diff --git a/.gitignore b/.gitignore index 170f51d..6fcf92a 100644 --- a/.gitignore +++ b/.gitignore @@ -176,4 +176,5 @@ cython_debug/ # Ignore local files notebooks/ data/ -sh/ \ No newline at end of file +sh/ +logs/ \ No newline at end of file diff --git a/README.md b/README.md index 800cd0e..9b926c4 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Open RS > Please press ⭐ button if you feel helpful! -This repository hosts the code and datasets for the **Open RS** project, accompanying the paper [*Reinforcement Learning for Reasoning in Small LLMs: What Works and What Doesn’t*](https://arxiv.org/abs/2503.16219). The project explores enhancing reasoning capabilities in small large language models (LLMs) using reinforcement learning (RL) under resource-constrained conditions. +This repository hosts the code and datasets for the **Open RS** project, accompanying the paper *Reinforcement Learning for Reasoning in Small LLMs: What Works and What Doesn’t*. The project explores enhancing reasoning capabilities in small large language models (LLMs) using reinforcement learning (RL) under resource-constrained conditions. We focus on a 1.5-billion-parameter model, `DeepSeek-R1-Distill-Qwen-1.5B`, trained on 4 NVIDIA A40 GPUs (48 GB VRAM each) within 24 hours. By adapting the Group Relative Policy Optimization (GRPO) algorithm and leveraging a curated, compact mathematical reasoning dataset, we conducted three experiments to assess performance and behavior. Key findings include: @@ -13,21 +13,6 @@ These results showcase RL-based fine-tuning as a cost-effective approach for sma ![Performance Metrics](assets/overall.png) -## Resources - -### Models -- [Open-RS1](https://huggingface.co/knoveleng/Open-RS1) -- [Open-RS2](https://huggingface.co/knoveleng/Open-RS2) -- [Open-RS3](https://huggingface.co/knoveleng/Open-RS3) -- Additional models in training: [knoveleng/OpenRS-GRPO](https://huggingface.co/knoveleng/OpenRS-GRPO/commits/main), [quyanh/OpenRS-GRPO](https://huggingface.co/quyanh/OpenRS-GRPO/commits/main) - -### Datasets -- [open-s1](https://huggingface.co/datasets/knoveleng/open-s1) -- [open-deepscaler](https://huggingface.co/datasets/knoveleng/open-deepscaler) -- [open-rs](https://huggingface.co/datasets/knoveleng/open-rs) (used in Experiments 2 and 3) - -### Collection -- [Open-RS Collection](https://huggingface.co/collections/knoveleng/open-rs-67d940abc201a7e7f252ca4e) ## Installation @@ -156,16 +141,3 @@ Our approach uses 7,000 samples (42,000 total outputs) and costs ~$42 on 4x A40 ## Acknowledgements Thanks to the Hugging Face team for their [open-r1](https://github.com/huggingface/open-r1) project. -## Citation -If this project aids your work, please cite it as: -``` -@misc{dang2025reinforcementlearningreasoningsmall, - title={Reinforcement Learning for Reasoning in Small LLMs: What Works and What Doesn't}, - author={Quy-Anh Dang and Chris Ngo}, - year={2025}, - eprint={2503.16219}, - archivePrefix={arXiv}, - primaryClass={cs.LG}, - url={https://arxiv.org/abs/2503.16219}, -} -``` diff --git a/logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-14T23-12-53.542208.json b/logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-14T23-12-53.542208.json deleted file mode 100644 index d96bf85..0000000 --- a/logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-14T23-12-53.542208.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2551784.221348991, - "end_time": 2552485.226336305, - "total_evaluation_time_secondes": "701.0049873138778", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|aime24|0": { - "extractive_match": 0.3, - "extractive_match_stderr": 0.0850962943396763 - }, - "all": { - "extractive_match": 0.3, - "extractive_match_stderr": 0.0850962943396763 - } - }, - "versions": { - "custom|aime24|0": 1 - }, - "config_tasks": { - "custom|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "253167becf0dfed7", - "hash_input_tokens": "bf1cc75b5f12dfb8", - "hash_cont_tokens": "5141f36c632674a8" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "84ff409b6bbf7cc0", - "hash_input_tokens": "9a8c7e54ce09af84", - "hash_cont_tokens": "f8465f893370f47e" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-14T23-36-50.272791.json b/logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-14T23-36-50.272791.json deleted file mode 100644 index 8d4ce9b..0000000 --- a/logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-14T23-36-50.272791.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2552567.750112632, - "end_time": 2553921.892538854, - "total_evaluation_time_secondes": "1354.1424262220971", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|math_500|0": { - "extractive_match": 0.838, - "extractive_match_stderr": 0.016494123566423505 - }, - "all": { - "extractive_match": 0.838, - "extractive_match_stderr": 0.016494123566423505 - } - }, - "versions": { - "custom|math_500|0": 1 - }, - "config_tasks": { - "custom|math_500": { - "name": "math_500", - "prompt_function": "math_prompt_fn", - "hf_repo": "HuggingFaceH4/MATH-500", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": false, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 500, - "effective_num_docs": 500, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|math_500|0": { - "hashes": { - "hash_examples": "adf0cc8311011db2", - "hash_full_prompts": "63c902dbdbaf1552", - "hash_input_tokens": "2af397a095a31139", - "hash_cont_tokens": "4e007e6ddae6bd47" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "bfaad1993ff37a60", - "hash_full_prompts": "3ceaaade5cf43911", - "hash_input_tokens": "c663dbac8a64d3e4", - "hash_cont_tokens": "fa9f7bbd65512b23" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-14T23-48-54.788605.json b/logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-14T23-48-54.788605.json deleted file mode 100644 index e6b5c17..0000000 --- a/logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-14T23-48-54.788605.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2554015.285077489, - "end_time": 2554646.476080566, - "total_evaluation_time_secondes": "631.1910030771978", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|amc23|0": { - "extractive_match": 0.7, - "extractive_match_stderr": 0.07337993857053426 - }, - "all": { - "extractive_match": 0.7, - "extractive_match_stderr": 0.07337993857053426 - } - }, - "versions": { - "custom|amc23|0": 1 - }, - "config_tasks": { - "custom|amc23": { - "name": "amc23", - "prompt_function": "amc_prompt_fn", - "hf_repo": "knoveleng/AMC-23", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|amc23|0": { - "hashes": { - "hash_examples": "57f3ead69f601b6a", - "hash_full_prompts": "64c05856286ef8dc", - "hash_input_tokens": "b64afe4485ef61c0", - "hash_cont_tokens": "b589babd0b65f01d" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "fe1f1f4512256bec", - "hash_full_prompts": "882107366cadc5ce", - "hash_input_tokens": "638cd1079164f374", - "hash_cont_tokens": "605820c49e2b0f60" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T13-49-09.435323.json b/logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T13-49-09.435323.json deleted file mode 100644 index 19e1410..0000000 --- a/logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T13-49-09.435323.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2604391.1116118, - "end_time": 2605061.115226257, - "total_evaluation_time_secondes": "670.0036144573241", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|aime24|0": { - "extractive_match": 0.3, - "extractive_match_stderr": 0.0850962943396763 - }, - "all": { - "extractive_match": 0.3, - "extractive_match_stderr": 0.0850962943396763 - } - }, - "versions": { - "custom|aime24|0": 1 - }, - "config_tasks": { - "custom|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "253167becf0dfed7", - "hash_input_tokens": "bf1cc75b5f12dfb8", - "hash_cont_tokens": "5141f36c632674a8" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "84ff409b6bbf7cc0", - "hash_input_tokens": "9a8c7e54ce09af84", - "hash_cont_tokens": "f8465f893370f47e" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T14-13-14.418707.json b/logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T14-13-14.418707.json deleted file mode 100644 index 1d66207..0000000 --- a/logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T14-13-14.418707.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2605133.897685678, - "end_time": 2606506.01809599, - "total_evaluation_time_secondes": "1372.1204103119671", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|math_500|0": { - "extractive_match": 0.838, - "extractive_match_stderr": 0.016494123566423505 - }, - "all": { - "extractive_match": 0.838, - "extractive_match_stderr": 0.016494123566423505 - } - }, - "versions": { - "custom|math_500|0": 1 - }, - "config_tasks": { - "custom|math_500": { - "name": "math_500", - "prompt_function": "math_prompt_fn", - "hf_repo": "HuggingFaceH4/MATH-500", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": false, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 500, - "effective_num_docs": 500, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|math_500|0": { - "hashes": { - "hash_examples": "adf0cc8311011db2", - "hash_full_prompts": "63c902dbdbaf1552", - "hash_input_tokens": "2af397a095a31139", - "hash_cont_tokens": "4e007e6ddae6bd47" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "bfaad1993ff37a60", - "hash_full_prompts": "3ceaaade5cf43911", - "hash_input_tokens": "c663dbac8a64d3e4", - "hash_cont_tokens": "fa9f7bbd65512b23" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T14-25-17.786800.json b/logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T14-25-17.786800.json deleted file mode 100644 index 5811d9f..0000000 --- a/logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T14-25-17.786800.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2606610.004414024, - "end_time": 2607229.474309994, - "total_evaluation_time_secondes": "619.4698959700763", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|amc23|0": { - "extractive_match": 0.7, - "extractive_match_stderr": 0.07337993857053426 - }, - "all": { - "extractive_match": 0.7, - "extractive_match_stderr": 0.07337993857053426 - } - }, - "versions": { - "custom|amc23|0": 1 - }, - "config_tasks": { - "custom|amc23": { - "name": "amc23", - "prompt_function": "amc_prompt_fn", - "hf_repo": "knoveleng/AMC-23", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|amc23|0": { - "hashes": { - "hash_examples": "57f3ead69f601b6a", - "hash_full_prompts": "64c05856286ef8dc", - "hash_input_tokens": "b64afe4485ef61c0", - "hash_cont_tokens": "b589babd0b65f01d" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "fe1f1f4512256bec", - "hash_full_prompts": "882107366cadc5ce", - "hash_input_tokens": "638cd1079164f374", - "hash_cont_tokens": "605820c49e2b0f60" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp1_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T00-02-07.142924.json b/logs/evals/Exp1_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T00-02-07.142924.json deleted file mode 100644 index b78e1ff..0000000 --- a/logs/evals/Exp1_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T00-02-07.142924.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2554717.479748211, - "end_time": 2555438.820359841, - "total_evaluation_time_secondes": "721.3406116301194", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|aime24|0": { - "extractive_match": 0.3, - "extractive_match_stderr": 0.0850962943396763 - }, - "all": { - "extractive_match": 0.3, - "extractive_match_stderr": 0.0850962943396763 - } - }, - "versions": { - "custom|aime24|0": 1 - }, - "config_tasks": { - "custom|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "253167becf0dfed7", - "hash_input_tokens": "bf1cc75b5f12dfb8", - "hash_cont_tokens": "b63238639d2fcf76" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "84ff409b6bbf7cc0", - "hash_input_tokens": "9a8c7e54ce09af84", - "hash_cont_tokens": "ecb6fd6a17a1ef8e" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp1_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T00-35-55.755190.json b/logs/evals/Exp1_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T00-35-55.755190.json deleted file mode 100644 index 8e27875..0000000 --- a/logs/evals/Exp1_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T00-35-55.755190.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2555529.780258504, - "end_time": 2557467.354353926, - "total_evaluation_time_secondes": "1937.5740954219364", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|math_500|0": { - "extractive_match": 0.798, - "extractive_match_stderr": 0.017973260031288248 - }, - "all": { - "extractive_match": 0.798, - "extractive_match_stderr": 0.017973260031288248 - } - }, - "versions": { - "custom|math_500|0": 1 - }, - "config_tasks": { - "custom|math_500": { - "name": "math_500", - "prompt_function": "math_prompt_fn", - "hf_repo": "HuggingFaceH4/MATH-500", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": false, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 500, - "effective_num_docs": 500, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|math_500|0": { - "hashes": { - "hash_examples": "adf0cc8311011db2", - "hash_full_prompts": "63c902dbdbaf1552", - "hash_input_tokens": "2af397a095a31139", - "hash_cont_tokens": "eba9b0271e8c8b2f" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "bfaad1993ff37a60", - "hash_full_prompts": "3ceaaade5cf43911", - "hash_input_tokens": "c663dbac8a64d3e4", - "hash_cont_tokens": "aafc6b7d92bee8a3" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp1_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T00-48-41.771959.json b/logs/evals/Exp1_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T00-48-41.771959.json deleted file mode 100644 index 4490e74..0000000 --- a/logs/evals/Exp1_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T00-48-41.771959.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2557548.485465054, - "end_time": 2558233.455578798, - "total_evaluation_time_secondes": "684.9701137440279", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|amc23|0": { - "extractive_match": 0.575, - "extractive_match_stderr": 0.07915823166939519 - }, - "all": { - "extractive_match": 0.575, - "extractive_match_stderr": 0.07915823166939519 - } - }, - "versions": { - "custom|amc23|0": 1 - }, - "config_tasks": { - "custom|amc23": { - "name": "amc23", - "prompt_function": "amc_prompt_fn", - "hf_repo": "knoveleng/AMC-23", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|amc23|0": { - "hashes": { - "hash_examples": "57f3ead69f601b6a", - "hash_full_prompts": "64c05856286ef8dc", - "hash_input_tokens": "b64afe4485ef61c0", - "hash_cont_tokens": "ccaa56a9b9fe16b8" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "fe1f1f4512256bec", - "hash_full_prompts": "882107366cadc5ce", - "hash_input_tokens": "638cd1079164f374", - "hash_cont_tokens": "0acfbefb98e82705" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp1_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T14-37-59.982413.json b/logs/evals/Exp1_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T14-37-59.982413.json deleted file mode 100644 index 86d20c4..0000000 --- a/logs/evals/Exp1_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T14-37-59.982413.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2607292.420535898, - "end_time": 2607991.665164641, - "total_evaluation_time_secondes": "699.2446287432685", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|aime24|0": { - "extractive_match": 0.3, - "extractive_match_stderr": 0.0850962943396763 - }, - "all": { - "extractive_match": 0.3, - "extractive_match_stderr": 0.0850962943396763 - } - }, - "versions": { - "custom|aime24|0": 1 - }, - "config_tasks": { - "custom|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "253167becf0dfed7", - "hash_input_tokens": "bf1cc75b5f12dfb8", - "hash_cont_tokens": "b63238639d2fcf76" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "84ff409b6bbf7cc0", - "hash_input_tokens": "9a8c7e54ce09af84", - "hash_cont_tokens": "ecb6fd6a17a1ef8e" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp1_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T01-02-29.088363.json b/logs/evals/Exp1_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T01-02-29.088363.json deleted file mode 100644 index 4ecc448..0000000 --- a/logs/evals/Exp1_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T01-02-29.088363.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2558280.776523193, - "end_time": 2559060.750369516, - "total_evaluation_time_secondes": "779.9738463233225", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|aime24|0": { - "extractive_match": 0.26666666666666666, - "extractive_match_stderr": 0.0821175682735253 - }, - "all": { - "extractive_match": 0.26666666666666666, - "extractive_match_stderr": 0.0821175682735253 - } - }, - "versions": { - "custom|aime24|0": 1 - }, - "config_tasks": { - "custom|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "253167becf0dfed7", - "hash_input_tokens": "bf1cc75b5f12dfb8", - "hash_cont_tokens": "839a511efa4a6078" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "84ff409b6bbf7cc0", - "hash_input_tokens": "9a8c7e54ce09af84", - "hash_cont_tokens": "680d67ec0912d741" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp1_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T01-51-01.120880.json b/logs/evals/Exp1_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T01-51-01.120880.json deleted file mode 100644 index e365403..0000000 --- a/logs/evals/Exp1_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T01-51-01.120880.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2559129.286908899, - "end_time": 2561972.673322505, - "total_evaluation_time_secondes": "2843.3864136058837", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|math_500|0": { - "extractive_match": 0.756, - "extractive_match_stderr": 0.01922673489361459 - }, - "all": { - "extractive_match": 0.756, - "extractive_match_stderr": 0.01922673489361459 - } - }, - "versions": { - "custom|math_500|0": 1 - }, - "config_tasks": { - "custom|math_500": { - "name": "math_500", - "prompt_function": "math_prompt_fn", - "hf_repo": "HuggingFaceH4/MATH-500", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": false, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 500, - "effective_num_docs": 500, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|math_500|0": { - "hashes": { - "hash_examples": "adf0cc8311011db2", - "hash_full_prompts": "63c902dbdbaf1552", - "hash_input_tokens": "2af397a095a31139", - "hash_cont_tokens": "f124f232f03e4b3c" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "bfaad1993ff37a60", - "hash_full_prompts": "3ceaaade5cf43911", - "hash_input_tokens": "c663dbac8a64d3e4", - "hash_cont_tokens": "2afd387b6678b989" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp1_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-04-41.905474.json b/logs/evals/Exp1_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-04-41.905474.json deleted file mode 100644 index c4a90b2..0000000 --- a/logs/evals/Exp1_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-04-41.905474.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2562049.988997336, - "end_time": 2562793.583693815, - "total_evaluation_time_secondes": "743.5946964789182", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|amc23|0": { - "extractive_match": 0.575, - "extractive_match_stderr": 0.07915823166939519 - }, - "all": { - "extractive_match": 0.575, - "extractive_match_stderr": 0.07915823166939519 - } - }, - "versions": { - "custom|amc23|0": 1 - }, - "config_tasks": { - "custom|amc23": { - "name": "amc23", - "prompt_function": "amc_prompt_fn", - "hf_repo": "knoveleng/AMC-23", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|amc23|0": { - "hashes": { - "hash_examples": "57f3ead69f601b6a", - "hash_full_prompts": "64c05856286ef8dc", - "hash_input_tokens": "b64afe4485ef61c0", - "hash_cont_tokens": "9dd35889080f8eca" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "fe1f1f4512256bec", - "hash_full_prompts": "882107366cadc5ce", - "hash_input_tokens": "638cd1079164f374", - "hash_cont_tokens": "812448b1bfa8fbc7" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp1_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-14-11.184777.json b/logs/evals/Exp1_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-14-11.184777.json deleted file mode 100644 index 62a8213..0000000 --- a/logs/evals/Exp1_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-14-11.184777.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2562859.065263796, - "end_time": 2563362.871206422, - "total_evaluation_time_secondes": "503.8059426257387", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|aime24|0": { - "extractive_match": 0.3333333333333333, - "extractive_match_stderr": 0.08753762190648168 - }, - "all": { - "extractive_match": 0.3333333333333333, - "extractive_match_stderr": 0.08753762190648168 - } - }, - "versions": { - "custom|aime24|0": 1 - }, - "config_tasks": { - "custom|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "253167becf0dfed7", - "hash_input_tokens": "bf1cc75b5f12dfb8", - "hash_cont_tokens": "57cd2595e3fb4e15" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "84ff409b6bbf7cc0", - "hash_input_tokens": "9a8c7e54ce09af84", - "hash_cont_tokens": "2ea2a7e8250f8005" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp1_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-36-02.705517.json b/logs/evals/Exp1_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-36-02.705517.json deleted file mode 100644 index 22fb6fe..0000000 --- a/logs/evals/Exp1_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-36-02.705517.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2563429.852101547, - "end_time": 2564674.316530785, - "total_evaluation_time_secondes": "1244.4644292378798", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|math_500|0": { - "extractive_match": 0.82, - "extractive_match_stderr": 0.01719859247631427 - }, - "all": { - "extractive_match": 0.82, - "extractive_match_stderr": 0.01719859247631427 - } - }, - "versions": { - "custom|math_500|0": 1 - }, - "config_tasks": { - "custom|math_500": { - "name": "math_500", - "prompt_function": "math_prompt_fn", - "hf_repo": "HuggingFaceH4/MATH-500", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": false, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 500, - "effective_num_docs": 500, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|math_500|0": { - "hashes": { - "hash_examples": "adf0cc8311011db2", - "hash_full_prompts": "63c902dbdbaf1552", - "hash_input_tokens": "2af397a095a31139", - "hash_cont_tokens": "dda7db86b5891548" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "bfaad1993ff37a60", - "hash_full_prompts": "3ceaaade5cf43911", - "hash_input_tokens": "c663dbac8a64d3e4", - "hash_cont_tokens": "2ed97aefa363786c" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp1_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-44-26.970544.json b/logs/evals/Exp1_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-44-26.970544.json deleted file mode 100644 index 379326a..0000000 --- a/logs/evals/Exp1_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-44-26.970544.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2564788.62842531, - "end_time": 2565178.658914464, - "total_evaluation_time_secondes": "390.03048915416", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|amc23|0": { - "extractive_match": 0.675, - "extractive_match_stderr": 0.07499999999999998 - }, - "all": { - "extractive_match": 0.675, - "extractive_match_stderr": 0.07499999999999998 - } - }, - "versions": { - "custom|amc23|0": 1 - }, - "config_tasks": { - "custom|amc23": { - "name": "amc23", - "prompt_function": "amc_prompt_fn", - "hf_repo": "knoveleng/AMC-23", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|amc23|0": { - "hashes": { - "hash_examples": "57f3ead69f601b6a", - "hash_full_prompts": "64c05856286ef8dc", - "hash_input_tokens": "b64afe4485ef61c0", - "hash_cont_tokens": "fdaf5a9fa19c5c25" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "fe1f1f4512256bec", - "hash_full_prompts": "882107366cadc5ce", - "hash_input_tokens": "638cd1079164f374", - "hash_cont_tokens": "a46bfd91934aff2c" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp1_500/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-52-04.589810.json b/logs/evals/Exp1_500/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-52-04.589810.json deleted file mode 100644 index c9f19b4..0000000 --- a/logs/evals/Exp1_500/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-52-04.589810.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2565227.071577089, - "end_time": 2565636.278604793, - "total_evaluation_time_secondes": "409.2070277039893", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|aime24|0": { - "extractive_match": 0.23333333333333334, - "extractive_match_stderr": 0.07854032324531728 - }, - "all": { - "extractive_match": 0.23333333333333334, - "extractive_match_stderr": 0.07854032324531728 - } - }, - "versions": { - "custom|aime24|0": 1 - }, - "config_tasks": { - "custom|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "253167becf0dfed7", - "hash_input_tokens": "bf1cc75b5f12dfb8", - "hash_cont_tokens": "17884cbea1d03333" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "84ff409b6bbf7cc0", - "hash_input_tokens": "9a8c7e54ce09af84", - "hash_cont_tokens": "887aa0bacc8f8844" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp1_500/results/quyanh/OpenRS-GRPO/results_2025-03-15T03-11-26.478898.json b/logs/evals/Exp1_500/results/quyanh/OpenRS-GRPO/results_2025-03-15T03-11-26.478898.json deleted file mode 100644 index 330f27e..0000000 --- a/logs/evals/Exp1_500/results/quyanh/OpenRS-GRPO/results_2025-03-15T03-11-26.478898.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2565700.44115151, - "end_time": 2566798.118010773, - "total_evaluation_time_secondes": "1097.676859262865", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|math_500|0": { - "extractive_match": 0.798, - "extractive_match_stderr": 0.017973260031288265 - }, - "all": { - "extractive_match": 0.798, - "extractive_match_stderr": 0.017973260031288265 - } - }, - "versions": { - "custom|math_500|0": 1 - }, - "config_tasks": { - "custom|math_500": { - "name": "math_500", - "prompt_function": "math_prompt_fn", - "hf_repo": "HuggingFaceH4/MATH-500", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": false, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 500, - "effective_num_docs": 500, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|math_500|0": { - "hashes": { - "hash_examples": "adf0cc8311011db2", - "hash_full_prompts": "63c902dbdbaf1552", - "hash_input_tokens": "2af397a095a31139", - "hash_cont_tokens": "6b327eeca1fea757" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "bfaad1993ff37a60", - "hash_full_prompts": "3ceaaade5cf43911", - "hash_input_tokens": "c663dbac8a64d3e4", - "hash_cont_tokens": "6dbad7c9862d4052" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp1_500/results/quyanh/OpenRS-GRPO/results_2025-03-15T03-18-43.114223.json b/logs/evals/Exp1_500/results/quyanh/OpenRS-GRPO/results_2025-03-15T03-18-43.114223.json deleted file mode 100644 index b2b0ea3..0000000 --- a/logs/evals/Exp1_500/results/quyanh/OpenRS-GRPO/results_2025-03-15T03-18-43.114223.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2566869.350252712, - "end_time": 2567234.803742991, - "total_evaluation_time_secondes": "365.4534902786836", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|amc23|0": { - "extractive_match": 0.675, - "extractive_match_stderr": 0.07499999999999998 - }, - "all": { - "extractive_match": 0.675, - "extractive_match_stderr": 0.07499999999999998 - } - }, - "versions": { - "custom|amc23|0": 1 - }, - "config_tasks": { - "custom|amc23": { - "name": "amc23", - "prompt_function": "amc_prompt_fn", - "hf_repo": "knoveleng/AMC-23", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|amc23|0": { - "hashes": { - "hash_examples": "57f3ead69f601b6a", - "hash_full_prompts": "64c05856286ef8dc", - "hash_input_tokens": "b64afe4485ef61c0", - "hash_cont_tokens": "aeca4b45e767c732" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "fe1f1f4512256bec", - "hash_full_prompts": "882107366cadc5ce", - "hash_input_tokens": "638cd1079164f374", - "hash_cont_tokens": "e916b987c51016d8" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp2_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-13-19.506547.json b/logs/evals/Exp2_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-13-19.506547.json deleted file mode 100644 index 3395bd2..0000000 --- a/logs/evals/Exp2_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-13-19.506547.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2569877.59925465, - "end_time": 2570511.193301663, - "total_evaluation_time_secondes": "633.5940470127389", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|aime24|0": { - "extractive_match": 0.3333333333333333, - "extractive_match_stderr": 0.08753762190648169 - }, - "all": { - "extractive_match": 0.3333333333333333, - "extractive_match_stderr": 0.08753762190648169 - } - }, - "versions": { - "custom|aime24|0": 1 - }, - "config_tasks": { - "custom|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "253167becf0dfed7", - "hash_input_tokens": "bf1cc75b5f12dfb8", - "hash_cont_tokens": "9d95073fa199fea1" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "84ff409b6bbf7cc0", - "hash_input_tokens": "9a8c7e54ce09af84", - "hash_cont_tokens": "9581b1a436f91769" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp2_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-35-14.751095.json b/logs/evals/Exp2_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-35-14.751095.json deleted file mode 100644 index 460082d..0000000 --- a/logs/evals/Exp2_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-35-14.751095.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2570571.236023537, - "end_time": 2571826.342473873, - "total_evaluation_time_secondes": "1255.106450336054", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|math_500|0": { - "extractive_match": 0.856, - "extractive_match_stderr": 0.015716934945725767 - }, - "all": { - "extractive_match": 0.856, - "extractive_match_stderr": 0.015716934945725767 - } - }, - "versions": { - "custom|math_500|0": 1 - }, - "config_tasks": { - "custom|math_500": { - "name": "math_500", - "prompt_function": "math_prompt_fn", - "hf_repo": "HuggingFaceH4/MATH-500", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": false, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 500, - "effective_num_docs": 500, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|math_500|0": { - "hashes": { - "hash_examples": "adf0cc8311011db2", - "hash_full_prompts": "63c902dbdbaf1552", - "hash_input_tokens": "2af397a095a31139", - "hash_cont_tokens": "ce035e0da82bf62c" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "bfaad1993ff37a60", - "hash_full_prompts": "3ceaaade5cf43911", - "hash_input_tokens": "c663dbac8a64d3e4", - "hash_cont_tokens": "185044a2d01d14ca" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp2_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-44-39.651229.json b/logs/evals/Exp2_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-44-39.651229.json deleted file mode 100644 index 113a278..0000000 --- a/logs/evals/Exp2_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-44-39.651229.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2571904.690897501, - "end_time": 2572391.342055246, - "total_evaluation_time_secondes": "486.6511577446945", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|amc23|0": { - "extractive_match": 0.7, - "extractive_match_stderr": 0.07337993857053426 - }, - "all": { - "extractive_match": 0.7, - "extractive_match_stderr": 0.07337993857053426 - } - }, - "versions": { - "custom|amc23|0": 1 - }, - "config_tasks": { - "custom|amc23": { - "name": "amc23", - "prompt_function": "amc_prompt_fn", - "hf_repo": "knoveleng/AMC-23", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|amc23|0": { - "hashes": { - "hash_examples": "57f3ead69f601b6a", - "hash_full_prompts": "64c05856286ef8dc", - "hash_input_tokens": "b64afe4485ef61c0", - "hash_cont_tokens": "806ab92dedff279f" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "fe1f1f4512256bec", - "hash_full_prompts": "882107366cadc5ce", - "hash_input_tokens": "638cd1079164f374", - "hash_cont_tokens": "a0e7c914c07ced33" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp2_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-54-03.406254.json b/logs/evals/Exp2_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-54-03.406254.json deleted file mode 100644 index b2524cc..0000000 --- a/logs/evals/Exp2_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-54-03.406254.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2572478.041585189, - "end_time": 2572955.095596673, - "total_evaluation_time_secondes": "477.0540114841424", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|aime24|0": { - "extractive_match": 0.36666666666666664, - "extractive_match_stderr": 0.0894855453983996 - }, - "all": { - "extractive_match": 0.36666666666666664, - "extractive_match_stderr": 0.0894855453983996 - } - }, - "versions": { - "custom|aime24|0": 1 - }, - "config_tasks": { - "custom|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "253167becf0dfed7", - "hash_input_tokens": "bf1cc75b5f12dfb8", - "hash_cont_tokens": "39278bd920db3487" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "84ff409b6bbf7cc0", - "hash_input_tokens": "9a8c7e54ce09af84", - "hash_cont_tokens": "b9f7ed706698f186" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp2_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T05-13-57.844846.json b/logs/evals/Exp2_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T05-13-57.844846.json deleted file mode 100644 index 5c51ed5..0000000 --- a/logs/evals/Exp2_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T05-13-57.844846.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2573032.477990599, - "end_time": 2574149.485158068, - "total_evaluation_time_secondes": "1117.0071674692445", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|math_500|0": { - "extractive_match": 0.854, - "extractive_match_stderr": 0.01580720517583485 - }, - "all": { - "extractive_match": 0.854, - "extractive_match_stderr": 0.01580720517583485 - } - }, - "versions": { - "custom|math_500|0": 1 - }, - "config_tasks": { - "custom|math_500": { - "name": "math_500", - "prompt_function": "math_prompt_fn", - "hf_repo": "HuggingFaceH4/MATH-500", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": false, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 500, - "effective_num_docs": 500, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|math_500|0": { - "hashes": { - "hash_examples": "adf0cc8311011db2", - "hash_full_prompts": "63c902dbdbaf1552", - "hash_input_tokens": "2af397a095a31139", - "hash_cont_tokens": "0e843829e85cfe94" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "bfaad1993ff37a60", - "hash_full_prompts": "3ceaaade5cf43911", - "hash_input_tokens": "c663dbac8a64d3e4", - "hash_cont_tokens": "db7b20002a364074" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp2_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T05-20-07.618719.json b/logs/evals/Exp2_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T05-20-07.618719.json deleted file mode 100644 index fc7b574..0000000 --- a/logs/evals/Exp2_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T05-20-07.618719.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2574199.530597124, - "end_time": 2574519.310388266, - "total_evaluation_time_secondes": "319.77979114232585", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|amc23|0": { - "extractive_match": 0.75, - "extractive_match_stderr": 0.06933752452815363 - }, - "all": { - "extractive_match": 0.75, - "extractive_match_stderr": 0.06933752452815363 - } - }, - "versions": { - "custom|amc23|0": 1 - }, - "config_tasks": { - "custom|amc23": { - "name": "amc23", - "prompt_function": "amc_prompt_fn", - "hf_repo": "knoveleng/AMC-23", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|amc23|0": { - "hashes": { - "hash_examples": "57f3ead69f601b6a", - "hash_full_prompts": "64c05856286ef8dc", - "hash_input_tokens": "b64afe4485ef61c0", - "hash_cont_tokens": "6a096939eeb8097d" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "fe1f1f4512256bec", - "hash_full_prompts": "882107366cadc5ce", - "hash_input_tokens": "638cd1079164f374", - "hash_cont_tokens": "bdec73a60303c3fc" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp2_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T05-31-02.871828.json b/logs/evals/Exp2_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T05-31-02.871828.json deleted file mode 100644 index 3d82ec2..0000000 --- a/logs/evals/Exp2_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T05-31-02.871828.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2574556.171576032, - "end_time": 2575174.544545133, - "total_evaluation_time_secondes": "618.3729691011831", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|aime24|0": { - "extractive_match": 0.2, - "extractive_match_stderr": 0.07427813527082075 - }, - "all": { - "extractive_match": 0.2, - "extractive_match_stderr": 0.07427813527082075 - } - }, - "versions": { - "custom|aime24|0": 1 - }, - "config_tasks": { - "custom|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "253167becf0dfed7", - "hash_input_tokens": "bf1cc75b5f12dfb8", - "hash_cont_tokens": "6077ed83d15d6889" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "84ff409b6bbf7cc0", - "hash_input_tokens": "9a8c7e54ce09af84", - "hash_cont_tokens": "c1a9ed1d84a0d925" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp2_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T05-51-04.119358.json b/logs/evals/Exp2_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T05-51-04.119358.json deleted file mode 100644 index a2ed530..0000000 --- a/logs/evals/Exp2_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T05-51-04.119358.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2575244.022829783, - "end_time": 2576375.754693394, - "total_evaluation_time_secondes": "1131.7318636109121", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|math_500|0": { - "extractive_match": 0.8, - "extractive_match_stderr": 0.017906459241433855 - }, - "all": { - "extractive_match": 0.8, - "extractive_match_stderr": 0.017906459241433855 - } - }, - "versions": { - "custom|math_500|0": 1 - }, - "config_tasks": { - "custom|math_500": { - "name": "math_500", - "prompt_function": "math_prompt_fn", - "hf_repo": "HuggingFaceH4/MATH-500", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": false, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 500, - "effective_num_docs": 500, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|math_500|0": { - "hashes": { - "hash_examples": "adf0cc8311011db2", - "hash_full_prompts": "63c902dbdbaf1552", - "hash_input_tokens": "2af397a095a31139", - "hash_cont_tokens": "758890b1dfee93f1" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "bfaad1993ff37a60", - "hash_full_prompts": "3ceaaade5cf43911", - "hash_input_tokens": "c663dbac8a64d3e4", - "hash_cont_tokens": "6da72493ad7e20fa" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp2_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-00-23.069698.json b/logs/evals/Exp2_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-00-23.069698.json deleted file mode 100644 index b0a09fb..0000000 --- a/logs/evals/Exp2_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-00-23.069698.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2576450.971500249, - "end_time": 2576934.750199688, - "total_evaluation_time_secondes": "483.77869943901896", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|amc23|0": { - "extractive_match": 0.575, - "extractive_match_stderr": 0.07915823166939519 - }, - "all": { - "extractive_match": 0.575, - "extractive_match_stderr": 0.07915823166939519 - } - }, - "versions": { - "custom|amc23|0": 1 - }, - "config_tasks": { - "custom|amc23": { - "name": "amc23", - "prompt_function": "amc_prompt_fn", - "hf_repo": "knoveleng/AMC-23", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|amc23|0": { - "hashes": { - "hash_examples": "57f3ead69f601b6a", - "hash_full_prompts": "64c05856286ef8dc", - "hash_input_tokens": "b64afe4485ef61c0", - "hash_cont_tokens": "56e8f52dea2b9ec6" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "fe1f1f4512256bec", - "hash_full_prompts": "882107366cadc5ce", - "hash_input_tokens": "638cd1079164f374", - "hash_cont_tokens": "6f3eceda2cec8b33" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp2_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-09-49.021006.json b/logs/evals/Exp2_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-09-49.021006.json deleted file mode 100644 index 5d83067..0000000 --- a/logs/evals/Exp2_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-09-49.021006.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2576972.829442079, - "end_time": 2577500.710391771, - "total_evaluation_time_secondes": "527.8809496918693", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|aime24|0": { - "extractive_match": 0.3, - "extractive_match_stderr": 0.0850962943396763 - }, - "all": { - "extractive_match": 0.3, - "extractive_match_stderr": 0.0850962943396763 - } - }, - "versions": { - "custom|aime24|0": 1 - }, - "config_tasks": { - "custom|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "253167becf0dfed7", - "hash_input_tokens": "bf1cc75b5f12dfb8", - "hash_cont_tokens": "9dea059f367c048a" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "84ff409b6bbf7cc0", - "hash_input_tokens": "9a8c7e54ce09af84", - "hash_cont_tokens": "e716ced2d7d9fa84" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp2_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-28-55.702973.json b/logs/evals/Exp2_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-28-55.702973.json deleted file mode 100644 index 01c658b..0000000 --- a/logs/evals/Exp2_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-28-55.702973.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2577587.191756647, - "end_time": 2578647.345242229, - "total_evaluation_time_secondes": "1060.1534855817445", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|math_500|0": { - "extractive_match": 0.826, - "extractive_match_stderr": 0.016971271257516147 - }, - "all": { - "extractive_match": 0.826, - "extractive_match_stderr": 0.016971271257516147 - } - }, - "versions": { - "custom|math_500|0": 1 - }, - "config_tasks": { - "custom|math_500": { - "name": "math_500", - "prompt_function": "math_prompt_fn", - "hf_repo": "HuggingFaceH4/MATH-500", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": false, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 500, - "effective_num_docs": 500, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|math_500|0": { - "hashes": { - "hash_examples": "adf0cc8311011db2", - "hash_full_prompts": "63c902dbdbaf1552", - "hash_input_tokens": "2af397a095a31139", - "hash_cont_tokens": "7b5df8b0938240b1" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "bfaad1993ff37a60", - "hash_full_prompts": "3ceaaade5cf43911", - "hash_input_tokens": "c663dbac8a64d3e4", - "hash_cont_tokens": "e53039268588162c" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp2_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-34-45.013493.json b/logs/evals/Exp2_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-34-45.013493.json deleted file mode 100644 index 38e6e6e..0000000 --- a/logs/evals/Exp2_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-34-45.013493.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2578699.013531457, - "end_time": 2578996.706382657, - "total_evaluation_time_secondes": "297.6928511997685", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|amc23|0": { - "extractive_match": 0.65, - "extractive_match_stderr": 0.07637626158259733 - }, - "all": { - "extractive_match": 0.65, - "extractive_match_stderr": 0.07637626158259733 - } - }, - "versions": { - "custom|amc23|0": 1 - }, - "config_tasks": { - "custom|amc23": { - "name": "amc23", - "prompt_function": "amc_prompt_fn", - "hf_repo": "knoveleng/AMC-23", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|amc23|0": { - "hashes": { - "hash_examples": "57f3ead69f601b6a", - "hash_full_prompts": "64c05856286ef8dc", - "hash_input_tokens": "b64afe4485ef61c0", - "hash_cont_tokens": "0a77fd8b65d5693c" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "fe1f1f4512256bec", - "hash_full_prompts": "882107366cadc5ce", - "hash_input_tokens": "638cd1079164f374", - "hash_cont_tokens": "a169c269d2eaaac4" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp2_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-42-52.541003.json b/logs/evals/Exp2_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-42-52.541003.json deleted file mode 100644 index b01d23f..0000000 --- a/logs/evals/Exp2_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-42-52.541003.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2579092.107139461, - "end_time": 2579484.220244874, - "total_evaluation_time_secondes": "392.1131054125726", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|aime24|0": { - "extractive_match": 0.2, - "extractive_match_stderr": 0.07427813527082076 - }, - "all": { - "extractive_match": 0.2, - "extractive_match_stderr": 0.07427813527082076 - } - }, - "versions": { - "custom|aime24|0": 1 - }, - "config_tasks": { - "custom|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "253167becf0dfed7", - "hash_input_tokens": "bf1cc75b5f12dfb8", - "hash_cont_tokens": "8a299fcdd85d83a4" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "84ff409b6bbf7cc0", - "hash_input_tokens": "9a8c7e54ce09af84", - "hash_cont_tokens": "d5533616a87b80ed" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp2_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-56-52.023106.json b/logs/evals/Exp2_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-56-52.023106.json deleted file mode 100644 index 52cf37d..0000000 --- a/logs/evals/Exp2_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-56-52.023106.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2579564.124064897, - "end_time": 2580323.670382602, - "total_evaluation_time_secondes": "759.5463177049533", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|math_500|0": { - "extractive_match": 0.824, - "extractive_match_stderr": 0.017047852020622263 - }, - "all": { - "extractive_match": 0.824, - "extractive_match_stderr": 0.017047852020622263 - } - }, - "versions": { - "custom|math_500|0": 1 - }, - "config_tasks": { - "custom|math_500": { - "name": "math_500", - "prompt_function": "math_prompt_fn", - "hf_repo": "HuggingFaceH4/MATH-500", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": false, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 500, - "effective_num_docs": 500, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|math_500|0": { - "hashes": { - "hash_examples": "adf0cc8311011db2", - "hash_full_prompts": "63c902dbdbaf1552", - "hash_input_tokens": "2af397a095a31139", - "hash_cont_tokens": "b2d7391eac1f81f2" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "bfaad1993ff37a60", - "hash_full_prompts": "3ceaaade5cf43911", - "hash_input_tokens": "c663dbac8a64d3e4", - "hash_cont_tokens": "8e0031f22a0aa723" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp2_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-04-05.239254.json b/logs/evals/Exp2_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-04-05.239254.json deleted file mode 100644 index 221cb66..0000000 --- a/logs/evals/Exp2_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-04-05.239254.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2580424.328730572, - "end_time": 2580756.931524183, - "total_evaluation_time_secondes": "332.60279361112043", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|amc23|0": { - "extractive_match": 0.675, - "extractive_match_stderr": 0.07499999999999998 - }, - "all": { - "extractive_match": 0.675, - "extractive_match_stderr": 0.07499999999999998 - } - }, - "versions": { - "custom|amc23|0": 1 - }, - "config_tasks": { - "custom|amc23": { - "name": "amc23", - "prompt_function": "amc_prompt_fn", - "hf_repo": "knoveleng/AMC-23", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|amc23|0": { - "hashes": { - "hash_examples": "57f3ead69f601b6a", - "hash_full_prompts": "64c05856286ef8dc", - "hash_input_tokens": "b64afe4485ef61c0", - "hash_cont_tokens": "b38129b60c769bd6" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "fe1f1f4512256bec", - "hash_full_prompts": "882107366cadc5ce", - "hash_input_tokens": "638cd1079164f374", - "hash_cont_tokens": "4a03618c47084d90" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp2_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-10-27.028672.json b/logs/evals/Exp2_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-10-27.028672.json deleted file mode 100644 index 7fea938..0000000 --- a/logs/evals/Exp2_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-10-27.028672.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2580812.042311833, - "end_time": 2581138.714750407, - "total_evaluation_time_secondes": "326.67243857402354", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|aime24|0": { - "extractive_match": 0.3333333333333333, - "extractive_match_stderr": 0.0875376219064817 - }, - "all": { - "extractive_match": 0.3333333333333333, - "extractive_match_stderr": 0.0875376219064817 - } - }, - "versions": { - "custom|aime24|0": 1 - }, - "config_tasks": { - "custom|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "253167becf0dfed7", - "hash_input_tokens": "bf1cc75b5f12dfb8", - "hash_cont_tokens": "314d40cde0446e00" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "84ff409b6bbf7cc0", - "hash_input_tokens": "9a8c7e54ce09af84", - "hash_cont_tokens": "83c2fe7147a45ed0" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp2_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-24-32.807123.json b/logs/evals/Exp2_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-24-32.807123.json deleted file mode 100644 index cde79fb..0000000 --- a/logs/evals/Exp2_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-24-32.807123.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2581209.555826082, - "end_time": 2581984.464070361, - "total_evaluation_time_secondes": "774.9082442792132", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|math_500|0": { - "extractive_match": 0.814, - "extractive_match_stderr": 0.01741880678058394 - }, - "all": { - "extractive_match": 0.814, - "extractive_match_stderr": 0.01741880678058394 - } - }, - "versions": { - "custom|math_500|0": 1 - }, - "config_tasks": { - "custom|math_500": { - "name": "math_500", - "prompt_function": "math_prompt_fn", - "hf_repo": "HuggingFaceH4/MATH-500", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": false, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 500, - "effective_num_docs": 500, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|math_500|0": { - "hashes": { - "hash_examples": "adf0cc8311011db2", - "hash_full_prompts": "63c902dbdbaf1552", - "hash_input_tokens": "2af397a095a31139", - "hash_cont_tokens": "a050975778a6be25" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "bfaad1993ff37a60", - "hash_full_prompts": "3ceaaade5cf43911", - "hash_input_tokens": "c663dbac8a64d3e4", - "hash_cont_tokens": "20a4136b5748e4f1" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp2_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-29-47.478300.json b/logs/evals/Exp2_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-29-47.478300.json deleted file mode 100644 index a46f867..0000000 --- a/logs/evals/Exp2_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-29-47.478300.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2582057.657695042, - "end_time": 2582299.170243019, - "total_evaluation_time_secondes": "241.51254797680303", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|amc23|0": { - "extractive_match": 0.625, - "extractive_match_stderr": 0.07752170911825529 - }, - "all": { - "extractive_match": 0.625, - "extractive_match_stderr": 0.07752170911825529 - } - }, - "versions": { - "custom|amc23|0": 1 - }, - "config_tasks": { - "custom|amc23": { - "name": "amc23", - "prompt_function": "amc_prompt_fn", - "hf_repo": "knoveleng/AMC-23", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|amc23|0": { - "hashes": { - "hash_examples": "57f3ead69f601b6a", - "hash_full_prompts": "64c05856286ef8dc", - "hash_input_tokens": "b64afe4485ef61c0", - "hash_cont_tokens": "02bf8270f672d019" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "fe1f1f4512256bec", - "hash_full_prompts": "882107366cadc5ce", - "hash_input_tokens": "638cd1079164f374", - "hash_cont_tokens": "9bcd38a007e3c1f7" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp2_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-36-30.994685.json b/logs/evals/Exp2_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-36-30.994685.json deleted file mode 100644 index c327fc4..0000000 --- a/logs/evals/Exp2_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-36-30.994685.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2582347.378834666, - "end_time": 2582702.685794419, - "total_evaluation_time_secondes": "355.30695975292474", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|aime24|0": { - "extractive_match": 0.26666666666666666, - "extractive_match_stderr": 0.0821175682735253 - }, - "all": { - "extractive_match": 0.26666666666666666, - "extractive_match_stderr": 0.0821175682735253 - } - }, - "versions": { - "custom|aime24|0": 1 - }, - "config_tasks": { - "custom|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "253167becf0dfed7", - "hash_input_tokens": "bf1cc75b5f12dfb8", - "hash_cont_tokens": "77b972bf9c1cfb26" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "84ff409b6bbf7cc0", - "hash_input_tokens": "9a8c7e54ce09af84", - "hash_cont_tokens": "585ab41a6bd09268" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp2_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-49-32.062451.json b/logs/evals/Exp2_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-49-32.062451.json deleted file mode 100644 index 745c63a..0000000 --- a/logs/evals/Exp2_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-49-32.062451.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2582781.263596611, - "end_time": 2583483.690574673, - "total_evaluation_time_secondes": "702.426978061907", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|math_500|0": { - "extractive_match": 0.822, - "extractive_match_stderr": 0.017123622189062257 - }, - "all": { - "extractive_match": 0.822, - "extractive_match_stderr": 0.017123622189062257 - } - }, - "versions": { - "custom|math_500|0": 1 - }, - "config_tasks": { - "custom|math_500": { - "name": "math_500", - "prompt_function": "math_prompt_fn", - "hf_repo": "HuggingFaceH4/MATH-500", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": false, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 500, - "effective_num_docs": 500, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|math_500|0": { - "hashes": { - "hash_examples": "adf0cc8311011db2", - "hash_full_prompts": "63c902dbdbaf1552", - "hash_input_tokens": "2af397a095a31139", - "hash_cont_tokens": "36524c3d10393f7f" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "bfaad1993ff37a60", - "hash_full_prompts": "3ceaaade5cf43911", - "hash_input_tokens": "c663dbac8a64d3e4", - "hash_cont_tokens": "e4920254817e7bfc" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp2_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-55-51.416032.json b/logs/evals/Exp2_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-55-51.416032.json deleted file mode 100644 index 1f6f1e6..0000000 --- a/logs/evals/Exp2_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-55-51.416032.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2583567.839449735, - "end_time": 2583863.106798447, - "total_evaluation_time_secondes": "295.2673487123102", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|amc23|0": { - "extractive_match": 0.7, - "extractive_match_stderr": 0.07337993857053426 - }, - "all": { - "extractive_match": 0.7, - "extractive_match_stderr": 0.07337993857053426 - } - }, - "versions": { - "custom|amc23|0": 1 - }, - "config_tasks": { - "custom|amc23": { - "name": "amc23", - "prompt_function": "amc_prompt_fn", - "hf_repo": "knoveleng/AMC-23", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|amc23|0": { - "hashes": { - "hash_examples": "57f3ead69f601b6a", - "hash_full_prompts": "64c05856286ef8dc", - "hash_input_tokens": "b64afe4485ef61c0", - "hash_cont_tokens": "f52cd39075e5792e" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "fe1f1f4512256bec", - "hash_full_prompts": "882107366cadc5ce", - "hash_input_tokens": "638cd1079164f374", - "hash_cont_tokens": "17e169df3b2c15f2" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp2_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T03-30-59.655917.json b/logs/evals/Exp2_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T03-30-59.655917.json deleted file mode 100644 index e8b2e4b..0000000 --- a/logs/evals/Exp2_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T03-30-59.655917.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2567298.375852412, - "end_time": 2567971.3367268, - "total_evaluation_time_secondes": "672.9608743879944", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|aime24|0": { - "extractive_match": 0.3, - "extractive_match_stderr": 0.0850962943396763 - }, - "all": { - "extractive_match": 0.3, - "extractive_match_stderr": 0.0850962943396763 - } - }, - "versions": { - "custom|aime24|0": 1 - }, - "config_tasks": { - "custom|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "253167becf0dfed7", - "hash_input_tokens": "bf1cc75b5f12dfb8", - "hash_cont_tokens": "76473421bbe93410" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "84ff409b6bbf7cc0", - "hash_input_tokens": "9a8c7e54ce09af84", - "hash_cont_tokens": "a2125e595bab4673" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp2_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T03-53-53.011460.json b/logs/evals/Exp2_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T03-53-53.011460.json deleted file mode 100644 index 60727c1..0000000 --- a/logs/evals/Exp2_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T03-53-53.011460.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2568037.999388426, - "end_time": 2569344.647350468, - "total_evaluation_time_secondes": "1306.6479620421305", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|math_500|0": { - "extractive_match": 0.854, - "extractive_match_stderr": 0.01580720517583485 - }, - "all": { - "extractive_match": 0.854, - "extractive_match_stderr": 0.01580720517583485 - } - }, - "versions": { - "custom|math_500|0": 1 - }, - "config_tasks": { - "custom|math_500": { - "name": "math_500", - "prompt_function": "math_prompt_fn", - "hf_repo": "HuggingFaceH4/MATH-500", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": false, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 500, - "effective_num_docs": 500, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|math_500|0": { - "hashes": { - "hash_examples": "adf0cc8311011db2", - "hash_full_prompts": "63c902dbdbaf1552", - "hash_input_tokens": "2af397a095a31139", - "hash_cont_tokens": "961808827104c739" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "bfaad1993ff37a60", - "hash_full_prompts": "3ceaaade5cf43911", - "hash_input_tokens": "c663dbac8a64d3e4", - "hash_cont_tokens": "0a4348ce590f191d" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp2_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-01-54.627135.json b/logs/evals/Exp2_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-01-54.627135.json deleted file mode 100644 index 748b51a..0000000 --- a/logs/evals/Exp2_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-01-54.627135.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2569429.209718564, - "end_time": 2569826.319747058, - "total_evaluation_time_secondes": "397.11002849414945", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|amc23|0": { - "extractive_match": 0.8, - "extractive_match_stderr": 0.06405126152203487 - }, - "all": { - "extractive_match": 0.8, - "extractive_match_stderr": 0.06405126152203487 - } - }, - "versions": { - "custom|amc23|0": 1 - }, - "config_tasks": { - "custom|amc23": { - "name": "amc23", - "prompt_function": "amc_prompt_fn", - "hf_repo": "knoveleng/AMC-23", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|amc23|0": { - "hashes": { - "hash_examples": "57f3ead69f601b6a", - "hash_full_prompts": "64c05856286ef8dc", - "hash_input_tokens": "b64afe4485ef61c0", - "hash_cont_tokens": "d1d854bc50c70679" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "fe1f1f4512256bec", - "hash_full_prompts": "882107366cadc5ce", - "hash_input_tokens": "638cd1079164f374", - "hash_cont_tokens": "198d7b3d60a17a97" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp3_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T08-52-22.004802.json b/logs/evals/Exp3_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T08-52-22.004802.json deleted file mode 100644 index ea4b990..0000000 --- a/logs/evals/Exp3_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T08-52-22.004802.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2586637.696856398, - "end_time": 2587253.686011266, - "total_evaluation_time_secondes": "615.9891548678279", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|aime24|0": { - "extractive_match": 0.3, - "extractive_match_stderr": 0.08509629433967632 - }, - "all": { - "extractive_match": 0.3, - "extractive_match_stderr": 0.08509629433967632 - } - }, - "versions": { - "custom|aime24|0": 1 - }, - "config_tasks": { - "custom|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "253167becf0dfed7", - "hash_input_tokens": "bf1cc75b5f12dfb8", - "hash_cont_tokens": "f899e2531ad5b6e1" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "84ff409b6bbf7cc0", - "hash_input_tokens": "9a8c7e54ce09af84", - "hash_cont_tokens": "a424417871d83f65" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp3_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T09-16-52.530386.json b/logs/evals/Exp3_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T09-16-52.530386.json deleted file mode 100644 index 4e49221..0000000 --- a/logs/evals/Exp3_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T09-16-52.530386.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2587310.198941251, - "end_time": 2588724.15183972, - "total_evaluation_time_secondes": "1413.952898469288", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|math_500|0": { - "extractive_match": 0.844, - "extractive_match_stderr": 0.016243636028391104 - }, - "all": { - "extractive_match": 0.844, - "extractive_match_stderr": 0.016243636028391104 - } - }, - "versions": { - "custom|math_500|0": 1 - }, - "config_tasks": { - "custom|math_500": { - "name": "math_500", - "prompt_function": "math_prompt_fn", - "hf_repo": "HuggingFaceH4/MATH-500", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": false, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 500, - "effective_num_docs": 500, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|math_500|0": { - "hashes": { - "hash_examples": "adf0cc8311011db2", - "hash_full_prompts": "63c902dbdbaf1552", - "hash_input_tokens": "2af397a095a31139", - "hash_cont_tokens": "0ff22df8a2e32ff6" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "bfaad1993ff37a60", - "hash_full_prompts": "3ceaaade5cf43911", - "hash_input_tokens": "c663dbac8a64d3e4", - "hash_cont_tokens": "13d8e0072ee283ae" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp3_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T09-26-36.766492.json b/logs/evals/Exp3_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T09-26-36.766492.json deleted file mode 100644 index 0fce891..0000000 --- a/logs/evals/Exp3_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T09-26-36.766492.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2588822.652426989, - "end_time": 2589308.453780031, - "total_evaluation_time_secondes": "485.80135304201394", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|amc23|0": { - "extractive_match": 0.7, - "extractive_match_stderr": 0.07337993857053426 - }, - "all": { - "extractive_match": 0.7, - "extractive_match_stderr": 0.07337993857053426 - } - }, - "versions": { - "custom|amc23|0": 1 - }, - "config_tasks": { - "custom|amc23": { - "name": "amc23", - "prompt_function": "amc_prompt_fn", - "hf_repo": "knoveleng/AMC-23", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|amc23|0": { - "hashes": { - "hash_examples": "57f3ead69f601b6a", - "hash_full_prompts": "64c05856286ef8dc", - "hash_input_tokens": "b64afe4485ef61c0", - "hash_cont_tokens": "cff1c1f58a10c3ec" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "fe1f1f4512256bec", - "hash_full_prompts": "882107366cadc5ce", - "hash_input_tokens": "638cd1079164f374", - "hash_cont_tokens": "9f14f44a8006a380" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp3_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T09-36-59.825027.json b/logs/evals/Exp3_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T09-36-59.825027.json deleted file mode 100644 index 9deb9a6..0000000 --- a/logs/evals/Exp3_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T09-36-59.825027.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2589368.024663066, - "end_time": 2589931.508379323, - "total_evaluation_time_secondes": "563.4837162569165", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|aime24|0": { - "extractive_match": 0.43333333333333335, - "extractive_match_stderr": 0.0920186554465537 - }, - "all": { - "extractive_match": 0.43333333333333335, - "extractive_match_stderr": 0.0920186554465537 - } - }, - "versions": { - "custom|aime24|0": 1 - }, - "config_tasks": { - "custom|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "253167becf0dfed7", - "hash_input_tokens": "bf1cc75b5f12dfb8", - "hash_cont_tokens": "5f98c2f013b48f03" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "84ff409b6bbf7cc0", - "hash_input_tokens": "9a8c7e54ce09af84", - "hash_cont_tokens": "ed47f783736df3bf" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp3_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T09-55-14.957560.json b/logs/evals/Exp3_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T09-55-14.957560.json deleted file mode 100644 index fe94103..0000000 --- a/logs/evals/Exp3_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T09-55-14.957560.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2589983.778860533, - "end_time": 2591026.655002043, - "total_evaluation_time_secondes": "1042.8761415099725", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|math_500|0": { - "extractive_match": 0.856, - "extractive_match_stderr": 0.015716934945725767 - }, - "all": { - "extractive_match": 0.856, - "extractive_match_stderr": 0.015716934945725767 - } - }, - "versions": { - "custom|math_500|0": 1 - }, - "config_tasks": { - "custom|math_500": { - "name": "math_500", - "prompt_function": "math_prompt_fn", - "hf_repo": "HuggingFaceH4/MATH-500", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": false, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 500, - "effective_num_docs": 500, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|math_500|0": { - "hashes": { - "hash_examples": "adf0cc8311011db2", - "hash_full_prompts": "63c902dbdbaf1552", - "hash_input_tokens": "2af397a095a31139", - "hash_cont_tokens": "0fab9c626e9e9390" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "bfaad1993ff37a60", - "hash_full_prompts": "3ceaaade5cf43911", - "hash_input_tokens": "c663dbac8a64d3e4", - "hash_cont_tokens": "a22503828d7d43e3" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp3_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T10-04-33.528650.json b/logs/evals/Exp3_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T10-04-33.528650.json deleted file mode 100644 index 53699cc..0000000 --- a/logs/evals/Exp3_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T10-04-33.528650.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2591099.695949811, - "end_time": 2591585.220585029, - "total_evaluation_time_secondes": "485.52463521808386", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|amc23|0": { - "extractive_match": 0.75, - "extractive_match_stderr": 0.06933752452815363 - }, - "all": { - "extractive_match": 0.75, - "extractive_match_stderr": 0.06933752452815363 - } - }, - "versions": { - "custom|amc23|0": 1 - }, - "config_tasks": { - "custom|amc23": { - "name": "amc23", - "prompt_function": "amc_prompt_fn", - "hf_repo": "knoveleng/AMC-23", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|amc23|0": { - "hashes": { - "hash_examples": "57f3ead69f601b6a", - "hash_full_prompts": "64c05856286ef8dc", - "hash_input_tokens": "b64afe4485ef61c0", - "hash_cont_tokens": "1b6a1b4002eec6c6" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "fe1f1f4512256bec", - "hash_full_prompts": "882107366cadc5ce", - "hash_input_tokens": "638cd1079164f374", - "hash_cont_tokens": "21b4cbba6be064a5" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp3_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T10-15-17.421583.json b/logs/evals/Exp3_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T10-15-17.421583.json deleted file mode 100644 index 59dcc60..0000000 --- a/logs/evals/Exp3_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T10-15-17.421583.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2591637.731648111, - "end_time": 2592229.108160378, - "total_evaluation_time_secondes": "591.3765122671612", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|aime24|0": { - "extractive_match": 0.23333333333333334, - "extractive_match_stderr": 0.07854032324531728 - }, - "all": { - "extractive_match": 0.23333333333333334, - "extractive_match_stderr": 0.07854032324531728 - } - }, - "versions": { - "custom|aime24|0": 1 - }, - "config_tasks": { - "custom|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "253167becf0dfed7", - "hash_input_tokens": "bf1cc75b5f12dfb8", - "hash_cont_tokens": "aba7231d0ede8892" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "84ff409b6bbf7cc0", - "hash_input_tokens": "9a8c7e54ce09af84", - "hash_cont_tokens": "68a37f065ac252b2" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp3_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T10-33-42.826955.json b/logs/evals/Exp3_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T10-33-42.826955.json deleted file mode 100644 index ef36b01..0000000 --- a/logs/evals/Exp3_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T10-33-42.826955.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2592299.508424689, - "end_time": 2593334.475003944, - "total_evaluation_time_secondes": "1034.9665792547166", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|math_500|0": { - "extractive_match": 0.852, - "extractive_match_stderr": 0.015896458561251246 - }, - "all": { - "extractive_match": 0.852, - "extractive_match_stderr": 0.015896458561251246 - } - }, - "versions": { - "custom|math_500|0": 1 - }, - "config_tasks": { - "custom|math_500": { - "name": "math_500", - "prompt_function": "math_prompt_fn", - "hf_repo": "HuggingFaceH4/MATH-500", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": false, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 500, - "effective_num_docs": 500, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|math_500|0": { - "hashes": { - "hash_examples": "adf0cc8311011db2", - "hash_full_prompts": "63c902dbdbaf1552", - "hash_input_tokens": "2af397a095a31139", - "hash_cont_tokens": "e3e867c652ccb694" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "bfaad1993ff37a60", - "hash_full_prompts": "3ceaaade5cf43911", - "hash_input_tokens": "c663dbac8a64d3e4", - "hash_cont_tokens": "d6ee582ea943bd6e" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp3_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T10-39-54.446838.json b/logs/evals/Exp3_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T10-39-54.446838.json deleted file mode 100644 index c415d01..0000000 --- a/logs/evals/Exp3_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T10-39-54.446838.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2593401.704100977, - "end_time": 2593706.140676645, - "total_evaluation_time_secondes": "304.43657566793263", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|amc23|0": { - "extractive_match": 0.775, - "extractive_match_stderr": 0.06686668711812967 - }, - "all": { - "extractive_match": 0.775, - "extractive_match_stderr": 0.06686668711812967 - } - }, - "versions": { - "custom|amc23|0": 1 - }, - "config_tasks": { - "custom|amc23": { - "name": "amc23", - "prompt_function": "amc_prompt_fn", - "hf_repo": "knoveleng/AMC-23", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|amc23|0": { - "hashes": { - "hash_examples": "57f3ead69f601b6a", - "hash_full_prompts": "64c05856286ef8dc", - "hash_input_tokens": "b64afe4485ef61c0", - "hash_cont_tokens": "e1ea54b6a1bf95b4" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "fe1f1f4512256bec", - "hash_full_prompts": "882107366cadc5ce", - "hash_input_tokens": "638cd1079164f374", - "hash_cont_tokens": "a71856567a33efac" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp3_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T10-50-49.693635.json b/logs/evals/Exp3_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T10-50-49.693635.json deleted file mode 100644 index d648f60..0000000 --- a/logs/evals/Exp3_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T10-50-49.693635.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2593753.251117188, - "end_time": 2594361.376800723, - "total_evaluation_time_secondes": "608.1256835348904", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|aime24|0": { - "extractive_match": 0.3333333333333333, - "extractive_match_stderr": 0.08753762190648169 - }, - "all": { - "extractive_match": 0.3333333333333333, - "extractive_match_stderr": 0.08753762190648169 - } - }, - "versions": { - "custom|aime24|0": 1 - }, - "config_tasks": { - "custom|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "253167becf0dfed7", - "hash_input_tokens": "bf1cc75b5f12dfb8", - "hash_cont_tokens": "490dc46f67c29942" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "84ff409b6bbf7cc0", - "hash_input_tokens": "9a8c7e54ce09af84", - "hash_cont_tokens": "18c8551d8a1015a6" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp3_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T11-11-55.345666.json b/logs/evals/Exp3_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T11-11-55.345666.json deleted file mode 100644 index fc6908a..0000000 --- a/logs/evals/Exp3_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T11-11-55.345666.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2594416.381958949, - "end_time": 2595626.985790251, - "total_evaluation_time_secondes": "1210.603831301909", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|math_500|0": { - "extractive_match": 0.804, - "extractive_match_stderr": 0.017770751227744862 - }, - "all": { - "extractive_match": 0.804, - "extractive_match_stderr": 0.017770751227744862 - } - }, - "versions": { - "custom|math_500|0": 1 - }, - "config_tasks": { - "custom|math_500": { - "name": "math_500", - "prompt_function": "math_prompt_fn", - "hf_repo": "HuggingFaceH4/MATH-500", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": false, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 500, - "effective_num_docs": 500, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|math_500|0": { - "hashes": { - "hash_examples": "adf0cc8311011db2", - "hash_full_prompts": "63c902dbdbaf1552", - "hash_input_tokens": "2af397a095a31139", - "hash_cont_tokens": "1678101c78f1f285" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "bfaad1993ff37a60", - "hash_full_prompts": "3ceaaade5cf43911", - "hash_input_tokens": "c663dbac8a64d3e4", - "hash_cont_tokens": "9f08e5d1c86fbf9c" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp3_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T11-22-45.088452.json b/logs/evals/Exp3_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T11-22-45.088452.json deleted file mode 100644 index bcb180e..0000000 --- a/logs/evals/Exp3_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T11-22-45.088452.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2595694.940795575, - "end_time": 2596276.773000479, - "total_evaluation_time_secondes": "581.8322049044073", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|amc23|0": { - "extractive_match": 0.6, - "extractive_match_stderr": 0.07844645405527362 - }, - "all": { - "extractive_match": 0.6, - "extractive_match_stderr": 0.07844645405527362 - } - }, - "versions": { - "custom|amc23|0": 1 - }, - "config_tasks": { - "custom|amc23": { - "name": "amc23", - "prompt_function": "amc_prompt_fn", - "hf_repo": "knoveleng/AMC-23", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|amc23|0": { - "hashes": { - "hash_examples": "57f3ead69f601b6a", - "hash_full_prompts": "64c05856286ef8dc", - "hash_input_tokens": "b64afe4485ef61c0", - "hash_cont_tokens": "597b0bb4fc1adfd9" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "fe1f1f4512256bec", - "hash_full_prompts": "882107366cadc5ce", - "hash_input_tokens": "638cd1079164f374", - "hash_cont_tokens": "66cde9f29d4c2f5c" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp3_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T11-33-37.815028.json b/logs/evals/Exp3_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T11-33-37.815028.json deleted file mode 100644 index 7464a62..0000000 --- a/logs/evals/Exp3_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T11-33-37.815028.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2596337.815541219, - "end_time": 2596929.497024281, - "total_evaluation_time_secondes": "591.6814830619842", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|aime24|0": { - "extractive_match": 0.26666666666666666, - "extractive_match_stderr": 0.0821175682735253 - }, - "all": { - "extractive_match": 0.26666666666666666, - "extractive_match_stderr": 0.0821175682735253 - } - }, - "versions": { - "custom|aime24|0": 1 - }, - "config_tasks": { - "custom|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "253167becf0dfed7", - "hash_input_tokens": "bf1cc75b5f12dfb8", - "hash_cont_tokens": "d893bd40128f3f01" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "84ff409b6bbf7cc0", - "hash_input_tokens": "9a8c7e54ce09af84", - "hash_cont_tokens": "304e7ec495522d6e" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp3_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T11-56-26.434504.json b/logs/evals/Exp3_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T11-56-26.434504.json deleted file mode 100644 index 8510407..0000000 --- a/logs/evals/Exp3_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T11-56-26.434504.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2596993.780300828, - "end_time": 2598298.12918594, - "total_evaluation_time_secondes": "1304.3488851119764", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|math_500|0": { - "extractive_match": 0.83, - "extractive_match_stderr": 0.016815633531393415 - }, - "all": { - "extractive_match": 0.83, - "extractive_match_stderr": 0.016815633531393415 - } - }, - "versions": { - "custom|math_500|0": 1 - }, - "config_tasks": { - "custom|math_500": { - "name": "math_500", - "prompt_function": "math_prompt_fn", - "hf_repo": "HuggingFaceH4/MATH-500", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": false, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 500, - "effective_num_docs": 500, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|math_500|0": { - "hashes": { - "hash_examples": "adf0cc8311011db2", - "hash_full_prompts": "63c902dbdbaf1552", - "hash_input_tokens": "2af397a095a31139", - "hash_cont_tokens": "2d8a8b96257009fe" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "bfaad1993ff37a60", - "hash_full_prompts": "3ceaaade5cf43911", - "hash_input_tokens": "c663dbac8a64d3e4", - "hash_cont_tokens": "69e17b0bd9a55619" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp3_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T12-07-33.637126.json b/logs/evals/Exp3_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T12-07-33.637126.json deleted file mode 100644 index facdefe..0000000 --- a/logs/evals/Exp3_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T12-07-33.637126.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2598411.082974548, - "end_time": 2598965.327884916, - "total_evaluation_time_secondes": "554.2449103682302", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|amc23|0": { - "extractive_match": 0.7, - "extractive_match_stderr": 0.07337993857053426 - }, - "all": { - "extractive_match": 0.7, - "extractive_match_stderr": 0.07337993857053426 - } - }, - "versions": { - "custom|amc23|0": 1 - }, - "config_tasks": { - "custom|amc23": { - "name": "amc23", - "prompt_function": "amc_prompt_fn", - "hf_repo": "knoveleng/AMC-23", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|amc23|0": { - "hashes": { - "hash_examples": "57f3ead69f601b6a", - "hash_full_prompts": "64c05856286ef8dc", - "hash_input_tokens": "b64afe4485ef61c0", - "hash_cont_tokens": "40d9bb73f755c8ae" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "fe1f1f4512256bec", - "hash_full_prompts": "882107366cadc5ce", - "hash_input_tokens": "638cd1079164f374", - "hash_cont_tokens": "0f0dc5923fa3d806" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp3_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T12-19-09.426338.json b/logs/evals/Exp3_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T12-19-09.426338.json deleted file mode 100644 index 2a617ce..0000000 --- a/logs/evals/Exp3_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T12-19-09.426338.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2599055.52137683, - "end_time": 2599661.112662593, - "total_evaluation_time_secondes": "605.5912857628427", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|aime24|0": { - "extractive_match": 0.23333333333333334, - "extractive_match_stderr": 0.07854032324531728 - }, - "all": { - "extractive_match": 0.23333333333333334, - "extractive_match_stderr": 0.07854032324531728 - } - }, - "versions": { - "custom|aime24|0": 1 - }, - "config_tasks": { - "custom|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "253167becf0dfed7", - "hash_input_tokens": "bf1cc75b5f12dfb8", - "hash_cont_tokens": "b2815827e166fb14" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "84ff409b6bbf7cc0", - "hash_input_tokens": "9a8c7e54ce09af84", - "hash_cont_tokens": "26c35b29751f9b6d" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp3_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T12-45-32.618178.json b/logs/evals/Exp3_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T12-45-32.618178.json deleted file mode 100644 index 3a9c6e6..0000000 --- a/logs/evals/Exp3_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T12-45-32.618178.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2599724.580276648, - "end_time": 2601244.233675632, - "total_evaluation_time_secondes": "1519.6533989841118", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|math_500|0": { - "extractive_match": 0.814, - "extractive_match_stderr": 0.017418806780583957 - }, - "all": { - "extractive_match": 0.814, - "extractive_match_stderr": 0.017418806780583957 - } - }, - "versions": { - "custom|math_500|0": 1 - }, - "config_tasks": { - "custom|math_500": { - "name": "math_500", - "prompt_function": "math_prompt_fn", - "hf_repo": "HuggingFaceH4/MATH-500", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": false, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 500, - "effective_num_docs": 500, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|math_500|0": { - "hashes": { - "hash_examples": "adf0cc8311011db2", - "hash_full_prompts": "63c902dbdbaf1552", - "hash_input_tokens": "2af397a095a31139", - "hash_cont_tokens": "966cfae3386821e2" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "bfaad1993ff37a60", - "hash_full_prompts": "3ceaaade5cf43911", - "hash_input_tokens": "c663dbac8a64d3e4", - "hash_cont_tokens": "06e0fffe6fd58548" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp3_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T12-54-08.288591.json b/logs/evals/Exp3_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T12-54-08.288591.json deleted file mode 100644 index f9452b5..0000000 --- a/logs/evals/Exp3_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T12-54-08.288591.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2601346.196358103, - "end_time": 2601759.980020389, - "total_evaluation_time_secondes": "413.7836622861214", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|amc23|0": { - "extractive_match": 0.7, - "extractive_match_stderr": 0.07337993857053426 - }, - "all": { - "extractive_match": 0.7, - "extractive_match_stderr": 0.07337993857053426 - } - }, - "versions": { - "custom|amc23|0": 1 - }, - "config_tasks": { - "custom|amc23": { - "name": "amc23", - "prompt_function": "amc_prompt_fn", - "hf_repo": "knoveleng/AMC-23", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|amc23|0": { - "hashes": { - "hash_examples": "57f3ead69f601b6a", - "hash_full_prompts": "64c05856286ef8dc", - "hash_input_tokens": "b64afe4485ef61c0", - "hash_cont_tokens": "3d0089b1cb6faa2d" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "fe1f1f4512256bec", - "hash_full_prompts": "882107366cadc5ce", - "hash_input_tokens": "638cd1079164f374", - "hash_cont_tokens": "adf30c7654ed9c16" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp3_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T13-02-26.672622.json b/logs/evals/Exp3_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T13-02-26.672622.json deleted file mode 100644 index 543ac9e..0000000 --- a/logs/evals/Exp3_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T13-02-26.672622.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2601826.367747913, - "end_time": 2602258.347004526, - "total_evaluation_time_secondes": "431.9792566127144", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|aime24|0": { - "extractive_match": 0.3, - "extractive_match_stderr": 0.0850962943396763 - }, - "all": { - "extractive_match": 0.3, - "extractive_match_stderr": 0.0850962943396763 - } - }, - "versions": { - "custom|aime24|0": 1 - }, - "config_tasks": { - "custom|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "253167becf0dfed7", - "hash_input_tokens": "bf1cc75b5f12dfb8", - "hash_cont_tokens": "699ac3b1b5301ac1" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "84ff409b6bbf7cc0", - "hash_input_tokens": "9a8c7e54ce09af84", - "hash_cont_tokens": "0245497f2bebbcc5" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp3_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T13-24-00.237541.json b/logs/evals/Exp3_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T13-24-00.237541.json deleted file mode 100644 index 030f850..0000000 --- a/logs/evals/Exp3_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T13-24-00.237541.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2602298.434195373, - "end_time": 2603551.813672935, - "total_evaluation_time_secondes": "1253.3794775619172", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|math_500|0": { - "extractive_match": 0.824, - "extractive_match_stderr": 0.01704785202062228 - }, - "all": { - "extractive_match": 0.824, - "extractive_match_stderr": 0.01704785202062228 - } - }, - "versions": { - "custom|math_500|0": 1 - }, - "config_tasks": { - "custom|math_500": { - "name": "math_500", - "prompt_function": "math_prompt_fn", - "hf_repo": "HuggingFaceH4/MATH-500", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": false, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 500, - "effective_num_docs": 500, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|math_500|0": { - "hashes": { - "hash_examples": "adf0cc8311011db2", - "hash_full_prompts": "63c902dbdbaf1552", - "hash_input_tokens": "2af397a095a31139", - "hash_cont_tokens": "17b24f3c520b6310" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "bfaad1993ff37a60", - "hash_full_prompts": "3ceaaade5cf43911", - "hash_input_tokens": "c663dbac8a64d3e4", - "hash_cont_tokens": "fcb10a4340d55cf4" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp3_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T13-32-38.296980.json b/logs/evals/Exp3_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T13-32-38.296980.json deleted file mode 100644 index f752c62..0000000 --- a/logs/evals/Exp3_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T13-32-38.296980.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2603652.439867107, - "end_time": 2604069.984818537, - "total_evaluation_time_secondes": "417.5449514295906", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|amc23|0": { - "extractive_match": 0.675, - "extractive_match_stderr": 0.07499999999999998 - }, - "all": { - "extractive_match": 0.675, - "extractive_match_stderr": 0.07499999999999998 - } - }, - "versions": { - "custom|amc23|0": 1 - }, - "config_tasks": { - "custom|amc23": { - "name": "amc23", - "prompt_function": "amc_prompt_fn", - "hf_repo": "knoveleng/AMC-23", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|amc23|0": { - "hashes": { - "hash_examples": "57f3ead69f601b6a", - "hash_full_prompts": "64c05856286ef8dc", - "hash_input_tokens": "b64afe4485ef61c0", - "hash_cont_tokens": "3e0fd04e750b02c0" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "fe1f1f4512256bec", - "hash_full_prompts": "882107366cadc5ce", - "hash_input_tokens": "638cd1079164f374", - "hash_cont_tokens": "02c3a8d49a2e3ee3" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp3_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T08-07-05.491769.json b/logs/evals/Exp3_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T08-07-05.491769.json deleted file mode 100644 index 237ea1c..0000000 --- a/logs/evals/Exp3_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T08-07-05.491769.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2583924.187252476, - "end_time": 2584537.177876145, - "total_evaluation_time_secondes": "612.9906236692332", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|aime24|0": { - "extractive_match": 0.4666666666666667, - "extractive_match_stderr": 0.09264111117062017 - }, - "all": { - "extractive_match": 0.4666666666666667, - "extractive_match_stderr": 0.09264111117062017 - } - }, - "versions": { - "custom|aime24|0": 1 - }, - "config_tasks": { - "custom|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "253167becf0dfed7", - "hash_input_tokens": "bf1cc75b5f12dfb8", - "hash_cont_tokens": "fd81113ccd505a4e" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "84ff409b6bbf7cc0", - "hash_input_tokens": "9a8c7e54ce09af84", - "hash_cont_tokens": "2a36a200d6cd2247" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp3_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T08-31-16.022306.json b/logs/evals/Exp3_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T08-31-16.022306.json deleted file mode 100644 index b02cc15..0000000 --- a/logs/evals/Exp3_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T08-31-16.022306.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2584606.174923473, - "end_time": 2585987.719419564, - "total_evaluation_time_secondes": "1381.5444960910827", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|math_500|0": { - "extractive_match": 0.844, - "extractive_match_stderr": 0.016243636028391097 - }, - "all": { - "extractive_match": 0.844, - "extractive_match_stderr": 0.016243636028391097 - } - }, - "versions": { - "custom|math_500|0": 1 - }, - "config_tasks": { - "custom|math_500": { - "name": "math_500", - "prompt_function": "math_prompt_fn", - "hf_repo": "HuggingFaceH4/MATH-500", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": false, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 500, - "effective_num_docs": 500, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|math_500|0": { - "hashes": { - "hash_examples": "adf0cc8311011db2", - "hash_full_prompts": "63c902dbdbaf1552", - "hash_input_tokens": "2af397a095a31139", - "hash_cont_tokens": "6f3bc11d908efbf4" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "bfaad1993ff37a60", - "hash_full_prompts": "3ceaaade5cf43911", - "hash_input_tokens": "c663dbac8a64d3e4", - "hash_cont_tokens": "c2619f7c35dda61a" - }, - "truncated": 0, - "non_truncated": 500, - "padded": 0, - "non_padded": 500, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/evals/Exp3_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T08-41-29.644364.json b/logs/evals/Exp3_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T08-41-29.644364.json deleted file mode 100644 index dff2a60..0000000 --- a/logs/evals/Exp3_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T08-41-29.644364.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": -1, - "max_samples": null, - "job_id": 0, - "start_time": 2586061.970029914, - "end_time": 2586601.331146573, - "total_evaluation_time_secondes": "539.3611166593619", - "model_name": "quyanh/OpenRS-GRPO", - "model_sha": "", - "model_dtype": null, - "model_size": null - }, - "results": { - "custom|amc23|0": { - "extractive_match": 0.725, - "extractive_match_stderr": 0.0714995069016527 - }, - "all": { - "extractive_match": 0.725, - "extractive_match_stderr": 0.0714995069016527 - } - }, - "versions": { - "custom|amc23|0": 1 - }, - "config_tasks": { - "custom|amc23": { - "name": "amc23", - "prompt_function": "amc_prompt_fn", - "hf_repo": "knoveleng/AMC-23", - "hf_subset": "default", - "metric": [ - { - "metric_name": "extractive_match", - "higher_is_better": true, - "category": "3", - "use_case": "1", - "sample_level_fn": "sample_level_fn", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "custom|amc23|0": { - "hashes": { - "hash_examples": "57f3ead69f601b6a", - "hash_full_prompts": "64c05856286ef8dc", - "hash_input_tokens": "b64afe4485ef61c0", - "hash_cont_tokens": "c0838ad8f27af064" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "fe1f1f4512256bec", - "hash_full_prompts": "882107366cadc5ce", - "hash_input_tokens": "638cd1079164f374", - "hash_cont_tokens": "68bf0a6b43ec34ae" - }, - "truncated": 0, - "non_truncated": 40, - "padded": 0, - "non_padded": 40, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/logs/logs.txt b/logs/logs.txt deleted file mode 100644 index 89cd797..0000000 --- a/logs/logs.txt +++ /dev/null @@ -1,15910 +0,0 @@ -2025-03-14T22:55:26.695749046Z ========== -2025-03-14T22:55:26.695756246Z == CUDA == -2025-03-14T22:55:26.695764279Z ========== -2025-03-14T22:55:26.704045697Z CUDA Version 12.4.1 -2025-03-14T22:55:26.704050902Z Container image Copyright (c) 2016-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -2025-03-14T22:55:26.704054511Z This container image and its contents are governed by the NVIDIA Deep Learning Container License. -2025-03-14T22:55:26.704056371Z By pulling and using the container, you accept the terms and conditions of this license: -2025-03-14T22:55:26.704058066Z https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license -2025-03-14T22:55:26.704061413Z A copy of this license is made available in this container at /NGC-DL-CONTAINER-LICENSE for your convenience. -2025-03-14T22:55:27.751169509Z downloading uv 0.6.6 x86_64-unknown-linux-gnu -2025-03-14T22:55:28.645220727Z no checksums to verify -2025-03-14T22:55:28.996691222Z installing to /root/.local/bin -2025-03-14T22:55:29.005883510Z uv -2025-03-14T22:55:29.009218297Z uvx -2025-03-14T22:55:29.009299117Z everything's installed! -2025-03-14T22:55:29.042812582Z To add $HOME/.local/bin to your PATH, either restart your shell or run: -2025-03-14T22:55:29.042817149Z source $HOME/.local/bin/env (sh, bash, zsh) -2025-03-14T22:55:29.042819746Z source $HOME/.local/bin/env.fish (fish) -2025-03-14T22:55:29.044182069Z WARNING: The following commands are shadowed by other commands in your PATH: uv uvx -2025-03-14T22:55:29.108367987Z Using CPython 3.11.10 interpreter at: /usr/bin/python3.11 -2025-03-14T22:55:29.108403119Z Creating virtual environment at: openr1 -2025-03-14T22:55:35.532567444Z Activate with: source openr1/bin/activate -2025-03-14T22:55:35.618466882Z Using Python 3.11.10 environment at: openr1 -2025-03-14T22:55:35.648821477Z Resolved 1 package in 27ms -2025-03-14T22:55:35.650741114Z Downloading pip (1.8MiB) -2025-03-14T22:55:35.736787946Z Downloaded pip -2025-03-14T22:55:35.737076985Z Prepared 1 package in 88ms -2025-03-14T22:55:35.751612142Z warning: Failed to hardlink files; falling back to full copy. This may lead to degraded performance. -2025-03-14T22:55:35.751638070Z If the cache and target directories are on different filesystems, hardlinking may not be supported. -2025-03-14T22:55:35.751641686Z If this is intentional, set `export UV_LINK_MODE=copy` or use `--link-mode=copy` to suppress this warning. -2025-03-14T22:55:37.241942480Z Installed 1 package in 1.50s -2025-03-14T22:55:37.242001713Z + pip==25.0.1 -2025-03-14T22:55:37.261831027Z Using Python 3.11.10 environment at: openr1 -2025-03-14T22:55:38.032721612Z Resolved 127 packages in 768ms -2025-03-14T22:55:38.048294865Z Downloading aiohttp (1.6MiB) -2025-03-14T22:55:38.048435690Z Downloading numpy (17.4MiB) -2025-03-14T22:55:38.048581700Z Downloading networkx (1.6MiB) -2025-03-14T22:55:38.048711432Z Downloading tiktoken (1.1MiB) -2025-03-14T22:55:38.052309404Z Downloading virtualenv (4.1MiB) -2025-03-14T22:55:38.052517510Z Downloading triton (199.8MiB) -2025-03-14T22:55:38.052625913Z Downloading torchaudio (3.2MiB) -2025-03-14T22:55:38.052733977Z Downloading mistral-common (6.2MiB) -2025-03-14T22:55:38.052844303Z Downloading sentencepiece (1.2MiB) -2025-03-14T22:55:38.053094499Z Downloading ray (64.6MiB) -2025-03-14T22:55:38.053205889Z Downloading uvloop (3.8MiB) -2025-03-14T22:55:38.053332129Z Downloading pycountry (6.0MiB) -2025-03-14T22:55:38.053442386Z Downloading py-spy (2.6MiB) -2025-03-14T22:55:38.053567832Z Downloading xgrammar (4.7MiB) -2025-03-14T22:55:38.053772001Z Downloading torchvision (6.9MiB) -2025-03-14T22:55:38.053882225Z Downloading pillow (4.3MiB) -2025-03-14T22:55:38.054061384Z Downloading pydantic-core (1.9MiB) -2025-03-14T22:55:38.054245215Z Downloading vllm (252.0MiB) -2025-03-14T22:55:38.054311825Z Downloading xformers (15.9MiB) -2025-03-14T22:55:38.054397337Z Downloading grpcio (5.7MiB) -2025-03-14T22:55:38.054483804Z Downloading nvidia-nvjitlink-cu12 (20.1MiB) -2025-03-14T22:55:38.054556753Z Downloading tokenizers (2.9MiB) -2025-03-14T22:55:38.054645774Z Downloading nvidia-cuda-cupti-cu12 (13.2MiB) -2025-03-14T22:55:38.054717854Z Downloading nvidia-cudnn-cu12 (634.0MiB) -2025-03-14T22:55:38.054812629Z Downloading nvidia-nccl-cu12 (179.9MiB) -2025-03-14T22:55:38.054877079Z Downloading torch (864.5MiB) -2025-03-14T22:55:38.054967122Z Downloading nvidia-curand-cu12 (53.7MiB) -2025-03-14T22:55:38.055038513Z Downloading transformers (9.5MiB) -2025-03-14T22:55:38.055175996Z Downloading nvidia-cusolver-cu12 (122.0MiB) -2025-03-14T22:55:38.055247693Z Downloading opencv-python-headless (47.7MiB) -2025-03-14T22:55:38.055328576Z Downloading nvidia-cublas-cu12 (346.6MiB) -2025-03-14T22:55:38.055381786Z Downloading sympy (5.9MiB) -2025-03-14T22:55:38.055443999Z Downloading nvidia-cufft-cu12 (201.7MiB) -2025-03-14T22:55:38.060542881Z Downloading nvidia-cuda-nvrtc-cu12 (23.5MiB) -2025-03-14T22:55:38.065437687Z Downloading nvidia-cusparse-cu12 (197.8MiB) -2025-03-14T22:55:38.702101838Z Downloaded tiktoken -2025-03-14T22:55:38.805377664Z Downloaded sentencepiece -2025-03-14T22:55:38.982627700Z Downloaded aiohttp -2025-03-14T22:55:39.121139763Z Downloaded pydantic-core -2025-03-14T22:55:39.418175569Z Downloaded py-spy -2025-03-14T22:55:39.546469855Z Downloaded tokenizers -2025-03-14T22:55:39.764630443Z Downloaded networkx -2025-03-14T22:55:39.771676735Z Downloaded torchaudio -2025-03-14T22:55:39.977772768Z Downloaded virtualenv -2025-03-14T22:55:39.998160069Z Downloaded uvloop -2025-03-14T22:55:40.002822045Z Downloaded pillow -2025-03-14T22:55:40.122323677Z Downloaded xgrammar -2025-03-14T22:55:40.506722921Z Downloaded grpcio -2025-03-14T22:55:40.576072562Z Downloaded mistral-common -2025-03-14T22:55:40.585897201Z Downloaded pycountry -2025-03-14T22:55:40.777980339Z Downloaded torchvision -2025-03-14T22:55:41.560415009Z Downloaded nvidia-cuda-cupti-cu12 -2025-03-14T22:55:41.841844182Z Downloaded sympy -2025-03-14T22:55:42.038605606Z Downloaded numpy -2025-03-14T22:55:42.059996159Z Downloaded xformers -2025-03-14T22:55:42.419988111Z Downloaded nvidia-nvjitlink-cu12 -2025-03-14T22:55:42.672184431Z Downloaded nvidia-cuda-nvrtc-cu12 -2025-03-14T22:55:44.143648529Z Downloaded transformers -2025-03-14T22:55:44.864373264Z Downloaded nvidia-curand-cu12 -2025-03-14T22:55:45.522874426Z Downloaded opencv-python-headless -2025-03-14T22:55:49.510410546Z Downloaded nvidia-cusolver-cu12 -2025-03-14T22:55:52.232253613Z Downloaded nvidia-nccl-cu12 -2025-03-14T22:55:53.189774148Z Downloaded triton -2025-03-14T22:55:53.388934911Z Downloaded nvidia-cusparse-cu12 -2025-03-14T22:55:53.499140352Z Downloaded nvidia-cufft-cu12 -2025-03-14T22:55:56.928291222Z Downloaded vllm -2025-03-14T22:55:57.419144098Z Downloaded nvidia-cublas-cu12 -2025-03-14T22:56:01.061562477Z Downloaded nvidia-cudnn-cu12 -2025-03-14T22:56:01.693486028Z Downloaded ray -2025-03-14T22:56:07.265525304Z Downloaded torch -2025-03-14T22:56:07.266159920Z Prepared 127 packages in 29.22s -2025-03-14T22:57:33.659851701Z Installed 127 packages in 1m 26s -2025-03-14T22:57:33.660781541Z + aiohappyeyeballs==2.6.1 -2025-03-14T22:57:33.660816883Z + aiohttp==3.11.13 -2025-03-14T22:57:33.660822513Z + aiohttp-cors==0.7.0 -2025-03-14T22:57:33.660827239Z + aiosignal==1.3.2 -2025-03-14T22:57:33.660831911Z + airportsdata==20250224 -2025-03-14T22:57:33.660837273Z + annotated-types==0.7.0 -2025-03-14T22:57:33.660842291Z + anyio==4.8.0 -2025-03-14T22:57:33.660869236Z + astor==0.8.1 -2025-03-14T22:57:33.660874819Z + attrs==25.3.0 -2025-03-14T22:57:33.660937379Z + blake3==1.0.4 -2025-03-14T22:57:33.660993399Z + cachetools==5.5.2 -2025-03-14T22:57:33.661029770Z + certifi==2025.1.31 -2025-03-14T22:57:33.661035393Z + charset-normalizer==3.4.1 -2025-03-14T22:57:33.661040989Z + click==8.1.8 -2025-03-14T22:57:33.661048493Z + cloudpickle==3.1.1 -2025-03-14T22:57:33.661070273Z + colorful==0.5.6 -2025-03-14T22:57:33.661106436Z + compressed-tensors==0.9.1 -2025-03-14T22:57:33.661186610Z + depyf==0.18.0 -2025-03-14T22:57:33.661229286Z + dill==0.3.9 -2025-03-14T22:57:33.661262960Z + diskcache==5.6.3 -2025-03-14T22:57:33.661269870Z + distlib==0.3.9 -2025-03-14T22:57:33.661297283Z + distro==1.9.0 -2025-03-14T22:57:33.661324283Z + einops==0.8.1 -2025-03-14T22:57:33.661344905Z + fastapi==0.115.11 -2025-03-14T22:57:33.661400348Z + filelock==3.18.0 -2025-03-14T22:57:33.661446570Z + frozenlist==1.5.0 -2025-03-14T22:57:33.661451578Z + fsspec==2025.3.0 -2025-03-14T22:57:33.661462132Z + gguf==0.10.0 -2025-03-14T22:57:33.661502668Z + google-api-core==2.8.0 -2025-03-14T22:57:33.661538962Z + google-auth==2.38.0 -2025-03-14T22:57:33.661636102Z + googleapis-common-protos==1.56.1 -2025-03-14T22:57:33.661650245Z + grpcio==1.71.0 -2025-03-14T22:57:33.661689845Z + h11==0.14.0 -2025-03-14T22:57:33.661702192Z + httpcore==1.0.7 -2025-03-14T22:57:33.661709725Z + httptools==0.6.4 -2025-03-14T22:57:33.661758572Z + httpx==0.28.1 -2025-03-14T22:57:33.661859799Z + huggingface-hub==0.29.3 -2025-03-14T22:57:33.661883672Z + idna==3.10 -2025-03-14T22:57:33.661890122Z + importlib-metadata==8.6.1 -2025-03-14T22:57:33.661919747Z + interegular==0.3.3 -2025-03-14T22:57:33.661924994Z + jinja2==3.1.6 -2025-03-14T22:57:33.661971272Z + jiter==0.9.0 -2025-03-14T22:57:33.661976669Z + jsonschema==4.23.0 -2025-03-14T22:57:33.662025244Z + jsonschema-specifications==2024.10.1 -2025-03-14T22:57:33.662085517Z + lark==1.2.2 -2025-03-14T22:57:33.662108031Z + lm-format-enforcer==0.10.11 -2025-03-14T22:57:33.662124358Z + markupsafe==3.0.2 -2025-03-14T22:57:33.662172277Z + mistral-common==1.5.3 -2025-03-14T22:57:33.662177734Z + mpmath==1.3.0 -2025-03-14T22:57:33.662214418Z + msgpack==1.1.0 -2025-03-14T22:57:33.662238104Z + msgspec==0.19.0 -2025-03-14T22:57:33.662270148Z + multidict==6.1.0 -2025-03-14T22:57:33.662324398Z + nest-asyncio==1.6.0 -2025-03-14T22:57:33.662356568Z + networkx==3.4.2 -2025-03-14T22:57:33.662382547Z + numpy==1.26.4 -2025-03-14T22:57:33.662432501Z + nvidia-cublas-cu12==12.4.5.8 -2025-03-14T22:57:33.662463346Z + nvidia-cuda-cupti-cu12==12.4.127 -2025-03-14T22:57:33.662471036Z + nvidia-cuda-nvrtc-cu12==12.4.127 -2025-03-14T22:57:33.662494948Z + nvidia-cuda-runtime-cu12==12.4.127 -2025-03-14T22:57:33.662516988Z + nvidia-cudnn-cu12==9.1.0.70 -2025-03-14T22:57:33.662559763Z + nvidia-cufft-cu12==11.2.1.3 -2025-03-14T22:57:33.662592043Z + nvidia-curand-cu12==10.3.5.147 -2025-03-14T22:57:33.662646403Z + nvidia-cusolver-cu12==11.6.1.9 -2025-03-14T22:57:33.662656890Z + nvidia-cusparse-cu12==12.3.1.170 -2025-03-14T22:57:33.662686113Z + nvidia-ml-py==12.570.86 -2025-03-14T22:57:33.662725137Z + nvidia-nccl-cu12==2.21.5 -2025-03-14T22:57:33.662758156Z + nvidia-nvjitlink-cu12==12.4.127 -2025-03-14T22:57:33.662788100Z + nvidia-nvtx-cu12==12.4.127 -2025-03-14T22:57:33.662838750Z + openai==1.66.3 -2025-03-14T22:57:33.662850780Z + opencensus==0.11.4 -2025-03-14T22:57:33.662873047Z + opencensus-context==0.1.3 -2025-03-14T22:57:33.662897047Z + opencv-python-headless==4.11.0.86 -2025-03-14T22:57:33.662931487Z + outlines==0.1.11 -2025-03-14T22:57:33.662989797Z + outlines-core==0.1.26 -2025-03-14T22:57:33.662998377Z + packaging==24.2 -2025-03-14T22:57:33.663060853Z + partial-json-parser==0.2.1.1.post5 -2025-03-14T22:57:33.663144230Z + pillow==11.1.0 -2025-03-14T22:57:33.663180115Z + platformdirs==4.3.6 -2025-03-14T22:57:33.663193000Z + prometheus-client==0.21.1 -2025-03-14T22:57:33.663201545Z + prometheus-fastapi-instrumentator==7.0.2 -2025-03-14T22:57:33.663208887Z + propcache==0.3.0 -2025-03-14T22:57:33.663256467Z + protobuf==6.30.1 -2025-03-14T22:57:33.663295027Z + psutil==7.0.0 -2025-03-14T22:57:33.663322475Z + py-cpuinfo==9.0.0 -2025-03-14T22:57:33.663332019Z + py-spy==0.4.0 -2025-03-14T22:57:33.663364302Z + pyasn1==0.6.1 -2025-03-14T22:57:33.663411602Z + pyasn1-modules==0.4.1 -2025-03-14T22:57:33.663439705Z + pycountry==24.6.1 -2025-03-14T22:57:33.663448255Z + pydantic==2.10.6 -2025-03-14T22:57:33.663482052Z + pydantic-core==2.27.2 -2025-03-14T22:57:33.663510055Z + python-dotenv==1.0.1 -2025-03-14T22:57:33.663536396Z + pyyaml==6.0.2 -2025-03-14T22:57:33.663583475Z + pyzmq==26.3.0 -2025-03-14T22:57:33.663597729Z + ray==2.43.0 -2025-03-14T22:57:33.663709479Z + referencing==0.36.2 -2025-03-14T22:57:33.663746626Z + regex==2024.11.6 -2025-03-14T22:57:33.663751762Z + requests==2.32.3 -2025-03-14T22:57:33.663785142Z + rpds-py==0.23.1 -2025-03-14T22:57:33.663815416Z + rsa==4.9 -2025-03-14T22:57:33.663853106Z + safetensors==0.5.3 -2025-03-14T22:57:33.663871694Z + sentencepiece==0.2.0 -2025-03-14T22:57:33.663899641Z + six==1.17.0 -2025-03-14T22:57:33.663922801Z + smart-open==7.1.0 -2025-03-14T22:57:33.663981038Z + sniffio==1.3.1 -2025-03-14T22:57:33.664008275Z + starlette==0.46.1 -2025-03-14T22:57:33.664015434Z + sympy==1.13.1 -2025-03-14T22:57:33.664047481Z + tiktoken==0.9.0 -2025-03-14T22:57:33.664123801Z + tokenizers==0.21.1 -2025-03-14T22:57:33.664131015Z + torch==2.5.1 -2025-03-14T22:57:33.664194595Z + torchaudio==2.5.1 -2025-03-14T22:57:33.664201615Z + torchvision==0.20.1 -2025-03-14T22:57:33.664242735Z + tqdm==4.67.1 -2025-03-14T22:57:33.664252394Z + transformers==4.49.0 -2025-03-14T22:57:33.664302978Z + triton==3.1.0 -2025-03-14T22:57:33.664341341Z + typing-extensions==4.12.2 -2025-03-14T22:57:33.664347121Z + urllib3==2.3.0 -2025-03-14T22:57:33.664388888Z + uvicorn==0.34.0 -2025-03-14T22:57:33.664417458Z + uvloop==0.21.0 -2025-03-14T22:57:33.664442405Z + virtualenv==20.29.3 -2025-03-14T22:57:33.664468233Z + vllm==0.7.2 -2025-03-14T22:57:33.664495978Z + watchfiles==1.0.4 -2025-03-14T22:57:33.664523713Z + websockets==15.0.1 -2025-03-14T22:57:33.664581295Z + wrapt==1.17.2 -2025-03-14T22:57:33.664607897Z + xformers==0.0.28.post3 -2025-03-14T22:57:33.664640127Z + xgrammar==0.1.15 -2025-03-14T22:57:33.664716194Z + yarl==1.18.3 -2025-03-14T22:57:33.664722654Z + zipp==3.21.0 -2025-03-14T22:57:33.718066659Z Using Python 3.11.10 environment at: openr1 -2025-03-14T22:57:34.033719558Z Resolved 1 package in 63ms -2025-03-14T22:57:34.040456885Z Downloading setuptools (1.2MiB) -2025-03-14T22:57:34.196471770Z Downloaded setuptools -2025-03-14T22:57:34.196882955Z Prepared 1 package in 159ms -2025-03-14T22:57:37.491110176Z Installed 1 package in 3.29s -2025-03-14T22:57:37.491165843Z + setuptools==76.0.0 -2025-03-14T22:57:37.517450526Z Using Python 3.11.10 environment at: openr1 -2025-03-14T22:57:40.097994884Z Resolved 24 packages in 2.35s -2025-03-14T22:57:40.101760489Z Building flash-attn==2.7.4.post1 -2025-03-14T22:58:03.501532325Z Built flash-attn==2.7.4.post1 -2025-03-14T22:58:05.037551234Z Prepared 1 package in 24.93s -2025-03-14T22:58:07.650974948Z Installed 1 package in 2.61s -2025-03-14T22:58:07.651005282Z + flash-attn==2.7.4.post1 -2025-03-14T22:58:07.670349476Z Using Python 3.11.10 environment at: openr1 -2025-03-14T22:58:09.049962151Z Updating https://github.com/huggingface/trl.git (69ad852e5654a77f1695eb4c608906fe0c7e8624) -2025-03-14T22:58:10.289499452Z Updated https://github.com/huggingface/trl.git (69ad852e5654a77f1695eb4c608906fe0c7e8624) -2025-03-14T22:58:12.845481407Z Resolved 148 packages in 5.06s -2025-03-14T22:58:12.860740266Z Building open-r1 @ file:///workspace/reasoning-llm/open-r1 -2025-03-14T22:58:12.866016225Z Updating https://github.com/huggingface/lighteval.git (ed084813e0bd12d82a06d9f913291fdbee774905) -2025-03-14T22:58:12.878442145Z Building trl @ git+https://github.com/huggingface/trl.git@69ad852e5654a77f1695eb4c608906fe0c7e8624 -2025-03-14T22:58:12.887142460Z Downloading pygments (1.2MiB) -2025-03-14T22:58:12.887515831Z Downloading pyarrow (40.1MiB) -2025-03-14T22:58:12.887703436Z Downloading bitsandbytes (72.5MiB) -2025-03-14T22:58:12.888156850Z Downloading wandb (19.8MiB) -2025-03-14T22:58:12.888386609Z Downloading hf-transfer (3.4MiB) -2025-03-14T22:58:12.888667967Z Downloading scipy (35.9MiB) -2025-03-14T22:58:12.888892599Z Downloading language-data (5.1MiB) -2025-03-14T22:58:12.889140877Z Downloading lxml (4.8MiB) -2025-03-14T22:58:12.889362298Z Downloading ruff (10.8MiB) -2025-03-14T22:58:12.890219997Z Downloading nltk (1.4MiB) -2025-03-14T22:58:12.890486372Z Downloading pandas (12.5MiB) -2025-03-14T22:58:12.891191805Z Downloading marisa-trie (1.3MiB) -2025-03-14T22:58:12.891799024Z Downloading srsly (1.1MiB) -2025-03-14T22:58:12.891983984Z Downloading spacy (6.3MiB) -2025-03-14T22:58:12.892164039Z Downloading blis (9.7MiB) -2025-03-14T22:58:12.892346409Z Downloading scikit-learn (12.9MiB) -2025-03-14T22:58:13.344820406Z Building deepspeed==0.15.4 -2025-03-14T22:58:13.346590850Z Building rouge-score==0.1.2 -2025-03-14T22:58:13.552868662Z Downloaded marisa-trie -2025-03-14T22:58:13.565399569Z Downloaded srsly -2025-03-14T22:58:13.677852168Z Updated https://github.com/huggingface/lighteval.git (ed084813e0bd12d82a06d9f913291fdbee774905) -2025-03-14T22:58:13.679117758Z Building lighteval @ git+https://github.com/huggingface/lighteval.git@ed084813e0bd12d82a06d9f913291fdbee774905 -2025-03-14T22:58:13.685513778Z Building langdetect==1.0.9 -2025-03-14T22:58:13.741132661Z Built trl @ git+https://github.com/huggingface/trl.git@69ad852e5654a77f1695eb4c608906fe0c7e8624 -2025-03-14T22:58:13.832970389Z Downloaded pygments -2025-03-14T22:58:13.910848672Z Built open-r1 @ file:///workspace/reasoning-llm/open-r1 -2025-03-14T22:58:13.947891707Z Downloaded nltk -2025-03-14T22:58:14.087163710Z Built rouge-score==0.1.2 -2025-03-14T22:58:14.126085249Z Downloaded hf-transfer -2025-03-14T22:58:14.446342889Z Downloaded lxml -2025-03-14T22:58:14.599470269Z Built langdetect==1.0.9 -2025-03-14T22:58:14.696970979Z Built lighteval @ git+https://github.com/huggingface/lighteval.git@ed084813e0bd12d82a06d9f913291fdbee774905 -2025-03-14T22:58:14.815017491Z Downloaded spacy -2025-03-14T22:58:14.874517034Z Downloaded language-data -2025-03-14T22:58:14.930680746Z Downloaded blis -2025-03-14T22:58:15.067438509Z Downloaded ruff -2025-03-14T22:58:15.350339480Z Built deepspeed==0.15.4 -2025-03-14T22:58:15.679232627Z Downloaded scikit-learn -2025-03-14T22:58:16.035826588Z Downloaded pandas -2025-03-14T22:58:16.305623185Z Downloaded bitsandbytes -2025-03-14T22:58:16.334773342Z Downloaded wandb -2025-03-14T22:58:16.705452180Z Downloaded scipy -2025-03-14T22:58:16.720679796Z Downloaded pyarrow -2025-03-14T22:58:16.721355398Z Prepared 92 packages in 3.86s -2025-03-14T22:58:16.868503370Z Uninstalled 4 packages in 146ms -2025-03-14T22:58:33.691292745Z Installed 92 packages in 16.82s -2025-03-14T22:58:33.691880987Z + absl-py==2.1.0 -2025-03-14T22:58:33.691919011Z + accelerate==1.4.0 -2025-03-14T22:58:33.691958259Z + aenum==3.1.15 -2025-03-14T22:58:33.692015457Z + antlr4-python3-runtime==4.13.2 -2025-03-14T22:58:33.692059431Z + bitsandbytes==0.45.3 -2025-03-14T22:58:33.692090346Z + blis==0.7.11 -2025-03-14T22:58:33.692110919Z + catalogue==2.0.10 -2025-03-14T22:58:33.692141243Z + chardet==5.2.0 -2025-03-14T22:58:33.692174741Z + cloudpathlib==0.16.0 -2025-03-14T22:58:33.692231196Z + colorama==0.4.6 -2025-03-14T22:58:33.692271943Z + colorlog==6.9.0 -2025-03-14T22:58:33.692313970Z + confection==0.1.5 -2025-03-14T22:58:33.692338306Z + cymem==2.0.11 -2025-03-14T22:58:33.692353306Z + dataproperty==1.1.0 -2025-03-14T22:58:33.692392799Z + datasets==3.4.0 -2025-03-14T22:58:33.692459903Z + deepspeed==0.15.4 -2025-03-14T22:58:33.692496916Z - dill==0.3.9 -2025-03-14T22:58:33.692520646Z + dill==0.3.8 -2025-03-14T22:58:33.692572110Z + docker-pycreds==0.4.0 -2025-03-14T22:58:33.692608758Z + flake8==7.1.2 -2025-03-14T22:58:33.692628173Z - fsspec==2025.3.0 -2025-03-14T22:58:33.692672940Z + fsspec==2024.12.0 -2025-03-14T22:58:33.692711600Z + gitdb==4.0.12 -2025-03-14T22:58:33.692732388Z + gitpython==3.1.44 -2025-03-14T22:58:33.692796483Z + hf-transfer==0.1.9 -2025-03-14T22:58:33.692863650Z + hjson==3.1.0 -2025-03-14T22:58:33.692885370Z + iniconfig==2.0.0 -2025-03-14T22:58:33.692893120Z + inquirerpy==0.3.4 -2025-03-14T22:58:33.692951298Z + isort==6.0.1 -2025-03-14T22:58:33.692969762Z + joblib==1.4.2 -2025-03-14T22:58:33.693019175Z + langcodes==3.5.0 -2025-03-14T22:58:33.693055562Z + langdetect==1.0.9 -2025-03-14T22:58:33.693069952Z + language-data==1.3.0 -2025-03-14T22:58:33.693128032Z + latex2sympy2-extended==1.0.6 -2025-03-14T22:58:33.693144475Z + liger-kernel==0.5.3 -2025-03-14T22:58:33.693203489Z + lighteval==0.6.0.dev0 (from git+https://github.com/huggingface/lighteval.git@ed084813e0bd12d82a06d9f913291fdbee774905) -2025-03-14T22:58:33.693262945Z + lxml==5.3.1 -2025-03-14T22:58:33.693301772Z + marisa-trie==1.2.1 -2025-03-14T22:58:33.693327682Z + markdown-it-py==3.0.0 -2025-03-14T22:58:33.693349237Z + math-verify==0.5.2 -2025-03-14T22:58:33.693407654Z + mbstrdecoder==1.1.4 -2025-03-14T22:58:33.693424509Z + mccabe==0.7.0 -2025-03-14T22:58:33.693521219Z + mdurl==0.1.2 -2025-03-14T22:58:33.693546801Z + multiprocess==0.70.16 -2025-03-14T22:58:33.693554449Z + murmurhash==1.0.12 -2025-03-14T22:58:33.693587451Z + ninja==1.11.1.3 -2025-03-14T22:58:33.693611101Z + nltk==3.9.1 -2025-03-14T22:58:33.693652191Z + open-r1==0.1.0.dev0 (from file:///workspace/reasoning-llm/open-r1) -2025-03-14T22:58:33.693700757Z + pandas==2.2.3 -2025-03-14T22:58:33.693723294Z + parameterized==0.9.0 -2025-03-14T22:58:33.693754877Z + pathvalidate==3.2.3 -2025-03-14T22:58:33.693798248Z + pfzy==0.3.4 -2025-03-14T22:58:33.693868948Z + pluggy==1.5.0 -2025-03-14T22:58:33.693879018Z + portalocker==3.1.1 -2025-03-14T22:58:33.693939528Z + preshed==3.0.9 -2025-03-14T22:58:33.693945307Z + prompt-toolkit==3.0.50 -2025-03-14T22:58:33.693993784Z - protobuf==6.30.1 -2025-03-14T22:58:33.694036096Z + protobuf==3.20.3 -2025-03-14T22:58:33.694077776Z + pyarrow==19.0.1 -2025-03-14T22:58:33.694097958Z + pycodestyle==2.12.1 -2025-03-14T22:58:33.694120243Z + pyflakes==3.2.0 -2025-03-14T22:58:33.694181108Z + pygments==2.19.1 -2025-03-14T22:58:33.694217376Z + pytablewriter==1.2.1 -2025-03-14T22:58:33.694286186Z + pytest==8.3.5 -2025-03-14T22:58:33.694295047Z + python-dateutil==2.9.0.post0 -2025-03-14T22:58:33.694325236Z + pytz==2025.1 -2025-03-14T22:58:33.694376787Z + rich==13.9.4 -2025-03-14T22:58:33.694393763Z + rouge-score==0.1.2 -2025-03-14T22:58:33.694425947Z + ruff==0.11.0 -2025-03-14T22:58:33.694472327Z + sacrebleu==2.5.1 -2025-03-14T22:58:33.694523900Z + scikit-learn==1.6.1 -2025-03-14T22:58:33.694557810Z + scipy==1.15.2 -2025-03-14T22:58:33.694606983Z + sentry-sdk==2.22.0 -2025-03-14T22:58:33.694628687Z + setproctitle==1.3.5 -2025-03-14T22:58:33.694646517Z - smart-open==7.1.0 -2025-03-14T22:58:33.694696395Z + smart-open==6.4.0 -2025-03-14T22:58:33.694734087Z + smmap==5.0.2 -2025-03-14T22:58:33.694769785Z + spacy==3.7.2 -2025-03-14T22:58:33.694814035Z + spacy-legacy==3.0.12 -2025-03-14T22:58:33.694837755Z + spacy-loggers==1.0.5 -2025-03-14T22:58:33.694868819Z + srsly==2.5.1 -2025-03-14T22:58:33.694928976Z + tabledata==1.3.4 -2025-03-14T22:58:33.694963772Z + tabulate==0.9.0 -2025-03-14T22:58:33.694998316Z + tcolorpy==0.1.7 -2025-03-14T22:58:33.695038969Z + termcolor==2.3.0 -2025-03-14T22:58:33.695054222Z + thinc==8.2.5 -2025-03-14T22:58:33.695134402Z + threadpoolctl==3.6.0 -2025-03-14T22:58:33.695155296Z + trl==0.16.0.dev0 (from git+https://github.com/huggingface/trl.git@69ad852e5654a77f1695eb4c608906fe0c7e8624) -2025-03-14T22:58:33.695205236Z + typepy==1.3.4 -2025-03-14T22:58:33.695254369Z + typer==0.9.4 -2025-03-14T22:58:33.695272126Z + tzdata==2025.1 -2025-03-14T22:58:33.695320952Z + wandb==0.19.8 -2025-03-14T22:58:33.695359016Z + wasabi==1.1.3 -2025-03-14T22:58:33.695398521Z + wcwidth==0.2.13 -2025-03-14T22:58:33.695425469Z + weasel==0.3.4 -2025-03-14T22:58:33.695465864Z + xxhash==3.5.0 -2025-03-14T22:58:42.060357914Z The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -2025-03-14T22:58:42.217097161Z Token is valid (permission: write). -2025-03-14T22:58:42.218557335Z The token `Ubuntu-2204` has been saved to /root/.cache/huggingface/stored_tokens -2025-03-14T22:58:42.219548334Z Your token has been saved to /root/.cache/huggingface/token -2025-03-14T22:58:42.219584896Z Login successful. -2025-03-14T22:58:42.219613752Z The current active token is: `Ubuntu-2204` -2025-03-14T22:59:15.995561731Z wandb: No netrc file found, creating one. -2025-03-14T22:59:15.995681956Z wandb: Appending key for api.wandb.ai to your netrc file: /root/.netrc -2025-03-14T22:59:16.010565337Z wandb: W&B API key is configured. Use `wandb login --relogin` to force relogin -2025-03-14T22:59:17.330564379Z ======================================== -2025-03-14T22:59:17.330591958Z Running Experiment 1 -2025-03-14T22:59:17.330594485Z Steps: 100 200 300 400 500 -2025-03-14T22:59:17.330596377Z ======================================== -2025-03-14T22:59:17.331408732Z ---------------------------------------- -2025-03-14T22:59:17.331450986Z Running evaluations for experiment 1, step 100 -2025-03-14T22:59:17.331456024Z Revision: rev1_100 = 67b0341a5a775c7821777abdfc4aab82ef8154f5 -2025-03-14T22:59:17.331460726Z Output directory: data/evals/Exp1_100 -2025-03-14T22:59:17.338413463Z Evaluating task: aime24 -2025-03-14T22:59:32.650896055Z [2025-03-14 22:59:32,650] [ INFO]: PyTorch version 2.5.1 available. (config.py:54) -2025-03-14T23:01:02.993955604Z INFO 03-14 23:01:02 __init__.py:190] Automatically detected platform cuda. -2025-03-14T23:01:09.923544368Z [nltk_data] Downloading package punkt to /root/nltk_data... -2025-03-14T23:01:09.996015681Z [nltk_data] Unzipping tokenizers/punkt.zip. -2025-03-14T23:01:12.579430166Z [2025-03-14 23:01:12,578] [ INFO]: --- LOADING MODEL --- (pipeline.py:186) -2025-03-14T23:02:43.590725657Z [2025-03-14 23:02:43,589] [ INFO]: This model supports multiple tasks: {'generate', 'embed', 'score', 'classify', 'reward'}. Defaulting to 'generate'. (config.py:542) -2025-03-14T23:02:43.625618871Z [2025-03-14 23:02:43,624] [ INFO]: Initializing a V0 LLM engine (v0.7.2) with config: model='quyanh/OpenRS-GRPO', speculative_config=None, tokenizer='quyanh/OpenRS-GRPO', skip_tokenizer_init=False, tokenizer_mode=auto, revision=67b0341a5a775c7821777abdfc4aab82ef8154f5, override_neuron_config=None, tokenizer_revision=67b0341a5a775c7821777abdfc4aab82ef8154f5, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=1234, served_model_name=quyanh/OpenRS-GRPO, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":256}, use_cached_outputs=False, (llm_engine.py:234) -2025-03-14T23:02:55.606913447Z [2025-03-14 23:02:55,606] [ INFO]: Using Flash Attention backend. (cuda.py:230) -2025-03-14T23:02:56.361446797Z [2025-03-14 23:02:56,360] [ INFO]: Starting to load model quyanh/OpenRS-GRPO... (model_runner.py:1110) -2025-03-14T23:02:57.172998465Z [2025-03-14 23:02:57,172] [ INFO]: Using model weights format ['*.safetensors'] (weight_utils.py:252) -2025-03-14T23:03:04.969246874Z [2025-03-14 23:03:04,968] [ INFO]: No model.safetensors.index.json found in remote. (weight_utils.py:297) -2025-03-14T23:03:04.970972921Z Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00