huggingface · qgallouedec · Sep 19, 2025 · Sep 19, 2025 · Sep 19, 2025 · Sep 19, 2025
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -75,18 +75,22 @@
     - local: sft_trainer
       title: SFT
     title: Trainers
+  - sections:
+    - local: chat_template_utils
+      title: Chat Template Utilities
+    - local: data_utils
+      title: Data Utilities
+    - local: model_utils
+      title: Model Utilities
+    - local: script_utils
+      title: Script Utilities
+    title: Utilities
   - local: models
     title: Model Classes
-  - local: model_utils
-    title: Model Utilities
   - local: callbacks
     title: Callbacks
-  - local: data_utils
-    title: Data Utilities
   - local: rewards
     title: Reward Functions
-  - local: script_utils
-    title: Script Utilities
   - local: others
     title: Others
   title: API

diff --git a/docs/source/chat_template_utils.md b/docs/source/chat_template_utils.md
@@ -0,0 +1,17 @@
+# Chat template utilities
+
+## add_response_schema
+
+[[autodoc]] chat_template_utils.add_response_schema
+
+## is_chat_template_prefix_preserving
+
+[[autodoc]] chat_template_utils.is_chat_template_prefix_preserving
+
+## get_training_chat_template
+
+[[autodoc]] chat_template_utils.get_training_chat_template
+
+## parse_response
+
+[[autodoc]] chat_template_utils.parse_response
diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md
@@ -514,6 +514,10 @@ and the reward will be computed as the sum of the rewards from each function, or
 
 Note that [`GRPOTrainer`] supports multiple reward functions of different types. See the parameters documentation for more details.
 
+## Agent Training
+
+To write...
+
 ## Vision-Language Model (VLM) Training
 
 GRPO supports training Vision-Language Models (VLMs) on multimodal datasets containing both text and images.
@@ -544,7 +548,6 @@ accelerate launch \
   --learning_rate 1e-5 \
   --gradient_checkpointing \
   --dtype bfloat16 \
-  --max_prompt_length 2048 \
   --max_completion_length 1024 \
   --use_vllm \
   --vllm_mode colocate \
@@ -555,15 +558,6 @@ accelerate launch \
 
 ### Configuration Tips
 
-> [!TIP]
-> For VLMs, truncating may remove image tokens, leading to errors during training. To avoid this, set `max_prompt_length=None` in the [`GRPOConfig`]. This allows the model to process the full sequence length without truncating image tokens.
->
-> ```python
-> GRPOConfig(max_prompt_length=None, ...)
-> ```
->
-> Only use `max_prompt_length` when you've verified that truncation won't remove image tokens for the entire dataset.
-
 - Use LoRA on vision-language projection layers
 - Enable 4-bit quantization to reduce memory usage
 - VLMs are memory-intensive — start with smaller batch sizes

diff --git a/docs/source/lora_without_regret.md b/docs/source/lora_without_regret.md
@@ -291,7 +291,6 @@ hf jobs uv run \
     --warmup_ratio 0.0 \
     --max_grad_norm 1.0 \
     --beta 0.0 \
-    --max_prompt_length 1024 \
     --max_completion_length 4096 \
     --num_generations 16 \
     --generation_batch_size 16 \
@@ -326,7 +325,6 @@ uv run "https://huggingface.co/datasets/burtenshaw/lora-without-regrets/resolve/
     --warmup_ratio 0.0 \
     --max_grad_norm 1.0 \
     --beta 0.0 \
-    --max_prompt_length 1024 \
     --max_completion_length 4096 \
     --num_generations 16 \
     --generation_batch_size 16 \

diff --git a/docs/source/paper_index.md b/docs/source/paper_index.md
@@ -111,7 +111,6 @@ training_args = GRPOConfig(
     loss_type="dr_grpo",
     per_device_train_batch_size=1, # train_batch_size_per_device in the Training section of the repository
     num_generations=8, #  num_samples in the Training section of the repository
-    max_prompt_length=1024, #  prompt_max_length in the Training section of the repository
     max_completion_length=3000, # generate_max_length in the Training section of the repository
     beta=0.0, # beta in the Training section of the repository
 )

diff --git a/docs/source/rapidfire_integration.md b/docs/source/rapidfire_integration.md
@@ -226,7 +226,6 @@ from rapidfireai.automl import RFGRPOConfig
 training_args = RFGRPOConfig(
     learning_rate=5e-6,
     num_generations=8,
-    max_prompt_length=256,
     max_completion_length=256,
     # ... all other GRPOConfig parameters supported
 )

diff --git a/examples/notebooks/grpo_qwen3_vl.ipynb b/examples/notebooks/grpo_qwen3_vl.ipynb
@@ -406,7 +406,6 @@
         "    per_device_train_batch_size=2,\n",
         "    max_completion_length=1024, # default: 256            # Max completion length produced during training\n",
         "    num_generations=2, # 2, # default: 8                  # Number of generations produced during trainig for comparison\n",
-        "    max_prompt_length=2048, # default: 512                # Max prompt lenght of the input prompt used for generation during training\n",
         "\n",
         "    fp16=True,\n",
         "\n",

diff --git a/examples/scripts/grpo_vlm.py b/examples/scripts/grpo_vlm.py
@@ -37,7 +37,6 @@
     --learning_rate 1e-5 \
     --gradient_checkpointing \
     --dtype bfloat16 \
-    --max_prompt_length 2048 \
     --max_completion_length 1024 \
     --use_vllm \
     --vllm_mode colocate \
@@ -55,7 +54,6 @@
     --output_dir grpo-SmolVLM2-2.2B-Instruct \
     --learning_rate 1e-5 \
     --dtype bfloat16 \
-    --max_prompt_length 2048 \
     --max_completion_length 1024 \
     --use_peft \
     --lora_target_modules "q_proj", "v_proj" \

diff --git a/examples/scripts/gspo.py b/examples/scripts/gspo.py
@@ -36,7 +36,6 @@
     --output_dir gspo-Qwen3-0.6B \
     --learning_rate 1e-5 \
     --dtype bfloat16 \
-    --max_prompt_length 2048 \
     --max_completion_length 1024 \
     --use_peft \
     --lora_target_modules "q_proj", "v_proj" \

diff --git a/examples/scripts/gspo_vlm.py b/examples/scripts/gspo_vlm.py
@@ -36,7 +36,6 @@
     --output_dir gspo-Qwen2.5-VL-3B-Instruct \
     --learning_rate 1e-5 \
     --dtype bfloat16 \
-    --max_prompt_length 2048 \
     --max_completion_length 1024 \
     --use_peft \
     --lora_target_modules "q_proj", "v_proj" \

diff --git a/examples/scripts/openenv/browsergym.py b/examples/scripts/openenv/browsergym.py
@@ -452,10 +452,10 @@ def main() -> None:
         print(f"🌍 Using existing BrowserGym Environment (Docker) at: {env_url}")
     elif args.env_mode == "docker-image":
         client = BrowserGymEnv.from_docker_image(args.env_image)
-        print(f"🌍 Using BrowserGym Environment (Docker) from local Image")
+        print("🌍 Using BrowserGym Environment (Docker) from local Image")
     elif args.env_mode == "docker-hub":
         client = BrowserGymEnv.from_hub(args.env_image)
-        print(f"🌍 Using existing BrowserGym Environment (Docker) from Hub Image")
+        print("🌍 Using existing BrowserGym Environment (Docker) from Hub Image")
     elif args.env_mode == "space":
         env_url = args.env_host
         print(f"🌍 Using Hugging Face Space environment at: {env_url}")

diff --git a/examples/scripts/openenv/wordle.py b/examples/scripts/openenv/wordle.py
@@ -93,7 +93,9 @@ def parse_args() -> argparse.Namespace:
         default="docker-image",
         help="Where to run the environment: 'docker-local' if already running locally, 'docker-image' to run from a Docker image, 'docker-hub' to run from Docker Hub, or 'space' to use a remote Space URL.",
     )
-    parser.add_argument("--env-image", type=str, default="textarena-env:latest", help="Docker image for the TextArena environment.")
+    parser.add_argument(
+        "--env-image", type=str, default="textarena-env:latest", help="Docker image for the TextArena environment."
+    )
     parser.add_argument(
         "--system-prompt-path",
         default="wordle_prompt.txt",
@@ -427,10 +429,10 @@ def main() -> None:
         print(f"🌍 Using existing TextArena Environment (Docker) at: {env_url}")
     elif args.env_mode == "docker-image":
         client = TextArenaEnv.from_docker_image(args.env_image)
-        print(f"🌍 Using TextArena Environment (Docker) from local Image")
+        print("🌍 Using TextArena Environment (Docker) from local Image")
     elif args.env_mode == "docker-hub":
         client = TextArenaEnv.from_hub(args.env_image)
-        print(f"🌍 Using existing TextArena Environment (Docker) from Hub Image")
+        print("🌍 Using existing TextArena Environment (Docker) from Hub Image")
     elif args.env_mode == "space":
         env_url = args.env_host
         print(f"🌍 Using Hugging Face Space environment at: {env_url}")

diff --git a/pyproject.toml b/pyproject.toml
@@ -122,7 +122,9 @@ dev = [
     # vlm
     "Pillow",
     "torchvision",
-    "num2words==0.5.14"
+    "num2words==0.5.14",
+    # for response parsing (required for training with tools)
+    "jmespath",
 ]
 
 [tool.setuptools]

diff --git a/tests/test_chat_template_utils.py b/tests/test_chat_template_utils.py
@@ -0,0 +1,171 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import textwrap
+
+import pytest
+import transformers
+from packaging.version import Version
+from transformers import AutoTokenizer
+
+from trl.chat_template_utils import (
+    add_response_schema,
+    get_training_chat_template,
+    is_chat_template_prefix_preserving,
+    parse_response,
+)
+
+
+class TestAddResponseSchema:
+    @pytest.mark.xfail(
+        condition=Version(transformers.__version__) < Version("5.0.0.dev0"),
+        reason="Response parsing is not supported in transformers versions below 5.0.0.dev0",
+        strict=True,
+    )
+    def test_add_response_schema(self):
+        tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-Qwen3MoeForSequenceClassification")
+        tokenizer = add_response_schema(tokenizer)
+        assistant_text = '<tool_call>\n{"name": "multiply", "arguments": {"a": 3, "b": 4}}\n</tool_call><|im_end|>'
+        parsed = tokenizer.parse_response(assistant_text)
+        expected = {
+            "role": "assistant",
+            "content": "",
+            "tool_calls": [{"type": "function", "function": {"name": "multiply", "arguments": {"a": 3, "b": 4}}}],
+        }
+        assert parsed == expected
+
+
+class TestIsChatTemplatePrefixPreserving:
+    def test_prefix_preserving_template(self):
+        tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-Qwen3MoeForSequenceClassification")
+        tokenizer.chat_template = textwrap.dedent(r"""
+        {%- for message in messages %}
+
+        {%- if message.role == 'user' %}
+            {{- '<|im_start|>user\n' + message.content + '<|im_end|>\n' }}
+        {%- elif message.role == 'assistant' %}
+            {{- '<|im_start|>assistant\n' + message.content + '<|im_end|>\n' }}
+        {%- endif %}
+
+        {%- endfor %}
+
+        {%- if add_generation_prompt %}
+            {{- '<|im_start|>assistant\n' }}
+        {%- endif %}""")
+        assert is_chat_template_prefix_preserving(tokenizer) is True
+
+    def test_non_prefix_preserving_template(self):
+        tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-Qwen3MoeForSequenceClassification")
+        # The following template is quite typical of models like Qwen3 and GPT-OSS, where the thinking part is
+        # only present for last assistant message, which makes it non-prefix-preserving.
+        # docstyle-ignore
+        tokenizer.chat_template = textwrap.dedent(r"""
+        {%- if messages[0].role == 'system' %}
+            {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+        {%- endif %}
+        {%- set ns = namespace(last_query_index=messages|length - 1) %}
+        {%- for message in messages[::-1] %}
+            {%- set index = (messages|length - 1) - loop.index0 %}
+            {%- if message.role == "user" and message.content is string %}
+                {%- set ns.last_query_index = index %}
+                {%- break %}
+            {%- endif %}
+        {%- endfor %}
+        {%- for message in messages %}
+            {%- set content = message.content if message.content is string else '' %}
+            {%- if message.role == "user" or (message.role == "system" and not loop.first) %}
+                {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>\n' }}
+            {%- elif message.role == "assistant" %}
+                {%- set reasoning_content = '' %}
+                {%- if message.reasoning_content is string %}
+                    {%- set reasoning_content = message.reasoning_content %}
+                {%- else %}
+                    {%- if '</think>' in content %}
+                        {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                        {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+                    {%- endif %}
+                {%- endif %}
+                {%- if loop.index0 > ns.last_query_index %}
+                    {%- if loop.last or (not loop.last and reasoning_content) %}
+                        {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+                    {%- else %}
+                        {{- '<|im_start|>' + message.role + '\n' + content }}
+                    {%- endif %}
+                {%- else %}
+                    {{- '<|im_start|>' + message.role + '\n' + content }}
+                {%- endif %}
+                {{- '<|im_end|>\n' }}
+            {%- endif %}
+        {%- endfor %}
+        {%- if add_generation_prompt %}
+            {{- '<|im_start|>assistant\n' }}
+            {%- if enable_thinking is defined and enable_thinking is false %}
+                {{- '<think>\n\n</think>\n\n' }}
+            {%- endif %}
+        {%- endif %}""")
+        assert is_chat_template_prefix_preserving(tokenizer) is False
+
+
+class TestGetTrainingChatTemplate:
+    def test_qwen3(self):
+        tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-Qwen3MoeForSequenceClassification")
+        assert is_chat_template_prefix_preserving(tokenizer) is False
+        tokenizer.chat_template = get_training_chat_template(tokenizer)
+        assert is_chat_template_prefix_preserving(tokenizer) is True
+
+
+@pytest.mark.xfail(
+    condition=Version(transformers.__version__) < Version("5.0.0.dev0"),
+    reason="Tool parsing is not supported in transformers versions below 5.0.0.dev0",
+    strict=True,
+)
+class TestParseResponse:
+    def test_parse_response(self):
+        tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-Qwen3MoeForSequenceClassification")
+        tokenizer = add_response_schema(tokenizer)
+        text = '<tool_call>\n{"name": "multiply", "arguments": {"a": 3, "b": 4}}\n</tool_call><|im_end|>'
+        assistant_text = tokenizer(text)["input_ids"]
+        parsed = parse_response(tokenizer, assistant_text)
+        expected = {
+            "role": "assistant",
+            "content": "",
+            "tool_calls": [{"type": "function", "function": {"name": "multiply", "arguments": {"a": 3, "b": 4}}}],
+        }
+        assert parsed == expected
+
+    def test_parse_response_no_tool_call(self):
+        tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-Qwen3MoeForSequenceClassification")
+        tokenizer = add_response_schema(tokenizer)
+        text = "Here is the answer to your question.<|im_end|>"
+        assistant_text = tokenizer(text)["input_ids"]
+        parsed = parse_response(tokenizer, assistant_text)
+        expected = {
+            "role": "assistant",
+            "content": "Here is the answer to your question.",
+        }
+
+        assert parsed == expected
+
+    def test_parse_response_malformed_tool_call(self):
+        tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-Qwen3MoeForSequenceClassification")
+        tokenizer = add_response_schema(tokenizer)
+        text = '<tool_call>\n{"name": "multiply", "arguments": {"a": 3, "b": 4}\n</tool_call><|im_end|>'
+        assistant_text = tokenizer(text)["input_ids"]
+        parsed = parse_response(tokenizer, assistant_text)
+        expected = {
+            "role": "assistant",
+            "content": '<tool_call>\n{"name": "multiply", "arguments": {"a": 3, "b": 4}\n</tool_call>',
+        }
+
+        assert parsed == expected