From 20b13d8413a584e312c70dbc7de078bf4c91b116 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Tue, 28 Jan 2025 13:10:23 +0100
Subject: [PATCH 1/4] update

---
 .../models/aria/image_processing_aria.py      |   2 +
 .../models/aria/processing_aria.py            |   4 +
 .../models/emu3/image_processing_emu3.py      |   2 +-
 .../models/emu3/processing_emu3.py            |   3 +-
 .../idefics2/image_processing_idefics2.py     |   2 +-
 .../idefics3/image_processing_idefics3.py     |   2 +-
 .../models/idefics3/processing_idefics3.py    |   3 +-
 .../llava_next/image_processing_llava_next.py |   2 +-
 .../models/mllama/processing_mllama.py        |  10 +-
 src/transformers/processing_utils.py          |  75 +++---
 tests/models/aria/test_processor_aria.py      |  49 ++++
 tests/models/emu3/test_processor_emu3.py      |   5 +
 tests/models/llava/test_processor_llava.py    |  33 +--
 .../llava_next/test_processor_llava_next.py   |   6 +-
 .../test_processor_llava_onevision.py         |   6 +-
 tests/models/mllama/test_processor_mllama.py  |   3 +
 .../qwen2_5_vl/test_processor_qwen2_5_vl.py   | 145 ++++++++++-
 .../qwen2_vl/test_processor_qwen2_vl.py       | 145 ++++++++++-
 tests/test_modeling_common.py                 |   6 +
 tests/test_processing_common.py               | 225 +++++++++++++++++-
 20 files changed, 660 insertions(+), 68 deletions(-)

diff --git a/src/transformers/models/aria/image_processing_aria.py b/src/transformers/models/aria/image_processing_aria.py
index 7b00665aa285..10ffde2f9952 100644
--- a/src/transformers/models/aria/image_processing_aria.py
+++ b/src/transformers/models/aria/image_processing_aria.py
@@ -133,6 +133,8 @@ class AriaImageProcessor(BaseImageProcessor):
             The resampling filter to use if resizing the image.
     """
 
+    model_input_names = ["pixel_values", "pixel_mask", "num_crops"]
+
     def __init__(
         self,
         image_mean: List[float] = None,
diff --git a/src/transformers/models/aria/processing_aria.py b/src/transformers/models/aria/processing_aria.py
index 2cfbd72a0020..4b7163db8fd3 100644
--- a/src/transformers/models/aria/processing_aria.py
+++ b/src/transformers/models/aria/processing_aria.py
@@ -158,6 +158,10 @@ def decode(self, *args, **kwargs):
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
+
+        # Remove `num_crops`, it is popped and used only when processing. Make a copy of list when remocing
+        # otherwise `self.image_processor.model_input_names` is also modified
+        image_processor_input_names = [name for name in image_processor_input_names if name != "num_crops"]
         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
 
 
diff --git a/src/transformers/models/emu3/image_processing_emu3.py b/src/transformers/models/emu3/image_processing_emu3.py
index f28bc501ba16..1cc02f58ddce 100644
--- a/src/transformers/models/emu3/image_processing_emu3.py
+++ b/src/transformers/models/emu3/image_processing_emu3.py
@@ -132,7 +132,7 @@ class Emu3ImageProcessor(BaseImageProcessor):
             The spatial downsample factor the image will be downsampled in feature extracting phase
     """
 
-    model_input_names = ["pixel_values"]
+    model_input_names = ["pixel_values", "image_sizes"]
 
     def __init__(
         self,
diff --git a/src/transformers/models/emu3/processing_emu3.py b/src/transformers/models/emu3/processing_emu3.py
index 2c536f5f2463..01966e470bdf 100644
--- a/src/transformers/models/emu3/processing_emu3.py
+++ b/src/transformers/models/emu3/processing_emu3.py
@@ -63,6 +63,7 @@ class Emu3Processor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
     tokenizer_class = ("GPT2Tokenizer", "GPT2TokenizerFast")
     image_processor_class = "Emu3ImageProcessor"
 
@@ -179,7 +180,7 @@ def __call__(
         data = self.tokenizer(text, **output_kwargs["text_kwargs"])
         data.update(**image_features)
 
-        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"]["return_tensors"])
+        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].pop("return_tensors", None))
 
     def calculate_generate_size(self, ratio, image_area, spatial_factor):
         width, height = map(int, ratio.split(":"))
diff --git a/src/transformers/models/idefics2/image_processing_idefics2.py b/src/transformers/models/idefics2/image_processing_idefics2.py
index 65d5a8285416..daf31174366a 100644
--- a/src/transformers/models/idefics2/image_processing_idefics2.py
+++ b/src/transformers/models/idefics2/image_processing_idefics2.py
@@ -217,7 +217,7 @@ class Idefics2ImageProcessor(BaseImageProcessor):
             strategy was first introduced in https://arxiv.org/abs/2311.06607.
     """
 
-    model_input_names = ["pixel_values"]
+    model_input_names = ["pixel_values", "pixel_attention_mask"]
 
     def __init__(
         self,
diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index df71a8bf0e85..452b5ebfee2a 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -323,7 +323,7 @@ class Idefics3ImageProcessor(BaseImageProcessor):
             sample in the batch, such that the returned tensor is of shape (batch_size, max_num_images, num_channels, max_height, max_width).
     """
 
-    model_input_names = ["pixel_values"]
+    model_input_names = ["pixel_values", "pixel_attention_mask"]
 
     def __init__(
         self,
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index 40c8829fe76e..2cf605cf65fc 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -129,6 +129,7 @@ class Idefics3Processor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["image_seq_len", "chat_template"]
     image_processor_class = "Idefics3ImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
@@ -354,7 +355,7 @@ def decode(self, *args, **kwargs):
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
-        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+        return list(dict.fromkeys(image_processor_input_names + tokenizer_input_names))
 
 
 __all__ = ["Idefics3Processor"]
diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py
index 8e2a4f4644fc..b4ee3bf6dc3f 100644
--- a/src/transformers/models/llava_next/image_processing_llava_next.py
+++ b/src/transformers/models/llava_next/image_processing_llava_next.py
@@ -186,7 +186,7 @@ class LlavaNextImageProcessor(BaseImageProcessor):
             Whether to convert the image to RGB.
     """
 
-    model_input_names = ["pixel_values"]
+    model_input_names = ["pixel_values", "image_sizes"]
 
     def __init__(
         self,
diff --git a/src/transformers/models/mllama/processing_mllama.py b/src/transformers/models/mllama/processing_mllama.py
index 5905f3313f78..03c95a085156 100644
--- a/src/transformers/models/mllama/processing_mllama.py
+++ b/src/transformers/models/mllama/processing_mllama.py
@@ -209,10 +209,11 @@ class MllamaProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
     image_processor_class = "MllamaImageProcessor"
     tokenizer_class = "PreTrainedTokenizerFast"
 
-    def __init__(self, image_processor, tokenizer):
+    def __init__(self, image_processor, tokenizer, chat_template=None):
         if not hasattr(tokenizer, "image_token"):
             self.image_token = "<|image|>"
             self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
@@ -223,8 +224,7 @@ def __init__(self, image_processor, tokenizer):
         self.python_token = "<|python_tag|>"
         self.python_token_id = tokenizer.convert_tokens_to_ids(self.python_token)
         self.bos_token = tokenizer.bos_token
-        self.chat_template = tokenizer.chat_template
-        super().__init__(image_processor, tokenizer)
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
     def __call__(
         self,
@@ -367,6 +367,10 @@ def post_process_image_text_to_text(self, generated_outputs):
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
+
+        # Remove `num_tiles`, it is popped and used only when processing. Make a copy of list when remocing
+        # otherwise `self.image_processor.model_input_names` is also modified
+        image_processor_input_names = [name for name in image_processor_input_names if name != "num_tiles"]
         return list(tokenizer_input_names + image_processor_input_names + ["cross_attention_mask"])
 
 
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index b94230c7d4a1..ca55e9d63cdb 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -762,7 +762,11 @@ def get_processor_dict(
         # (`cached_file` called using `_raise_exceptions_for_missing_entries=False` to avoid exception)
         # However, for models added in the future, we won't get the expected error if this file is missing.
         if resolved_processor_file is None:
-            return {}, kwargs
+            # In any case we need to pass `chat_template` if it is available
+            processor_dict = {}
+            if "chat_template" in kwargs:
+                processor_dict = {"chat_template": kwargs.pop("chat_template")}
+            return processor_dict, kwargs
 
         try:
             # Load processor dict
@@ -786,6 +790,9 @@ def get_processor_dict(
                 "in the processor's config. Make sure to move your template to its own file."
             )
 
+        if "chat_template" in kwargs:
+            processor_dict["chat_template"] = kwargs.pop("chat_template")
+
         if not is_local:
             if "auto_map" in processor_dict:
                 processor_dict["auto_map"] = add_model_info_to_auto_map(
@@ -817,7 +824,6 @@ def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs):
         """
         processor_dict = processor_dict.copy()
         return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
-        chat_template = kwargs.pop("chat_template", None)
 
         # We have to pop up some unused (but specific) kwargs and then validate that it doesn't contain unused kwargs
         # If we don't pop, some specific kwargs will raise a warning
@@ -829,8 +835,6 @@ def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs):
 
         unused_kwargs = cls.validate_init_kwargs(processor_config=processor_dict, valid_kwargs=cls.valid_kwargs)
         processor = cls(*args, **processor_dict)
-        if chat_template is not None:
-            setattr(processor, "chat_template", chat_template)
 
         # Update processor with kwargs if needed
         for key in set(kwargs.keys()):
@@ -1199,12 +1203,6 @@ def apply_chat_template(
                     "https://huggingface.co/docs/transformers/main/en/chat_templating for more information."
                 )
 
-        text_kwargs = {}
-        for key in TextKwargs.__annotations__.keys():
-            value = kwargs.pop(key, None)
-            if value is not None:
-                text_kwargs[key] = value
-
         chat_template_kwargs = {}
         for key in ChatTemplateKwargs.__annotations__.keys():
             value = kwargs.pop(key, getattr(ChatTemplateKwargs, key))
@@ -1221,31 +1219,52 @@ def apply_chat_template(
             chat_template=chat_template,
             tokenize=False,
             return_dict=False,
-            **text_kwargs,
             **chat_template_kwargs,
         )
 
-        # we will have to return all processed inputs in a dict
+        if isinstance(conversation, (list, tuple)) and (
+            isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content")
+        ):
+            conversations = conversation
+            is_batched = True
+        else:
+            conversations = [conversation]
+            is_batched = False
+
+        # We will have to return all processed inputs in a dict
+        # Currently all processors can accept flat list of visuals, but not all can accept nested list of batches
+        # So we'll make a simple list of images in the order they appear
         if tokenize:
-            images, videos = [], []
-            for message in conversation:
-                visuals = [content for content in message["content"] if content["type"] in ["image", "video"]]
-                for vision_info in visuals:
-                    if vision_info["type"] == "image":
-                        for key in ["image", "url", "path", "base64"]:
-                            if key in vision_info:
-                                images.append(load_image(vision_info[key]))
-                    elif vision_info["type"] == "video":
-                        for key in ["video", "url", "path"]:
-                            if key in vision_info:
-                                videos.append(
-                                    load_video(vision_info[key], num_frames=num_frames, backend=video_load_backend)
-                                )
+            batch_images, batch_videos = [], []
+            for conversation in conversations:
+                for message in conversation:
+                    visuals = [content for content in message["content"] if content["type"] in ["image", "video"]]
+                    for vision_info in visuals:
+                        if vision_info["type"] == "image":
+                            for key in ["image", "url", "path", "base64"]:
+                                if key in vision_info:
+                                    batch_images.append(load_image(vision_info[key]))
+                        elif vision_info["type"] == "video":
+                            for key in ["video", "url", "path"]:
+                                if key in vision_info:
+                                    batch_videos.append(
+                                        load_video(vision_info[key], num_frames=num_frames, backend=video_load_backend)
+                                    )
+
+            # Tokenizer's `apply_chat_template` never adds special tokens when tokenizing
+            # But processor's `apply_chat_template` didn't have an option to tokenize, so users had to format the prompt
+            # and pass it to the processor. Users thus never worried about special tokens relying on processor hadnling
+            # everything internally. The below line is to keep BC for that and be able to work with model that have
+            # special tokens in the template (consistent with tokenizers). We dont want to raise warning, it will flood command line
+            # without actionable solution for users
+            single_prompt = prompt[0] if is_batched else prompt
+            if self.tokenizer.bos_token is not None and single_prompt.startswith(self.tokenizer.bos_token):
+                kwargs["add_special_tokens"] = False
 
             out = self(
                 text=prompt,
-                images=images if images else None,
-                videos=videos if videos else None,
+                images=batch_images if batch_images else None,
+                videos=batch_videos if batch_videos else None,
                 **kwargs,
             )
             if return_dict:
diff --git a/tests/models/aria/test_processor_aria.py b/tests/models/aria/test_processor_aria.py
index 7e23d861c775..623153b6798a 100644
--- a/tests/models/aria/test_processor_aria.py
+++ b/tests/models/aria/test_processor_aria.py
@@ -237,6 +237,55 @@ def test_apply_chat_template(self):
 """
         self.assertEqual(rendered, expected_rendered)
 
+    # Override as AriaImageProcessor doesn't accept `do_rescale`
+    def test_chat_template_accepts_processing_kwargs(self):
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What is shown in this image?"},
+                    ],
+                },
+            ]
+        ]
+
+        formatted_prompt_tokenized = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            padding="max_length",
+            max_length=50,
+        )
+        self.assertEqual(len(formatted_prompt_tokenized[0]), 50)
+
+        formatted_prompt_tokenized = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            truncation=True,
+            max_length=5,
+        )
+        self.assertEqual(len(formatted_prompt_tokenized[0]), 5)
+
+        # Now test the ability to return dict
+        messages[0][0]["content"].append(
+            {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}
+        )
+        out_dict = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            max_image_size=980,
+            return_tensors="np",
+        )
+        self.assertListEqual(list(out_dict[self.images_input_name].shape), [1, 3, 980, 980])
+
     # Override as AriaProcessor needs image tokens in prompts
     def prepare_text_inputs(self, batch_size: Optional[int] = None):
         if batch_size is None:
diff --git a/tests/models/emu3/test_processor_emu3.py b/tests/models/emu3/test_processor_emu3.py
index 7bc77075b1a6..c7b792c2a391 100644
--- a/tests/models/emu3/test_processor_emu3.py
+++ b/tests/models/emu3/test_processor_emu3.py
@@ -52,6 +52,11 @@ def setUp(self):
         )
         processor.save_pretrained(self.tmpdirname)
 
+    def prepare_processor_dict(self):
+        return {
+            "chat_template": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}",
+        }  # fmt: skip
+
     def test_processor_for_generation(self):
         processor_components = self.prepare_components()
         processor = self.processor_class(**processor_components)
diff --git a/tests/models/llava/test_processor_llava.py b/tests/models/llava/test_processor_llava.py
index fd0ba8cacc18..a411430a1e8a 100644
--- a/tests/models/llava/test_processor_llava.py
+++ b/tests/models/llava/test_processor_llava.py
@@ -17,7 +17,7 @@
 import unittest
 
 from transformers import AutoProcessor, AutoTokenizer, LlamaTokenizerFast, LlavaProcessor
-from transformers.testing_utils import require_torch, require_vision
+from transformers.testing_utils import require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
@@ -27,7 +27,7 @@
     from transformers import CLIPImageProcessor
 
 if is_torch_available:
-    import torch
+    pass
 
 
 @require_vision
@@ -53,7 +53,11 @@ def tearDown(self):
         shutil.rmtree(self.tmpdirname)
 
     def prepare_processor_dict(self):
-        return {"chat_template": "dummy_template", "patch_size": 3, "vision_feature_select_strategy": "default"}
+        return {
+            "chat_template": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}",
+            "patch_size": 3,
+            "vision_feature_select_strategy": "default"
+        }  # fmt: skip
 
     @unittest.skip(
         "Skip because the model has no processor kwargs except for chat template and"
@@ -123,29 +127,6 @@ def test_chat_template_dict(self):
         )
         self.assertListEqual(list(out_dict_with_image.keys()), ["input_ids", "attention_mask", "pixel_values"])
 
-    @require_torch
-    def test_chat_template_dict_torch(self):
-        processor = LlavaProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
-                    {"type": "text", "text": "What is shown in this image?"},
-                ],
-            },
-        ]
-
-        out_dict_tensors = processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-        )
-        self.assertListEqual(list(out_dict_tensors.keys()), ["input_ids", "attention_mask", "pixel_values"])
-        self.assertTrue(isinstance(out_dict_tensors["input_ids"], torch.Tensor))
-
     def test_chat_template_with_continue_final_message(self):
         processor = LlavaProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
         expected_prompt = "USER: <image>\nDescribe this image. ASSISTANT: There is a dog and"
diff --git a/tests/models/llava_next/test_processor_llava_next.py b/tests/models/llava_next/test_processor_llava_next.py
index 234e47911000..af1457841163 100644
--- a/tests/models/llava_next/test_processor_llava_next.py
+++ b/tests/models/llava_next/test_processor_llava_next.py
@@ -50,7 +50,11 @@ def get_image_processor(self, **kwargs):
         return LlavaNextProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
 
     def prepare_processor_dict(self):
-        return {"chat_template": "dummy_template", "patch_size": 3, "vision_feature_select_strategy": "default"}
+        return {
+            "chat_template": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}",
+            "patch_size": 3,
+            "vision_feature_select_strategy": "default"
+        }  # fmt: skip
 
     @unittest.skip(
         "Skip because the model has no processor kwargs except for chat template and"
diff --git a/tests/models/llava_onevision/test_processor_llava_onevision.py b/tests/models/llava_onevision/test_processor_llava_onevision.py
index 04aafa11a8b0..01e5e1d8384d 100644
--- a/tests/models/llava_onevision/test_processor_llava_onevision.py
+++ b/tests/models/llava_onevision/test_processor_llava_onevision.py
@@ -61,7 +61,11 @@ def get_video_processor(self, **kwargs):
         return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor
 
     def prepare_processor_dict(self):
-        return {"chat_template": "dummy_template", "num_image_tokens": 6, "vision_feature_select_strategy": "default"}
+        return {
+            "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + ' '}}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>' }}{% endfor %}{# Render all video then #}{% for content in message['content'] | selectattr('type', 'equalto', 'video') %}{{ '<video>' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ '\n' + content['text'] }}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ '\n' + content['text'] }}{% endgeneration %}{% endfor %}{% endif %}{{'<|im_end|>'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+            "num_image_tokens": 6,
+            "vision_feature_select_strategy": "default"
+        }  # fmt: skip
 
     def test_processor_to_json_string(self):
         processor = self.get_processor()
diff --git a/tests/models/mllama/test_processor_mllama.py b/tests/models/mllama/test_processor_mllama.py
index 795a361e2264..868aa88196f3 100644
--- a/tests/models/mllama/test_processor_mllama.py
+++ b/tests/models/mllama/test_processor_mllama.py
@@ -52,6 +52,9 @@ def setUp(self):
     def tearDown(self):
         shutil.rmtree(self.tmpdirname)
 
+    def prepare_processor_dict(self):
+        return {"chat_template": "{% for message in messages %}{% if loop.index0 == 0 %}{{ bos_token }}{% endif %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}{{ '<|image|>' }}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{{ '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"}  # fmt: skip
+
     def test_apply_chat_template(self):
         # Message contains content which a mix of lists with images and image urls and string
         messages = [
diff --git a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
index 2bc47d76acd4..230dbc5f9082 100644
--- a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
+++ b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import inspect
 import shutil
 import tempfile
 import unittest
@@ -19,7 +20,7 @@
 import pytest
 
 from transformers import AutoProcessor, Qwen2Tokenizer
-from transformers.testing_utils import require_torch, require_vision
+from transformers.testing_utils import require_av, require_torch, require_vision
 from transformers.utils import is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
@@ -45,6 +46,9 @@ def get_tokenizer(self, **kwargs):
     def get_image_processor(self, **kwargs):
         return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
 
+    def prepare_processor_dict(self):
+        return {"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"}  # fmt: skip
+
     def tearDown(self):
         shutil.rmtree(self.tmpdirname)
 
@@ -111,3 +115,142 @@ def test_model_input_names(self):
         inputs = processor(text=input_str, images=image_input, videos=video_inputs)
 
         self.assertListEqual(list(inputs.keys()), processor.model_input_names)
+
+    def test_chat_template_single(self):
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What is shown in this image?"},
+                    ],
+                },
+            ]
+        ]
+
+        formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+        self.assertEqual(len(formatted_prompt), 1)
+
+        formatted_prompt_tokenized = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
+        expected_output = processor.tokenizer(formatted_prompt, return_tensors=None).input_ids
+        self.assertListEqual(expected_output, formatted_prompt_tokenized)
+
+        out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
+        self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"])
+
+        # Now test the ability to return dict
+        messages[0][0]["content"].append(
+            {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}
+        )
+        out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
+        self.assertTrue(self.images_input_name in out_dict)
+
+        # should always have input_ids and attention_mask
+        self.assertEqual(len(out_dict["input_ids"]), 1)
+        self.assertEqual(len(out_dict["attention_mask"]), 1)
+        self.assertEqual(len(out_dict[self.images_input_name]), 71280)
+
+    def test_chat_template_batched(self):
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        batched_messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What is shown in this image?"},
+                    ],
+                },
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What do you see?"},
+                    ],
+                },
+            ],
+        ]
+
+        formatted_prompt = processor.apply_chat_template(batched_messages, add_generation_prompt=True, tokenize=False)
+        self.assertEqual(len(formatted_prompt), 2)
+
+        formatted_prompt_tokenized = processor.apply_chat_template(
+            batched_messages, add_generation_prompt=True, tokenize=True, padding=True
+        )
+        expected_output = processor.tokenizer(formatted_prompt, return_tensors=None, padding=True).input_ids
+        self.assertListEqual(expected_output, formatted_prompt_tokenized)
+
+        out_dict = processor.apply_chat_template(
+            batched_messages, add_generation_prompt=True, tokenize=True, return_dict=True, padding=True
+        )
+        self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"])
+
+        # Now test the ability to return dict
+        batched_messages[0][0]["content"].append(
+            {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}
+        )
+        batched_messages[1][0]["content"].append(
+            {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"}
+        )
+        out_dict = processor.apply_chat_template(
+            batched_messages, add_generation_prompt=True, tokenize=True, return_dict=True, padding=True
+        )
+        self.assertTrue(self.images_input_name in out_dict)
+
+        # should always have input_ids and attention_mask
+        self.assertEqual(len(out_dict["input_ids"]), 2)
+        self.assertEqual(len(out_dict["attention_mask"]), 2)
+        self.assertEqual(len(out_dict[self.images_input_name]), 90480)
+
+    @require_av
+    def test_chat_template_video(self):
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        signature = inspect.signature(processor.__call__)
+        if "videos" not in {*signature.parameters.keys()} or (
+            signature.parameters.get("videos") is not None
+            and signature.parameters["videos"].annotation == inspect._empty
+        ):
+            self.skipTest("Processor doesn't accept videos at input")
+
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video"},
+                        {"type": "text", "text": "What is shown in this video?"},
+                    ],
+                },
+            ]
+        ]
+
+        formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+        self.assertEqual(len(formatted_prompt), 1)
+
+        formatted_prompt_tokenized = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
+        expected_output = processor.tokenizer(formatted_prompt, return_tensors=None).input_ids
+        self.assertListEqual(expected_output, formatted_prompt_tokenized)
+
+        out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
+        self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"])
+
+        # add video URL for return dict
+        messages[0][0]["content"][0] = {
+            "type": "video",
+            "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4",
+        }
+        out_dict_with_video = processor.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=True, return_dict=True
+        )
+        self.assertTrue(self.videos_input_name in out_dict_with_video)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8640000)
diff --git a/tests/models/qwen2_vl/test_processor_qwen2_vl.py b/tests/models/qwen2_vl/test_processor_qwen2_vl.py
index a360fc98f4c5..ec080e9973f4 100644
--- a/tests/models/qwen2_vl/test_processor_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_processor_qwen2_vl.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import inspect
 import shutil
 import tempfile
 import unittest
@@ -19,7 +20,7 @@
 import pytest
 
 from transformers import AutoProcessor, Qwen2Tokenizer
-from transformers.testing_utils import require_torch, require_vision
+from transformers.testing_utils import require_av, require_torch, require_vision
 from transformers.utils import is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
@@ -45,6 +46,9 @@ def get_tokenizer(self, **kwargs):
     def get_image_processor(self, **kwargs):
         return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
 
+    def prepare_processor_dict(self):
+        return {"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"}  # fmt: skip
+
     def tearDown(self):
         shutil.rmtree(self.tmpdirname)
 
@@ -108,3 +112,142 @@ def test_model_input_names(self):
         inputs = processor(text=input_str, images=image_input, videos=video_inputs)
 
         self.assertListEqual(list(inputs.keys()), processor.model_input_names)
+
+    def test_chat_template_single(self):
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What is shown in this image?"},
+                    ],
+                },
+            ]
+        ]
+
+        formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+        self.assertEqual(len(formatted_prompt), 1)
+
+        formatted_prompt_tokenized = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
+        expected_output = processor.tokenizer(formatted_prompt, return_tensors=None).input_ids
+        self.assertListEqual(expected_output, formatted_prompt_tokenized)
+
+        out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
+        self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"])
+
+        # Now test the ability to return dict
+        messages[0][0]["content"].append(
+            {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}
+        )
+        out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
+        self.assertTrue(self.images_input_name in out_dict)
+
+        # should always have input_ids and attention_mask
+        self.assertEqual(len(out_dict["input_ids"]), 1)
+        self.assertEqual(len(out_dict["attention_mask"]), 1)
+        self.assertEqual(len(out_dict[self.images_input_name]), 71280)
+
+    def test_chat_template_batched(self):
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        batched_messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What is shown in this image?"},
+                    ],
+                },
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What do you see?"},
+                    ],
+                },
+            ],
+        ]
+
+        formatted_prompt = processor.apply_chat_template(batched_messages, add_generation_prompt=True, tokenize=False)
+        self.assertEqual(len(formatted_prompt), 2)
+
+        formatted_prompt_tokenized = processor.apply_chat_template(
+            batched_messages, add_generation_prompt=True, tokenize=True, padding=True
+        )
+        expected_output = processor.tokenizer(formatted_prompt, return_tensors=None, padding=True).input_ids
+        self.assertListEqual(expected_output, formatted_prompt_tokenized)
+
+        out_dict = processor.apply_chat_template(
+            batched_messages, add_generation_prompt=True, tokenize=True, return_dict=True, padding=True
+        )
+        self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"])
+
+        # Now test the ability to return dict
+        batched_messages[0][0]["content"].append(
+            {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}
+        )
+        batched_messages[1][0]["content"].append(
+            {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"}
+        )
+        out_dict = processor.apply_chat_template(
+            batched_messages, add_generation_prompt=True, tokenize=True, return_dict=True, padding=True
+        )
+        self.assertTrue(self.images_input_name in out_dict)
+
+        # should always have input_ids and attention_mask
+        self.assertEqual(len(out_dict["input_ids"]), 2)
+        self.assertEqual(len(out_dict["attention_mask"]), 2)
+        self.assertEqual(len(out_dict[self.images_input_name]), 90480)
+
+    @require_av
+    def test_chat_template_video(self):
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        signature = inspect.signature(processor.__call__)
+        if "videos" not in {*signature.parameters.keys()} or (
+            signature.parameters.get("videos") is not None
+            and signature.parameters["videos"].annotation == inspect._empty
+        ):
+            self.skipTest("Processor doesn't accept videos at input")
+
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video"},
+                        {"type": "text", "text": "What is shown in this video?"},
+                    ],
+                },
+            ]
+        ]
+
+        formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+        self.assertEqual(len(formatted_prompt), 1)
+
+        formatted_prompt_tokenized = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
+        expected_output = processor.tokenizer(formatted_prompt, return_tensors=None).input_ids
+        self.assertListEqual(expected_output, formatted_prompt_tokenized)
+
+        out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
+        self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"])
+
+        # add video URL for return dict
+        messages[0][0]["content"][0] = {
+            "type": "video",
+            "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4",
+        }
+        out_dict_with_video = processor.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=True, return_dict=True
+        )
+        self.assertTrue(self.videos_input_name in out_dict_with_video)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8640000)
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 98709ba3b84a..1d52e8281449 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -331,6 +331,12 @@ def check_save_load(out1, out2):
                 with torch.no_grad():
                     second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
 
+                # Save and load second time because `from_pretrained` adds a bunch of new config fields
+                # so we need to make sure those fields can be loaded back after saving
+                # Simply init as `model(config)` doesn't add those fields
+                model.save_pretrained(tmpdirname)
+                model = model_class.from_pretrained(tmpdirname)
+
             if isinstance(first, tuple) and isinstance(second, tuple):
                 for tensor1, tensor2 in zip(first, second):
                     check_save_load(tensor1, tensor2)
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index d81386f9e0c7..262f54c9f83b 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -27,10 +27,11 @@
 from transformers.processing_utils import Unpack
 from transformers.testing_utils import (
     check_json_file_has_correct_format,
+    require_av,
     require_torch,
     require_vision,
 )
-from transformers.utils import is_vision_available
+from transformers.utils import is_torch_available, is_vision_available
 
 
 global_rng = random.Random()
@@ -38,6 +39,9 @@
 if is_vision_available():
     from PIL import Image
 
+if is_torch_available():
+    import torch
+
 
 def prepare_image_inputs():
     """This function prepares a list of PIL images"""
@@ -532,6 +536,10 @@ def test_prepare_and_validate_optional_call_args(self):
 
     def test_chat_template_save_loading(self):
         processor = self.get_processor()
+        signature = inspect.signature(processor.__call__)
+        if "chat_template" not in {*signature.parameters.keys()}:
+            self.skipTest("Processor doesn't accept chat templates at input")
+
         existing_tokenizer_template = getattr(processor.tokenizer, "chat_template", None)
         processor.chat_template = "test template"
         with tempfile.TemporaryDirectory() as tmpdirname:
@@ -553,3 +561,218 @@ def test_chat_template_save_loading(self):
             # When we save as single files, tokenizers and processors share a chat template, which means
             # the reloaded tokenizer should get the chat template as well
             self.assertEqual(reloaded_processor.chat_template, reloaded_processor.tokenizer.chat_template)
+
+    def test_chat_template_single(self):
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What is shown in this image?"},
+                    ],
+                },
+            ]
+        ]
+
+        formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+        self.assertEqual(len(formatted_prompt), 1)
+
+        formatted_prompt_tokenized = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
+        expected_output = processor.tokenizer(formatted_prompt, return_tensors=None).input_ids
+        self.assertListEqual(expected_output, formatted_prompt_tokenized)
+
+        out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
+        self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"])
+
+        # Now test the ability to return dict
+        messages[0][0]["content"].append(
+            {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}
+        )
+        out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
+        self.assertTrue(self.images_input_name in out_dict)
+
+        # should always have input_ids and attention_mask
+        self.assertEqual(len(out_dict["input_ids"]), 1)
+        self.assertEqual(len(out_dict["attention_mask"]), 1)
+        self.assertEqual(len(out_dict[self.images_input_name]), 1)
+
+    def test_chat_template_batched(self):
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        batched_messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What is shown in this image?"},
+                    ],
+                },
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What do you see?"},
+                    ],
+                },
+            ],
+        ]
+
+        formatted_prompt = processor.apply_chat_template(batched_messages, add_generation_prompt=True, tokenize=False)
+        self.assertEqual(len(formatted_prompt), 2)
+
+        formatted_prompt_tokenized = processor.apply_chat_template(
+            batched_messages, add_generation_prompt=True, tokenize=True, padding=True
+        )
+        expected_output = processor.tokenizer(formatted_prompt, return_tensors=None, padding=True).input_ids
+        self.assertListEqual(expected_output, formatted_prompt_tokenized)
+
+        out_dict = processor.apply_chat_template(
+            batched_messages, add_generation_prompt=True, tokenize=True, return_dict=True, padding=True
+        )
+        self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"])
+
+        # Now test the ability to return dict
+        batched_messages[0][0]["content"].append(
+            {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}
+        )
+        batched_messages[1][0]["content"].append(
+            {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"}
+        )
+        out_dict = processor.apply_chat_template(
+            batched_messages, add_generation_prompt=True, tokenize=True, return_dict=True, padding=True
+        )
+        self.assertTrue(self.images_input_name in out_dict)
+
+        # should always have input_ids and attention_mask
+        self.assertEqual(len(out_dict["input_ids"]), 2)
+        self.assertEqual(len(out_dict["attention_mask"]), 2)
+        self.assertEqual(len(out_dict[self.images_input_name]), 2)
+
+    def test_chat_template_accepts_processing_kwargs(self):
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What is shown in this image?"},
+                    ],
+                },
+            ]
+        ]
+
+        formatted_prompt_tokenized = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            padding="max_length",
+            max_length=50,
+        )
+        self.assertEqual(len(formatted_prompt_tokenized[0]), 50)
+
+        formatted_prompt_tokenized = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            truncation=True,
+            max_length=5,
+        )
+        self.assertEqual(len(formatted_prompt_tokenized[0]), 5)
+
+        # Now test the ability to return dict
+        messages[0][0]["content"].append(
+            {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}
+        )
+        out_dict = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            do_rescale=True,
+            rescale_factor=-1,
+            return_tensors="np",
+        )
+        self.assertLessEqual(out_dict[self.images_input_name][0][0].mean(), 0)
+
+    @require_torch
+    def test_chat_template_dict_torch(self):
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ]
+
+        out_dict_tensors = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+        self.assertTrue(self.images_input_name in out_dict_tensors)
+        for k in out_dict_tensors:
+            self.assertIsInstance(out_dict_tensors[k], torch.Tensor)
+
+    @require_av
+    def test_chat_template_video(self):
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        signature = inspect.signature(processor.__call__)
+        if "videos" not in {*signature.parameters.keys()} or (
+            signature.parameters.get("videos") is not None
+            and signature.parameters["videos"].annotation == inspect._empty
+        ):
+            self.skipTest("Processor doesn't accept videos at input")
+
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video"},
+                        {"type": "text", "text": "What is shown in this video?"},
+                    ],
+                },
+            ]
+        ]
+
+        formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+        self.assertEqual(len(formatted_prompt), 1)
+
+        formatted_prompt_tokenized = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
+        expected_output = processor.tokenizer(formatted_prompt, return_tensors=None).input_ids
+        self.assertListEqual(expected_output, formatted_prompt_tokenized)
+
+        out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
+        self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"])
+
+        # add video URL for return dict
+        messages[0][0]["content"][0] = {
+            "type": "video",
+            "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4",
+        }
+        out_dict_with_video = processor.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=True, return_dict=True
+        )
+        self.assertTrue(self.videos_input_name in out_dict_with_video)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)

From 982661408bffc1dc10ac0644c39eca71455d5514 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Tue, 28 Jan 2025 15:06:41 +0100
Subject: [PATCH 2/4] we need batched nested input to always process correctly

---
 src/transformers/processing_utils.py          |  9 ++++--
 .../test_processor_llava_onevision.py         | 31 ++-----------------
 tests/test_processing_common.py               | 26 ++++++++++++++--
 3 files changed, 32 insertions(+), 34 deletions(-)

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index ca55e9d63cdb..9b808d1ce0f5 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -1237,19 +1237,24 @@ def apply_chat_template(
         if tokenize:
             batch_images, batch_videos = [], []
             for conversation in conversations:
+                images, videos = [], []
                 for message in conversation:
                     visuals = [content for content in message["content"] if content["type"] in ["image", "video"]]
                     for vision_info in visuals:
                         if vision_info["type"] == "image":
                             for key in ["image", "url", "path", "base64"]:
                                 if key in vision_info:
-                                    batch_images.append(load_image(vision_info[key]))
+                                    images.append(load_image(vision_info[key]))
                         elif vision_info["type"] == "video":
                             for key in ["video", "url", "path"]:
                                 if key in vision_info:
-                                    batch_videos.append(
+                                    videos.append(
                                         load_video(vision_info[key], num_frames=num_frames, backend=video_load_backend)
                                     )
+                if images:
+                    batch_images.append(images)
+                if videos:
+                    batch_videos.append(videos)
 
             # Tokenizer's `apply_chat_template` never adds special tokens when tokenizing
             # But processor's `apply_chat_template` didn't have an option to tokenize, so users had to format the prompt
diff --git a/tests/models/llava_onevision/test_processor_llava_onevision.py b/tests/models/llava_onevision/test_processor_llava_onevision.py
index 01e5e1d8384d..3f5db8c9c3ac 100644
--- a/tests/models/llava_onevision/test_processor_llava_onevision.py
+++ b/tests/models/llava_onevision/test_processor_llava_onevision.py
@@ -16,7 +16,7 @@
 import tempfile
 import unittest
 
-from transformers.testing_utils import require_av, require_torch, require_vision
+from transformers.testing_utils import require_av, require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
@@ -32,7 +32,7 @@
     )
 
 if is_torch_available:
-    import torch
+    pass
 
 
 @require_vision
@@ -137,30 +137,3 @@ def test_chat_template_dict(self):
             messages, add_generation_prompt=True, tokenize=True, return_dict=True
         )
         self.assertListEqual(list(out_dict_with_video.keys()), ["input_ids", "attention_mask", "pixel_values_videos"])
-
-    @require_torch
-    @require_av
-    def test_chat_template_dict_torch(self):
-        processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf")
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "video",
-                        "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4",
-                    },
-                    {"type": "text", "text": "What is shown in this video?"},
-                ],
-            },
-        ]
-
-        out_dict_tensors = processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-        )
-        self.assertListEqual(list(out_dict_tensors.keys()), ["input_ids", "attention_mask", "pixel_values_videos"])
-        self.assertTrue(isinstance(out_dict_tensors["input_ids"], torch.Tensor))
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index 262f54c9f83b..94b036e12b30 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -582,7 +582,12 @@ def test_chat_template_single(self):
         self.assertEqual(len(formatted_prompt), 1)
 
         formatted_prompt_tokenized = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
-        expected_output = processor.tokenizer(formatted_prompt, return_tensors=None).input_ids
+        add_special_tokens = True
+        if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token):
+            add_special_tokens = False
+        expected_output = processor.tokenizer(
+            formatted_prompt, return_tensors=None, add_special_tokens=add_special_tokens
+        ).input_ids
         self.assertListEqual(expected_output, formatted_prompt_tokenized)
 
         out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
@@ -630,7 +635,15 @@ def test_chat_template_batched(self):
         formatted_prompt_tokenized = processor.apply_chat_template(
             batched_messages, add_generation_prompt=True, tokenize=True, padding=True
         )
-        expected_output = processor.tokenizer(formatted_prompt, return_tensors=None, padding=True).input_ids
+        add_special_tokens = True
+        if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token):
+            add_special_tokens = False
+        expected_output = processor.tokenizer(
+            formatted_prompt,
+            return_tensors=None,
+            padding=True,
+            add_special_tokens=add_special_tokens,
+        ).input_ids
         self.assertListEqual(expected_output, formatted_prompt_tokenized)
 
         out_dict = processor.apply_chat_template(
@@ -760,7 +773,14 @@ def test_chat_template_video(self):
         self.assertEqual(len(formatted_prompt), 1)
 
         formatted_prompt_tokenized = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
-        expected_output = processor.tokenizer(formatted_prompt, return_tensors=None).input_ids
+        add_special_tokens = True
+        if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token):
+            add_special_tokens = False
+        expected_output = processor.tokenizer(
+            formatted_prompt,
+            return_tensors=None,
+            add_special_tokens=add_special_tokens,
+        ).input_ids
         self.assertListEqual(expected_output, formatted_prompt_tokenized)
 
         out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)

From 7eb23fa482662f24dcb2cb3598b1ad650d599e1d Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Fri, 31 Jan 2025 10:08:23 +0100
Subject: [PATCH 3/4] update a bit

---
 src/transformers/image_utils.py               | 85 ++++++++++++++++---
 src/transformers/models/aria/modular_aria.py  |  6 ++
 .../models/aria/processing_aria.py            |  1 -
 .../models/mllama/processing_mllama.py        |  2 +
 src/transformers/processing_utils.py          | 44 +++++++---
 tests/models/mllama/test_processor_mllama.py  | 12 +++
 .../qwen2_5_vl/test_processor_qwen2_5_vl.py   | 60 ++++++++++++-
 .../qwen2_vl/test_processor_qwen2_vl.py       | 60 ++++++++++++-
 tests/test_processing_common.py               | 70 ++++++++++++++-
 9 files changed, 305 insertions(+), 35 deletions(-)

diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 4f8b5980a6d1..609f51d98897 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -530,7 +530,7 @@ def get_uniform_frame_indices(total_num_frames: int, num_frames: Optional[int] =
     return indices
 
 
-def read_video_opencv(video_path: str, num_frames: Optional[int] = None):
+def read_video_opencv(video_path: str, num_frames: Optional[int] = None, fps: Optional[int] = None):
     """
     Decode the video with open-cv decoder.
 
@@ -538,13 +538,25 @@ def read_video_opencv(video_path: str, num_frames: Optional[int] = None):
         video_path (`str`):
             Path to the video file.
         num_frames (`int`, *optional*):
-            Number of frames to sample uniformly. If not specified, all frames are sampled.
+            Number of frames to sample uniformly. Should be passed only when `fps=None`.
+            If not specified and `fps==None`, all frames are sampled.
+        fps (`int`, *optional*):
+            Number of frames to sample per second. Should be passed only when `num_frames=None`.
+            If not specified and `num_frames==None`, all frames are sampled.
 
     Returns:
         np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
     """
     video = cv2.VideoCapture(video_path)
     total_num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+    video_fps = video.get(cv2.CAP_PROP_FPS)
+    if num_frames is None and fps is not None:
+        num_frames = int(total_num_frames / video_fps * fps)
+        if num_frames > total_num_frames:
+            raise ValueError(
+                f"When loading the video with fps={fps}, we identified that num_frames ({num_frames}) > total_frames ({total_num_frames}) ."
+                f"Make sure that fps of a video is less than the requested fps for loading. Detected video_fps={video_fps}"
+            )
     indices = get_uniform_frame_indices(total_num_frames, num_frames=num_frames)
 
     index = 0
@@ -563,7 +575,7 @@ def read_video_opencv(video_path: str, num_frames: Optional[int] = None):
     return np.stack(frames)
 
 
-def read_video_decord(video_path: str, num_frames: Optional[int] = None):
+def read_video_decord(video_path: str, num_frames: Optional[int] = None, fps: Optional[int] = None):
     """
     Decode the video with Decord decoder.
 
@@ -571,18 +583,31 @@ def read_video_decord(video_path: str, num_frames: Optional[int] = None):
         video_path (`str`):
             Path to the video file.
         num_frames (`int`, *optional*):
-            Number of frames to sample uniformly. If not specified, all frames are sampled.
+            Number of frames to sample uniformly. Should be passed only when `fps=None`.
+            If not specified and `fps==None`, all frames are sampled.
+        fps (`int`, *optional*):
+            Number of frames to sample per second. Should be passed only when `num_frames=None`.
+            If not specified and `num_frames==None`, all frames are sampled.
 
     Returns:
         np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
     """
     vr = VideoReader(uri=video_path, ctx=cpu(0))  # decord has problems with gpu
-    indices = get_uniform_frame_indices(total_num_frames=len(vr), num_frames=num_frames)
+    video_fps = vr.get_avg_fps()
+    total_num_frames = len(vr)
+    if num_frames is None and fps is not None:
+        num_frames = int(total_num_frames / video_fps * fps)
+        if num_frames > total_num_frames:
+            raise ValueError(
+                f"When loading the video with fps={fps}, we identified that num_frames ({num_frames}) > total_frames ({total_num_frames}) ."
+                f"Make sure that fps of a video is less than the requested fps for loading. Detected video_fps={video_fps}"
+            )
+    indices = get_uniform_frame_indices(total_num_frames=total_num_frames, num_frames=num_frames)
     frames = vr.get_batch(indices).asnumpy()
     return frames
 
 
-def read_video_pyav(video_path: str, num_frames: Optional[int] = None):
+def read_video_pyav(video_path: str, num_frames: Optional[int] = None, fps: Optional[int] = None):
     """
     Decode the video with PyAV decoder.
 
@@ -590,15 +615,26 @@ def read_video_pyav(video_path: str, num_frames: Optional[int] = None):
         video_path (`str`):
             Path to the video file.
         num_frames (`int`, *optional*):
-            Number of frames to sample uniformly. If not specified, all frames are sampled.
+            Number of frames to sample uniformly. Should be passed only when `fps=None`.
+            If not specified and `fps==None`, all frames are sampled.
+        fps (`int`, *optional*):
+            Number of frames to sample per second. Should be passed only when `num_frames=None`.
+            If not specified and `num_frames==None`, all frames are sampled.
 
     Returns:
         np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
     """
     container = av.open(video_path)
 
-    # sample uniformly "num_frames" frames from the video
     total_num_frames = container.streams.video[0].frames
+    video_fps = container.streams.video[0].average_rate  # should we better use `av_guess_frame_rate`?
+    if num_frames is None and fps is not None:
+        num_frames = int(total_num_frames / video_fps * fps)
+        if num_frames > total_num_frames:
+            raise ValueError(
+                f"When loading the video with fps={fps}, we identified that num_frames ({num_frames}) > total_frames ({total_num_frames}) ."
+                f"Make sure that fps of a video is less than the requested fps for loading. Detected video_fps={video_fps}"
+            )
     indices = get_uniform_frame_indices(total_num_frames, num_frames=num_frames)
 
     frames = []
@@ -612,7 +648,7 @@ def read_video_pyav(video_path: str, num_frames: Optional[int] = None):
     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
 
 
-def read_video_torchvision(video_path: str, num_frames: Optional[int] = None):
+def read_video_torchvision(video_path: str, num_frames: Optional[int] = None, fps: Optional[int] = None):
     """
     Decode the video with torchvision decoder.
 
@@ -620,7 +656,11 @@ def read_video_torchvision(video_path: str, num_frames: Optional[int] = None):
         video_path (`str`):
             Path to the video file.
         num_frames (`int`, *optional*):
-            Number of frames to sample uniformly. If not specified, all frames are sampled.
+            Number of frames to sample uniformly. Should be passed only when `fps=None`.
+            If not specified and `fps==None`, all frames are sampled.
+        fps (`int`, *optional*):
+            Number of frames to sample per second. Should be passed only when `num_frames=None`.
+            If not specified and `num_frames==None`, all frames are sampled.
 
     Returns:
         np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
@@ -632,6 +672,15 @@ def read_video_torchvision(video_path: str, num_frames: Optional[int] = None):
         pts_unit="sec",
         output_format="TCHW",
     )
+    video_fps = info["video_fps"]
+    total_num_frames = video.size(0) - 1
+    if num_frames is None and fps is not None:
+        num_frames = int(total_num_frames / video_fps * fps)
+        if num_frames > total_num_frames:
+            raise ValueError(
+                f"When loading the video with fps={fps}, we identified that num_frames ({num_frames}) > total_frames ({total_num_frames}) ."
+                f"Make sure that fps of a video is less than the requested fps for loading. Detected video_fps={video_fps}"
+            )
 
     if num_frames is not None:
         idx = torch.linspace(0, video.size(0) - 1, num_frames, dtype=torch.int64)
@@ -648,7 +697,12 @@ def read_video_torchvision(video_path: str, num_frames: Optional[int] = None):
 }
 
 
-def load_video(video: Union[str, "VideoInput"], num_frames: Optional[int] = None, backend: str = "opencv") -> np.array:
+def load_video(
+    video: Union[str, "VideoInput"],
+    num_frames: Optional[int] = None,
+    fps: Optional[int] = None,
+    backend: str = "opencv",
+) -> np.array:
     """
     Loads `video` to a numpy array.
 
@@ -657,12 +711,19 @@ def load_video(video: Union[str, "VideoInput"], num_frames: Optional[int] = None
             The video to convert to the numpy array format. Can be a link to video or local path.
         num_frames (`int`, *optional*):
             Number of frames to sample uniformly. If not passed, the whole video is loaded.
+        fps (`int`, *optional*):
+            Number of frames to sample per second. Should be passed only when `num_frames=None`.
+            If not specified and `num_frames==None`, all frames are sampled.
         backend (`str`, *optional*, defaults to `"opencv"`):
             The backend to use when loading the video. Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "opencv".
 
     Returns:
         `np.array`: A numpy array of shape (num_frames, channels, height, width).
     """
+
+    if fps is not None and num_frames is not None:
+        raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
+
     if video.startswith("https://www.youtube.com") or video.startswith("http://www.youtube.com"):
         if not is_yt_dlp_available():
             raise ImportError("To load a video from YouTube url you have  to install `yt_dlp` first.")
@@ -703,7 +764,7 @@ def load_video(video: Union[str, "VideoInput"], num_frames: Optional[int] = None
         )
 
     video_decoder = VIDEO_DECODERS[backend]
-    video = video_decoder(file_obj)
+    video = video_decoder(file_obj, num_frames=num_frames, fps=fps)
     return video
 
 
diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py
index 5c348ae1ee56..7d579d6e37f3 100644
--- a/src/transformers/models/aria/modular_aria.py
+++ b/src/transformers/models/aria/modular_aria.py
@@ -499,6 +499,8 @@ class AriaImageProcessor(BaseImageProcessor):
             The resampling filter to use if resizing the image.
     """
 
+    model_input_names = ["pixel_values", "pixel_mask", "num_crops"]
+
     def __init__(
         self,
         image_mean: List[float] = None,
@@ -997,6 +999,10 @@ def decode(self, *args, **kwargs):
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
+
+        # Remove `num_crops`, it is popped and used only when processing. Make a copy of list when remocing
+        # otherwise `self.image_processor.model_input_names` is also modified
+        image_processor_input_names = [name for name in image_processor_input_names if name != "num_crops"]
         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
 
 
diff --git a/src/transformers/models/aria/processing_aria.py b/src/transformers/models/aria/processing_aria.py
index 4b7163db8fd3..0d416cc044ca 100644
--- a/src/transformers/models/aria/processing_aria.py
+++ b/src/transformers/models/aria/processing_aria.py
@@ -158,7 +158,6 @@ def decode(self, *args, **kwargs):
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
-
         # Remove `num_crops`, it is popped and used only when processing. Make a copy of list when remocing
         # otherwise `self.image_processor.model_input_names` is also modified
         image_processor_input_names = [name for name in image_processor_input_names if name != "num_crops"]
diff --git a/src/transformers/models/mllama/processing_mllama.py b/src/transformers/models/mllama/processing_mllama.py
index 810ea7e8412c..5ed398e928fa 100644
--- a/src/transformers/models/mllama/processing_mllama.py
+++ b/src/transformers/models/mllama/processing_mllama.py
@@ -202,6 +202,8 @@ class MllamaProcessor(ProcessorMixin):
             The image processor is a required input.
         tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
             The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
 
     """
 
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 9b808d1ce0f5..f9a0c4c5e8c9 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -379,6 +379,9 @@ class ChatTemplateKwargs(TypedDict, total=False):
         The backend to use when loading the video which will be used only when there are videos in the conversation.
         Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "pyav" because it is the only backend
         that supports all types of sources to load from.
+    video_fps (`int`, *optional*):
+        Number of frames to sample per second. Should be passed only when `num_frames=None`.
+        If not specified and `num_frames==None`, all frames are sampled.
     """
 
     tokenize: Optional[bool] = False
@@ -390,6 +393,7 @@ class ChatTemplateKwargs(TypedDict, total=False):
     return_assistant_tokens_mask: Optional[bool] = False
     num_frames: Optional[int] = None
     video_load_backend: Optional[str] = "pyav"
+    video_fps: Optional[int] = None
 
 
 class AllKwargsForChatTemplate(
@@ -1212,6 +1216,7 @@ def apply_chat_template(
         tokenize = chat_template_kwargs.pop("tokenize")
         return_dict = chat_template_kwargs.pop("return_dict")
         num_frames = chat_template_kwargs.pop("num_frames")
+        video_fps = chat_template_kwargs.pop("video_fps")
         video_load_backend = chat_template_kwargs.pop("video_load_backend")
 
         prompt = self.tokenizer.apply_chat_template(
@@ -1231,26 +1236,37 @@ def apply_chat_template(
             conversations = [conversation]
             is_batched = False
 
-        # We will have to return all processed inputs in a dict
-        # Currently all processors can accept flat list of visuals, but not all can accept nested list of batches
-        # So we'll make a simple list of images in the order they appear
         if tokenize:
             batch_images, batch_videos = [], []
             for conversation in conversations:
                 images, videos = [], []
                 for message in conversation:
                     visuals = [content for content in message["content"] if content["type"] in ["image", "video"]]
-                    for vision_info in visuals:
-                        if vision_info["type"] == "image":
-                            for key in ["image", "url", "path", "base64"]:
-                                if key in vision_info:
-                                    images.append(load_image(vision_info[key]))
-                        elif vision_info["type"] == "video":
-                            for key in ["video", "url", "path"]:
-                                if key in vision_info:
-                                    videos.append(
-                                        load_video(vision_info[key], num_frames=num_frames, backend=video_load_backend)
-                                    )
+                    image_fnames = [
+                        vision_info[key]
+                        for vision_info in visuals
+                        for key in ["image", "url", "path", "base64"]
+                        if key in vision_info and vision_info["type"] == "image"
+                    ]
+                    video_fnames = [
+                        vision_info[key]
+                        for vision_info in visuals
+                        for key in ["video", "url", "path"]
+                        if key in vision_info and vision_info["type"] == "video"
+                    ]
+                    for fname in image_fnames:
+                        images.append(load_image(fname))
+                    for fname in video_fnames:
+                        if isinstance(fname, (list, tuple)) and isinstance(fname[0], str):
+                            video = [np.array(load_image(image_fname)).T for image_fname in fname]
+                            # create a 4D video because `load_video` always returns a 4D array
+                            video = np.stack(video)
+                        else:
+                            video = load_video(fname, num_frames=num_frames, fps=video_fps, backend=video_load_backend)
+                        videos.append(video)
+
+                # Currently all processors can accept accept nested list of batches, but not flat list of visuals
+                # So we'll make a batched list of images and let the processor handle it
                 if images:
                     batch_images.append(images)
                 if videos:
diff --git a/tests/models/mllama/test_processor_mllama.py b/tests/models/mllama/test_processor_mllama.py
index 868aa88196f3..6d5db2f677cf 100644
--- a/tests/models/mllama/test_processor_mllama.py
+++ b/tests/models/mllama/test_processor_mllama.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
 import shutil
 import tempfile
 import unittest
@@ -55,6 +56,17 @@ def tearDown(self):
     def prepare_processor_dict(self):
         return {"chat_template": "{% for message in messages %}{% if loop.index0 == 0 %}{{ bos_token }}{% endif %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}{{ '<|image|>' }}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{{ '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"}  # fmt: skip
 
+    def test_chat_template_is_saved(self):
+        processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
+        processor_dict_loaded = json.loads(processor_loaded.to_json_string())
+        # chat templates aren't serialized to json in processors
+        self.assertFalse("chat_template" in processor_dict_loaded.keys())
+
+        # they have to be saved as separate file and loaded back from that file
+        # so we check if the same template is loaded
+        processor_dict = self.prepare_processor_dict()
+        self.assertTrue(processor_loaded.chat_template == processor_dict.get("chat_template", None))
+
     def test_apply_chat_template(self):
         # Message contains content which a mix of lists with images and image urls and string
         messages = [
diff --git a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
index 230dbc5f9082..c85389c07306 100644
--- a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
+++ b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
@@ -244,13 +244,69 @@ def test_chat_template_video(self):
         out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
         self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"])
 
-        # add video URL for return dict
+        # Add video URL for return dict and load with `num_frames` arg
         messages[0][0]["content"][0] = {
             "type": "video",
             "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4",
         }
+        num_frames = 3
         out_dict_with_video = processor.apply_chat_template(
-            messages, add_generation_prompt=True, tokenize=True, return_dict=True
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            num_frames=num_frames,
+        )
+        self.assertTrue(self.videos_input_name in out_dict_with_video)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 115200)
+
+        # Load with `video_fps` arg
+        video_fps = 1
+        out_dict_with_video = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            video_fps=video_fps,
+        )
+        self.assertTrue(self.videos_input_name in out_dict_with_video)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 288000)
+
+        # Load with `video_fps` and `num_frames` args, should raise an error
+        with self.assertRaises(ValueError):
+            out_dict_with_video = processor.apply_chat_template(
+                messages,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                video_fps=video_fps,
+                num_frames=num_frames,
+            )
+
+        # Load without any arg should load the whole video
+        out_dict_with_video = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
         self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8640000)
+
+        # Load video as a list of frames (i.e. images). NOTE: each frame should have same size
+        # because we assume they come from one video
+        messages[0][0]["content"][0] = {
+            "type": "video",
+            "url": [
+                "https://www.ilankelman.org/stopsigns/australia.jpg",
+                "https://www.ilankelman.org/stopsigns/australia.jpg",
+            ],
+        }
+        out_dict_with_video = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+        )
+        self.assertTrue(self.videos_input_name in out_dict_with_video)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 71280)
diff --git a/tests/models/qwen2_vl/test_processor_qwen2_vl.py b/tests/models/qwen2_vl/test_processor_qwen2_vl.py
index ec080e9973f4..59546485c597 100644
--- a/tests/models/qwen2_vl/test_processor_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_processor_qwen2_vl.py
@@ -241,13 +241,69 @@ def test_chat_template_video(self):
         out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
         self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"])
 
-        # add video URL for return dict
+        # Add video URL for return dict and load with `num_frames` arg
         messages[0][0]["content"][0] = {
             "type": "video",
             "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4",
         }
+        num_frames = 3
         out_dict_with_video = processor.apply_chat_template(
-            messages, add_generation_prompt=True, tokenize=True, return_dict=True
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            num_frames=num_frames,
+        )
+        self.assertTrue(self.videos_input_name in out_dict_with_video)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 115200)
+
+        # Load with `video_fps` arg
+        video_fps = 1
+        out_dict_with_video = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            video_fps=video_fps,
+        )
+        self.assertTrue(self.videos_input_name in out_dict_with_video)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 288000)
+
+        # Load with `video_fps` and `num_frames` args, should raise an error
+        with self.assertRaises(ValueError):
+            out_dict_with_video = processor.apply_chat_template(
+                messages,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                video_fps=video_fps,
+                num_frames=num_frames,
+            )
+
+        # Load without any arg should load the whole video
+        out_dict_with_video = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
         self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8640000)
+
+        # Load video as a list of frames (i.e. images). NOTE: each frame should have same size
+        # because we assume they come from one video
+        messages[0][0]["content"][0] = {
+            "type": "video",
+            "url": [
+                "https://www.ilankelman.org/stopsigns/australia.jpg",
+                "https://www.ilankelman.org/stopsigns/australia.jpg",
+            ],
+        }
+        out_dict_with_video = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+        )
+        self.assertTrue(self.videos_input_name in out_dict_with_video)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 71280)
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index 94b036e12b30..fd3345cbbd33 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -135,8 +135,10 @@ def test_processor_to_json_string(self):
         processor = self.get_processor()
         obj = json.loads(processor.to_json_string())
         for key, value in self.prepare_processor_dict().items():
-            self.assertEqual(obj[key], value)
-            self.assertEqual(getattr(processor, key, None), value)
+            # Chat template is saved as a separate file
+            if key not in "chat_template":
+                self.assertEqual(obj[key], value)
+                self.assertEqual(getattr(processor, key, None), value)
 
     def test_processor_from_and_save_pretrained(self):
         processor_first = self.get_processor()
@@ -786,13 +788,73 @@ def test_chat_template_video(self):
         out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
         self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"])
 
-        # add video URL for return dict
+        # Add video URL for return dict and load with `num_frames` arg
         messages[0][0]["content"][0] = {
             "type": "video",
             "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4",
         }
+        num_frames = 3
         out_dict_with_video = processor.apply_chat_template(
-            messages, add_generation_prompt=True, tokenize=True, return_dict=True
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            num_frames=num_frames,
+        )
+        self.assertTrue(self.videos_input_name in out_dict_with_video)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), num_frames)
+
+        # Load with `video_fps` arg
+        video_fps = 1
+        out_dict_with_video = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            video_fps=video_fps,
+        )
+        self.assertTrue(self.videos_input_name in out_dict_with_video)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), video_fps * 10)
+
+        # Load with `video_fps` and `num_frames` args, should raise an error
+        with self.assertRaises(ValueError):
+            out_dict_with_video = processor.apply_chat_template(
+                messages,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                video_fps=video_fps,
+                num_frames=num_frames,
+            )
+
+        # Load without any arg should load the whole video
+        out_dict_with_video = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+        )
+        self.assertTrue(self.videos_input_name in out_dict_with_video)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 300)
+
+        # Load video as a list of frames (i.e. images). NOTE: each frame should have same size
+        # because we assume they come from one video
+        messages[0][0]["content"][0] = {
+            "type": "video",
+            "url": [
+                "https://www.ilankelman.org/stopsigns/australia.jpg",
+                "https://www.ilankelman.org/stopsigns/australia.jpg",
+            ],
+        }
+        out_dict_with_video = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
         self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 2)

From aa08f53716e523f132091258542f1ef08f8d1a5d Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Fri, 31 Jan 2025 10:14:32 +0100
Subject: [PATCH 4/4] fix copies

---
 src/transformers/models/aria/processing_aria.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/aria/processing_aria.py b/src/transformers/models/aria/processing_aria.py
index 0d416cc044ca..4b7163db8fd3 100644
--- a/src/transformers/models/aria/processing_aria.py
+++ b/src/transformers/models/aria/processing_aria.py
@@ -158,6 +158,7 @@ def decode(self, *args, **kwargs):
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
+
         # Remove `num_crops`, it is popped and used only when processing. Make a copy of list when remocing
         # otherwise `self.image_processor.model_input_names` is also modified
         image_processor_input_names = [name for name in image_processor_input_names if name != "num_crops"]