huggingface · A-Mahla · Jul 17, 2025 · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025
diff --git a/inference/Qwen2.5-VL-3B-instruct.py b/inference/Qwen2.5-VL-3B-instruct.py
@@ -0,0 +1,61 @@
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+from datasets import load_dataset
+
+# default: Load the model on the available device(s)
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    "smolagents/Qwen2.5-VL-3B-Instruct-Agentic", torch_dtype="auto", device_map="auto"
+)
+
+# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
+# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+#     "Qwen/Qwen2.5-VL-3B-Instruct",
+#     torch_dtype=torch.bfloat16,
+#     attn_implementation="flash_attention_2",
+#     device_map="auto",
+# )
+
+# default processer
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
+
+# The default range for the number of visual tokens per image in the model is 4-16384.
+# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
+# min_pixels = 256*28*28
+# max_pixels = 1280*28*28
+# processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
+
+dataset = load_dataset("smolagents/aguvis-stage-2", "mind2web", split="train")
+
+for example in dataset:
+    messages = [
+                {"role": "system", "content": example["system"]},
+                {"role": "user", "content": [
+                    {"type": "image", "image": example["image"]},
+                    {"type": "text", "text": example["user"]}
+                ]},
+            ]
+    break
+
+# Preparation for inference
+text = processor.apply_chat_template(
+    messages, tokenize=False, add_generation_prompt=True
+)
+image_inputs, video_inputs = process_vision_info(messages)
+inputs = processor(
+    text=[text],
+    images=image_inputs,
+    videos=video_inputs,
+    padding=True,
+    return_tensors="pt",
+)
+inputs = inputs.to("cuda")
+
+# Inference: Generation of the output
+generated_ids = model.generate(**inputs, max_new_tokens=4096)
+generated_ids_trimmed = [
+    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+]
+output_text = processor.batch_decode(
+    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)
+print(output_text)
diff --git a/recipes/Qwen2.5-VL-3B-Instruct/sft/config_gui.yaml b/recipes/Qwen2.5-VL-3B-Instruct/sft/config_gui.yaml
@@ -0,0 +1,142 @@
+# Model arguments
+# You can download the model and manually change the rope to 300k/500k and max_position_embeddings to 32768
+model_name_or_path: Qwen/Qwen2.5-VL-3B-Instruct
+vision_model: true
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: sdpa
+
+# Data training arguments
+dataset_name: smolagents/aguvis-stage-2
+dataset_num_proc: 48
+
+#SFT hyperparam
+max_length: 4096
+optim: adamw_torch
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_grad_norm: 0.2
+warmup_ratio: 0.03
+learning_rate: 2.0e-05
+gradient_accumulation_steps: 16
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 4 # Change this depending on the context length of the model to keep a 500M GBS. 
+
+# Image resize arguments
+image_resize:
+  factor: 28
+  min_pixels: 200704
+  max_pixels: 1003520
+
+# SFT trainer config
+max_steps: -1
+num_train_epochs: 1
+bf16: true
+do_eval: true
+eval_strategy: 'steps'
+eval_steps: 100
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: A-Mahla/Qwen2.5-VL-3B-Instruct-Agentic-GUI
+hub_strategy: end
+push_to_hub: true
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+output_dir: /fsx/amir_mahla/smolagents-Qwen2.5-VL-3B-Instruct-Agentic
+overwrite_output_dir: true
+report_to:
+- wandb
+wandb_project: smolagents
+save_strategy: "epoch"
+save_steps: 1
+save_total_limit: 1
+seed: 42
+
+dataset_mixture:
+  datasets:                     # List of datasets to include in the mixture
+    - id: smolagents/aguvis-stage-2  # Hub dataset ID
+      config: mind2web     # Name of the dataset config
+      split: train            # Split to use from the dataset
+      columns:                  # Columns to keep
+        - system
+        - user
+        - assistant
+        - image
+      weight: 1.
+    - id: smolagents/aguvis-stage-2
+      config: guiact-web-single
+      split: train
+      columns:
+        - system
+        - user
+        - assistant
+        - image
+      weight: 1.
+    - id: smolagents/aguvis-stage-2
+      config: guiact-web-multi
+      split: train
+      columns:
+        - system
+        - user
+        - assistant
+        - image
+      weight: 1.
+    - id: smolagents/aguvis-stage-2
+      config: miniwob
+      split: train
+      columns:
+        - system
+        - user
+        - assistant
+        - image
+      weight: 1.
+    - id: smolagents/aguvis-stage-2
+      config: coat
+      split: train
+      columns:
+        - system
+        - user
+        - assistant
+        - image
+      weight: 1.
+    - id: smolagents/aguvis-stage-2
+      config: android_control
+      split: train
+      columns:
+        - system
+        - user
+        - assistant
+        - image
+      weight: 1.
+    - id: smolagents/aguvis-stage-2
+      config: gui-odyssey
+      split: train
+      columns:
+        - system
+        - user
+        - assistant
+        - image
+      weight: 1.
+    - id: smolagents/aguvis-stage-2
+      config: amex
+      split: train
+      columns:
+        - system
+        - user
+        - assistant
+        - image
+      weight: 1.
+    - id: smolagents/aguvis-stage-2
+      config: aitw
+      split: train
+      columns:
+        - system
+        - user
+        - assistant
+        - image
+      weight: 1.
+  seed: 42                      # Seed for shuffling the combined dataset
+  test_split_size: 0.01
diff --git a/recipes/SmolVLM2-2.2B-Instruct/sft/config_gui_phase_1_1152.yaml b/recipes/SmolVLM2-2.2B-Instruct/sft/config_gui_phase_1_1152.yaml
@@ -0,0 +1,116 @@
+# Model arguments
+# You can download the model and manually change the rope to 300k/500k and max_position_embeddings to 32768
+model_name_or_path: HuggingFaceTB/SmolVLM2-2.2B-Instruct
+vision_model: true
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: sdpa
+
+# Data training arguments
+dataset_name: smolagents/aguvis-stage-2
+dataset_num_proc: 48
+
+#SFT hyperparam
+max_length: 4096
+optim: adamw_torch
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_grad_norm: 0.2
+warmup_ratio: 0.03
+learning_rate: 2.0e-05
+gradient_accumulation_steps: 32
+per_device_eval_batch_size: 2
+per_device_train_batch_size: 2 # Change this depending on the context length of the model to keep a 500M GBS. 
+
+image_resize:
+  resolution_max_side: 1152
+  to_pixel_coordinates: true
+
+# SFT trainer config
+max_steps: -1
+num_train_epochs: 1
+bf16: true
+do_eval: false
+eval_strategy: 'steps'
+eval_steps: 100
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: smolagents/SmolVLM2-2.2B-Instruct-Agentic-GUI
+hub_model_revision: main
+hub_strategy: end
+push_to_hub: false
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+output_dir: /fsx/amir_mahla/smolagents-SmolVLM2-2.2B-Instruct-Agentic-GUI-phase-1-max-size-1152-pixel-coordinates
+overwrite_output_dir: true
+report_to:
+- wandb
+wandb_project: smolagents
+save_strategy: steps
+save_steps: 800
+save_total_limit: 1
+seed: 42
+
+dataset_mixture:
+  datasets:                     # List of datasets to include in the mixture
+    - id: smolagents/aguvis-stage-1  # Hub dataset ID
+      config: guienv     # Name of the dataset config
+      split: train            # Split to use from the dataset
+      columns:                  # Columns to keep
+        - images
+        - texts
+      weight: 1.
+    - id: smolagents/aguvis-stage-1
+      config: omniact
+      split: train
+      columns:
+        - images
+        - texts
+      weight: 1.
+    - id: smolagents/aguvis-stage-1
+      config: ricoig16k
+      split: train
+      columns:
+        - images
+        - texts
+      weight: 1.
+    - id: smolagents/aguvis-stage-1
+      config: ricosca
+      split: train
+      columns:
+        - images
+        - texts
+      weight: 1.
+    - id: smolagents/aguvis-stage-1
+      config: seeclick
+      split: train
+      columns:
+        - images
+        - texts
+      weight: 1.
+    - id: smolagents/aguvis-stage-1
+      config: ui_refexp
+      split: train
+      columns:
+        - images
+        - texts
+      weight: 1.
+    - id: smolagents/aguvis-stage-1
+      config: webui350k
+      split: train
+      columns:
+        - images
+        - texts
+      weight: 1.
+    - id: smolagents/aguvis-stage-1
+      config: widget_captioning
+      split: train
+      columns:
+        - images
+        - texts
+      weight: 1.
+  seed: 42                      # Seed for shuffling the combined dataset
+  test_split_size: 0.007