Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions inference/Qwen2.5-VL-3B-instruct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from datasets import load_dataset

# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"smolagents/Qwen2.5-VL-3B-Instruct-Agentic", torch_dtype="auto", device_map="auto"
)

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
# "Qwen/Qwen2.5-VL-3B-Instruct",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
# device_map="auto",
# )

# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

# The default range for the number of visual tokens per image in the model is 4-16384.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

dataset = load_dataset("smolagents/aguvis-stage-2", "mind2web", split="train")

for example in dataset:
messages = [
{"role": "system", "content": example["system"]},
{"role": "user", "content": [
{"type": "image", "image": example["image"]},
{"type": "text", "text": example["user"]}
]},
]
break

# Preparation for inference
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=4096)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)
142 changes: 142 additions & 0 deletions recipes/Qwen2.5-VL-3B-Instruct/sft/config_gui.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
# Model arguments
# You can download the model and manually change the rope to 300k/500k and max_position_embeddings to 32768
model_name_or_path: Qwen/Qwen2.5-VL-3B-Instruct
vision_model: true
model_revision: main
torch_dtype: bfloat16
attn_implementation: sdpa

# Data training arguments
dataset_name: smolagents/aguvis-stage-2
dataset_num_proc: 48

#SFT hyperparam
max_length: 4096
optim: adamw_torch
lr_scheduler_type: cosine_with_min_lr
lr_scheduler_kwargs:
min_lr_rate: 0.1
max_grad_norm: 0.2
warmup_ratio: 0.03
learning_rate: 2.0e-05
gradient_accumulation_steps: 16
per_device_eval_batch_size: 4
per_device_train_batch_size: 4 # Change this depending on the context length of the model to keep a 500M GBS.

# Image resize arguments
image_resize:
factor: 28
min_pixels: 200704
max_pixels: 1003520

# SFT trainer config
max_steps: -1
num_train_epochs: 1
bf16: true
do_eval: true
eval_strategy: 'steps'
eval_steps: 100
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: A-Mahla/Qwen2.5-VL-3B-Instruct-Agentic-GUI
hub_strategy: end
push_to_hub: true
log_level: info
logging_steps: 5
logging_strategy: steps
output_dir: /fsx/amir_mahla/smolagents-Qwen2.5-VL-3B-Instruct-Agentic
overwrite_output_dir: true
report_to:
- wandb
wandb_project: smolagents
save_strategy: "epoch"
save_steps: 1
save_total_limit: 1
seed: 42

dataset_mixture:
datasets: # List of datasets to include in the mixture
- id: smolagents/aguvis-stage-2 # Hub dataset ID
config: mind2web # Name of the dataset config
split: train # Split to use from the dataset
columns: # Columns to keep
- system
- user
- assistant
- image
weight: 1.
- id: smolagents/aguvis-stage-2
config: guiact-web-single
split: train
columns:
- system
- user
- assistant
- image
weight: 1.
- id: smolagents/aguvis-stage-2
config: guiact-web-multi
split: train
columns:
- system
- user
- assistant
- image
weight: 1.
- id: smolagents/aguvis-stage-2
config: miniwob
split: train
columns:
- system
- user
- assistant
- image
weight: 1.
- id: smolagents/aguvis-stage-2
config: coat
split: train
columns:
- system
- user
- assistant
- image
weight: 1.
- id: smolagents/aguvis-stage-2
config: android_control
split: train
columns:
- system
- user
- assistant
- image
weight: 1.
- id: smolagents/aguvis-stage-2
config: gui-odyssey
split: train
columns:
- system
- user
- assistant
- image
weight: 1.
- id: smolagents/aguvis-stage-2
config: amex
split: train
columns:
- system
- user
- assistant
- image
weight: 1.
- id: smolagents/aguvis-stage-2
config: aitw
split: train
columns:
- system
- user
- assistant
- image
weight: 1.
seed: 42 # Seed for shuffling the combined dataset
test_split_size: 0.01
116 changes: 116 additions & 0 deletions recipes/SmolVLM2-2.2B-Instruct/sft/config_gui_phase_1_1152.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# Model arguments
# You can download the model and manually change the rope to 300k/500k and max_position_embeddings to 32768
model_name_or_path: HuggingFaceTB/SmolVLM2-2.2B-Instruct
vision_model: true
model_revision: main
torch_dtype: bfloat16
attn_implementation: sdpa

# Data training arguments
dataset_name: smolagents/aguvis-stage-2
dataset_num_proc: 48

#SFT hyperparam
max_length: 4096
optim: adamw_torch
lr_scheduler_type: cosine_with_min_lr
lr_scheduler_kwargs:
min_lr_rate: 0.1
max_grad_norm: 0.2
warmup_ratio: 0.03
learning_rate: 2.0e-05
gradient_accumulation_steps: 32
per_device_eval_batch_size: 2
per_device_train_batch_size: 2 # Change this depending on the context length of the model to keep a 500M GBS.

image_resize:
resolution_max_side: 1152
to_pixel_coordinates: true

# SFT trainer config
max_steps: -1
num_train_epochs: 1
bf16: true
do_eval: false
eval_strategy: 'steps'
eval_steps: 100
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: smolagents/SmolVLM2-2.2B-Instruct-Agentic-GUI
hub_model_revision: main
hub_strategy: end
push_to_hub: false
log_level: info
logging_steps: 5
logging_strategy: steps
output_dir: /fsx/amir_mahla/smolagents-SmolVLM2-2.2B-Instruct-Agentic-GUI-phase-1-max-size-1152-pixel-coordinates
overwrite_output_dir: true
report_to:
- wandb
wandb_project: smolagents
save_strategy: steps
save_steps: 800
save_total_limit: 1
seed: 42

dataset_mixture:
datasets: # List of datasets to include in the mixture
- id: smolagents/aguvis-stage-1 # Hub dataset ID
config: guienv # Name of the dataset config
split: train # Split to use from the dataset
columns: # Columns to keep
- images
- texts
weight: 1.
- id: smolagents/aguvis-stage-1
config: omniact
split: train
columns:
- images
- texts
weight: 1.
- id: smolagents/aguvis-stage-1
config: ricoig16k
split: train
columns:
- images
- texts
weight: 1.
- id: smolagents/aguvis-stage-1
config: ricosca
split: train
columns:
- images
- texts
weight: 1.
- id: smolagents/aguvis-stage-1
config: seeclick
split: train
columns:
- images
- texts
weight: 1.
- id: smolagents/aguvis-stage-1
config: ui_refexp
split: train
columns:
- images
- texts
weight: 1.
- id: smolagents/aguvis-stage-1
config: webui350k
split: train
columns:
- images
- texts
weight: 1.
- id: smolagents/aguvis-stage-1
config: widget_captioning
split: train
columns:
- images
- texts
weight: 1.
seed: 42 # Seed for shuffling the combined dataset
test_split_size: 0.007
Loading
Loading