meta-llama · varunfb · Nov 3, 2025 · Sep 17, 2025 · Sep 17, 2025 · Sep 17, 2025
diff --git a/.github/scripts/spellcheck_conf/wordlist.txt b/.github/scripts/spellcheck_conf/wordlist.txt
@@ -1561,3 +1561,21 @@ VSCode
 applyTo
 mdc
 windsurfrules
+FPFT
+PagedAdamW
+Torchtune
+WandB
+bfclv
+bwd
+cellpadding
+cellspacing
+chartqa
+checkpointer
+docvqa
+enconder
+gsm
+preprocessors
+seqs
+torchtune
+verifications
+Waitlist
diff --git a/getting-started/README.md b/getting-started/README.md
@@ -21,5 +21,6 @@ If you are new to developing with Meta Llama models, this is where you should st
 * The [Build_with_Llama API](./build_with_llama_api.ipynb) notebook highlights some of the features of [Llama API](https://llama.developer.meta.com?utm_source=llama-cookbook&utm_medium=readme&utm_campaign=getting_started).
 * The [inference](./inference/) folder contains scripts to deploy Llama for inference on server and mobile. See also [3p_integrations/vllm](../3p-integrations/vllm/) and [3p_integrations/tgi](../3p-integrations/tgi/) for hosting Llama on open-source model servers.
 * The [RAG](./RAG/) folder contains a simple Retrieval-Augmented Generation application using Llama.
-* The [finetuning](./finetuning/) folder contains resources to help you finetune Llama on your custom datasets, for both single- and multi-GPU setups. The scripts use the native llama-cookbook finetuning code found in [finetuning.py](../src/llama_cookbook/finetuning.py) which supports these features.
+* The [finetuning](./finetuning/) folder contains resources to help you finetune Llama on your custom datasets, for both single- and multi-GPU setups. The scripts use the native llama-cookbook finetuning code found in [finetuning.py](../src/llama_cookbook/finetuning.py) which supports these features:
+  * **NEW:** [Vision fine-tuning recipe](./finetuning/vision/README.md) for Llama 3.2 11B Vision - Learn how to fine-tune multimodal models for document understanding with 98% accuracy on structured data extraction!
 * The [llama-tools](./llama-tools/) folder contains resources to help you use Llama tools, such as [llama-prompt-ops](../llama-tools/llama-prompt-ops_101.ipynb).
diff --git a/getting-started/finetuning/README.md b/getting-started/finetuning/README.md
@@ -13,6 +13,9 @@ If you are new to fine-tuning techniques, check out [an overview](./LLM_finetuni
 > [!TIP]
 > If you want to try finetuning Meta Llama 3 in a Jupyter notebook you can find a quickstart notebook [here](./quickstart_peft_finetuning.ipynb)
 
+> [!NOTE]
+> **New: Vision Fine-tuning Recipe** - Looking to fine-tune Llama 3.2 11B Vision for structured data extraction? Check out our comprehensive [vision fine-tuning recipe](./vision/README.md) that achieves 98% accuracy on document understanding tasks using torchtune, with detailed benchmarking comparing LoRA vs Full Parameter Fine-Tuning approaches.
+
 
 ## How to configure finetuning settings?
 

diff --git a/getting-started/finetuning/vision/.gitignore b/getting-started/finetuning/vision/.gitignore
@@ -0,0 +1,7 @@
+results/
+outputs/
+w2_evaluation_results/
+Llama-3.2-11B-Vision-Instruct/
+fake_w2_us_tax_form_dataset_train30_test70/
+fake_w2_us_tax_form_dataset_train80_test20/
+htmlcov/
diff --git a/getting-started/finetuning/vision/11B_full_w2.yaml b/getting-started/finetuning/vision/11B_full_w2.yaml
@@ -0,0 +1,100 @@
+# Top-level output directory
+output_dir: ./outputs/Llama-3.2-11B-Instruct-w2-full
+
+# Model
+model:
+  _component_: torchtune.models.llama3_2_vision.llama3_2_vision_11b
+  decoder_trainable: False
+  encoder_trainable: True
+  fusion_trainable: True
+  image_size: 560 # Make sure this matches the image_size in tokenizer
+
+# Tokenizer / vision transform
+tokenizer:
+  _component_: torchtune.models.llama3_2_vision.llama3_2_vision_transform
+  path: ./Llama-3.2-11B-Vision-Instruct/original/tokenizer.model
+  image_size: 560
+  max_seq_len: 8192
+
+# Checkpointing
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: ./Llama-3.2-11B-Vision-Instruct
+  checkpoint_files:
+    filename_format: model-{}-of-{}.safetensors
+    max_filename: "00005"
+  recipe_checkpoint: null
+  output_dir: ${output_dir}
+  model_type: LLAMA3_VISION
+
+resume_from_checkpoint: false
+save_adapter_weights_only: False # PeFT formatting not available yet. This will save it in torchtune format only.
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.multimodal.vqa_dataset
+  source: arrow
+  data_files:
+    train: "fake_w2_us_tax_form_dataset_train30_test70/train/data-00000-of-00001.arrow"
+  split: train
+  column_map:
+    input: input
+    output: ground_truth
+    image: image
+
+# General data handling
+seed: null
+shuffle: true
+collate_fn: torchtune.data.padded_collate_tiled_images_and_mask
+
+# Training loop & hyperparams
+
+epochs: 5
+max_steps_per_epoch: null
+batch_size: 1
+gradient_accumulation_steps: 1 # Use to increase effective batch size
+# explicit optimizer / scheduler / loss
+optimizer:
+  _component_: bitsandbytes.optim.PagedAdamW8bit
+  lr: 2e-5
+optimizer_in_bwd: True  # True saves memory. Requires gradient_accumulation_steps=1
+
+loss:
+  _component_: torchtune.modules.loss.LinearCrossEntropyLoss
+
+# clip_grad_norm: 1.0
+compile: true
+
+# Device & memory
+device: cuda
+enable_activation_checkpointing: true
+dtype: bf16
+
+# Logging
+
+metric_logger:
+  _component_: torchtune.training.metric_logging.WandBLogger
+  project: llama3_2_w2_extraction
+  entity: <your_wandb_entity>
+  job_type: full_finetune_single_device
+  group: llama-cookbook
+log_every_n_steps: 5
+save_steps: 100
+log_peak_memory_stats: true
+log_level: INFO
+
+# Profiler (off by default)
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+  enabled: false
+  output_dir: ${output_dir}/profiling_outputs
+  cpu: true
+  cuda: true
+  profile_memory: false
+  with_stack: false
+  record_shapes: true
+  with_flops: false
+  wait_steps: 5
+  warmup_steps: 3
+  active_steps: 2
+  num_cycles: 1
diff --git a/getting-started/finetuning/vision/11B_lora_w2.yaml b/getting-started/finetuning/vision/11B_lora_w2.yaml
@@ -0,0 +1,118 @@
+# Top-level output directory
+output_dir: ./outputs/Llama-3.2-11B-Instruct-w2-lora
+
+# Model + LoRA settings
+model:
+  _component_: torchtune.models.llama3_2_vision.lora_llama3_2_vision_11b
+  # preserve your hyperparams
+  lora_rank: 8 # higher increases accuracy and memory
+  lora_alpha: 16 # usually alpha=2*rank
+  lora_dropout: 0.05
+  image_size: 560 # Make sure this matches the image_size in tokenizer
+  # example’s fixed settings
+  decoder_trainable: "frozen"
+  encoder_trainable: "lora"
+  fusion_trainable: "lora"
+  lora_attn_modules:
+    - 'q_proj'
+    - 'v_proj'
+    - 'output_proj'
+  apply_lora_to_mlp: true
+  apply_lora_to_output: false
+
+# Tokenizer / vision transform
+tokenizer:
+  _component_: torchtune.models.llama3_2_vision.llama3_2_vision_transform
+  path: ./Llama-3.2-11B-Vision-Instruct/original/tokenizer.model
+  image_size: 560
+  max_seq_len: 8192
+
+# Checkpointing
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: ./Llama-3.2-11B-Vision-Instruct
+  checkpoint_files:
+    filename_format: model-{}-of-{}.safetensors
+    max_filename: "00005"
+  recipe_checkpoint: null
+  output_dir: ${output_dir}
+  model_type: LLAMA3_VISION
+
+resume_from_checkpoint: false
+save_adapter_weights_only: false # PeFT formatting not available yet. This will save it in torchtune format only.
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.multimodal.vqa_dataset
+  source: arrow
+  data_files:
+    train: "fake_w2_us_tax_form_dataset_train30_test70/train/data-00000-of-00001.arrow"
+  split: train
+  column_map:
+    input: input
+    output: ground_truth
+    image: image
+
+# General data handling
+seed: null
+shuffle: true
+collate_fn: torchtune.data.padded_collate_tiled_images_and_mask
+
+# Training loop & hyperparams
+
+# example’s train-control
+epochs: 5
+max_steps_per_epoch: null
+batch_size: 1
+gradient_accumulation_steps: 1 # Use to increase effective batch size
+# explicit optimizer / scheduler / loss
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: true
+  weight_decay: 0.01
+  lr: 1e-4
+optimizer_in_bwd: true
+
+lr_scheduler:
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torchtune.modules.loss.LinearCrossEntropyLoss
+
+clip_grad_norm: 1.0
+compile: false
+
+# Device & memory
+device: cuda
+enable_activation_checkpointing: true
+dtype: bf16
+
+# Logging
+
+metric_logger:
+  _component_: torchtune.training.metric_logging.WandBLogger
+  project: llama3_2_w2_extraction
+  entity: <your_wandb_entity>
+  job_type: lora_finetune_single_device
+  group: llama-cookbook
+log_every_n_steps: 5
+save_steps: 100
+log_peak_memory_stats: true
+log_level: INFO
+
+# Profiler (off by default)
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+  enabled: false
+  output_dir: ${output_dir}/profiling_outputs
+  cpu: true
+  cuda: true
+  profile_memory: false
+  with_stack: false
+  record_shapes: true
+  with_flops: false
+  wait_steps: 5
+  warmup_steps: 3
+  active_steps: 2
+  num_cycles: 1