pytorch
diff --git a/‎examples/models/llama/export_llama_lib.py
+50-4 b/‎examples/models/llama/export_llama_lib.py
+50-4
diff --git a/‎examples/models/llama/source_transformation/sdpa.py
+10 b/‎examples/models/llama/source_transformation/sdpa.py
+10
diff --git a/‎examples/models/llama3_2_lora/__init__.py
+11 b/‎examples/models/llama3_2_lora/__init__.py
+11
diff --git a/‎examples/models/llama3_2_lora/model.py
+155 b/‎examples/models/llama3_2_lora/model.py
+155
@@ -101,7 +101,7 @@
     "phi_4_mini",
     "smollm2",
 ]
-TORCHTUNE_DEFINED_MODELS = ["llama3_2_vision"]
+TORCHTUNE_DEFINED_MODELS = ["llama3_2_vision", "llama3_2_lora"]
 HUGGING_FACE_REPO_IDS = {
     "qwen2_5": "Qwen/Qwen2.5-1.5B",
     "phi_4_mini": "microsoft/Phi-4-mini-instruct",
@@ -209,6 +209,12 @@ def build_args_parser() -> argparse.ArgumentParser:
         help="checkpoint directory. Use with a sharded checkpoint, not for the standard llama2 model. Note, checkpoint_dir takes precedence over checkpoint if both are set.",
     )
 
+    parser.add_argument(
+        "--adapter",
+        default=None,
+        help="Adapter path",
+    )
+
     parser.add_argument(
         "--use_qnn_sha",
         action="store_true",
@@ -585,17 +591,20 @@ def _prepare_for_llama_export(args) -> LLMEdgeManager:
     checkpoint_dir = (
         canonical_path(args.checkpoint_dir) if args.checkpoint_dir else None
     )
+    adapter_path = canonical_path(args.adapter) if args.adapter else None
     params_path = canonical_path(args.params) if args.params else None
     output_dir_path = canonical_path(args.output_dir, dir=True)
     weight_type = WeightType.FAIRSEQ2 if args.fairseq2 else WeightType.LLAMA
 
     # Convert dtype override string arg to actual type.
     dtype_override = DType[args.dtype_override]
 
+    # breakpoint()  # 1, OK.
     edge_manager = _load_llama_model(
         args.model,
         checkpoint=checkpoint_path,
         checkpoint_dir=checkpoint_dir,
+        adapter=adapter_path,
         params_path=params_path,
         use_kv_cache=args.use_kv_cache,
         use_sdpa_with_kv_cache=args.use_sdpa_with_kv_cache,
@@ -616,10 +625,16 @@ def _prepare_for_llama_export(args) -> LLMEdgeManager:
         dtype_override=dtype_override,
         args=args,
     )
-
     # At this point, the model is loaded in the default fp32.
 
     # Checkpoint dtype should be lower or equal precision to the dtype override.
+    eg = torch.tensor([[2, 3, 4]], dtype=torch.int64)
+    ip = torch.tensor([[0, 1, 2]], dtype=torch.long)
+
+    em1 = edge_manager.model.forward(eg, input_pos=ip)
+    eager = torch.load("/data/users/lfq/executorch/eager_res.pt")
+    torch.allclose(eager, em1)
+    # breakpoint()  # 4, OK.
     checkpoint_dtype = edge_manager.model.checkpoint_dtype
     if not (
         checkpoint_dtype == dtype_override.to_torch_dtype()
@@ -637,6 +652,10 @@ def _prepare_for_llama_export(args) -> LLMEdgeManager:
         )
 
     edge_manager.model = edge_manager.model.to(dtype=dtype_override.to_torch_dtype())
+    # edge_manager.model = edge_manager.model.to(dtype=torch.float32)
+    em2 = edge_manager.model.forward(eg, input_pos=ip)
+    torch.allclose(em2, eager)
+    # breakpoint()  # 5, not OK, gets converted to bf16. OK if dtype is consistent.
 
     # We want to quantize (in the source transforms) the weights of the model
     # in the checkpoint dtype.
@@ -649,7 +668,9 @@ def _prepare_for_llama_export(args) -> LLMEdgeManager:
             args=args,
         )
     )
-
+    # torch.allclose here as well.
+    em3 = edge_manager.model.forward(eg, input_pos=ip)
+    torch.allclose(em3, eager)
     return edge_manager
 
 
@@ -777,6 +798,9 @@ def _to_edge_and_lower_llama(  # noqa: C901
     builder_exported_to_edge = builder_exported.pt2e_quantize(
         quantizers
     ).export_to_edge()
+    breakpoint()
+    # ^to_edge_res.pt
+    # allclose 1e-1 compared to pre-auto.
 
     # to_backend
     partitioners = []
@@ -911,7 +935,16 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
 
     # export_to_edge
     builder_exported = _prepare_for_llama_export(args).export()
+    eg = torch.tensor([[2, 3, 4]], dtype=torch.int64)
+    ip = torch.tensor([[0, 1, 2]], dtype=torch.long)
+    b_e = builder_exported.model.forward(eg, input_pos=ip)
+    eager = torch.load("/data/users/lfq/executorch/eager_res.pt")
+    torch.allclose(b_e, eager)
+    # breakpoint()
+
     builder_exported.run_canonical_optimizations()
+    b_e2 = builder_exported.model.forward(eg, input_pos=ip)
+    torch.allclose(b_e2, eager)
     modelname = builder_exported.modelname
 
     if args.export_only:
@@ -932,6 +965,9 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
             args,
         )
     else:
+        # breakpoint()
+        b_e3 = builder_exported.model.forward(eg, input_pos=ip)
+        torch.allclose(b_e3, eager)
         builder = _to_edge_and_lower_llama(
             builder_exported,
             modelname,
@@ -941,6 +977,7 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
             quant_dtype,
             args,
         )
+        breakpoint()
 
     if args.profile_memory:
         generate_memory_trace(builder.export_program, "memory_profile.json")
@@ -1004,6 +1041,7 @@ def _load_llama_model(
     *,
     checkpoint: Optional[str] = None,
     checkpoint_dir: Optional[str] = None,
+    adapter: Optional[str] = None,
     params_path: Optional[str] = None,
     use_kv_cache: bool = False,
     use_sdpa_with_kv_cache: bool = False,
@@ -1038,6 +1076,9 @@ def _load_llama_model(
         if modelname == "llama3_2_vision":
             module_name = "llama3_2_vision"
             model_class_name = "Llama3_2Decoder"
+        if modelname == "llama3_2_lora":
+            module_name = "llama3_2_lora"
+            model_class_name = "Llama3_2_Lora"
         else:
             raise ValueError(f"{modelname} is not a valid Llama model.")
     else:
@@ -1051,6 +1092,7 @@ def _load_llama_model(
             model_class_name,
             checkpoint=checkpoint,
             checkpoint_dir=checkpoint_dir,
+            adapter=adapter,
             params=params_path,
             use_kv_cache=use_kv_cache,
             use_sdpa_with_kv_cache=use_sdpa_with_kv_cache,
@@ -1066,6 +1108,7 @@ def _load_llama_model(
         )
     )
 
+    # breakpoint()  # 3. OK.
     return LLMEdgeManager(
         model=model,
         modelname=modelname,
@@ -1093,7 +1136,7 @@ def _load_llama_model(
             model.max_seq_len,
             # pyre-fixme[6]: For 6th argument expected `ModelArgs` but got
             #  `Union[Tensor, Module]`.
-            model.max_context_len,
+            max_context_len,
             # pyre-fixme[6]: For 7th argument expected `int` but got `Union[Tensor,
             #  Module]`.
             model.n_layers,
@@ -1244,6 +1287,9 @@ def _get_source_transforms(  # noqa
     if args.vulkan:
         transforms.append(replace_with_vulkan_rotary_emb)
 
+    # transforms.append(
+    #     replace_rope_with_inference_rope()
+    # )
     return transforms
 
 
 
@@ -15,6 +15,8 @@
 
 from executorch.examples.models.llama.attention import KVCache, SDPA
 
+from executorch.extension.llm.modules.attention import SDPA as TTSDPA
+
 
 class SDPACustom(torch.nn.Module):
     def __init__(
@@ -60,11 +62,19 @@ def forward(
 def _replace_sdpa_with_custom_op(module: torch.nn.Module):
     for name, child in module.named_children():
         if isinstance(child, SDPA):
+            breakpoint()
             setattr(
                 module,
                 name,
                 SDPACustom(child.dim),
             )
+        elif isinstance(child, TTSDPA):
+            breakpoint()
+            setattr(
+                module,
+                name,
+                SDPACustom(child.num_heads * child.head_dim),
+            )
         else:
             _replace_sdpa_with_custom_op(child)
 
 
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .model import Llama3_2_Lora
+
+__all__ = [
+    "Llama3_2_Lora",
+]
@@ -0,0 +1,155 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import json
+import os
+from typing import Any, Dict
+
+import torch
+
+from executorch.examples.models.checkpoint import get_checkpoint_dtype
+from executorch.examples.models.llama.model_args import ModelArgs
+from executorch.examples.models.llama.rope import Rope, RotaryEmbedding
+from executorch.examples.models.model_base import EagerModelBase
+from executorch.extension.llm.modules.attention import (
+    replace_mha_with_inference_mha,
+    replace_rope_with_inference_rope,
+)
+
+from torchtune.models import convert_weights
+
+from torchtune.models.llama3_1._position_embeddings import Llama3ScaledRoPE
+
+from torchtune.models.llama3_2._component_builders import lora_llama3_2
+
+
+class Llama3_2_Lora(EagerModelBase):
+    def __init__(self, **kwargs):
+        # Set member vars from kwargs.
+        self.max_seq_len = kwargs.get(
+            "max_seq_len", 8192
+        )  # Trained to be a lot larger, but this value is kept small because of static kv cache at the moment.
+        # self.encoder_max_seq_len = kwargs.get(
+        #     "encoder_max_seq_len", int(4 * (448 / 14) ** 2 + 1)
+        # )  # Same as above.
+        self.generate_full_logits = kwargs.get("generate_full_logits", False)
+        self.enable_dynamic_shape = kwargs.get("enable_dynamic_shape", True)
+        self.output_prune_map_path = kwargs.get("output_prune_map_path", None)
+        self.use_kv_cache = kwargs.get("use_kv_cache", False)
+        self.verbose = kwargs.get("verbose", False)
+        self.args = kwargs.get("args", None)
+        self.dtype = kwargs.get("dtype", torch.float16)
+        self.use_checkpoint = False
+        self.max_context_len = kwargs.get("max_context_len", 8192)
+
+        # Single checkpoint file.
+        checkpoint_path = kwargs.get("checkpoint")
+
+        if os.path.isfile(checkpoint_path):
+            self.use_checkpoint = True
+
+        params_path = kwargs.get("params")
+        adapter_path = kwargs.get("adapter")
+
+        # self.input_pos = torch.arange(self.max_seq_len, dtype=torch.int64)
+        # Load checkpoint and params.
+        device = "cpu"
+        if self.use_checkpoint:
+            checkpoint = torch.load(
+                checkpoint_path, map_location=device, weights_only=False, mmap=True
+            )
+            checkpoint = convert_weights.meta_to_tune(checkpoint)
+            self.dtype = get_checkpoint_dtype(checkpoint)
+
+            adapter = torch.load(
+                adapter_path, map_location="cpu", mmap=True, weights_only=False
+            )
+
+            checkpoint.update(adapter)
+
+        with open(params_path, "r") as f:
+            params = json.loads(f.read())
+
+        # Load model.
+        # Cannot use "with torch.device("meta"):" because it causes some exceptions during export,
+        # i.e. the model isn't fully initialized or something.
+        self.model_ = lora_llama3_2(
+            lora_attn_modules=[
+                "q_proj",
+            ],
+            apply_lora_to_mlp=False,
+            apply_lora_to_output=False,
+            # llama3_2 args
+            vocab_size=params["vocab_size"],
+            num_layers=params["n_layers"],
+            num_heads=params["n_heads"],
+            num_kv_heads=params["n_kv_heads"],
+            embed_dim=params["dim"],
+            max_seq_len=self.max_seq_len,  # 131072
+            # intermediate_dim=params["intermediate_dim"], # 8192, calc is 4096
+            # LoRA args. TODO take in the adapter config.
+            lora_rank=8,
+            lora_alpha=16,
+        )
+        self.model_.requires_grad_(False)
+        for param_name, param_val in params.items():
+            setattr(self.model_, param_name, param_val)
+
+        setattr(self.model_, "enable_dynamic_shape", self.enable_dynamic_shape)
+        # Source transformation for MultiHeadAttention
+        self.model_ = replace_mha_with_inference_mha(self.model_)
+
+        model_args: ModelArgs = ModelArgs(
+            max_seq_len=self.max_seq_len,
+            max_context_len=self.max_context_len,
+            use_kv_cache=self.use_kv_cache,
+            generate_full_logits=self.generate_full_logits,
+            enable_dynamic_shape=self.enable_dynamic_shape,
+            **params,
+        )
+        # Source transformation for RoPE
+        # self.model_ = replace_rope_with_inference_rope(self.model_, model_args)
+
+        setattr(self.model_, "checkpoint_dtype", self.dtype)
+        if self.use_checkpoint:
+            # Load checkpoint.
+            missing, unexpected = self.model_.load_state_dict(
+                checkpoint,
+                strict=False,
+                assign=True,
+            )
+            if kwargs.get("verbose", False):
+                print("============= missing keys ================")
+                print(missing)
+                print("============= /missing ================")
+                print("============= unexpected keys ================")
+                print(unexpected)
+                print("============= /unexpected ================")
+
+        self.model_.to(self.dtype)
+        # breakpoint()  # 2, OK.
+
+    def get_eager_model(self) -> torch.nn.Module:
+        return self.model_
+
+    def get_example_inputs(self):
+        return (torch.tensor([[2, 3, 4]], dtype=torch.int64),)
+        # return (
+        #     torch.tensor([[2, 3, 4]], dtype=torch.long),
+        #     {"input_pos": torch.tensor([0], dtype=torch.long)},
+        # )
+        # return (torch.ones(1, self.n_tokens, dtype=torch.int64),)
+
+    # eg=torch.tensor([[2, 3, 4]], dtype=torch.int64)
+    # ip=torch.tensor([[0, 1, 2]], dtype=torch.long)
+    def get_example_kwarg_inputs(self):
+        return {"input_pos": torch.tensor([[0, 1, 2]], dtype=torch.long)}
+
+    def get_dynamic_shapes(self):
+        dim = torch.export.Dim("token_dim", min=1, max=self.max_seq_len - 1)
+        return ({1: dim}, {1: dim})