Added the latest change for subfunction

abhishek-singh591 · abhishek-singh591 · commit 663f8978a4c4 · 2025-10-29T04:58:07.000Z
diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
@@ -12,6 +12,7 @@
 from QEfficient.utils import custom_format_warning
 from QEfficient.utils.logging_utils import logger
 
+from QEfficient.utils.patches import apply_torch_patches, is_patched
 # For faster downloads via hf_transfer
 # This code is put above import statements as this needs to be executed before
 # hf_transfer is imported (will happen on line 15 via leading imports)
@@ -21,6 +22,9 @@
 # custom warning for the better logging experience
 warnings.formatwarning = custom_format_warning
 
+# Apply patches
+# TODO: Find a better way to do this, this is temp. fix.
+apply_torch_patches()
 
 def check_qaic_sdk():
     """Check if QAIC SDK is installed"""
@@ -69,6 +73,8 @@ def check_qaic_sdk():
         "QEFFAutoModelForImageTextToText",
         "QEFFAutoModelForSpeechSeq2Seq",
         "QEFFCommonLoader",
+        "apply_torch_patches",
+        "is_patched",
     ]
 
 else:
diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
@@ -253,22 +253,6 @@ def _export(
             decoder_layer_classes = get_decoder_layer_classes_for_export(self.model)
             export_kwargs = {} if export_kwargs is None else export_kwargs
             
-            # def sanitize_decoder_layer_for_onnx(module):
-            #     """Remove or simplify attributes that ONNX export cannot handle."""
-            #     unsafe_attrs = ["config", "experts", "router", "cache", "past_key_values", "sliding_window"]
-            #     for attr in unsafe_attrs:
-            #         if hasattr(module, attr):
-            #             try:
-            #                 setattr(module, attr, None)
-            #             except Exception:
-            #                 pass
-            
-            # # Sanitize *only* the decoder layers
-            # for m in self.model.modules():
-            #     if m.__class__ in decoder_layer_classes:
-            #         sanitize_decoder_layer_for_onnx(m)
-
-            # import pdb; pdb.set_trace()
             torch.onnx.export(
                 self.model,
                 (example_inputs,),
@@ -285,9 +269,8 @@ def _export(
             logger.info("PyTorch export successful")
 
             _ = self._offload_model_weights(offload_pt_weights)
-
-            rename_function_outputs(tmp_onnx_path, output_names)
             model = onnx.load(tmp_onnx_path, load_external_data=False)
+            model,transformed = rename_function_outputs(model)
             transform_kwargs = {
                 "onnx_base_dir": str(tmp_onnx_dir),
                 "temp_onnx_path": tmp_onnx_path,
diff --git a/QEfficient/base/onnx_transforms.py b/QEfficient/base/onnx_transforms.py
@@ -223,50 +223,22 @@ def _is_custom_op_used(cls, model: ModelProto, op_name: str, used_op_types: set)
 
         return False
 
-
-def rename_function_outputs(onnx_path, expected_output_names):
-    model = onnx.load(onnx_path, load_external_data=False)
+def rename_function_outputs(model):
     graph = model.graph
-    for i, output in enumerate(graph.output):
-        output.name = expected_output_names[i]
-
+    op_type_to_func_map = {func.name:func for func in model.functions}
     decoder_layer_patterns = ["DecoderLayer", "Block", "Layer"]
-    layer_index = 0
-    output_rename_map = {}
-
+    transformed = False
+    model_graph_outputs = [val.name for val in model.graph.output]
     for node in graph.node:
         if any(pattern in node.name or pattern in node.op_type for pattern in decoder_layer_patterns):
-            if "layers.0" in node.name:
-                if len(node.output) >= 4:
-                    print(f"Renaming outputs of node (layers.0): {node.name}")
-                    new_output_0 = f"past_key.{layer_index}_RetainedState"
-                    new_output_1 = f"past_value.{layer_index}_RetainedState"
-                    output_rename_map[node.output[2]] = new_output_0
-                    output_rename_map[node.output[3]] = new_output_1
-                    node.output[2] = new_output_0
-                    node.output[3] = new_output_1
-                    layer_index += 1
-                else:
-                    print(f"Warning: Node {node.name} has fewer than 4 outputs.")
-            elif len(node.output) >= 2:
-                print(f"Renaming outputs of node: {node.name}")
-                new_output_0 = f"past_key.{layer_index}_RetainedState"
-                new_output_1 = f"past_value.{layer_index}_RetainedState"
-                output_rename_map[node.output[0]] = new_output_0
-                output_rename_map[node.output[1]] = new_output_1
-                node.output[0] = new_output_0
-                node.output[1] = new_output_1
-                layer_index += 1
-            else:
-                print(f"Warning: Node {node.name} has fewer than 2 outputs.")
-
-    for node in graph.node:
-        for i, input_name in enumerate(node.input):
-            if input_name in output_rename_map:
-                import pdb
-
-                pdb.set_trace()
-                print(f"Replacing input {input_name} in node {node.name} with {output_rename_map[input_name]}")
-                node.input[i] = output_rename_map[input_name]
-
-    onnx.save(model, onnx_path)
+            func = op_type_to_func_map[node.op_type]
+            for i, out_name in enumerate(func.output):
+                if "_InternalRetainedState" in out_name:    
+                    transformed = True
+                    tmp = node.output[i]
+                    new_name = func.output[i].replace("Internal", "")
+                    print(f"renaming {node.output[i]} to {new_name}")
+                    node.output[i] = new_name
+                    model.graph.output[model_graph_outputs.index(tmp)].name = new_name
+                    
+    return model, transformed
diff --git a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py
@@ -171,7 +171,7 @@ def forward(self, hidden_states):
         up = torch.bmm(expert_in, up_proj) + up_proj_bias.unsqueeze(1)
 
         # Apply activation with clamping
-        gate = gate.clamp(min=None, max=self.experts.limit)
+        gate = gate.clamp(min=-self.experts.limit, max=self.experts.limit)
         up = up.clamp(min=-self.experts.limit, max=self.experts.limit)
 
         # GLU activation
diff --git a/QEfficient/utils/patches.py b/QEfficient/utils/patches.py
@@ -0,0 +1,120 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+"""Monkey patches for torch.onnx.utils to fix ONNX export issues."""
+
+from typing import Collection, Set, Type, Union
+
+import torch
+import torch.onnx.utils as onnx_utils
+from torch import _C
+
+
+def _setup_trace_module_map_patched(
+    model: Union[torch.nn.Module, torch.jit.ScriptModule],
+    export_modules_as_functions: Union[bool, Collection[Type[torch.nn.Module]]],
+) -> Set[str]:
+    """Patched version of _setup_trace_module_map that fixes onnx_attrs type mismatch."""
+
+    def __register_attribute_hook():
+        attr_name = "_onnx_attrs"
+
+        def _track_module_attributes_forward_pre_hook(module, input):
+            setattr(module, attr_name, _get_module_attributes(module))
+
+        def _track_module_attributes_forward_hook(module, input, output):
+            tracing_state = _C._get_tracing_state()
+            if not tracing_state:
+                return
+            graph = tracing_state.graph()
+            onnx_attrs = {}
+            if hasattr(module, attr_name):
+                onnx_attrs = getattr(module, attr_name)
+                delattr(module, attr_name)
+            # FIX: use empty dict to avoid type mismatch with _jit_pass_onnx_track_scope_attributes
+            # Observed in transformers v4.55 and above
+            onnx_attrs = {}
+            _C._jit_pass_onnx_track_scope_attributes(graph, onnx_attrs)
+
+        for m in model.modules():
+            m.register_forward_hook(_track_module_attributes_forward_hook)
+            m.register_forward_pre_hook(_track_module_attributes_forward_pre_hook)
+
+    def _unqualified_variable_name(qualified_name: str) -> str:
+        """
+        Parse qualified variable name and return the unqualified version.
+        Pure numeric atoms are considered inadequate, so this function will look past them,
+        and start from the first non-numeric atom.
+        """
+        name_atoms = qualified_name.split(".")
+        for i, atom in reversed(list(enumerate(name_atoms))):
+            if not atom.isnumeric():
+                return ".".join(name_atoms[i:])
+        return qualified_name
+
+    trace_module_map = {
+        _m: torch._C._jit_onnx_create_full_scope_name(torch.typename(type(_m)), _unqualified_variable_name(_n))
+        for _n, _m in model.named_modules()
+    }
+    torch.jit._trace._trace_module_map = trace_module_map
+
+    if isinstance(export_modules_as_functions, bool) and export_modules_as_functions:
+        module_typenames = {torch.typename(type(module)) for module in trace_module_map}
+    elif isinstance(export_modules_as_functions, set) and export_modules_as_functions:
+
+        def _find_typename(v):
+            if isinstance(v, type):
+                return torch.typename(v)
+            else:
+                raise RuntimeError(
+                    "Only type of the `nn.Module` should be "
+                    "passed in the set for argument `export_modules_as_functions`. "
+                    f"Got `{type(v).__name__}`."
+                )
+
+        module_typenames = {_find_typename(v) for v in export_modules_as_functions}
+    else:
+        module_typenames = set()
+
+    if module_typenames:
+        __register_attribute_hook()
+
+    return module_typenames
+
+
+def _get_module_attributes(module):
+    """Helper function to get module attributes safely."""
+    import typing
+
+    annotations = typing.get_type_hints(type(module))
+    base_m_annotations = typing.get_type_hints(torch.nn.Module)
+    [annotations.pop(k, None) for k in base_m_annotations]
+
+    attrs = {}
+    for k in annotations:
+        try:
+            attrs[k] = getattr(module, k)
+        except AttributeError:
+            _C._jit_onnx_log(f"Skipping module attribute '{k}'")
+            continue
+    return attrs
+
+
+def apply_torch_patches():
+    """Apply all necessary torch patches for ONNX export."""
+    # Monkey patch the function
+    onnx_utils._setup_trace_module_map = _setup_trace_module_map_patched
+
+    if hasattr(onnx_utils, "_get_module_attributes"):
+        onnx_utils._get_module_attributes = _get_module_attributes
+
+    print("Applied torch ONNX export patches for export_modules_as_functions compatibility")
+
+
+def is_patched():
+    """Check if patches have been applied."""
+    return onnx_utils._setup_trace_module_map == _setup_trace_module_map_patched
diff --git a/output.txt b/output.txt
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,11 +39,11 @@ dependencies = [
     "fire",
     "py7zr",
     "torchmetrics==1.7.0",
-    "torch==2.7.0; platform_machine=='aarch64'",
+    "torch==2.4.1; platform_machine=='aarch64'",
     # Specifying torch cpu package URL per python version, update the list once pytorch releases whl for python>3.11
     "torch@https://download.pytorch.org/whl/cpu/torch-2.4.1%2Bcpu-cp38-cp38-linux_x86_64.whl ; python_version=='3.8' and platform_machine=='x86_64'",
-    "torch@https://download.pytorch.org/whl/cpu/torch-2.7.0%2Bcpu-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and platform_machine=='x86_64'",
-    "torch@https://download.pytorch.org/whl/cpu/torch-2.7.0%2Bcpu-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and platform_machine=='x86_64'",
+    "torch@https://download.pytorch.org/whl/cpu/torch-2.4.1%2Bcpu-cp39-cp39-linux_x86_64.whl ; python_version=='3.9' and platform_machine=='x86_64'",
+    "torch@https://download.pytorch.org/whl/cpu/torch-2.4.1%2Bcpu-cp310-cp310-linux_x86_64.whl ; python_version=='3.10' and platform_machine=='x86_64'",
 ]
 
 [project.optional-dependencies]
diff --git a/run.py b/run.py
@@ -84,7 +84,7 @@
 # ort_tokens = api_runner.run_kv_model_on_ort(onnx_model_path, is_tlm=False)
 
 qpc_path = qeff_model.compile(
-    prefill_seq_len=Constants.PROMPT_LEN,
+    prefill_seq_len=1,
     ctx_len=Constants.CTX_LEN,
     num_cores=16,
     mxfp6_matmul=False,
@@ -98,7 +98,6 @@
 streamer = TextStreamer(tokenizer)
 exec_info = qeff_model.generate(
     tokenizer,
-    streamer=streamer,
     prompts=Constants.INPUT_STR,
     device_ids=[0, 1, 2, 3],
 )
diff --git a/test.py b/test.py
@@ -3,7 +3,7 @@
 from QEfficient import QEFFAutoModelForCausalLM
 import torch
 # Initialize the model using from_pretrained similar to transformers.AutoModelForCausalLM
-model_name = "Qwen/Qwen3-30B-A3B-Instruct-2507"
+model_name = "meta-llama/Llama-3.2-1B"
 # model_name="GPT2"
 # model_name="Qwen/Qwen2-1.5B-Instruct"
 import time
@@ -14,6 +14,7 @@
 # print("torch.compile run for model.model")
 # print("time ",t2-t1)
 # print("done")
+# import pdb; pdb.set_trace()
 inputs="Help me with this"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 # tokens=tokenizer([input], return_tensors="pt")