pytorch
diff --git a/‎backends/aoti/common_shims_slim.cpp‎
Lines changed: 61 additions & 0 deletions b/‎backends/aoti/common_shims_slim.cpp‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎backends/aoti/common_shims_slim.h‎
Lines changed: 13 additions & 177 deletions b/‎backends/aoti/common_shims_slim.h‎
Lines changed: 13 additions & 177 deletions
diff --git a/‎backends/aoti/slim/c10/cuda/Exception.h‎
Lines changed: 2 additions & 0 deletions b/‎backends/aoti/slim/c10/cuda/Exception.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/aoti/targets.bzl‎
Lines changed: 6 additions & 6 deletions b/‎backends/aoti/targets.bzl‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎backends/arm/_passes/decompose_add_sub_alpha_pass.py‎
Lines changed: 6 additions & 2 deletions b/‎backends/arm/_passes/decompose_add_sub_alpha_pass.py‎
Lines changed: 6 additions & 2 deletions
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/aoti/common_shims_slim.h>
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+
+// ============================================================
+// Basic Property Getters - Implementations
+// ============================================================
+
+AOTITorchError aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr) {
+  if (tensor == nullptr || ret_data_ptr == nullptr) {
+    return Error::InvalidArgument;
+  }
+  *ret_data_ptr = tensor->data_ptr();
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) {
+  if (tensor == nullptr || ret_sizes == nullptr) {
+    return Error::InvalidArgument;
+  }
+  *ret_sizes = const_cast<int64_t*>(tensor->sizes().data());
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) {
+  if (tensor == nullptr || ret_strides == nullptr) {
+    return Error::InvalidArgument;
+  }
+  *ret_strides = const_cast<int64_t*>(tensor->strides().data());
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype) {
+  if (tensor == nullptr || ret_dtype == nullptr) {
+    return Error::InvalidArgument;
+  }
+  *ret_dtype = static_cast<int32_t>(tensor->dtype());
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim) {
+  if (tensor == nullptr || ret_dim == nullptr) {
+    return Error::InvalidArgument;
+  }
+  *ret_dim = static_cast<int64_t>(tensor->dim());
+  return Error::Ok;
+}
+
+} // namespace aoti
+} // namespace backends
+} // namespace executorch
@@ -9,19 +9,9 @@
 #pragma once
 
 #include <executorch/backends/aoti/export.h>
+#include <executorch/backends/aoti/slim/core/SlimTensor.h>
 #include <executorch/runtime/core/error.h>
 #include <cstdint>
-#include <unordered_map>
-#include <vector>
-
-// Uses conditional compilation to separate the implementation between
-// CUDA backend (SlimTensor) and other backends like MPS (ETensor).
-// The caller determines which path is used by defining CUDA_AVAILABLE.
-#ifdef CUDA_AVAILABLE
-#include <executorch/backends/aoti/slim/core/SlimTensor.h>
-#else
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#endif
 
 namespace executorch {
 namespace backends {
@@ -30,185 +20,31 @@ namespace aoti {
 // Common using declarations for ExecuTorch types
 using executorch::runtime::Error;
 
-// ============================================================
-// Tensor Type Definition - branched based on CUDA_AVAILABLE
-// ============================================================
-#ifdef CUDA_AVAILABLE
+// Tensor type definition using SlimTensor
 using Tensor = executorch::backends::aoti::slim::SlimTensor;
-#else
-using Tensor = executorch::runtime::etensor::Tensor;
-#endif
 
 // Common AOTI type aliases
 using AOTIRuntimeError = Error;
 using AOTITorchError = Error;
 
-#ifndef CUDA_AVAILABLE
-namespace internal {
-// Global storage for tensor metadata (ETensor path only)
-// SlimTensor stores sizes/strides directly in int64_t[] - no caching needed
-inline std::unordered_map<Tensor*, std::vector<int64_t>>& tensor_to_sizes() {
-  static std::unordered_map<Tensor*, std::vector<int64_t>> instance;
-  return instance;
-}
-inline std::unordered_map<Tensor*, std::vector<int64_t>>& tensor_to_strides() {
-  static std::unordered_map<Tensor*, std::vector<int64_t>> instance;
-  return instance;
-}
-} // namespace internal
-#endif
-
 // ============================================================
-// Basic Property Getters - Inline implementations
+// Basic Property Getters - Declarations
 // ============================================================
 
-inline AOTITorchError aoti_torch_get_data_ptr(
-    Tensor* tensor,
-    void** ret_data_ptr) {
-  if (tensor == nullptr) {
-    return Error::InvalidArgument;
-  }
-  if (ret_data_ptr == nullptr) {
-    return Error::InvalidArgument;
-  }
-
-#ifdef CUDA_AVAILABLE
-  *ret_data_ptr = tensor->data_ptr();
-#else
-  *ret_data_ptr = tensor->mutable_data_ptr();
-#endif
-  return Error::Ok;
-}
-
-inline AOTITorchError aoti_torch_get_sizes(
-    Tensor* tensor,
-    int64_t** ret_sizes) {
-  if (tensor == nullptr) {
-    return Error::InvalidArgument;
-  }
-  if (ret_sizes == nullptr) {
-    return Error::InvalidArgument;
-  }
-
-#ifdef CUDA_AVAILABLE
-  // SlimTensor stores sizes directly in int64_t[] - no caching needed
-  *ret_sizes = const_cast<int64_t*>(tensor->sizes().data());
-#else
-  auto it = internal::tensor_to_sizes().find(tensor);
-  bool needs_update = false;
-
-  if (it == internal::tensor_to_sizes().end()) {
-    needs_update = true;
-  } else {
-    // Validate cached metadata matches current tensor state
-    auto tensor_sizes = tensor->sizes();
-    needs_update = !std::equal(
-        it->second.begin(),
-        it->second.end(),
-        tensor_sizes.begin(),
-        tensor_sizes.end());
-  }
-
-  if (needs_update) {
-    std::vector<int64_t> sizes(tensor->dim());
-    auto tensor_sizes = tensor->sizes();
-    for (int i = 0; i < tensor->dim(); i++) {
-      sizes[i] = tensor_sizes[i];
-    }
-    it = internal::tensor_to_sizes()
-             .insert_or_assign(tensor, std::move(sizes))
-             .first;
-  }
-
-  // For 0D tensors, data() returns nullptr on empty vectors
-  if (it->second.empty()) {
-    static int64_t empty_sizes_placeholder = 0;
-    *ret_sizes = &empty_sizes_placeholder;
-  } else {
-    *ret_sizes = it->second.data();
-  }
-#endif
-  return Error::Ok;
-}
-
-inline AOTITorchError aoti_torch_get_strides(
-    Tensor* tensor,
-    int64_t** ret_strides) {
-  if (tensor == nullptr) {
-    return Error::InvalidArgument;
-  }
-  if (ret_strides == nullptr) {
-    return Error::InvalidArgument;
-  }
-
-#ifdef CUDA_AVAILABLE
-  // SlimTensor stores strides directly in int64_t[] - no caching needed
-  *ret_strides = const_cast<int64_t*>(tensor->strides().data());
-#else
-  auto it = internal::tensor_to_strides().find(tensor);
-  bool needs_update = false;
-
-  if (it == internal::tensor_to_strides().end()) {
-    needs_update = true;
-  } else {
-    // Validate cached metadata matches current tensor state
-    auto tensor_strides = tensor->strides();
-    needs_update = !std::equal(
-        it->second.begin(),
-        it->second.end(),
-        tensor_strides.begin(),
-        tensor_strides.end());
-  }
-
-  if (needs_update) {
-    std::vector<int64_t> strides(tensor->dim());
-    auto tensor_strides = tensor->strides();
-    for (int i = 0; i < tensor->dim(); i++) {
-      strides[i] = tensor_strides[i];
-    }
-    it = internal::tensor_to_strides()
-             .insert_or_assign(tensor, std::move(strides))
-             .first;
-  }
-
-  // For 0D tensors, data() returns nullptr on empty vectors
-  if (it->second.empty()) {
-    static int64_t empty_strides_placeholder = 0;
-    *ret_strides = &empty_strides_placeholder;
-  } else {
-    *ret_strides = it->second.data();
-  }
-#endif
-  return Error::Ok;
-}
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr);
 
-inline AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype) {
-  if (tensor == nullptr) {
-    return Error::InvalidArgument;
-  }
-  if (ret_dtype == nullptr) {
-    return Error::InvalidArgument;
-  }
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes);
 
-#ifdef CUDA_AVAILABLE
-  *ret_dtype = static_cast<int32_t>(tensor->dtype());
-#else
-  *ret_dtype = static_cast<int32_t>(tensor->scalar_type());
-#endif
-  return Error::Ok;
-}
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides);
 
-inline AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim) {
-  if (tensor == nullptr) {
-    return Error::InvalidArgument;
-  }
-  if (ret_dim == nullptr) {
-    return Error::InvalidArgument;
-  }
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype);
 
-  *ret_dim = static_cast<int64_t>(tensor->dim());
-  return Error::Ok;
-}
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim);
 
 } // namespace aoti
 } // namespace backends
 
@@ -19,12 +19,14 @@
 
 /// Checks a CUDA expression and aborts on error.
 /// @param EXPR The CUDA expression to check.
+#ifndef ET_CUDA_CHECK
 #define ET_CUDA_CHECK(EXPR)                                                 \
   do {                                                                      \
     const cudaError_t __err = EXPR;                                         \
     ET_CHECK_MSG(                                                           \
         __err == cudaSuccess, "CUDA error: %s", cudaGetErrorString(__err)); \
   } while (0)
+#endif
 
 /// Checks a CUDA expression and logs a warning on error (non-fatal).
 /// @param EXPR The CUDA expression to check.
 
@@ -87,20 +87,20 @@ def define_common_targets():
         ],
     )
 
-    # SlimTensor-based common shims (header-only library)
-    # The caller determines which tensor type is used by defining CUDA_AVAILABLE.
-    # - With CUDA_AVAILABLE=1: Uses SlimTensor
-    # - Without CUDA_AVAILABLE: Uses ETensor
+    # SlimTensor-based common shims library
+    # Uses SlimTensor for all tensor operations
     runtime.cxx_library(
         name = "common_shims_slim",
+        srcs = [
+            "common_shims_slim.cpp",
+        ],
         headers = [
             "common_shims_slim.h",
             "export.h",
         ],
         visibility = ["@EXECUTORCH_CLIENTS"],
-        deps = [
+        exported_deps = [
             "//executorch/runtime/core:core",
-            "//executorch/runtime/core/exec_aten:lib",
             "//executorch/backends/aoti/slim/core:slimtensor",
         ],
     )
@@ -1,4 +1,4 @@
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -76,7 +76,11 @@ def call_operator(self, op, args, kwargs, meta, updated: bool | None = False):
         lhs, rhs = args
 
         alpha_full = super().call_operator(
-            full_op, ((1,), float(alpha)), {}, meta, updated=True
+            full_op,
+            ((1,), float(alpha)),
+            {"device": meta["val"].device},
+            meta,
+            updated=True,
         )
         scaled_rhs = super().call_operator(
             mul_op,