From 51504a8e909fbed001956b86b44f947e4ca90e16 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Mon, 12 Jan 2026 10:32:59 -0800 Subject: [PATCH 1/2] [slimtensor] Introduce Device and ScalarType headers for SlimTensor minimal support Pull Request resolved: https://github.com/pytorch/executorch/pull/16382 This diff introduces the foundational c10 core headers for SlimTensor, a lightweight tensor implementation used by torchnative, to cuda backend runtime and further it will be used by all aoti-driven backends like MPS. We add: - DeviceType.h - Device type enum (CPU only for now) - Device.h - Device class representing compute device location - ScalarType.h - Scalar type enum with elementSize() helper (Float only for now) These headers are modeled after PyTorch's c10 but simplified for our needs. The enum values are kept compatible with PyTorch for serialization compatibility. This is the first step in migrating SlimTensor to replace ETensor as the internal tensor representation in CUDA backend. Future diffs will add Storage, SlimTensor class, and additional dtypes/devices incrementally. ghstack-source-id: 332983719 @exported-using-ghexport Differential Revision: [D89747061](https://our.internmc.facebook.com/intern/diff/D89747061/) --- backends/aoti/slim/c10/core/Device.h | 145 ++++++++++++++++++ backends/aoti/slim/c10/core/DeviceType.h | 66 ++++++++ backends/aoti/slim/c10/core/ScalarType.h | 83 ++++++++++ backends/aoti/slim/c10/core/TARGETS | 3 + backends/aoti/slim/c10/core/targets.bzl | 52 +++++++ backends/aoti/slim/c10/core/test/TARGETS | 3 + backends/aoti/slim/c10/core/test/targets.bzl | 25 +++ .../aoti/slim/c10/core/test/test_device.cpp | 111 ++++++++++++++ .../slim/c10/core/test/test_scalar_type.cpp | 61 ++++++++ 9 files changed, 549 insertions(+) create mode 100644 backends/aoti/slim/c10/core/Device.h create mode 100644 backends/aoti/slim/c10/core/DeviceType.h create mode 100644 backends/aoti/slim/c10/core/ScalarType.h create mode 100644 backends/aoti/slim/c10/core/TARGETS create mode 100644 backends/aoti/slim/c10/core/targets.bzl create mode 100644 backends/aoti/slim/c10/core/test/TARGETS create mode 100644 backends/aoti/slim/c10/core/test/targets.bzl create mode 100644 backends/aoti/slim/c10/core/test/test_device.cpp create mode 100644 backends/aoti/slim/c10/core/test/test_scalar_type.cpp diff --git a/backends/aoti/slim/c10/core/Device.h b/backends/aoti/slim/c10/core/Device.h new file mode 100644 index 00000000000..5638f6f80e8 --- /dev/null +++ b/backends/aoti/slim/c10/core/Device.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +#include +#include + +namespace executorch::backends::aoti::slim::c10 { + +/// An index representing a specific device; e.g., the 1 in GPU 1. +/// A DeviceIndex is not independently meaningful without knowing +/// the DeviceType it is associated; try to use Device rather than +/// DeviceIndex directly. +using DeviceIndex = int8_t; + +/// Represents a compute device on which a tensor is located. +/// A device is uniquely identified by a type (e.g., CPU) and a device index. +struct Device final { + /// Constructs a new Device from a DeviceType and an optional device index. + /// @param type The type of device. + /// @param index The device index. For CPU, this should be -1 or 0. + /* implicit */ + explicit Device(DeviceType type, DeviceIndex index = -1) + : type_(type), index_(index) { + validate(); + } + + /// Constructs a Device from a string description. + /// The string must be "cpu" or "cpu:0". + /* implicit */ Device(const std::string& device_string) + : Device(DeviceType::CPU) { + ET_CHECK_MSG(!device_string.empty(), "Device string must not be empty"); + + if (device_string == "cpu" || device_string == "CPU") { + type_ = DeviceType::CPU; + index_ = -1; + } else if (device_string == "cpu:0" || device_string == "CPU:0") { + type_ = DeviceType::CPU; + index_ = static_cast(device_string.back() - '0'); + } else { + ET_CHECK_MSG( + false, + "Invalid device string: %s. Currently only 'cpu' is supported.", + device_string.c_str()); + } + validate(); + } + + /// Returns true if the type and index of this Device matches that of other. + bool operator==(const Device& other) const noexcept { + return this->type_ == other.type_ && this->index_ == other.index_; + } + + /// Returns true if the type or index of this Device differs from that of + /// other. + bool operator!=(const Device& other) const noexcept { + return !(*this == other); + } + + /// Sets the device index. + void set_index(DeviceIndex index) { + index_ = index; + } + + /// Returns the type of device this is. + DeviceType type() const noexcept { + return type_; + } + + /// Returns the device index. + DeviceIndex index() const noexcept { + return index_; + } + + /// Returns true if the device has a non-default index. + bool has_index() const noexcept { + return index_ != -1; + } + + /// Returns true if the device is of CPU type. + bool is_cpu() const noexcept { + return type_ == DeviceType::CPU; + } + + /// Returns a string representation of the device (e.g., "cpu" or "cpu:0"). + std::string str() const { + std::string str = DeviceTypeName(type(), /* lower_case */ true); + if (has_index()) { + str.push_back(':'); + str.append(std::to_string(index())); + } + return str; + } + + private: + DeviceType type_; + DeviceIndex index_ = -1; + + void validate() { + ET_DCHECK_MSG( + index_ >= -1, + "Device index must be -1 or non-negative, got %d", + static_cast(index_)); + ET_DCHECK_MSG( + !is_cpu() || index_ <= 0, + "CPU device index must be -1 or zero, got %d", + static_cast(index_)); + } +}; + +inline std::ostream& operator<<(std::ostream& stream, const Device& device) { + stream << device.str(); + return stream; +} + +} // namespace executorch::backends::aoti::slim::c10 + +namespace std { +template <> +struct hash { + size_t operator()( + executorch::backends::aoti::slim::c10::Device d) const noexcept { + static_assert( + sizeof(executorch::backends::aoti::slim::c10::DeviceType) == 1, + "DeviceType is not 8-bit"); + static_assert( + sizeof(executorch::backends::aoti::slim::c10::DeviceIndex) == 1, + "DeviceIndex is not 8-bit"); + uint32_t bits = static_cast(static_cast(d.type())) + << 16 | + static_cast(static_cast(d.index())); + return std::hash{}(bits); + } +}; +} // namespace std diff --git a/backends/aoti/slim/c10/core/DeviceType.h b/backends/aoti/slim/c10/core/DeviceType.h new file mode 100644 index 00000000000..c8c36c7faab --- /dev/null +++ b/backends/aoti/slim/c10/core/DeviceType.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +#include + +namespace executorch::backends::aoti::slim::c10 { + +/// Enum representing the type of device. +enum class DeviceType : int8_t { + CPU = 0, + COMPILE_TIME_MAX_DEVICE_TYPES = 1, +}; + +constexpr DeviceType kCPU = DeviceType::CPU; + +/// Maximum number of device types at compile time. +constexpr int COMPILE_TIME_MAX_DEVICE_TYPES = + static_cast(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES); + +/// Returns the name of the device type as a string. +/// @param d The device type. +/// @param lower_case If true, returns the name in lower case. +/// @return The name of the device type. +inline std::string DeviceTypeName(DeviceType d, bool lower_case = false) { + switch (d) { + case DeviceType::CPU: + return lower_case ? "cpu" : "CPU"; + default: + ET_CHECK_MSG(false, "Unknown device type: %d", static_cast(d)); + } +} + +/// Checks if the device type is valid. +/// @param d The device type to check. +/// @return true if the device type is valid, false otherwise. +inline bool isValidDeviceType(DeviceType d) { + return d == DeviceType::CPU; +} + +inline std::ostream& operator<<(std::ostream& stream, DeviceType type) { + stream << DeviceTypeName(type, /* lower_case */ true); + return stream; +} + +} // namespace executorch::backends::aoti::slim::c10 + +namespace std { +template <> +struct hash { + std::size_t operator()( + executorch::backends::aoti::slim::c10::DeviceType k) const { + return std::hash()(static_cast(k)); + } +}; +} // namespace std diff --git a/backends/aoti/slim/c10/core/ScalarType.h b/backends/aoti/slim/c10/core/ScalarType.h new file mode 100644 index 00000000000..1ca1a1429ed --- /dev/null +++ b/backends/aoti/slim/c10/core/ScalarType.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +#include + +namespace executorch::backends::aoti::slim::c10 { + +/// Enum representing the scalar type (dtype) of tensor elements. +/// Note: Enum values must match PyTorch's c10::ScalarType for compatibility. +enum class ScalarType : int8_t { + // Byte = 0, + // Char = 1, + // Short = 2, + // Int = 3, + // Long = 4, + Float = 6, + // Bool = 11, + // BFloat16 = 15, + Undefined = -1, + NumOptions = 7, +}; + +/// Constant for Float scalar type. +constexpr ScalarType kFloat = ScalarType::Float; + +/// Returns the size in bytes of a single element of the given scalar type. +/// @param t The scalar type. +/// @return The size in bytes of a single element. +inline size_t elementSize(ScalarType t) { + switch (t) { + case ScalarType::Float: + return sizeof(float); + default: + ET_CHECK_MSG(false, "Unknown ScalarType: %d", static_cast(t)); + } +} + +/// Returns the name of the scalar type as a string. +/// @param t The scalar type. +/// @return The name of the scalar type. +inline const char* toString(ScalarType t) { + switch (t) { + case ScalarType::Float: + return "Float"; + case ScalarType::Undefined: + return "Undefined"; + default: + return "UNKNOWN_SCALAR"; + } +} + +/// Checks if the scalar type is a floating point type. +/// @param t The scalar type to check. +/// @return true if the scalar type is floating point, false otherwise. +inline bool isFloatingType(ScalarType t) { + return t == ScalarType::Float; +} + +/// Checks if the scalar type is an integral type (including bool). +/// @param t The scalar type to check. +/// @param includeBool Whether to consider Bool as integral. +/// @return true if the scalar type is integral, false otherwise. +inline bool isIntegralType(ScalarType t, bool /*includeBool*/) { + (void)t; + return false; +} + +inline std::ostream& operator<<(std::ostream& stream, ScalarType scalar_type) { + return stream << toString(scalar_type); +} + +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/core/TARGETS b/backends/aoti/slim/c10/core/TARGETS new file mode 100644 index 00000000000..77871de4469 --- /dev/null +++ b/backends/aoti/slim/c10/core/TARGETS @@ -0,0 +1,3 @@ +load("targets.bzl", "define_common_targets") + +define_common_targets() diff --git a/backends/aoti/slim/c10/core/targets.bzl b/backends/aoti/slim/c10/core/targets.bzl new file mode 100644 index 00000000000..9b7d1259df0 --- /dev/null +++ b/backends/aoti/slim/c10/core/targets.bzl @@ -0,0 +1,52 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + """Define targets for SlimTensor c10 core module.""" + + # Header-only library for DeviceType + runtime.cxx_library( + name = "device_type", + headers = [ + "DeviceType.h", + ], + visibility = ["@EXECUTORCH_CLIENTS"], + exported_deps = [ + "//executorch/runtime/platform:platform", + ], + ) + + # Header-only library for Device + runtime.cxx_library( + name = "device", + headers = [ + "Device.h", + ], + visibility = ["@EXECUTORCH_CLIENTS"], + exported_deps = [ + ":device_type", + "//executorch/runtime/platform:platform", + ], + ) + + # Header-only library for ScalarType + runtime.cxx_library( + name = "scalar_type", + headers = [ + "ScalarType.h", + ], + visibility = ["@EXECUTORCH_CLIENTS"], + exported_deps = [ + "//executorch/runtime/platform:platform", + ], + ) + + # Combined c10 core library + runtime.cxx_library( + name = "core", + visibility = ["@EXECUTORCH_CLIENTS"], + exported_deps = [ + ":device", + ":device_type", + ":scalar_type", + ], + ) diff --git a/backends/aoti/slim/c10/core/test/TARGETS b/backends/aoti/slim/c10/core/test/TARGETS new file mode 100644 index 00000000000..77871de4469 --- /dev/null +++ b/backends/aoti/slim/c10/core/test/TARGETS @@ -0,0 +1,3 @@ +load("targets.bzl", "define_common_targets") + +define_common_targets() diff --git a/backends/aoti/slim/c10/core/test/targets.bzl b/backends/aoti/slim/c10/core/test/targets.bzl new file mode 100644 index 00000000000..f7abf59a273 --- /dev/null +++ b/backends/aoti/slim/c10/core/test/targets.bzl @@ -0,0 +1,25 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + """Define test targets for SlimTensor c10 core module.""" + + runtime.cxx_test( + name = "test_device", + srcs = [ + "test_device.cpp", + ], + deps = [ + "//executorch/backends/aoti/slim/c10/core:device", + "//executorch/backends/aoti/slim/c10/core:device_type", + ], + ) + + runtime.cxx_test( + name = "test_scalar_type", + srcs = [ + "test_scalar_type.cpp", + ], + deps = [ + "//executorch/backends/aoti/slim/c10/core:scalar_type", + ], + ) diff --git a/backends/aoti/slim/c10/core/test/test_device.cpp b/backends/aoti/slim/c10/core/test/test_device.cpp new file mode 100644 index 00000000000..57123589775 --- /dev/null +++ b/backends/aoti/slim/c10/core/test/test_device.cpp @@ -0,0 +1,111 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include + +using namespace executorch::backends::aoti::slim::c10; + +class DeviceTypeTest : public ::testing::Test {}; + +TEST_F(DeviceTypeTest, CPUEnumValue) { + // Verify CPU has the correct enum value (0) + EXPECT_EQ(static_cast(DeviceType::CPU), 0); +} + +TEST_F(DeviceTypeTest, DeviceTypeName) { + // Verify DeviceTypeName returns correct strings + EXPECT_EQ(DeviceTypeName(DeviceType::CPU, false), "CPU"); + EXPECT_EQ(DeviceTypeName(DeviceType::CPU, true), "cpu"); +} + +TEST_F(DeviceTypeTest, IsValidDeviceType) { + // Verify isValidDeviceType works correctly + EXPECT_TRUE(isValidDeviceType(DeviceType::CPU)); +} + +TEST_F(DeviceTypeTest, KCPUConstant) { + // Verify kCPU constant + EXPECT_EQ(kCPU, DeviceType::CPU); +} + +class DeviceTest : public ::testing::Test {}; + +TEST_F(DeviceTest, ConstructFromDeviceType) { + // Construct Device from DeviceType + Device cpu_device(DeviceType::CPU); + + EXPECT_TRUE(cpu_device.is_cpu()); + EXPECT_EQ(cpu_device.type(), DeviceType::CPU); + EXPECT_EQ(cpu_device.index(), -1); // Default index + EXPECT_FALSE(cpu_device.has_index()); +} + +TEST_F(DeviceTest, ConstructWithIndex) { + // Construct Device with explicit index + Device cpu_device(DeviceType::CPU, 0); + + EXPECT_TRUE(cpu_device.is_cpu()); + EXPECT_EQ(cpu_device.type(), DeviceType::CPU); + EXPECT_EQ(cpu_device.index(), 0); + EXPECT_TRUE(cpu_device.has_index()); +} + +TEST_F(DeviceTest, ConstructFromString) { + // Construct Device from string + Device cpu1("cpu"); + EXPECT_TRUE(cpu1.is_cpu()); + EXPECT_EQ(cpu1.index(), -1); + + Device cpu2("CPU"); + EXPECT_TRUE(cpu2.is_cpu()); + EXPECT_EQ(cpu2.index(), -1); + + Device cpu3("cpu:0"); + EXPECT_TRUE(cpu3.is_cpu()); + EXPECT_EQ(cpu3.index(), 0); +} + +TEST_F(DeviceTest, Equality) { + Device cpu1(DeviceType::CPU, 0); + Device cpu2(DeviceType::CPU, 0); + Device cpu3(DeviceType::CPU, -1); + + EXPECT_EQ(cpu1, cpu2); + EXPECT_NE(cpu1, cpu3); +} + +TEST_F(DeviceTest, Str) { + Device cpu1(DeviceType::CPU); + EXPECT_EQ(cpu1.str(), "cpu"); + + Device cpu2(DeviceType::CPU, 0); + EXPECT_EQ(cpu2.str(), "cpu:0"); +} + +TEST_F(DeviceTest, SetIndex) { + Device cpu(DeviceType::CPU); + EXPECT_EQ(cpu.index(), -1); + + cpu.set_index(0); + EXPECT_EQ(cpu.index(), 0); + EXPECT_TRUE(cpu.has_index()); +} + +TEST_F(DeviceTest, Hash) { + // Verify Device can be hashed (for use in unordered containers) + Device cpu1(DeviceType::CPU, 0); + Device cpu2(DeviceType::CPU, 0); + Device cpu3(DeviceType::CPU, -1); + + std::hash hasher; + EXPECT_EQ(hasher(cpu1), hasher(cpu2)); + EXPECT_NE(hasher(cpu1), hasher(cpu3)); +} diff --git a/backends/aoti/slim/c10/core/test/test_scalar_type.cpp b/backends/aoti/slim/c10/core/test/test_scalar_type.cpp new file mode 100644 index 00000000000..673641d84c7 --- /dev/null +++ b/backends/aoti/slim/c10/core/test/test_scalar_type.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +using namespace executorch::backends::aoti::slim::c10; + +class ScalarTypeTest : public ::testing::Test {}; + +TEST_F(ScalarTypeTest, FloatEnumValue) { + // Verify Float has the correct enum value (6) to match PyTorch + EXPECT_EQ(static_cast(ScalarType::Float), 6); +} + +TEST_F(ScalarTypeTest, KFloatConstant) { + // Verify kFloat constant + EXPECT_EQ(kFloat, ScalarType::Float); +} + +TEST_F(ScalarTypeTest, ElementSizeFloat) { + // Verify elementSize returns correct size for Float (4 bytes) + EXPECT_EQ(elementSize(ScalarType::Float), sizeof(float)); + EXPECT_EQ(elementSize(ScalarType::Float), 4); +} + +TEST_F(ScalarTypeTest, ToStringFloat) { + // Verify toString returns correct string for Float + EXPECT_STREQ(toString(ScalarType::Float), "Float"); +} + +TEST_F(ScalarTypeTest, ToStringUndefined) { + // Verify toString returns correct string for Undefined + EXPECT_STREQ(toString(ScalarType::Undefined), "Undefined"); +} + +TEST_F(ScalarTypeTest, IsFloatingType) { + // Verify isFloatingType works correctly + EXPECT_TRUE(isFloatingType(ScalarType::Float)); +} + +TEST_F(ScalarTypeTest, IsIntegralType) { + // Verify isIntegralType works correctly + // Currently no integral types are supported, so Float should return false + EXPECT_FALSE(isIntegralType(ScalarType::Float, false)); + EXPECT_FALSE(isIntegralType(ScalarType::Float, true)); +} + +TEST_F(ScalarTypeTest, StreamOperator) { + // Verify stream operator works + std::ostringstream oss; + oss << ScalarType::Float; + EXPECT_EQ(oss.str(), "Float"); +} From fc492d73279737599716218318543be40063edb5 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Mon, 12 Jan 2026 10:33:01 -0800 Subject: [PATCH 2/2] [slimtensor] Storage and SharedPtr for CPU owning mode Pull Request resolved: https://github.com/pytorch/executorch/pull/16383 This diff introduces the foundation for tensor storage management in the SlimTensor migration: - util/SharedPtr.h - A lightweight, non-thread-safe shared pointer optimized for single-threaded tensor operations. - core/Storage.h - The MaybeOwningStorage class that manages tensor data memory: - DeviceTraits specialization with allocate(), free(), memcpy() using malloc/free - Owning mode for CPU device (CUDA and non-owning mode added later) - Storage type alias as SharedPtr - Move semantics for efficient resource transfer - clone() and copy_() methods for data management ghstack-source-id: 332983720 @exported-using-ghexport Differential Revision: [D89747980](https://our.internmc.facebook.com/intern/diff/D89747980/) --- backends/aoti/slim/core/Storage.h | 245 +++++++++++++++++ backends/aoti/slim/core/TARGETS | 3 + backends/aoti/slim/core/targets.bzl | 19 ++ backends/aoti/slim/core/test/TARGETS | 3 + backends/aoti/slim/core/test/targets.bzl | 14 + backends/aoti/slim/core/test/test_storage.cpp | 259 ++++++++++++++++++ backends/aoti/slim/util/SharedPtr.h | 192 +++++++++++++ backends/aoti/slim/util/TARGETS | 3 + backends/aoti/slim/util/targets.bzl | 16 ++ 9 files changed, 754 insertions(+) create mode 100644 backends/aoti/slim/core/Storage.h create mode 100644 backends/aoti/slim/core/TARGETS create mode 100644 backends/aoti/slim/core/targets.bzl create mode 100644 backends/aoti/slim/core/test/TARGETS create mode 100644 backends/aoti/slim/core/test/targets.bzl create mode 100644 backends/aoti/slim/core/test/test_storage.cpp create mode 100644 backends/aoti/slim/util/SharedPtr.h create mode 100644 backends/aoti/slim/util/TARGETS create mode 100644 backends/aoti/slim/util/targets.bzl diff --git a/backends/aoti/slim/core/Storage.h b/backends/aoti/slim/core/Storage.h new file mode 100644 index 00000000000..ed8bdf88b49 --- /dev/null +++ b/backends/aoti/slim/core/Storage.h @@ -0,0 +1,245 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include +#include +#include +#include + +namespace executorch::backends::aoti::slim { + +/// Type alias for deleter function pointer. +using DeleterFn = void (*)(void*); + +namespace detail { +/// No-op deleter for non-owning storage. +inline void noop(void*) {} +} // namespace detail + +/// Default CPU device constant. +inline const c10::Device CPU_DEVICE = c10::Device(c10::DeviceType::CPU, 0); + +/// DeviceTraits template for device-specific operations. +/// Device-specific implementations provide allocate(), free(), and memcpy(). +template +struct DeviceTraits; + +/// CPU specialization of DeviceTraits. +/// Provides CPU memory allocation and copy operations using malloc/free/memcpy. +template <> +struct DeviceTraits { + /// Allocates CPU memory using malloc. + /// @param nbytes Number of bytes to allocate. + /// @param device The target device (unused for CPU). + /// @return Pointer to allocated memory. + static void* allocate(size_t nbytes, const c10::Device& device = CPU_DEVICE) { + (void)device; + // NOLINTNEXTLINE(cppcoreguidelines-no-malloc) + return malloc(nbytes); + } + + /// Frees CPU memory using free. + /// @param ptr Pointer to memory to free. + static void free(void* ptr) { + // NOLINTNEXTLINE(cppcoreguidelines-no-malloc) + std::free(ptr); + } + + /// Copies memory between CPU locations. + /// @param dst Destination pointer. + /// @param src Source pointer. + /// @param nbytes Number of bytes to copy. + /// @param dst_device Destination device (unused for CPU-to-CPU). + /// @param src_device Source device (unused for CPU-to-CPU). + static void memcpy( + void* dst, + const void* src, + size_t nbytes, + const c10::Device& dst_device, + const c10::Device& src_device) { + (void)dst_device; + (void)src_device; + std::memcpy(dst, src, nbytes); + } +}; + +/** + * MaybeOwningStorage - A storage class that manages tensor data memory. + * + * This class provides owning memory storage for tensor data on CPU. + * Owning storage allocates and manages its own memory, freeing it upon + * destruction. + * + * Current limitations: + * - CPU device only + * - Owning mode only + * The future diffs will add support for non-owning storage and other devices. + * + * Thread Safety: NOT THREAD-SAFE + * - Uses NonAtomicSharedPtr for reference counting + * - Must only be used in single-threaded contexts + */ +class MaybeOwningStorage { + public: + /// Constructs owning storage with allocated memory. + /// @param device The device for storage (must be CPU). + /// @param nbytes Number of bytes to allocate. + MaybeOwningStorage(const c10::Device& device, size_t nbytes) + : device_(device), capacity_(nbytes), is_owning_(true) { + ET_CHECK_MSG( + device.is_cpu(), + "Only CPU device is currently supported, got: %s", + device.str().c_str()); + + data_ = DeviceTraits::allocate(nbytes, device); + deleter_ = DeviceTraits::free; + } + + /// Default constructor is deleted - storage must have a device. + MaybeOwningStorage() = delete; + + /// Copy constructor is deleted - use SharedPtr for shared ownership. + MaybeOwningStorage(const MaybeOwningStorage&) = delete; + + /// Copy assignment is deleted - use SharedPtr for shared ownership. + MaybeOwningStorage& operator=(const MaybeOwningStorage&) = delete; + + /// Move constructor. + MaybeOwningStorage(MaybeOwningStorage&& other) noexcept + : device_(other.device_), + data_(other.data_), + capacity_(other.capacity_), + deleter_(other.deleter_), + is_owning_(other.is_owning_) { + other.data_ = nullptr; + other.capacity_ = 0; + other.deleter_ = detail::noop; + other.is_owning_ = false; + } + + /// Move assignment operator. + MaybeOwningStorage& operator=(MaybeOwningStorage&& other) noexcept { + if (this != &other) { + free_data(); + + device_ = other.device_; + data_ = other.data_; + capacity_ = other.capacity_; + deleter_ = other.deleter_; + is_owning_ = other.is_owning_; + + other.data_ = nullptr; + other.capacity_ = 0; + other.deleter_ = detail::noop; + other.is_owning_ = false; + } + return *this; + } + + /// Destructor - frees owned memory. + ~MaybeOwningStorage() { + free_data(); + } + + /// Copies data between storage locations. + /// @param dst_data_ptr Destination data pointer. + /// @param src_data_ptr Source data pointer. + /// @param nbytes Number of bytes to copy. + /// @param src_device Source device. + void copy_( + void* dst_data_ptr, + void* src_data_ptr, + size_t nbytes, + const c10::Device& src_device) { + ET_CHECK_MSG( + dst_data_ptr, "Storage copy failed: dst_data_ptr cannot be nullptr"); + ET_CHECK_MSG( + src_data_ptr, "Storage copy failed: src_data_ptr cannot be nullptr"); + + if (dst_data_ptr == src_data_ptr) { + return; + } + + ET_CHECK_MSG( + device_.is_cpu() && src_device.is_cpu(), + "Only CPU-to-CPU copy is currently supported"); + + DeviceTraits::memcpy( + dst_data_ptr, src_data_ptr, nbytes, device_, src_device); + } + + /// Creates a clone of this storage on the specified device. + /// @param device Target device for the clone (must be CPU). + /// @return A new MaybeOwningStorage with copied data. + MaybeOwningStorage clone(const c10::Device& device) const { + ET_CHECK_MSG(data_, "Storage clone failed: source data cannot be nullptr"); + ET_CHECK_MSG( + device.is_cpu(), "Only CPU device is currently supported for clone"); + + MaybeOwningStorage cloned_storage(device, capacity_); + + DeviceTraits::memcpy( + cloned_storage.data_, data_, capacity_, device, device_); + + return cloned_storage; + } + + /// Returns the data pointer, or nullptr for zero-sized storage. + void* data() const { + if (capacity_ == 0) { + return nullptr; + } + return data_; + } + + /// Returns the device this storage is on. + const c10::Device& device() const { + return device_; + } + + /// Returns the capacity in bytes. + size_t nbytes() const { + return capacity_; + } + + /// Returns true if this storage owns its memory. + bool is_owning() const { + return is_owning_; + } + + /// Returns true if the storage can be resized (must be owning). + bool is_resizable() const { + return is_owning_; + } + + private: + c10::Device device_ = CPU_DEVICE; + void* data_ = nullptr; + size_t capacity_ = 0; + DeleterFn deleter_ = detail::noop; + bool is_owning_ = false; + + /// Frees the data if non-null. + void free_data() { + if (data_ != nullptr) { + deleter_(data_); + data_ = nullptr; + } + } +}; + +/// Storage is a shared pointer to MaybeOwningStorage. +/// Multiple tensors can share the same underlying storage. +using Storage = SharedPtr; + +} // namespace executorch::backends::aoti::slim diff --git a/backends/aoti/slim/core/TARGETS b/backends/aoti/slim/core/TARGETS new file mode 100644 index 00000000000..77871de4469 --- /dev/null +++ b/backends/aoti/slim/core/TARGETS @@ -0,0 +1,3 @@ +load("targets.bzl", "define_common_targets") + +define_common_targets() diff --git a/backends/aoti/slim/core/targets.bzl b/backends/aoti/slim/core/targets.bzl new file mode 100644 index 00000000000..12de67bf8b1 --- /dev/null +++ b/backends/aoti/slim/core/targets.bzl @@ -0,0 +1,19 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + """Define targets for SlimTensor core module.""" + + # Header-only library for Storage + runtime.cxx_library( + name = "storage", + headers = [ + "Storage.h", + ], + visibility = ["@EXECUTORCH_CLIENTS"], + exported_deps = [ + "//executorch/backends/aoti/slim/c10/core:device", + "//executorch/backends/aoti/slim/c10/core:scalar_type", + "//executorch/backends/aoti/slim/util:shared_ptr", + "//executorch/runtime/platform:platform", + ], + ) diff --git a/backends/aoti/slim/core/test/TARGETS b/backends/aoti/slim/core/test/TARGETS new file mode 100644 index 00000000000..77871de4469 --- /dev/null +++ b/backends/aoti/slim/core/test/TARGETS @@ -0,0 +1,3 @@ +load("targets.bzl", "define_common_targets") + +define_common_targets() diff --git a/backends/aoti/slim/core/test/targets.bzl b/backends/aoti/slim/core/test/targets.bzl new file mode 100644 index 00000000000..8e580f5ed0e --- /dev/null +++ b/backends/aoti/slim/core/test/targets.bzl @@ -0,0 +1,14 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + """Define test targets for SlimTensor core module.""" + + runtime.cxx_test( + name = "test_storage", + srcs = [ + "test_storage.cpp", + ], + deps = [ + "//executorch/backends/aoti/slim/core:storage", + ], + ) diff --git a/backends/aoti/slim/core/test/test_storage.cpp b/backends/aoti/slim/core/test/test_storage.cpp new file mode 100644 index 00000000000..42a8678c888 --- /dev/null +++ b/backends/aoti/slim/core/test/test_storage.cpp @@ -0,0 +1,259 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +namespace executorch::backends::aoti::slim { + +// ============================================================================= +// DeviceTraits Tests +// ============================================================================= + +TEST(DeviceTraitsCPUTest, AllocateAndFree) { + constexpr size_t kSize = 1024; + void* ptr = DeviceTraits::allocate(kSize); + ASSERT_NE(ptr, nullptr); + + DeviceTraits::free(ptr); +} + +TEST(DeviceTraitsCPUTest, AllocateZeroBytes) { + void* ptr = DeviceTraits::allocate(0); + DeviceTraits::free(ptr); +} + +TEST(DeviceTraitsCPUTest, MemcpyCPUToCPU) { + constexpr size_t kSize = 256; + float* src = static_cast( + DeviceTraits::allocate(kSize * sizeof(float))); + float* dst = static_cast( + DeviceTraits::allocate(kSize * sizeof(float))); + + for (size_t i = 0; i < kSize; ++i) { + src[i] = static_cast(i) * 1.5f; + } + + DeviceTraits::memcpy( + dst, src, kSize * sizeof(float), CPU_DEVICE, CPU_DEVICE); + + for (size_t i = 0; i < kSize; ++i) { + EXPECT_FLOAT_EQ(dst[i], static_cast(i) * 1.5f); + } + + DeviceTraits::free(src); + DeviceTraits::free(dst); +} + +// ============================================================================= +// MaybeOwningStorage Tests - Owning Mode +// ============================================================================= + +TEST(MaybeOwningStorageTest, ConstructOwning) { + constexpr size_t kNbytes = 512; + MaybeOwningStorage storage(CPU_DEVICE, kNbytes); + + EXPECT_NE(storage.data(), nullptr); + EXPECT_EQ(storage.nbytes(), kNbytes); + EXPECT_TRUE(storage.device().is_cpu()); + EXPECT_TRUE(storage.is_owning()); + EXPECT_TRUE(storage.is_resizable()); +} + +TEST(MaybeOwningStorageTest, ConstructOwningZeroBytes) { + MaybeOwningStorage storage(CPU_DEVICE, 0); + + EXPECT_EQ(storage.data(), nullptr); + EXPECT_EQ(storage.nbytes(), 0); + EXPECT_TRUE(storage.device().is_cpu()); + EXPECT_TRUE(storage.is_owning()); +} + +TEST(MaybeOwningStorageTest, DataPersistence) { + constexpr size_t kNumFloats = 64; + constexpr size_t kNbytes = kNumFloats * sizeof(float); + MaybeOwningStorage storage(CPU_DEVICE, kNbytes); + + float* data = static_cast(storage.data()); + for (size_t i = 0; i < kNumFloats; ++i) { + data[i] = static_cast(i) * 2.0f; + } + + float* read_data = static_cast(storage.data()); + for (size_t i = 0; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(read_data[i], static_cast(i) * 2.0f); + } +} + +TEST(MaybeOwningStorageTest, MoveConstruct) { + constexpr size_t kNbytes = 256; + MaybeOwningStorage original(CPU_DEVICE, kNbytes); + void* original_data = original.data(); + + MaybeOwningStorage moved(std::move(original)); + + EXPECT_EQ(moved.data(), original_data); + EXPECT_EQ(moved.nbytes(), kNbytes); + EXPECT_TRUE(moved.is_owning()); + + EXPECT_EQ(original.data(), nullptr); + EXPECT_EQ(original.nbytes(), 0); + EXPECT_FALSE(original.is_owning()); +} + +TEST(MaybeOwningStorageTest, MoveAssign) { + constexpr size_t kNbytes1 = 256; + constexpr size_t kNbytes2 = 512; + MaybeOwningStorage storage1(CPU_DEVICE, kNbytes1); + MaybeOwningStorage storage2(CPU_DEVICE, kNbytes2); + void* storage2_data = storage2.data(); + + storage1 = std::move(storage2); + + EXPECT_EQ(storage1.data(), storage2_data); + EXPECT_EQ(storage1.nbytes(), kNbytes2); + EXPECT_TRUE(storage1.is_owning()); + + EXPECT_EQ(storage2.data(), nullptr); + EXPECT_EQ(storage2.nbytes(), 0); + EXPECT_FALSE(storage2.is_owning()); +} + +TEST(MaybeOwningStorageTest, Clone) { + constexpr size_t kNumFloats = 32; + constexpr size_t kNbytes = kNumFloats * sizeof(float); + MaybeOwningStorage original(CPU_DEVICE, kNbytes); + + float* data = static_cast(original.data()); + for (size_t i = 0; i < kNumFloats; ++i) { + data[i] = static_cast(i) * 3.0f; + } + + MaybeOwningStorage cloned = original.clone(CPU_DEVICE); + + EXPECT_NE(cloned.data(), original.data()); + EXPECT_EQ(cloned.nbytes(), original.nbytes()); + EXPECT_TRUE(cloned.is_owning()); + + float* cloned_data = static_cast(cloned.data()); + for (size_t i = 0; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(cloned_data[i], static_cast(i) * 3.0f); + } + + data[0] = 999.0f; + EXPECT_FLOAT_EQ(cloned_data[0], 0.0f); +} + +TEST(MaybeOwningStorageTest, CopyFunction) { + constexpr size_t kNumFloats = 16; + constexpr size_t kNbytes = kNumFloats * sizeof(float); + MaybeOwningStorage src_storage(CPU_DEVICE, kNbytes); + MaybeOwningStorage dst_storage(CPU_DEVICE, kNbytes); + + float* src_data = static_cast(src_storage.data()); + for (size_t i = 0; i < kNumFloats; ++i) { + src_data[i] = static_cast(i) + 0.5f; + } + + dst_storage.copy_( + dst_storage.data(), src_storage.data(), kNbytes, CPU_DEVICE); + + float* dst_data = static_cast(dst_storage.data()); + for (size_t i = 0; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(dst_data[i], static_cast(i) + 0.5f); + } +} + +// ============================================================================= +// Storage (SharedPtr) Tests +// ============================================================================= + +TEST(StorageSharedPtrTest, BasicUsage) { + constexpr size_t kNbytes = 128; + Storage storage(new MaybeOwningStorage(CPU_DEVICE, kNbytes)); + + EXPECT_NE(storage.get(), nullptr); + EXPECT_NE(storage->data(), nullptr); + EXPECT_EQ(storage->nbytes(), kNbytes); + EXPECT_TRUE(storage->device().is_cpu()); + EXPECT_EQ(storage.use_count(), 1); +} + +TEST(StorageSharedPtrTest, SharedOwnership) { + constexpr size_t kNbytes = 128; + Storage storage1(new MaybeOwningStorage(CPU_DEVICE, kNbytes)); + void* data_ptr = storage1->data(); + + Storage storage2 = storage1; // Copy, not reference - increments ref count + + EXPECT_EQ(storage1.use_count(), 2); + EXPECT_EQ(storage2.use_count(), 2); + EXPECT_EQ(storage1->data(), storage2->data()); + EXPECT_EQ(storage2->data(), data_ptr); +} + +TEST(StorageSharedPtrTest, SharedOwnershipModification) { + constexpr size_t kNumFloats = 8; + constexpr size_t kNbytes = kNumFloats * sizeof(float); + Storage storage1(new MaybeOwningStorage(CPU_DEVICE, kNbytes)); + + float* data = static_cast(storage1->data()); + for (size_t i = 0; i < kNumFloats; ++i) { + data[i] = 0.0f; + } + + const Storage& storage2 = storage1; + + float* data2 = static_cast(storage2->data()); + for (size_t i = 0; i < kNumFloats; ++i) { + data2[i] = static_cast(i) * 10.0f; + } + + float* data1 = static_cast(storage1->data()); + for (size_t i = 0; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(data1[i], static_cast(i) * 10.0f); + } +} + +TEST(StorageSharedPtrTest, ReferenceCountDecrement) { + constexpr size_t kNbytes = 64; + Storage storage1(new MaybeOwningStorage(CPU_DEVICE, kNbytes)); + EXPECT_EQ(storage1.use_count(), 1); + + { + Storage storage2 = storage1; // Copy increments ref count + EXPECT_EQ(storage1.use_count(), 2); + } // storage2 destroyed, ref count decrements + + EXPECT_EQ(storage1.use_count(), 1); +} + +TEST(StorageSharedPtrTest, MoveSemantics) { + constexpr size_t kNbytes = 64; + Storage storage1(new MaybeOwningStorage(CPU_DEVICE, kNbytes)); + void* data_ptr = storage1->data(); + + Storage storage2 = std::move(storage1); + + EXPECT_EQ(storage1.get(), nullptr); + EXPECT_EQ(storage2->data(), data_ptr); + EXPECT_EQ(storage2.use_count(), 1); +} + +TEST(StorageSharedPtrTest, MakeShared) { + constexpr size_t kNbytes = 256; + Storage storage = make_shared(CPU_DEVICE, kNbytes); + + EXPECT_NE(storage.get(), nullptr); + EXPECT_NE(storage->data(), nullptr); + EXPECT_EQ(storage->nbytes(), kNbytes); + EXPECT_EQ(storage.use_count(), 1); +} + +} // namespace executorch::backends::aoti::slim diff --git a/backends/aoti/slim/util/SharedPtr.h b/backends/aoti/slim/util/SharedPtr.h new file mode 100644 index 00000000000..e4e439ee4cb --- /dev/null +++ b/backends/aoti/slim/util/SharedPtr.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include + +namespace executorch::backends::aoti::slim { + +/** + * SharedPtr - A lightweight shared pointer implementation optimized for + * single-threaded execution contexts. + * + * This class provides shared ownership semantics similar to std::shared_ptr but + * without atomic operations, making it faster in single-threaded contexts. + * ExecuTorch AOTI-drive backends operate in a single-threaded context, so + * this optimization is safe and provides better performance. + * + * Primary Use Cases: + * 1. Intermediate SlimTensor Storage Management: + * - Manages temporary tensors created during model execution + * - Avoids the overhead of atomic reference counting in std::shared_ptr + * + * 2. Input/Output Tensor References: + * - Provides reference counting for input/output tensors + * - Uses dummy deleters to prevent premature deallocation when needed + */ +template +class SharedPtr { + private: + struct ControlBlock { + int count = 1; + T* ptr; + using Deleter = void (*)(T*); + Deleter deleter; + + ControlBlock(T* p, Deleter d) : ptr(p), deleter(d) {} + ControlBlock(const ControlBlock&) = delete; + ControlBlock& operator=(const ControlBlock&) = delete; + ControlBlock(ControlBlock&&) = delete; + ControlBlock& operator=(ControlBlock&&) = delete; + + ~ControlBlock() { + if (ptr) { + deleter(ptr); + } + } + }; + + ControlBlock* cb_; + + static void default_deleter(T* p) { + delete p; + } + + void cleanup() { + if (cb_ && --cb_->count == 0) { + delete cb_; + } + cb_ = nullptr; + } + + public: + /// Default constructor - creates an empty shared pointer. + SharedPtr() noexcept : cb_(nullptr) {} + + /// Constructor from raw pointer. + explicit SharedPtr(T* p, typename ControlBlock::Deleter d = default_deleter) + : cb_(p ? new ControlBlock(p, d) : nullptr) {} + + /// Copy constructor. + SharedPtr(const SharedPtr& other) noexcept : cb_(other.cb_) { + if (cb_) { + ++cb_->count; + } + } + + /// Move constructor. + SharedPtr(SharedPtr&& other) noexcept : cb_(other.cb_) { + other.cb_ = nullptr; + } + + /// Destructor. + ~SharedPtr() { + cleanup(); + } + + /// Copy assignment. + SharedPtr& operator=(const SharedPtr& other) noexcept { + if (this != &other) { + cleanup(); + cb_ = other.cb_; + if (cb_) { + ++cb_->count; + } + } + return *this; + } + + /// Move assignment. + SharedPtr& operator=(SharedPtr&& other) noexcept { + if (this != &other) { + cleanup(); + cb_ = other.cb_; + other.cb_ = nullptr; + } + return *this; + } + + /// Resets the shared pointer to manage a new object. + void reset( + T* p = nullptr, + typename ControlBlock::Deleter d = default_deleter) { + *this = SharedPtr(p, d); + } + + /// Swaps the contents with another shared pointer. + void swap(SharedPtr& other) noexcept { + std::swap(cb_, other.cb_); + } + + /// Returns the managed pointer. + T* get() const noexcept { + return cb_ ? cb_->ptr : nullptr; + } + + /// Dereferences the managed pointer. + T& operator*() const { + ET_CHECK_MSG(cb_, "Dereferencing null SharedPtr"); + return *cb_->ptr; + } + + /// Accesses members of the managed object. + T* operator->() const { + ET_CHECK_MSG(cb_, "Accessing member of null SharedPtr"); + return cb_->ptr; + } + + /// Returns the reference count. + long use_count() const noexcept { + return cb_ ? cb_->count : 0; + } + + /// Returns true if the shared pointer is not null. + explicit operator bool() const noexcept { + return cb_ != nullptr; + } + + friend void swap(SharedPtr& a, SharedPtr& b) noexcept { + a.swap(b); + } + + friend bool operator==(const SharedPtr& lhs, const SharedPtr& rhs) noexcept { + return lhs.get() == rhs.get(); + } + + friend bool operator!=(const SharedPtr& lhs, const SharedPtr& rhs) noexcept { + return !(lhs == rhs); + } + + friend bool operator==(const SharedPtr& lhs, std::nullptr_t) noexcept { + return lhs.get() == nullptr; + } + + friend bool operator!=(const SharedPtr& lhs, std::nullptr_t) noexcept { + return lhs.get() != nullptr; + } + + friend bool operator==(std::nullptr_t, const SharedPtr& rhs) noexcept { + return rhs.get() == nullptr; + } + + friend bool operator!=(std::nullptr_t, const SharedPtr& rhs) noexcept { + return rhs.get() != nullptr; + } +}; + +/// Creates a SharedPtr managing a new object constructed with the given args. +template +SharedPtr make_shared(Args&&... args) { + return SharedPtr(new T(std::forward(args)...)); +} + +} // namespace executorch::backends::aoti::slim diff --git a/backends/aoti/slim/util/TARGETS b/backends/aoti/slim/util/TARGETS new file mode 100644 index 00000000000..77871de4469 --- /dev/null +++ b/backends/aoti/slim/util/TARGETS @@ -0,0 +1,3 @@ +load("targets.bzl", "define_common_targets") + +define_common_targets() diff --git a/backends/aoti/slim/util/targets.bzl b/backends/aoti/slim/util/targets.bzl new file mode 100644 index 00000000000..13f49168a0f --- /dev/null +++ b/backends/aoti/slim/util/targets.bzl @@ -0,0 +1,16 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + """Define targets for SlimTensor util module.""" + + # Header-only library for SharedPtr + runtime.cxx_library( + name = "shared_ptr", + headers = [ + "SharedPtr.h", + ], + visibility = ["@EXECUTORCH_CLIENTS"], + exported_deps = [ + "//executorch/runtime/platform:platform", + ], + )