NOCOMMIT: PoC: vectorize just expm1

swolchok · swolchok · commit b1358b9b9d9a · 2025-03-25T12:47:55.000-07:00
Here is how we can vectorize expm1 given the prior PRs in this stack. It is NOCOMMIT because we actually should implement vectorization for all the `unary_ufunc_*` ops. ghstack-source-id: e65c2b075587f716b8a746616d3335bab69a261d ghstack-comment-id: 2751961712 Pull Request resolved: #9586
diff --git a/kernels/portable/cpu/op_expm1.cpp b/kernels/portable/cpu/op_expm1.cpp
@@ -7,16 +7,66 @@
  */
 
 #include <executorch/kernels/portable/cpu/pattern/pattern.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <cmath>
 
 namespace torch {
 namespace executor {
 namespace native {
 
+// REVIEW: I'm not entirely sure what the best way to implement this
+// namespace is. Some options:
+// 1) All in one file, with or without an `IMPLEMENT_VECTORIZED_MATH_OP` macro.
+// 2) Include in each `unary_ufunc_*` op_foo.cpp, with or without an
+// `IMPLEMENT_VECTORIZED_MATH_OP` macro.
+//
+// I think my preferred option would be (2) with a macro, but I've
+// left the macro out for ease of reading this PoC PR.
+namespace math {
+using std::expm1;
+#ifdef ET_USE_PYTORCH_HEADERS
+template <typename T>
+auto expm1(at::vec::Vectorized<T> x) {
+  // ATen knows to do this conversion because the TensorIterator for this op
+  // (and lots of similar ones in aten/src/ATen/native/UnaryOps.cpp) is created
+  // with build_borrowing_unary_float_op.
+  if constexpr (!executorch::runtime::is_floating_point<T>::value) {
+    return at::vec::convert<float>(x).expm1();
+  } else {
+    return x.expm1();
+  }
+}
+#endif
+} // namespace math
 Tensor& expm1_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhbbf16_to_floathbf16(
-      std::expm1, ctx, in, out);
+  ET_KERNEL_CHECK(ctx, tensor_is_floating_type(out), InvalidArgument, out);
+
+  // Resize for dynamic shape
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      resize_tensor(out, in.sizes()) == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
+
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  static constexpr const char op_name[] = "expm1.out";
+  ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_IN,
+        op_name,
+        utils::SupportedTensorDtypes::FLOATHBF16>(
+        [](auto x) { return math::expm1(x); },
+        ctx,
+        in,
+        utils::SupportedTensorDtypes::REALHBBF16,
+        out);
+  });
+
+  return out;
 }
 
 } // namespace native
diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -527,6 +527,7 @@ ATEN_OPS = (
         name = "op_expm1",
         deps = [
             "//executorch/kernels/portable/cpu/pattern:pattern",
+            "//executorch/kernels/portable/cpu/util:elementwise_util",
         ],
     ),
     op_target(