cranelift: inline small constant-length array.copy for performance (#13460)

gfx · web-flow · commit a5026422089b · 2026-05-26T17:56:52.000Z
* cranelift: inline small constant-length `array.copy`

When a scalar-element `array.copy` has a compile-time-constant length of at
most 8, expand it inline as loads-then-stores instead of calling the
`memory_copy` libcall. The libcall's fixed per-call cost (a wasm/host
transition and an indirect call) dominates for tiny copies, and is
especially visible in unoptimized builds where the libcall body itself is
not optimized.

Every element is loaded before any is stored so overlapping ranges keep
memmove semantics. Dynamic or larger lengths, and tables, still use the
libcall, whose `memmove` amortizes the overhead. The bound of 8 trades a
little perf (the inline-vs-libcall crossover is ~16 elements) for bounded
code size, capturing the largest wins, which cluster at &lt;= 8 elements.

Assisted-by: Claude Code:claude-opus-4-7

* cranelift: tidy inline `array.copy` helper after review

Reword `emit_inline_array_copy`'s doc to describe a bitwise copy by element
width (it also handles `v128` and copies `f32`/`f64` via integer types), and
replace the `elem_size`/`n` parameter shadowing with `stride`/`count`. No
codegen change.

Assisted-by: Claude Code:claude-opus-4-7

* cranelift: generalize inline copy to all constant-length bulk copies

Per review, move the inline expansion into `raw_bulk_memory_operation`, keyed
on a constant byte length, so it also covers `memory.copy` and `table.copy`,
not just `array.copy`. It now uses width-agnostic wide accesses (`i8x16` down
to `i8`), and `BulkOp::MemoryCopy` carries entity-appropriate flags. Threshold
128 bytes, from measurement (~1.7-2.7x faster up to 128 B, ties by 256).

Assisted-by: Claude Code:claude-opus-4-7

* cranelift: simplify inline bulk copy after review

- Use `MemFlagsData::trusted()` in `emit_inline_memcpy` and drop the
  per-entity flags threaded through `BulkOp::MemoryCopy`; each load only
  feeds its paired store, so unaligned moves are selected regardless.
- Detect a constant length by matching only `iconst` on the raw wasm
  length (plumbed as `const_len`) instead of folding the width casts and
  `* element_size` multiply, which was type-unfaithful.
- Gate on `bytes &lt;= 128` with a zero-length early return.

Assisted-by: Claude Code:claude-opus-4-7
diff --git a/crates/cranelift/src/func_environ.rs b/crates/cranelift/src/func_environ.rs
@@ -3448,6 +3448,32 @@ impl FuncEnvironment<'_> {
     /// epochs are enabled to break up the copy into a loop of chunks with
     /// preemption checks between them.
     fn raw_bulk_memory_operation(&mut self, builder: &mut FunctionBuilder<'_>, mut op: BulkOp) {
+        // Fast path: a copy whose byte length is a small compile-time constant is
+        // expanded inline (see `emit_inline_memcpy`), skipping the libcall's fixed
+        // per-call cost (a wasm/host transition and an indirect call) that
+        // dominates tiny copies. Larger or dynamic copies, and all fills, use the
+        // libcall below, whose `memmove` amortizes that cost.
+        //
+        // The bound is empirical: measured on aarch64, inline is ~1.7-2.7x faster
+        // than the libcall through 128 bytes and ties by 256 (cost grows with the
+        // length, since every chunk is loaded before any is stored).
+        const INLINE_COPY_MAX_BYTES: u64 = 128;
+        if let BulkOp::MemoryCopy {
+            dst,
+            src,
+            const_len: Some(bytes),
+            ..
+        } = op
+        {
+            if bytes <= INLINE_COPY_MAX_BYTES {
+                if self.tunables.consume_fuel {
+                    self.fuel_consumed += bytes as i64;
+                }
+                self.emit_inline_memcpy(builder, dst, src, bytes);
+                return;
+            }
+        }
+
         // Very scientifically chosen. Or, more seriously, this is just an
         // arbitrary number for now. 100k copies of this size locally takes half
         // a second, so seems like a reasonably large chunk size to not hit perf
@@ -3467,7 +3493,7 @@ impl FuncEnvironment<'_> {
                     env.epoch_check(builder);
                 }
                 match *op {
-                    BulkOp::MemoryCopy { dst, src, len } => {
+                    BulkOp::MemoryCopy { dst, src, len, .. } => {
                         if env.tunables.consume_fuel {
                             // Note that fuel is always a 64-bit counter.
                             let fuel_consumed = match env.pointer_type() {
@@ -3530,7 +3556,7 @@ impl FuncEnvironment<'_> {
             };
             let has_chunk = builder.ins().icmp(IntCC::UnsignedGreaterThan, len, chunk);
             match *op {
-                BulkOp::MemoryCopy { dst, src, len } => {
+                BulkOp::MemoryCopy { dst, src, len, .. } => {
                     builder.ins().brif(
                         has_chunk,
                         chunk_block,
@@ -3553,7 +3579,7 @@ impl FuncEnvironment<'_> {
         has_chunk_branch(builder, &op);
 
         let append_block_params = |builder: &mut FunctionBuilder<'_>, block, op: &mut _| match op {
-            BulkOp::MemoryCopy { dst, src, len } => {
+            BulkOp::MemoryCopy { dst, src, len, .. } => {
                 *dst = builder.append_block_param(block, pointer_type);
                 *src = builder.append_block_param(block, pointer_type);
                 *len = builder.append_block_param(block, pointer_type);
@@ -3577,7 +3603,7 @@ impl FuncEnvironment<'_> {
         *op_len = chunk;
         raw_call(self, builder, &op);
         match &mut op {
-            BulkOp::MemoryCopy { dst, src, len } => {
+            BulkOp::MemoryCopy { dst, src, len, .. } => {
                 *dst = builder.ins().iadd(*dst, chunk);
                 *src = builder.ins().iadd(*src, chunk);
                 *len = builder.ins().isub(remaining_len, chunk);
@@ -3976,19 +4002,28 @@ impl FuncEnvironment<'_> {
         let len_ptr =
             self.unchecked_cast_wasm_addr_to_native_addr(&mut builder.cursor(), len, len_idx_ty);
 
+        // A constant wasm length lets a small copy be expanded inline. Capture it
+        // straight from wasm here (a count of entity elements; for memories an
+        // element is a byte) so the fast path only has to recognize an `iconst`,
+        // not the casts and `* element_size` multiply applied further down.
+        let const_count = Self::value_as_const_int(builder, len);
+
         match dst_entity {
             // Memories are always a `memcpy`.
             CheckedEntity::Memory(_) => {
                 assert!(matches!(
                     src_entity,
                     CheckedEntity::Memory(_) | CheckedEntity::Data { .. }
                 ));
+                // A memory's elements are bytes, so the element count is already
+                // the byte length.
                 self.raw_bulk_memory_operation(
                     builder,
                     BulkOp::MemoryCopy {
                         dst: dst_raw_addr,
                         src: src_raw_addr,
                         len: len_ptr,
+                        const_len: const_count,
                     },
                 );
                 Ok(())
@@ -4005,6 +4040,7 @@ impl FuncEnvironment<'_> {
                     src_raw_addr,
                     len_ptr,
                     src,
+                    const_count,
                 ),
 
             // Cannot copy into a data or element segment in wasm.
@@ -4211,7 +4247,9 @@ impl FuncEnvironment<'_> {
     /// `src_elem_addr` and stored to `dst_elem_addr`. The `elem_ty` is the type
     /// being transferred, `one_elem_size` is the byte size of each element,
     /// `copy_len` is the number of elements being copied, and `src_index` is
-    /// the first index within `src_entity` being loaded.
+    /// the first index within `src_entity` being loaded. `const_count` is that
+    /// same element count when it is a wasm constant, used to expand small copies
+    /// inline.
     ///
     /// All values here have type `self.pointer_type()`, except `src_index`
     /// which is typed appropriately to index `src_entity`.
@@ -4227,6 +4265,7 @@ impl FuncEnvironment<'_> {
         src_elem_addr: ir::Value,
         copy_len: ir::Value,
         src_index: ir::Value,
+        const_count: Option<u64>,
     ) -> WasmResult<()> {
         let pointer_type = self.pointer_type();
         assert_eq!(builder.func.dfg.value_type(dst_elem_addr), pointer_type);
@@ -4300,14 +4339,16 @@ impl FuncEnvironment<'_> {
         }
 
         // For memcpy, that's easy, just call the intrinsic with the right
-        // parameters.
+        // parameters (or expand it inline; see `raw_bulk_memory_operation`).
         if !type_forbids_memcpy && dst_element_size == src_element_size {
+            let const_len = const_count.and_then(|c| c.checked_mul(u64::from(dst_element_size)));
             self.raw_bulk_memory_operation(
                 builder,
                 BulkOp::MemoryCopy {
                     dst: dst_elem_addr,
                     src: src_elem_addr,
                     len: dst_copy_byte_len,
+                    const_len,
                 },
             );
             return Ok(());
@@ -4384,6 +4425,72 @@ impl FuncEnvironment<'_> {
         Ok(())
     }
 
+    /// If `value` is an `iconst`, return its immediate as a `u64`.
+    ///
+    /// This deliberately peeks at a single `iconst` and nothing else. Callers
+    /// pass the length exactly as it appears in wasm, before the width casts and
+    /// `* element_size` multiply that the byte-length computation wraps it in, so
+    /// there is no need to (incorrectly) fold those type-changing ops here.
+    fn value_as_const_int(builder: &FunctionBuilder<'_>, value: ir::Value) -> Option<u64> {
+        let inst = builder.func.dfg.value_def(value).inst()?;
+        match builder.func.dfg.insts[inst] {
+            ir::InstructionData::UnaryImm {
+                opcode: ir::Opcode::Iconst,
+                imm,
+            } => Some(imm.bits().cast_unsigned()),
+            _ => None,
+        }
+    }
+
+    /// Expand a copy of `bytes` (a small compile-time constant) into inline loads
+    /// then stores, avoiding the `memory_copy` libcall.
+    ///
+    /// The copy is bitwise and element-type agnostic: the byte range is covered
+    /// greedily with the widest convenient access (`i8x16` down to `i8`). Every
+    /// chunk is loaded before any is stored, so overlapping ranges keep `memmove`
+    /// semantics. The caller has already bounds-checked the range.
+    fn emit_inline_memcpy(
+        &mut self,
+        builder: &mut FunctionBuilder<'_>,
+        dst_addr: ir::Value,
+        src_addr: ir::Value,
+        bytes: u64,
+    ) {
+        // `trusted()` (notrap + aligned) is sound even though the chunks may be
+        // unaligned: each load feeds only its paired store, never an instruction
+        // operand that requires alignment, so the backend selects unaligned
+        // moves regardless of the `aligned` flag. The range was already
+        // bounds-checked, so `notrap` is fine too.
+        let flags = ir::MemFlagsData::trusted();
+        const WIDTHS: &[(u64, ir::Type)] = &[
+            (16, ir::types::I8X16),
+            (8, ir::types::I64),
+            (4, ir::types::I32),
+            (2, ir::types::I16),
+            (1, ir::types::I8),
+        ];
+        // 12 covers the worst case under the 128-byte cap: n=127 decomposes into
+        // 7×i8x16 + i64 + i32 + i16 + i8 = 11 chunks. Sized so both `SmallVec`s
+        // stay inline.
+        let mut chunks: SmallVec<[(i32, ir::Type); 12]> = smallvec![];
+        let mut offset = 0u64;
+        let mut remaining = bytes;
+        for &(width, ty) in WIDTHS {
+            while remaining >= width {
+                chunks.push((i32::try_from(offset).unwrap(), ty));
+                offset += width;
+                remaining -= width;
+            }
+        }
+        let vals: SmallVec<[ir::Value; 12]> = chunks
+            .iter()
+            .map(|&(off, ty)| builder.ins().load(ty, flags, src_addr, off))
+            .collect();
+        for (&(off, _), val) in chunks.iter().zip(vals) {
+            builder.ins().store(flags, val, dst_addr, off);
+        }
+    }
+
     /// For bulk operations (copies, fills, etc) this is an extra check layered
     /// on the spec-defined bounds check that the address is in-bounds.
     ///
@@ -5436,11 +5543,14 @@ enum BulkOp {
     /// A `memory.copy` operation, copying memory from `src` to `dst`.
     ///
     /// All of `dst`, `src`, and `len` must be pre-validated and inbounds. All
-    /// must have type `env.pointer_type()`.
+    /// must have type `env.pointer_type()`. `const_len`, when set, is the
+    /// statically-known byte length (from a constant wasm length); the inline
+    /// fast path in `raw_bulk_memory_operation` uses it to expand small copies.
     MemoryCopy {
         dst: ir::Value,
         src: ir::Value,
         len: ir::Value,
+        const_len: Option<u64>,
     },
 
     /// A `memory.fill` operation, setting all bytes of `dst` to `val`.
diff --git a/tests/disas/array-copy-inline.wat b/tests/disas/array-copy-inline.wat
@@ -0,0 +1,80 @@
+;;! target = 'x86_64'
+;;! test = 'optimize'
+;;! flags = '-Wgc'
+
+;; A small, constant-length `array.copy` is expanded inline as wide loads
+;; followed by stores instead of calling the `memory_copy` libcall. The byte
+;; range is covered greedily with the widest convenient access, so 7 `i32`s (28
+;; bytes) become an `i8x16` + `i64` + `i32`, and every chunk is loaded before any
+;; is stored so overlapping ranges still copy correctly.
+
+(module
+  (type $a (array (mut i32)))
+
+  (func $copy (param (ref $a) i32 (ref $a) i32)
+    (array.copy $a $a (local.get 0) (local.get 1) (local.get 2) (local.get 3) (i32.const 7))
+  )
+)
+;; function u0:0(i64 vmctx, i64, i32, i32, i32, i32) tail {
+;;     gv0 = vmctx
+;;     gv1 = load.i64 notrap aligned readonly gv0+8
+;;     gv2 = load.i64 notrap aligned gv1+24
+;;     gv3 = vmctx
+;;     gv4 = load.i64 notrap aligned readonly can_move gv3+8
+;;     gv5 = load.i64 notrap aligned readonly can_move gv4+32
+;;     gv6 = load.i64 notrap aligned gv4+40
+;;     stack_limit = gv2
+;;
+;;                                 block0(v0: i64, v1: i64, v2: i32, v3: i32, v4: i32, v5: i32):
+;; @002a                               trapz v2, user16
+;; @002a                               v83 = load.i64 notrap aligned readonly can_move v0+8
+;; @002a                               v8 = load.i64 notrap aligned readonly can_move v83+32
+;; @002a                               v7 = uextend.i64 v2
+;; @002a                               v9 = iadd v8, v7
+;; @002a                               v10 = iconst.i64 16
+;; @002a                               v11 = iadd v9, v10  ; v10 = 16
+;; @002a                               v12 = load.i32 user2 readonly v11
+;; @002a                               v14 = uextend.i64 v3
+;;                                     v85 = iconst.i64 7
+;; @002a                               v17 = iadd v14, v85  ; v85 = 7
+;; @002a                               v13 = uextend.i64 v12
+;; @002a                               v18 = icmp ugt v17, v13
+;; @002a                               trapnz v18, user17
+;; @002a                               trapz v4, user16
+;; @002a                               v26 = uextend.i64 v4
+;; @002a                               v28 = iadd v8, v26
+;; @002a                               v30 = iadd v28, v10  ; v10 = 16
+;; @002a                               v31 = load.i32 user2 readonly v30
+;; @002a                               v33 = uextend.i64 v5
+;; @002a                               v36 = iadd v33, v85  ; v85 = 7
+;; @002a                               v32 = uextend.i64 v31
+;; @002a                               v37 = icmp ugt v36, v32
+;; @002a                               trapnz v37, user17
+;; @002a                               v49 = load.i64 notrap aligned v83+40
+;;                                     v79 = iconst.i64 20
+;; @002a                               v22 = iadd v9, v79  ; v79 = 20
+;;                                     v93 = iconst.i64 2
+;;                                     v94 = ishl v14, v93  ; v93 = 2
+;; @002a                               v25 = iadd v22, v94
+;;                                     v98 = iconst.i64 28
+;; @002a                               v51 = uadd_overflow_trap v25, v98, user2  ; v98 = 28
+;; @002a                               v50 = iadd v8, v49
+;; @002a                               v52 = icmp ugt v51, v50
+;; @002a                               trapnz v52, user2
+;; @002a                               v41 = iadd v28, v79  ; v79 = 20
+;;                                     v96 = ishl v33, v93  ; v93 = 2
+;; @002a                               v44 = iadd v41, v96
+;; @002a                               v56 = uadd_overflow_trap v44, v98, user2  ; v98 = 28
+;; @002a                               v57 = icmp ugt v56, v50
+;; @002a                               trapnz v57, user2
+;; @002a                               v58 = load.i8x16 notrap aligned v44
+;; @002a                               v59 = load.i64 notrap aligned v44+16
+;; @002a                               v60 = load.i32 notrap aligned v44+24
+;; @002a                               store notrap aligned v58, v25
+;; @002a                               store notrap aligned v59, v25+16
+;; @002a                               store notrap aligned v60, v25+24
+;; @002e                               jump block1
+;;
+;;                                 block1:
+;; @002e                               return
+;; }
diff --git a/tests/disas/memory-copy-inline.wat b/tests/disas/memory-copy-inline.wat
@@ -0,0 +1,43 @@
+;;! target = 'x86_64'
+;;! test = 'optimize'
+
+;; A constant-length `memory.copy` is expanded inline as wide loads followed by
+;; stores (every byte is loaded before any is stored, so overlapping ranges keep
+;; `memmove` semantics) instead of calling the `memory_copy` libcall.
+
+(module
+  (memory 1)
+  (func $copy (param i32 i32)
+    (memory.copy (local.get 0) (local.get 1) (i32.const 16))
+  )
+)
+;; function u0:0(i64 vmctx, i64, i32, i32) tail {
+;;     gv0 = vmctx
+;;     gv1 = load.i64 notrap aligned readonly gv0+8
+;;     gv2 = load.i64 notrap aligned gv1+24
+;;     gv3 = vmctx
+;;     gv4 = load.i64 notrap aligned gv3+64
+;;     gv5 = load.i64 notrap aligned readonly can_move gv3+56
+;;     stack_limit = gv2
+;;
+;;                                 block0(v0: i64, v1: i64, v2: i32, v3: i32):
+;; @0024                               v6 = load.i64 notrap aligned v0+64
+;; @0024                               v7 = uextend.i64 v2
+;;                                     v35 = iconst.i64 16
+;; @0024                               v10 = iadd v7, v35  ; v35 = 16
+;; @0024                               v11 = icmp ugt v10, v6
+;; @0024                               trapnz v11, heap_oob
+;; @0024                               v18 = uextend.i64 v3
+;; @0024                               v21 = iadd v18, v35  ; v35 = 16
+;; @0024                               v22 = icmp ugt v21, v6
+;; @0024                               trapnz v22, heap_oob
+;; @0024                               v12 = load.i64 notrap aligned readonly can_move v0+56
+;; @0024                               v26 = iadd v12, v18
+;; @0024                               v28 = load.i8x16 notrap aligned v26
+;; @0024                               v15 = iadd v12, v7
+;; @0024                               store notrap aligned v28, v15
+;; @0028                               jump block1
+;;
+;;                                 block1:
+;; @0028                               return
+;; }
diff --git a/tests/misc_testsuite/gc/array-copy-inline.wast b/tests/misc_testsuite/gc/array-copy-inline.wast