Skip to content

Commit a502642

Browse files
authored
cranelift: inline small constant-length array.copy for performance (#13460)
* cranelift: inline small constant-length `array.copy` When a scalar-element `array.copy` has a compile-time-constant length of at most 8, expand it inline as loads-then-stores instead of calling the `memory_copy` libcall. The libcall's fixed per-call cost (a wasm/host transition and an indirect call) dominates for tiny copies, and is especially visible in unoptimized builds where the libcall body itself is not optimized. Every element is loaded before any is stored so overlapping ranges keep memmove semantics. Dynamic or larger lengths, and tables, still use the libcall, whose `memmove` amortizes the overhead. The bound of 8 trades a little perf (the inline-vs-libcall crossover is ~16 elements) for bounded code size, capturing the largest wins, which cluster at <= 8 elements. Assisted-by: Claude Code:claude-opus-4-7 * cranelift: tidy inline `array.copy` helper after review Reword `emit_inline_array_copy`'s doc to describe a bitwise copy by element width (it also handles `v128` and copies `f32`/`f64` via integer types), and replace the `elem_size`/`n` parameter shadowing with `stride`/`count`. No codegen change. Assisted-by: Claude Code:claude-opus-4-7 * cranelift: generalize inline copy to all constant-length bulk copies Per review, move the inline expansion into `raw_bulk_memory_operation`, keyed on a constant byte length, so it also covers `memory.copy` and `table.copy`, not just `array.copy`. It now uses width-agnostic wide accesses (`i8x16` down to `i8`), and `BulkOp::MemoryCopy` carries entity-appropriate flags. Threshold 128 bytes, from measurement (~1.7-2.7x faster up to 128 B, ties by 256). Assisted-by: Claude Code:claude-opus-4-7 * cranelift: simplify inline bulk copy after review - Use `MemFlagsData::trusted()` in `emit_inline_memcpy` and drop the per-entity flags threaded through `BulkOp::MemoryCopy`; each load only feeds its paired store, so unaligned moves are selected regardless. - Detect a constant length by matching only `iconst` on the raw wasm length (plumbed as `const_len`) instead of folding the width casts and `* element_size` multiply, which was type-unfaithful. - Gate on `bytes <= 128` with a zero-length early return. Assisted-by: Claude Code:claude-opus-4-7
1 parent 5aa80a1 commit a502642

4 files changed

Lines changed: 407 additions & 7 deletions

File tree

crates/cranelift/src/func_environ.rs

Lines changed: 117 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3448,6 +3448,32 @@ impl FuncEnvironment<'_> {
34483448
/// epochs are enabled to break up the copy into a loop of chunks with
34493449
/// preemption checks between them.
34503450
fn raw_bulk_memory_operation(&mut self, builder: &mut FunctionBuilder<'_>, mut op: BulkOp) {
3451+
// Fast path: a copy whose byte length is a small compile-time constant is
3452+
// expanded inline (see `emit_inline_memcpy`), skipping the libcall's fixed
3453+
// per-call cost (a wasm/host transition and an indirect call) that
3454+
// dominates tiny copies. Larger or dynamic copies, and all fills, use the
3455+
// libcall below, whose `memmove` amortizes that cost.
3456+
//
3457+
// The bound is empirical: measured on aarch64, inline is ~1.7-2.7x faster
3458+
// than the libcall through 128 bytes and ties by 256 (cost grows with the
3459+
// length, since every chunk is loaded before any is stored).
3460+
const INLINE_COPY_MAX_BYTES: u64 = 128;
3461+
if let BulkOp::MemoryCopy {
3462+
dst,
3463+
src,
3464+
const_len: Some(bytes),
3465+
..
3466+
} = op
3467+
{
3468+
if bytes <= INLINE_COPY_MAX_BYTES {
3469+
if self.tunables.consume_fuel {
3470+
self.fuel_consumed += bytes as i64;
3471+
}
3472+
self.emit_inline_memcpy(builder, dst, src, bytes);
3473+
return;
3474+
}
3475+
}
3476+
34513477
// Very scientifically chosen. Or, more seriously, this is just an
34523478
// arbitrary number for now. 100k copies of this size locally takes half
34533479
// a second, so seems like a reasonably large chunk size to not hit perf
@@ -3467,7 +3493,7 @@ impl FuncEnvironment<'_> {
34673493
env.epoch_check(builder);
34683494
}
34693495
match *op {
3470-
BulkOp::MemoryCopy { dst, src, len } => {
3496+
BulkOp::MemoryCopy { dst, src, len, .. } => {
34713497
if env.tunables.consume_fuel {
34723498
// Note that fuel is always a 64-bit counter.
34733499
let fuel_consumed = match env.pointer_type() {
@@ -3530,7 +3556,7 @@ impl FuncEnvironment<'_> {
35303556
};
35313557
let has_chunk = builder.ins().icmp(IntCC::UnsignedGreaterThan, len, chunk);
35323558
match *op {
3533-
BulkOp::MemoryCopy { dst, src, len } => {
3559+
BulkOp::MemoryCopy { dst, src, len, .. } => {
35343560
builder.ins().brif(
35353561
has_chunk,
35363562
chunk_block,
@@ -3553,7 +3579,7 @@ impl FuncEnvironment<'_> {
35533579
has_chunk_branch(builder, &op);
35543580

35553581
let append_block_params = |builder: &mut FunctionBuilder<'_>, block, op: &mut _| match op {
3556-
BulkOp::MemoryCopy { dst, src, len } => {
3582+
BulkOp::MemoryCopy { dst, src, len, .. } => {
35573583
*dst = builder.append_block_param(block, pointer_type);
35583584
*src = builder.append_block_param(block, pointer_type);
35593585
*len = builder.append_block_param(block, pointer_type);
@@ -3577,7 +3603,7 @@ impl FuncEnvironment<'_> {
35773603
*op_len = chunk;
35783604
raw_call(self, builder, &op);
35793605
match &mut op {
3580-
BulkOp::MemoryCopy { dst, src, len } => {
3606+
BulkOp::MemoryCopy { dst, src, len, .. } => {
35813607
*dst = builder.ins().iadd(*dst, chunk);
35823608
*src = builder.ins().iadd(*src, chunk);
35833609
*len = builder.ins().isub(remaining_len, chunk);
@@ -3976,19 +4002,28 @@ impl FuncEnvironment<'_> {
39764002
let len_ptr =
39774003
self.unchecked_cast_wasm_addr_to_native_addr(&mut builder.cursor(), len, len_idx_ty);
39784004

4005+
// A constant wasm length lets a small copy be expanded inline. Capture it
4006+
// straight from wasm here (a count of entity elements; for memories an
4007+
// element is a byte) so the fast path only has to recognize an `iconst`,
4008+
// not the casts and `* element_size` multiply applied further down.
4009+
let const_count = Self::value_as_const_int(builder, len);
4010+
39794011
match dst_entity {
39804012
// Memories are always a `memcpy`.
39814013
CheckedEntity::Memory(_) => {
39824014
assert!(matches!(
39834015
src_entity,
39844016
CheckedEntity::Memory(_) | CheckedEntity::Data { .. }
39854017
));
4018+
// A memory's elements are bytes, so the element count is already
4019+
// the byte length.
39864020
self.raw_bulk_memory_operation(
39874021
builder,
39884022
BulkOp::MemoryCopy {
39894023
dst: dst_raw_addr,
39904024
src: src_raw_addr,
39914025
len: len_ptr,
4026+
const_len: const_count,
39924027
},
39934028
);
39944029
Ok(())
@@ -4005,6 +4040,7 @@ impl FuncEnvironment<'_> {
40054040
src_raw_addr,
40064041
len_ptr,
40074042
src,
4043+
const_count,
40084044
),
40094045

40104046
// Cannot copy into a data or element segment in wasm.
@@ -4211,7 +4247,9 @@ impl FuncEnvironment<'_> {
42114247
/// `src_elem_addr` and stored to `dst_elem_addr`. The `elem_ty` is the type
42124248
/// being transferred, `one_elem_size` is the byte size of each element,
42134249
/// `copy_len` is the number of elements being copied, and `src_index` is
4214-
/// the first index within `src_entity` being loaded.
4250+
/// the first index within `src_entity` being loaded. `const_count` is that
4251+
/// same element count when it is a wasm constant, used to expand small copies
4252+
/// inline.
42154253
///
42164254
/// All values here have type `self.pointer_type()`, except `src_index`
42174255
/// which is typed appropriately to index `src_entity`.
@@ -4227,6 +4265,7 @@ impl FuncEnvironment<'_> {
42274265
src_elem_addr: ir::Value,
42284266
copy_len: ir::Value,
42294267
src_index: ir::Value,
4268+
const_count: Option<u64>,
42304269
) -> WasmResult<()> {
42314270
let pointer_type = self.pointer_type();
42324271
assert_eq!(builder.func.dfg.value_type(dst_elem_addr), pointer_type);
@@ -4300,14 +4339,16 @@ impl FuncEnvironment<'_> {
43004339
}
43014340

43024341
// For memcpy, that's easy, just call the intrinsic with the right
4303-
// parameters.
4342+
// parameters (or expand it inline; see `raw_bulk_memory_operation`).
43044343
if !type_forbids_memcpy && dst_element_size == src_element_size {
4344+
let const_len = const_count.and_then(|c| c.checked_mul(u64::from(dst_element_size)));
43054345
self.raw_bulk_memory_operation(
43064346
builder,
43074347
BulkOp::MemoryCopy {
43084348
dst: dst_elem_addr,
43094349
src: src_elem_addr,
43104350
len: dst_copy_byte_len,
4351+
const_len,
43114352
},
43124353
);
43134354
return Ok(());
@@ -4384,6 +4425,72 @@ impl FuncEnvironment<'_> {
43844425
Ok(())
43854426
}
43864427

4428+
/// If `value` is an `iconst`, return its immediate as a `u64`.
4429+
///
4430+
/// This deliberately peeks at a single `iconst` and nothing else. Callers
4431+
/// pass the length exactly as it appears in wasm, before the width casts and
4432+
/// `* element_size` multiply that the byte-length computation wraps it in, so
4433+
/// there is no need to (incorrectly) fold those type-changing ops here.
4434+
fn value_as_const_int(builder: &FunctionBuilder<'_>, value: ir::Value) -> Option<u64> {
4435+
let inst = builder.func.dfg.value_def(value).inst()?;
4436+
match builder.func.dfg.insts[inst] {
4437+
ir::InstructionData::UnaryImm {
4438+
opcode: ir::Opcode::Iconst,
4439+
imm,
4440+
} => Some(imm.bits().cast_unsigned()),
4441+
_ => None,
4442+
}
4443+
}
4444+
4445+
/// Expand a copy of `bytes` (a small compile-time constant) into inline loads
4446+
/// then stores, avoiding the `memory_copy` libcall.
4447+
///
4448+
/// The copy is bitwise and element-type agnostic: the byte range is covered
4449+
/// greedily with the widest convenient access (`i8x16` down to `i8`). Every
4450+
/// chunk is loaded before any is stored, so overlapping ranges keep `memmove`
4451+
/// semantics. The caller has already bounds-checked the range.
4452+
fn emit_inline_memcpy(
4453+
&mut self,
4454+
builder: &mut FunctionBuilder<'_>,
4455+
dst_addr: ir::Value,
4456+
src_addr: ir::Value,
4457+
bytes: u64,
4458+
) {
4459+
// `trusted()` (notrap + aligned) is sound even though the chunks may be
4460+
// unaligned: each load feeds only its paired store, never an instruction
4461+
// operand that requires alignment, so the backend selects unaligned
4462+
// moves regardless of the `aligned` flag. The range was already
4463+
// bounds-checked, so `notrap` is fine too.
4464+
let flags = ir::MemFlagsData::trusted();
4465+
const WIDTHS: &[(u64, ir::Type)] = &[
4466+
(16, ir::types::I8X16),
4467+
(8, ir::types::I64),
4468+
(4, ir::types::I32),
4469+
(2, ir::types::I16),
4470+
(1, ir::types::I8),
4471+
];
4472+
// 12 covers the worst case under the 128-byte cap: n=127 decomposes into
4473+
// 7×i8x16 + i64 + i32 + i16 + i8 = 11 chunks. Sized so both `SmallVec`s
4474+
// stay inline.
4475+
let mut chunks: SmallVec<[(i32, ir::Type); 12]> = smallvec![];
4476+
let mut offset = 0u64;
4477+
let mut remaining = bytes;
4478+
for &(width, ty) in WIDTHS {
4479+
while remaining >= width {
4480+
chunks.push((i32::try_from(offset).unwrap(), ty));
4481+
offset += width;
4482+
remaining -= width;
4483+
}
4484+
}
4485+
let vals: SmallVec<[ir::Value; 12]> = chunks
4486+
.iter()
4487+
.map(|&(off, ty)| builder.ins().load(ty, flags, src_addr, off))
4488+
.collect();
4489+
for (&(off, _), val) in chunks.iter().zip(vals) {
4490+
builder.ins().store(flags, val, dst_addr, off);
4491+
}
4492+
}
4493+
43874494
/// For bulk operations (copies, fills, etc) this is an extra check layered
43884495
/// on the spec-defined bounds check that the address is in-bounds.
43894496
///
@@ -5436,11 +5543,14 @@ enum BulkOp {
54365543
/// A `memory.copy` operation, copying memory from `src` to `dst`.
54375544
///
54385545
/// All of `dst`, `src`, and `len` must be pre-validated and inbounds. All
5439-
/// must have type `env.pointer_type()`.
5546+
/// must have type `env.pointer_type()`. `const_len`, when set, is the
5547+
/// statically-known byte length (from a constant wasm length); the inline
5548+
/// fast path in `raw_bulk_memory_operation` uses it to expand small copies.
54405549
MemoryCopy {
54415550
dst: ir::Value,
54425551
src: ir::Value,
54435552
len: ir::Value,
5553+
const_len: Option<u64>,
54445554
},
54455555

54465556
/// A `memory.fill` operation, setting all bytes of `dst` to `val`.

tests/disas/array-copy-inline.wat

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
;;! target = 'x86_64'
2+
;;! test = 'optimize'
3+
;;! flags = '-Wgc'
4+
5+
;; A small, constant-length `array.copy` is expanded inline as wide loads
6+
;; followed by stores instead of calling the `memory_copy` libcall. The byte
7+
;; range is covered greedily with the widest convenient access, so 7 `i32`s (28
8+
;; bytes) become an `i8x16` + `i64` + `i32`, and every chunk is loaded before any
9+
;; is stored so overlapping ranges still copy correctly.
10+
11+
(module
12+
(type $a (array (mut i32)))
13+
14+
(func $copy (param (ref $a) i32 (ref $a) i32)
15+
(array.copy $a $a (local.get 0) (local.get 1) (local.get 2) (local.get 3) (i32.const 7))
16+
)
17+
)
18+
;; function u0:0(i64 vmctx, i64, i32, i32, i32, i32) tail {
19+
;; gv0 = vmctx
20+
;; gv1 = load.i64 notrap aligned readonly gv0+8
21+
;; gv2 = load.i64 notrap aligned gv1+24
22+
;; gv3 = vmctx
23+
;; gv4 = load.i64 notrap aligned readonly can_move gv3+8
24+
;; gv5 = load.i64 notrap aligned readonly can_move gv4+32
25+
;; gv6 = load.i64 notrap aligned gv4+40
26+
;; stack_limit = gv2
27+
;;
28+
;; block0(v0: i64, v1: i64, v2: i32, v3: i32, v4: i32, v5: i32):
29+
;; @002a trapz v2, user16
30+
;; @002a v83 = load.i64 notrap aligned readonly can_move v0+8
31+
;; @002a v8 = load.i64 notrap aligned readonly can_move v83+32
32+
;; @002a v7 = uextend.i64 v2
33+
;; @002a v9 = iadd v8, v7
34+
;; @002a v10 = iconst.i64 16
35+
;; @002a v11 = iadd v9, v10 ; v10 = 16
36+
;; @002a v12 = load.i32 user2 readonly v11
37+
;; @002a v14 = uextend.i64 v3
38+
;; v85 = iconst.i64 7
39+
;; @002a v17 = iadd v14, v85 ; v85 = 7
40+
;; @002a v13 = uextend.i64 v12
41+
;; @002a v18 = icmp ugt v17, v13
42+
;; @002a trapnz v18, user17
43+
;; @002a trapz v4, user16
44+
;; @002a v26 = uextend.i64 v4
45+
;; @002a v28 = iadd v8, v26
46+
;; @002a v30 = iadd v28, v10 ; v10 = 16
47+
;; @002a v31 = load.i32 user2 readonly v30
48+
;; @002a v33 = uextend.i64 v5
49+
;; @002a v36 = iadd v33, v85 ; v85 = 7
50+
;; @002a v32 = uextend.i64 v31
51+
;; @002a v37 = icmp ugt v36, v32
52+
;; @002a trapnz v37, user17
53+
;; @002a v49 = load.i64 notrap aligned v83+40
54+
;; v79 = iconst.i64 20
55+
;; @002a v22 = iadd v9, v79 ; v79 = 20
56+
;; v93 = iconst.i64 2
57+
;; v94 = ishl v14, v93 ; v93 = 2
58+
;; @002a v25 = iadd v22, v94
59+
;; v98 = iconst.i64 28
60+
;; @002a v51 = uadd_overflow_trap v25, v98, user2 ; v98 = 28
61+
;; @002a v50 = iadd v8, v49
62+
;; @002a v52 = icmp ugt v51, v50
63+
;; @002a trapnz v52, user2
64+
;; @002a v41 = iadd v28, v79 ; v79 = 20
65+
;; v96 = ishl v33, v93 ; v93 = 2
66+
;; @002a v44 = iadd v41, v96
67+
;; @002a v56 = uadd_overflow_trap v44, v98, user2 ; v98 = 28
68+
;; @002a v57 = icmp ugt v56, v50
69+
;; @002a trapnz v57, user2
70+
;; @002a v58 = load.i8x16 notrap aligned v44
71+
;; @002a v59 = load.i64 notrap aligned v44+16
72+
;; @002a v60 = load.i32 notrap aligned v44+24
73+
;; @002a store notrap aligned v58, v25
74+
;; @002a store notrap aligned v59, v25+16
75+
;; @002a store notrap aligned v60, v25+24
76+
;; @002e jump block1
77+
;;
78+
;; block1:
79+
;; @002e return
80+
;; }

tests/disas/memory-copy-inline.wat

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
;;! target = 'x86_64'
2+
;;! test = 'optimize'
3+
4+
;; A constant-length `memory.copy` is expanded inline as wide loads followed by
5+
;; stores (every byte is loaded before any is stored, so overlapping ranges keep
6+
;; `memmove` semantics) instead of calling the `memory_copy` libcall.
7+
8+
(module
9+
(memory 1)
10+
(func $copy (param i32 i32)
11+
(memory.copy (local.get 0) (local.get 1) (i32.const 16))
12+
)
13+
)
14+
;; function u0:0(i64 vmctx, i64, i32, i32) tail {
15+
;; gv0 = vmctx
16+
;; gv1 = load.i64 notrap aligned readonly gv0+8
17+
;; gv2 = load.i64 notrap aligned gv1+24
18+
;; gv3 = vmctx
19+
;; gv4 = load.i64 notrap aligned gv3+64
20+
;; gv5 = load.i64 notrap aligned readonly can_move gv3+56
21+
;; stack_limit = gv2
22+
;;
23+
;; block0(v0: i64, v1: i64, v2: i32, v3: i32):
24+
;; @0024 v6 = load.i64 notrap aligned v0+64
25+
;; @0024 v7 = uextend.i64 v2
26+
;; v35 = iconst.i64 16
27+
;; @0024 v10 = iadd v7, v35 ; v35 = 16
28+
;; @0024 v11 = icmp ugt v10, v6
29+
;; @0024 trapnz v11, heap_oob
30+
;; @0024 v18 = uextend.i64 v3
31+
;; @0024 v21 = iadd v18, v35 ; v35 = 16
32+
;; @0024 v22 = icmp ugt v21, v6
33+
;; @0024 trapnz v22, heap_oob
34+
;; @0024 v12 = load.i64 notrap aligned readonly can_move v0+56
35+
;; @0024 v26 = iadd v12, v18
36+
;; @0024 v28 = load.i8x16 notrap aligned v26
37+
;; @0024 v15 = iadd v12, v7
38+
;; @0024 store notrap aligned v28, v15
39+
;; @0028 jump block1
40+
;;
41+
;; block1:
42+
;; @0028 return
43+
;; }

0 commit comments

Comments
 (0)