Skip to content

Commit 6849c16

Browse files
committed
Use 128-bit Widening Multiply on More Platforms
The 128-bit widening multiplication was previously gated by simply checking the target pointer width. This works as a simple heuristic, but a better heuristic can be used: 1. Most 64-bit architectures except SPARC64 and Wasm64 support the 128-bit widening multiplication, so it shouldn't be used on those two architectures. 2. The target pointer width doesn't always indicate that we are dealing with a 64-bit architecture, as there are ABIs that reduce the pointer width, especially on AArch64 and x86-64. 3. WebAssembly (regardless of pointer width) supports 64-bit to 128-bit widening multiplication with the `wide-arithmetic` proposal. The `wide-arithmetic` proposal is available since the LLVM 20 update and works perfectly for this use case as can be seen here: https://rust.godbolt.org/z/9jY7fxqxK Using `wasmtime explore`, we can see it compiles down to the ideal instructions on x86-64: ```nasm mulx rax, rdx, r10 xor rax, rdx ``` Based on the same change in [`foldhash`](orlp/foldhash#17).
1 parent dc5c33f commit 6849c16

File tree

1 file changed

+21
-9
lines changed

1 file changed

+21
-9
lines changed

src/lib.rs

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -197,11 +197,26 @@ const PREVENT_TRIVIAL_ZERO_COLLAPSE: u64 = 0xa4093822299f31d0;
197197

198198
#[inline]
199199
fn multiply_mix(x: u64, y: u64) -> u64 {
200-
#[cfg(target_pointer_width = "64")]
201-
{
200+
// The following code path is only fast if 64-bit to 128-bit widening
201+
// multiplication is supported by the architecture. Most 64-bit
202+
// architectures except SPARC64 and Wasm64 support it. However, the target
203+
// pointer width doesn't always indicate that we are dealing with a 64-bit
204+
// architecture, as there are ABIs that reduce the pointer width, especially
205+
// on AArch64 and x86-64. WebAssembly (regardless of pointer width) supports
206+
// 64-bit to 128-bit widening multiplication with the `wide-arithmetic`
207+
// proposal.
208+
if cfg!(any(
209+
all(
210+
target_pointer_width = "64",
211+
not(any(target_arch = "sparc64", target_arch = "wasm64")),
212+
),
213+
target_arch = "aarch64",
214+
target_arch = "x86_64",
215+
all(target_family = "wasm", target_feature = "wide-arithmetic"),
216+
)) {
202217
// We compute the full u64 x u64 -> u128 product, this is a single mul
203218
// instruction on x86-64, one mul plus one mulhi on ARM64.
204-
let full = (x as u128) * (y as u128);
219+
let full = (x as u128).wrapping_mul(y as u128);
205220
let lo = full as u64;
206221
let hi = (full >> 64) as u64;
207222

@@ -216,10 +231,7 @@ fn multiply_mix(x: u64, y: u64) -> u64 {
216231
// x * y = 2^64 * hi + lo = (-1) * hi + lo = lo - hi, (mod 2^64 + 1)
217232
// x * y = 2^64 * hi + lo = 1 * hi + lo = lo + hi, (mod 2^64 - 1)
218233
// Multiplicative hashing is universal in a field (like mod p).
219-
}
220-
221-
#[cfg(target_pointer_width = "32")]
222-
{
234+
} else {
223235
// u64 x u64 -> u128 product is prohibitively expensive on 32-bit.
224236
// Decompose into 32-bit parts.
225237
let lx = x as u32;
@@ -228,8 +240,8 @@ fn multiply_mix(x: u64, y: u64) -> u64 {
228240
let hy = (y >> 32) as u32;
229241

230242
// u32 x u32 -> u64 the low bits of one with the high bits of the other.
231-
let afull = (lx as u64) * (hy as u64);
232-
let bfull = (hx as u64) * (ly as u64);
243+
let afull = (lx as u64).wrapping_mul(hy as u64);
244+
let bfull = (hx as u64).wrapping_mul(ly as u64);
233245

234246
// Combine, swapping low/high of one of them so the upper bits of the
235247
// product of one combine with the lower bits of the other.

0 commit comments

Comments
 (0)