From 30ee39d71b4514bb784a6027214b8d10ac9f5632 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 4 Jul 2024 20:15:36 +0100 Subject: [PATCH 1/5] [X86] var-permute-*.ll - check memory address math Expose the memory addressing to simplify checking stack spill/restore manipulation codegen (cherry picked from commit 8270485af12516c159c4a2c6533fa9b149bdc63a) --- llvm/test/CodeGen/X86/var-permute-128.ll | 410 +++++++++++------------ llvm/test/CodeGen/X86/var-permute-256.ll | 14 +- llvm/test/CodeGen/X86/var-permute-512.ll | 2 +- 3 files changed, 213 insertions(+), 213 deletions(-) diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll index 99a3821bb9ba91..fddba154fc3383 100644 --- a/llvm/test/CodeGen/X86/var-permute-128.ll +++ b/llvm/test/CodeGen/X86/var-permute-128.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --no_x86_scrub_mem_shuffle ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=SSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41 @@ -19,9 +19,9 @@ define <2 x i64> @var_shuffle_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind { ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE3-NEXT: movq %xmm1, %rcx ; SSE3-NEXT: andl $1, %ecx -; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE3-NEXT: movaps %xmm0, -24(%rsp) +; SSE3-NEXT: movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero +; SSE3-NEXT: movsd -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero ; SSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE3-NEXT: retq ; @@ -32,9 +32,9 @@ define <2 x i64> @var_shuffle_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind { ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSSE3-NEXT: movq %xmm1, %rcx ; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSSE3-NEXT: movaps %xmm0, -24(%rsp) +; SSSE3-NEXT: movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero +; SSSE3-NEXT: movsd -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero ; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: retq ; @@ -73,16 +73,16 @@ define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind { ; SSE3-NEXT: movd %xmm2, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE3-NEXT: movd %xmm1, %esi -; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE3-NEXT: movaps %xmm0, -24(%rsp) ; SSE3-NEXT: andl $3, %eax ; SSE3-NEXT: andl $3, %ecx ; SSE3-NEXT: andl $3, %edx ; SSE3-NEXT: andl $3, %esi -; SSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE3-NEXT: movss -24(%rsp,%rsi,4), %xmm0 # xmm0 = mem[0],zero,zero,zero +; SSE3-NEXT: movss -24(%rsp,%rdx,4), %xmm1 # xmm1 = mem[0],zero,zero,zero ; SSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE3-NEXT: movss -24(%rsp,%rax,4), %xmm0 # xmm0 = mem[0],zero,zero,zero +; SSE3-NEXT: movss -24(%rsp,%rcx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero ; SSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE3-NEXT: retq @@ -137,7 +137,7 @@ define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind { ; SSE3-NEXT: pextrw $5, %xmm1, %r8d ; SSE3-NEXT: pextrw $6, %xmm1, %r9d ; SSE3-NEXT: pextrw $7, %xmm1, %r10d -; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE3-NEXT: movaps %xmm0, -24(%rsp) ; SSE3-NEXT: andl $7, %eax ; SSE3-NEXT: andl $7, %ecx ; SSE3-NEXT: andl $7, %edx @@ -226,69 +226,69 @@ define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind { define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind { ; SSE3-LABEL: var_shuffle_v16i8: ; SSE3: # %bb.0: -; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movaps %xmm1, -40(%rsp) +; SSE3-NEXT: movaps %xmm0, -24(%rsp) +; SSE3-NEXT: movzbl -25(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm1 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -26(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm2 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -27(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm4 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -28(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm3 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -29(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm6 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -30(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm7 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -31(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm8 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -32(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm5 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -33(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm9 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -34(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm10 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -35(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm12 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -36(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm11 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -37(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm13 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -38(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm14 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -39(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm15 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -40(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm0 @@ -382,9 +382,9 @@ define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) noun ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE3-NEXT: movq %xmm1, %rcx ; SSE3-NEXT: andl $1, %ecx -; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE3-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; SSE3-NEXT: movaps %xmm0, -24(%rsp) +; SSE3-NEXT: movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero +; SSE3-NEXT: movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: var_shuffle_v2f64: @@ -394,9 +394,9 @@ define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) noun ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSSE3-NEXT: movq %xmm1, %rcx ; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; SSSE3-NEXT: movaps %xmm0, -24(%rsp) +; SSSE3-NEXT: movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero +; SSSE3-NEXT: movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: var_shuffle_v2f64: @@ -434,16 +434,16 @@ define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwi ; SSE3-NEXT: movd %xmm2, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE3-NEXT: movd %xmm1, %esi -; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE3-NEXT: movaps %xmm0, -24(%rsp) ; SSE3-NEXT: andl $3, %eax ; SSE3-NEXT: andl $3, %ecx ; SSE3-NEXT: andl $3, %edx ; SSE3-NEXT: andl $3, %esi -; SSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE3-NEXT: movss -24(%rsp,%rsi,4), %xmm0 # xmm0 = mem[0],zero,zero,zero +; SSE3-NEXT: movss -24(%rsp,%rdx,4), %xmm1 # xmm1 = mem[0],zero,zero,zero ; SSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE3-NEXT: movss -24(%rsp,%rax,4), %xmm0 # xmm0 = mem[0],zero,zero,zero +; SSE3-NEXT: movss -24(%rsp,%rcx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero ; SSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE3-NEXT: retq @@ -490,69 +490,69 @@ define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwi define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %indices) nounwind { ; SSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8: ; SSE3: # %bb.0: -; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movaps %xmm1, -40(%rsp) +; SSE3-NEXT: movaps %xmm0, -24(%rsp) +; SSE3-NEXT: movzbl -25(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm1 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -26(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm2 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -27(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm4 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -28(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm3 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -29(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm6 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -30(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm7 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -31(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm8 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -32(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm5 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -33(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm9 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -34(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm10 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -35(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm12 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -36(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm11 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -37(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm13 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -38(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm14 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -39(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm15 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -40(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm0 @@ -649,56 +649,56 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSE3-NEXT: pushq %r12 ; SSE3-NEXT: pushq %rbx ; SSE3-NEXT: subq $424, %rsp # imm = 0x1A8 -; SSE3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movaps %xmm2, -128(%rsp) +; SSE3-NEXT: movaps %xmm1, 400(%rsp) +; SSE3-NEXT: movaps %xmm0, 384(%rsp) +; SSE3-NEXT: movzbl -128(%rsp), %eax ; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi -; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d -; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d -; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d -; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d -; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx -; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d -; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d -; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) +; SSE3-NEXT: movaps %xmm1, 368(%rsp) +; SSE3-NEXT: movaps %xmm0, 352(%rsp) +; SSE3-NEXT: movzbl -127(%rsp), %ecx +; SSE3-NEXT: movaps %xmm1, 336(%rsp) +; SSE3-NEXT: movaps %xmm0, 320(%rsp) +; SSE3-NEXT: movzbl -126(%rsp), %edx +; SSE3-NEXT: movaps %xmm1, 304(%rsp) +; SSE3-NEXT: movaps %xmm0, 288(%rsp) +; SSE3-NEXT: movzbl -125(%rsp), %esi +; SSE3-NEXT: movaps %xmm1, 272(%rsp) +; SSE3-NEXT: movaps %xmm0, 256(%rsp) +; SSE3-NEXT: movzbl -124(%rsp), %edi +; SSE3-NEXT: movaps %xmm1, 240(%rsp) +; SSE3-NEXT: movaps %xmm0, 224(%rsp) +; SSE3-NEXT: movzbl -123(%rsp), %r8d +; SSE3-NEXT: movaps %xmm1, 208(%rsp) +; SSE3-NEXT: movaps %xmm0, 192(%rsp) +; SSE3-NEXT: movzbl -122(%rsp), %r9d +; SSE3-NEXT: movaps %xmm1, 176(%rsp) +; SSE3-NEXT: movaps %xmm0, 160(%rsp) +; SSE3-NEXT: movzbl -121(%rsp), %r10d +; SSE3-NEXT: movaps %xmm1, 144(%rsp) +; SSE3-NEXT: movaps %xmm0, 128(%rsp) +; SSE3-NEXT: movzbl -120(%rsp), %r11d +; SSE3-NEXT: movaps %xmm1, 112(%rsp) +; SSE3-NEXT: movaps %xmm0, 96(%rsp) +; SSE3-NEXT: movzbl -119(%rsp), %ebx +; SSE3-NEXT: movaps %xmm1, 80(%rsp) +; SSE3-NEXT: movaps %xmm0, 64(%rsp) +; SSE3-NEXT: movzbl -118(%rsp), %r14d +; SSE3-NEXT: movaps %xmm1, 48(%rsp) +; SSE3-NEXT: movaps %xmm0, 32(%rsp) +; SSE3-NEXT: movzbl -117(%rsp), %r15d +; SSE3-NEXT: movaps %xmm1, 16(%rsp) ; SSE3-NEXT: movaps %xmm0, (%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d -; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d -; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -116(%rsp), %r12d +; SSE3-NEXT: movaps %xmm1, -16(%rsp) +; SSE3-NEXT: movaps %xmm0, -32(%rsp) +; SSE3-NEXT: movzbl -115(%rsp), %r13d +; SSE3-NEXT: movaps %xmm1, -48(%rsp) +; SSE3-NEXT: movaps %xmm0, -64(%rsp) +; SSE3-NEXT: movzbl -114(%rsp), %ebp +; SSE3-NEXT: movaps %xmm1, -80(%rsp) +; SSE3-NEXT: movaps %xmm0, -96(%rsp) +; SSE3-NEXT: movzbl -113(%rsp), %eax ; SSE3-NEXT: andl $31, %eax ; SSE3-NEXT: movzbl -96(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm1 @@ -781,56 +781,56 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSSE3-NEXT: pushq %r12 ; SSSE3-NEXT: pushq %rbx ; SSSE3-NEXT: subq $424, %rsp # imm = 0x1A8 -; SSSE3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSSE3-NEXT: movaps %xmm2, -128(%rsp) +; SSSE3-NEXT: movaps %xmm1, 400(%rsp) +; SSSE3-NEXT: movaps %xmm0, 384(%rsp) +; SSSE3-NEXT: movzbl -128(%rsp), %eax ; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi -; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d -; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d -; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d -; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d -; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx -; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d -; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d -; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm1, 368(%rsp) +; SSSE3-NEXT: movaps %xmm0, 352(%rsp) +; SSSE3-NEXT: movzbl -127(%rsp), %ecx +; SSSE3-NEXT: movaps %xmm1, 336(%rsp) +; SSSE3-NEXT: movaps %xmm0, 320(%rsp) +; SSSE3-NEXT: movzbl -126(%rsp), %edx +; SSSE3-NEXT: movaps %xmm1, 304(%rsp) +; SSSE3-NEXT: movaps %xmm0, 288(%rsp) +; SSSE3-NEXT: movzbl -125(%rsp), %esi +; SSSE3-NEXT: movaps %xmm1, 272(%rsp) +; SSSE3-NEXT: movaps %xmm0, 256(%rsp) +; SSSE3-NEXT: movzbl -124(%rsp), %edi +; SSSE3-NEXT: movaps %xmm1, 240(%rsp) +; SSSE3-NEXT: movaps %xmm0, 224(%rsp) +; SSSE3-NEXT: movzbl -123(%rsp), %r8d +; SSSE3-NEXT: movaps %xmm1, 208(%rsp) +; SSSE3-NEXT: movaps %xmm0, 192(%rsp) +; SSSE3-NEXT: movzbl -122(%rsp), %r9d +; SSSE3-NEXT: movaps %xmm1, 176(%rsp) +; SSSE3-NEXT: movaps %xmm0, 160(%rsp) +; SSSE3-NEXT: movzbl -121(%rsp), %r10d +; SSSE3-NEXT: movaps %xmm1, 144(%rsp) +; SSSE3-NEXT: movaps %xmm0, 128(%rsp) +; SSSE3-NEXT: movzbl -120(%rsp), %r11d +; SSSE3-NEXT: movaps %xmm1, 112(%rsp) +; SSSE3-NEXT: movaps %xmm0, 96(%rsp) +; SSSE3-NEXT: movzbl -119(%rsp), %ebx +; SSSE3-NEXT: movaps %xmm1, 80(%rsp) +; SSSE3-NEXT: movaps %xmm0, 64(%rsp) +; SSSE3-NEXT: movzbl -118(%rsp), %r14d +; SSSE3-NEXT: movaps %xmm1, 48(%rsp) +; SSSE3-NEXT: movaps %xmm0, 32(%rsp) +; SSSE3-NEXT: movzbl -117(%rsp), %r15d +; SSSE3-NEXT: movaps %xmm1, 16(%rsp) ; SSSE3-NEXT: movaps %xmm0, (%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d -; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d -; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSSE3-NEXT: movzbl -116(%rsp), %r12d +; SSSE3-NEXT: movaps %xmm1, -16(%rsp) +; SSSE3-NEXT: movaps %xmm0, -32(%rsp) +; SSSE3-NEXT: movzbl -115(%rsp), %r13d +; SSSE3-NEXT: movaps %xmm1, -48(%rsp) +; SSSE3-NEXT: movaps %xmm0, -64(%rsp) +; SSSE3-NEXT: movzbl -114(%rsp), %ebp +; SSSE3-NEXT: movaps %xmm1, -80(%rsp) +; SSSE3-NEXT: movaps %xmm0, -96(%rsp) +; SSSE3-NEXT: movzbl -113(%rsp), %eax ; SSSE3-NEXT: andl $31, %eax ; SSSE3-NEXT: movzbl -96(%rsp,%rax), %eax ; SSSE3-NEXT: movd %eax, %xmm1 @@ -908,39 +908,39 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSE41: # %bb.0: ; SSE41-NEXT: subq $392, %rsp # imm = 0x188 ; SSE41-NEXT: movd %xmm2, %eax -; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE41-NEXT: movaps %xmm1, 368(%rsp) +; SSE41-NEXT: movaps %xmm0, 352(%rsp) ; SSE41-NEXT: andl $31, %eax -; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) +; SSE41-NEXT: movaps %xmm1, 336(%rsp) +; SSE41-NEXT: movaps %xmm0, 320(%rsp) +; SSE41-NEXT: movaps %xmm1, 304(%rsp) +; SSE41-NEXT: movaps %xmm0, 288(%rsp) +; SSE41-NEXT: movaps %xmm1, 272(%rsp) +; SSE41-NEXT: movaps %xmm0, 256(%rsp) +; SSE41-NEXT: movaps %xmm1, 240(%rsp) +; SSE41-NEXT: movaps %xmm0, 224(%rsp) +; SSE41-NEXT: movaps %xmm1, 208(%rsp) +; SSE41-NEXT: movaps %xmm0, 192(%rsp) +; SSE41-NEXT: movaps %xmm1, 176(%rsp) +; SSE41-NEXT: movaps %xmm0, 160(%rsp) +; SSE41-NEXT: movaps %xmm1, 144(%rsp) +; SSE41-NEXT: movaps %xmm0, 128(%rsp) +; SSE41-NEXT: movaps %xmm1, 112(%rsp) +; SSE41-NEXT: movaps %xmm0, 96(%rsp) +; SSE41-NEXT: movaps %xmm1, 80(%rsp) +; SSE41-NEXT: movaps %xmm0, 64(%rsp) +; SSE41-NEXT: movaps %xmm1, 48(%rsp) +; SSE41-NEXT: movaps %xmm0, 32(%rsp) +; SSE41-NEXT: movaps %xmm1, 16(%rsp) ; SSE41-NEXT: movaps %xmm0, (%rsp) -; SSE41-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE41-NEXT: movaps %xmm1, -16(%rsp) +; SSE41-NEXT: movaps %xmm0, -32(%rsp) +; SSE41-NEXT: movaps %xmm1, -48(%rsp) +; SSE41-NEXT: movaps %xmm0, -64(%rsp) +; SSE41-NEXT: movaps %xmm1, -80(%rsp) +; SSE41-NEXT: movaps %xmm0, -96(%rsp) +; SSE41-NEXT: movaps %xmm1, -112(%rsp) +; SSE41-NEXT: movaps %xmm0, -128(%rsp) ; SSE41-NEXT: movzbl 352(%rsp,%rax), %eax ; SSE41-NEXT: movd %eax, %xmm0 ; SSE41-NEXT: pextrb $1, %xmm2, %eax @@ -1104,16 +1104,16 @@ define void @indices_convert() { ; SSE3-NEXT: movdqa (%rax), %xmm0 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-NEXT: movd %xmm1, %eax -; SSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE3-NEXT: movdqa %xmm0, -24(%rsp) +; SSE3-NEXT: movdqa %xmm0, -40(%rsp) ; SSE3-NEXT: andl $3, %eax ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] ; SSE3-NEXT: movd %xmm1, %ecx -; SSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE3-NEXT: movdqa %xmm0, -56(%rsp) +; SSE3-NEXT: movdqa %xmm0, -72(%rsp) ; SSE3-NEXT: andl $3, %ecx -; SSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE3-NEXT: movsd -72(%rsp,%rcx,8), %xmm0 # xmm0 = mem[0],zero +; SSE3-NEXT: movsd -40(%rsp,%rax,8), %xmm1 # xmm1 = mem[0],zero ; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE3-NEXT: movups %xmm1, (%rax) ; SSE3-NEXT: retq @@ -1123,16 +1123,16 @@ define void @indices_convert() { ; SSSE3-NEXT: movdqa (%rax), %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSSE3-NEXT: movd %xmm1, %eax -; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movdqa %xmm0, -24(%rsp) +; SSSE3-NEXT: movdqa %xmm0, -40(%rsp) ; SSSE3-NEXT: andl $3, %eax ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] ; SSSE3-NEXT: movd %xmm1, %ecx -; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movdqa %xmm0, -56(%rsp) +; SSSE3-NEXT: movdqa %xmm0, -72(%rsp) ; SSSE3-NEXT: andl $3, %ecx -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSSE3-NEXT: movsd -72(%rsp,%rcx,8), %xmm0 # xmm0 = mem[0],zero +; SSSE3-NEXT: movsd -40(%rsp,%rax,8), %xmm1 # xmm1 = mem[0],zero ; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSSE3-NEXT: movups %xmm1, (%rax) ; SSSE3-NEXT: retq @@ -1141,15 +1141,15 @@ define void @indices_convert() { ; SSE41: # %bb.0: # %bb ; SSE41-NEXT: movaps (%rax), %xmm0 ; SSE41-NEXT: extractps $2, %xmm0, %eax -; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE41-NEXT: movaps %xmm0, -24(%rsp) +; SSE41-NEXT: movaps %xmm0, -40(%rsp) ; SSE41-NEXT: andl $3, %eax ; SSE41-NEXT: extractps $3, %xmm0, %ecx -; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE41-NEXT: movaps %xmm0, -56(%rsp) +; SSE41-NEXT: movaps %xmm0, -72(%rsp) ; SSE41-NEXT: andl $3, %ecx -; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE41-NEXT: movsd -72(%rsp,%rcx,8), %xmm0 # xmm0 = mem[0],zero +; SSE41-NEXT: movsd -40(%rsp,%rax,8), %xmm1 # xmm1 = mem[0],zero ; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE41-NEXT: movups %xmm1, (%rax) ; SSE41-NEXT: retq diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll index 6c07c4ca523f8f..bf5949a5e55894 100644 --- a/llvm/test/CodeGen/X86/var-permute-256.ll +++ b/llvm/test/CodeGen/X86/var-permute-256.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --no_x86_scrub_mem_shuffle ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefix=XOP ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=INT256,AVX2 @@ -1189,8 +1189,8 @@ define <4 x i64> @PR50356(<4 x i64> %0, <4 x i32> %1, <4 x i64> %2) unnamed_addr ; AVX2-NEXT: andl $3, %eax ; AVX2-NEXT: vpextrd $1, %xmm1, %ecx ; AVX2-NEXT: andl $3, %ecx -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vmovq (%rsp,%rcx,8), %xmm0 # xmm0 = mem[0],zero +; AVX2-NEXT: vmovq (%rsp,%rax,8), %xmm1 # xmm1 = mem[0],zero ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vmovapd {{.*#+}} ymm1 = [34,68,102,136] @@ -1211,8 +1211,8 @@ define <4 x i64> @PR50356(<4 x i64> %0, <4 x i32> %1, <4 x i64> %2) unnamed_addr ; AVX512-NEXT: andl $3, %eax ; AVX512-NEXT: vpextrd $1, %xmm1, %ecx ; AVX512-NEXT: andl $3, %ecx -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vmovq (%rsp,%rcx,8), %xmm0 # xmm0 = mem[0],zero +; AVX512-NEXT: vmovq (%rsp,%rax,8), %xmm1 # xmm1 = mem[0],zero ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: vpcmpgtq %zmm0, %zmm2, %k1 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [17,51,85,119] @@ -1234,8 +1234,8 @@ define <4 x i64> @PR50356(<4 x i64> %0, <4 x i32> %1, <4 x i64> %2) unnamed_addr ; AVX512VL-NEXT: andl $3, %eax ; AVX512VL-NEXT: vpextrd $1, %xmm1, %ecx ; AVX512VL-NEXT: andl $3, %ecx -; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512VL-NEXT: vmovq (%rsp,%rcx,8), %xmm0 # xmm0 = mem[0],zero +; AVX512VL-NEXT: vmovq (%rsp,%rax,8), %xmm1 # xmm1 = mem[0],zero ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512VL-NEXT: vpcmpgtq %ymm0, %ymm2, %k1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [34,68,102,136] diff --git a/llvm/test/CodeGen/X86/var-permute-512.ll b/llvm/test/CodeGen/X86/var-permute-512.ll index 032ffb0d0bf7d9..88788013e49431 100644 --- a/llvm/test/CodeGen/X86/var-permute-512.ll +++ b/llvm/test/CodeGen/X86/var-permute-512.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --no_x86_scrub_mem_shuffle ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI From 9688bbcb5d237b375a17ab9b72021cedc9b2648e Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 4 Jul 2024 20:54:43 +0100 Subject: [PATCH 2/5] [X86] matchAddressRecursively - don't fold zext(shl(x,c)) -> shl(zext(x),c)) if the pattern has multiple uses Fixes #97533 crash where we hit a case where the root node had referenced the original zext node, which we then deleted - hopefully I can come up with a better solution, but the codegen changes don't look too bad atm (pulls out a shift from some complex LEA nodes that shared the scaled index). (cherry picked from commit e975ff0a223e79842b693e0ec4d3cac87963869a) --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 2 +- llvm/test/CodeGen/X86/addr-mode-matcher-3.ll | 28 ++++++++++++++++++++ llvm/test/CodeGen/X86/sttni.ll | 20 ++++++++------ 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 553d338b77904a..924bd6bd89e47e 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2692,7 +2692,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, Src = Src.getOperand(0); } - if (Src.getOpcode() == ISD::SHL && Src.hasOneUse()) { + if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N->hasOneUse()) { // Give up if the shift is not a valid scale factor [1,2,3]. SDValue ShlSrc = Src.getOperand(0); SDValue ShlAmt = Src.getOperand(1); diff --git a/llvm/test/CodeGen/X86/addr-mode-matcher-3.ll b/llvm/test/CodeGen/X86/addr-mode-matcher-3.ll index daa521d3917cdf..0c7275ec28677b 100644 --- a/llvm/test/CodeGen/X86/addr-mode-matcher-3.ll +++ b/llvm/test/CodeGen/X86/addr-mode-matcher-3.ll @@ -70,3 +70,31 @@ define i32 @mask_offset_scale_zext_i32_i64(ptr %base, i32 %i) { %load = load i32, ptr %arrayidx, align 4 ret i32 %load } + +; PR97533 - multiple uses of shl node (add + gep) in the same dependency chain. +define i64 @add_shl_zext(ptr %ptr, i8 %arg) nounwind { +; X86-LABEL: add_shl_zext: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl 4(%esi,%ecx,4), %edx +; X86-NEXT: leal (,%ecx,8), %eax +; X86-NEXT: addl (%esi,%ecx,4), %eax +; X86-NEXT: adcl $0, %edx +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: add_shl_zext: +; X64: # %bb.0: +; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: shll $3, %eax +; X64-NEXT: addq (%rdi,%rax), %rax +; X64-NEXT: retq + %idx = zext i8 %arg to i64 + %gep = getelementptr ptr, ptr %ptr, i64 %idx + %val = load i64, ptr %gep, align 8 + %shl = shl i64 %idx, 3 + %sum = add i64 %val, %shl + ret i64 %sum +} diff --git a/llvm/test/CodeGen/X86/sttni.ll b/llvm/test/CodeGen/X86/sttni.ll index 870912bb6bb1be..39cbee54737c38 100644 --- a/llvm/test/CodeGen/X86/sttni.ll +++ b/llvm/test/CodeGen/X86/sttni.ll @@ -341,9 +341,10 @@ define i32 @pcmpestri_reg_diff_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, ; X64-NEXT: .LBB8_2: # %compare ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: andl $7, %ecx -; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax +; X64-NEXT: addl %ecx, %ecx +; X64-NEXT: movzwl -24(%rsp,%rcx), %eax ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: subw -40(%rsp,%rcx,2), %ax +; X64-NEXT: subw -40(%rsp,%rcx), %ax ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: retq entry: @@ -481,9 +482,10 @@ define i32 @pcmpestri_mem_diff_i16(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32 ; X64-NEXT: .LBB11_2: # %compare ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: andl $7, %ecx -; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax +; X64-NEXT: addl %ecx, %ecx +; X64-NEXT: movzwl -24(%rsp,%rcx), %eax ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: subw -40(%rsp,%rcx,2), %ax +; X64-NEXT: subw -40(%rsp,%rcx), %ax ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: retq entry: @@ -795,9 +797,10 @@ define i32 @pcmpistri_reg_diff_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind { ; X64-NEXT: .LBB20_2: # %compare ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: andl $7, %ecx -; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax +; X64-NEXT: addl %ecx, %ecx +; X64-NEXT: movzwl -24(%rsp,%rcx), %eax ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: subw -40(%rsp,%rcx,2), %ax +; X64-NEXT: subw -40(%rsp,%rcx), %ax ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: retq entry: @@ -915,9 +918,10 @@ define i32 @pcmpistri_mem_diff_i16(ptr %lhs_ptr, ptr %rhs_ptr) nounwind { ; X64-NEXT: .LBB23_2: # %compare ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: andl $7, %ecx -; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax +; X64-NEXT: addl %ecx, %ecx +; X64-NEXT: movzwl -24(%rsp,%rcx), %eax ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: subw -40(%rsp,%rcx,2), %ax +; X64-NEXT: subw -40(%rsp,%rcx), %ax ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: retq entry: From 55cbec47218e3dcb94d1a025a74a81850d85e056 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 4 Jul 2024 12:47:33 +0200 Subject: [PATCH 3/5] [Mem2Reg] Add some single store tests (NFC) For https://github.com/llvm/llvm-project/issues/97702. (cherry picked from commit e7bfd4d77fafdcad890f80f8feee50ca02e0e2c3) --- llvm/test/Transforms/Mem2Reg/single-store.ll | 67 ++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 llvm/test/Transforms/Mem2Reg/single-store.ll diff --git a/llvm/test/Transforms/Mem2Reg/single-store.ll b/llvm/test/Transforms/Mem2Reg/single-store.ll new file mode 100644 index 00000000000000..b82e26158a361f --- /dev/null +++ b/llvm/test/Transforms/Mem2Reg/single-store.ll @@ -0,0 +1,67 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=mem2reg < %s | FileCheck %s + +; FIXME: This is a miscompile. +define i8 @single_store_literal_poison(i1 %cond) { +; CHECK-LABEL: define i8 @single_store_literal_poison( +; CHECK-SAME: i1 [[COND:%.*]]) { +; CHECK-NEXT: br i1 [[COND]], label %[[IF:.*]], label %[[EXIT:.*]] +; CHECK: [[IF]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret i8 poison +; + %a = alloca i8, align 1 + br i1 %cond, label %if, label %exit + +if: + store i8 poison, ptr %a, align 1 + br label %exit + +exit: + %v = load i8, ptr %a, align 1 + ret i8 %v +} + +; FIXME: This is a miscompile. +define i8 @single_store_maybe_poison(i1 %cond, i8 %x) { +; CHECK-LABEL: define i8 @single_store_maybe_poison( +; CHECK-SAME: i1 [[COND:%.*]], i8 [[X:%.*]]) { +; CHECK-NEXT: br i1 [[COND]], label %[[IF:.*]], label %[[EXIT:.*]] +; CHECK: [[IF]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret i8 [[X]] +; + %a = alloca i8, align 1 + br i1 %cond, label %if, label %exit + +if: + store i8 %x, ptr %a, align 1 + br label %exit + +exit: + %v = load i8, ptr %a, align 1 + ret i8 %v +} + +define i8 @single_store_cant_be_poison(i1 %cond, i8 noundef %x) { +; CHECK-LABEL: define i8 @single_store_cant_be_poison( +; CHECK-SAME: i1 [[COND:%.*]], i8 noundef [[X:%.*]]) { +; CHECK-NEXT: br i1 [[COND]], label %[[IF:.*]], label %[[EXIT:.*]] +; CHECK: [[IF]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret i8 [[X]] +; + %a = alloca i8, align 1 + br i1 %cond, label %if, label %exit + +if: + store i8 %x, ptr %a, align 1 + br label %exit + +exit: + %v = load i8, ptr %a, align 1 + ret i8 %v +} From 9457ded3cd7173179ea0ef76c966368ca2fcfc9d Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 4 Jul 2024 14:41:12 +0200 Subject: [PATCH 4/5] [Mem2Reg] Don't use single store optimization for potentially poison value (#97711) If there is a single store, then loads must either load the stored value or uninitialized memory (undef). If the stored value may be poison, then replacing an uninitialized memory load with it would be incorrect. Fall back to the generic code in that case. This PR only fixes the case where there is a literal poison store -- the case where the value is non-trivially poison will still get miscompiled by phi simplification later, see #96631. Fixes https://github.com/llvm/llvm-project/issues/97702. (cherry picked from commit f58930f705884dfac3bd8c481c827d027a6068cb) --- llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp | 10 ++++++++-- llvm/test/Transforms/Mem2Reg/single-store.ll | 3 +-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index 88b05aab8db4df..d37342faad6ee5 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -511,7 +511,14 @@ rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI, SmallSet *DbgAssignsToDelete, SmallSet *DPVAssignsToDelete) { StoreInst *OnlyStore = Info.OnlyStore; - bool StoringGlobalVal = !isa(OnlyStore->getOperand(0)); + Value *ReplVal = OnlyStore->getOperand(0); + // Loads may either load the stored value or uninitialized memory (undef). + // If the stored value may be poison, then replacing an uninitialized memory + // load with it would be incorrect. + if (!isGuaranteedNotToBePoison(ReplVal)) + return false; + + bool StoringGlobalVal = !isa(ReplVal); BasicBlock *StoreBB = OnlyStore->getParent(); int StoreIndex = -1; @@ -551,7 +558,6 @@ rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI, } // Otherwise, we *can* safely rewrite this load. - Value *ReplVal = OnlyStore->getOperand(0); // If the replacement value is the load, this must occur in unreachable // code. if (ReplVal == LI) diff --git a/llvm/test/Transforms/Mem2Reg/single-store.ll b/llvm/test/Transforms/Mem2Reg/single-store.ll index b82e26158a361f..f864227c49145c 100644 --- a/llvm/test/Transforms/Mem2Reg/single-store.ll +++ b/llvm/test/Transforms/Mem2Reg/single-store.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -passes=mem2reg < %s | FileCheck %s -; FIXME: This is a miscompile. define i8 @single_store_literal_poison(i1 %cond) { ; CHECK-LABEL: define i8 @single_store_literal_poison( ; CHECK-SAME: i1 [[COND:%.*]]) { @@ -9,7 +8,7 @@ define i8 @single_store_literal_poison(i1 %cond) { ; CHECK: [[IF]]: ; CHECK-NEXT: br label %[[EXIT]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: ret i8 poison +; CHECK-NEXT: ret i8 undef ; %a = alloca i8, align 1 br i1 %cond, label %if, label %exit From 6d3d4f9a522db5d400b155f3fafc839f4551d4ed Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Fri, 5 Jul 2024 09:13:59 +0200 Subject: [PATCH 5/5] [Mem2Reg] Always allow single-store optimization for dominating stores In #97711 the single-store optimization was disabled for the case where the value is potentially poison, as this may produce incorrect results for loads of uninitialized memory. However, this resulted in compile-time regressions. Address these by still allowing the single-store optimization to occur in cases where the store dominates the load, as we know that such a load will always read initialized memory. (cherry picked from commit daaea128bb84f8ed7b9de36aa3a51f33b775c05a) --- llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index d37342faad6ee5..bc0d12f47055fd 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -514,11 +514,10 @@ rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI, Value *ReplVal = OnlyStore->getOperand(0); // Loads may either load the stored value or uninitialized memory (undef). // If the stored value may be poison, then replacing an uninitialized memory - // load with it would be incorrect. - if (!isGuaranteedNotToBePoison(ReplVal)) - return false; - - bool StoringGlobalVal = !isa(ReplVal); + // load with it would be incorrect. If the store dominates the load, we know + // it is always initialized. + bool RequireDominatingStore = + isa(ReplVal) || !isGuaranteedNotToBePoison(ReplVal); BasicBlock *StoreBB = OnlyStore->getParent(); int StoreIndex = -1; @@ -535,7 +534,7 @@ rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI, // only value stored to the alloca. We can do this if the value is // dominated by the store. If not, we use the rest of the mem2reg machinery // to insert the phi nodes as needed. - if (!StoringGlobalVal) { // Non-instructions are always dominated. + if (RequireDominatingStore) { if (LI->getParent() == StoreBB) { // If we have a use that is in the same block as the store, compare the // indices of the two instructions to see which one came first. If the