-
Notifications
You must be signed in to change notification settings - Fork 12.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DAG: Fix vector_shuffle -> splat fold defining undef lanes #123596
base: main
Are you sure you want to change the base?
DAG: Fix vector_shuffle -> splat fold defining undef lanes #123596
Conversation
For shuffle vector splats with undef lanes in the mask, this was introducing real values. Filter out build_vector results based on the undef elements in the mask. This avoids AMDGPU test regressions in a future change. test/CodeGen/X86/urem-seteq-illegal-types.ll looks worse but I didn't investigate.
@llvm/pr-subscribers-backend-powerpc @llvm/pr-subscribers-llvm-selectiondag Author: Matt Arsenault (arsenm) ChangesFor shuffle vector splats with undef lanes in the mask, This avoids AMDGPU test regressions in a future change. test/CodeGen/X86/urem-seteq-illegal-types.ll looks worse Patch is 24.32 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/123596.diff 9 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 49e5b7d9ef0141..671a14e6250fc6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -26373,9 +26373,17 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
if (AllSame)
return N0;
- // Canonicalize any other splat as a build_vector.
+ // Canonicalize any other splat as a build_vector, but avoid defining any
+ // undefined elements in the mask.
SDValue Splatted = V->getOperand(SplatIndex);
SmallVector<SDValue, 8> Ops(NumElts, Splatted);
+ EVT EltVT = Splatted.getValueType();
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (SVN->getMaskElt(i) < 0)
+ Ops[i] = DAG.getUNDEF(EltVT);
+ }
+
SDValue NewBV = DAG.getBuildVector(V->getValueType(0), SDLoc(N), Ops);
// We may have jumped through bitcasts, so the type of the
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll
index 4a036a7868c1a9..95ff0d9a3a9c60 100644
--- a/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll
@@ -3628,15 +3628,15 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add
; PWR9LE-LABEL: v2ppcf128_fast:
; PWR9LE: # %bb.0: # %entry
; PWR9LE-NEXT: mflr r0
-; PWR9LE-NEXT: stdu r1, -64(r1)
-; PWR9LE-NEXT: std r0, 80(r1)
+; PWR9LE-NEXT: stdu r1, -48(r1)
+; PWR9LE-NEXT: std r0, 64(r1)
; PWR9LE-NEXT: bl __gcc_qadd
; PWR9LE-NEXT: nop
; PWR9LE-NEXT: stfd f2, 40(r1)
; PWR9LE-NEXT: stfd f1, 32(r1)
; PWR9LE-NEXT: lxv vs1, 32(r1)
; PWR9LE-NEXT: xxswapd vs2, vs1
-; PWR9LE-NEXT: addi r1, r1, 64
+; PWR9LE-NEXT: addi r1, r1, 48
; PWR9LE-NEXT: ld r0, 16(r1)
; PWR9LE-NEXT: mtlr r0
; PWR9LE-NEXT: blr
@@ -3644,15 +3644,15 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add
; PWR9BE-LABEL: v2ppcf128_fast:
; PWR9BE: # %bb.0: # %entry
; PWR9BE-NEXT: mflr r0
-; PWR9BE-NEXT: stdu r1, -144(r1)
-; PWR9BE-NEXT: std r0, 160(r1)
+; PWR9BE-NEXT: stdu r1, -128(r1)
+; PWR9BE-NEXT: std r0, 144(r1)
; PWR9BE-NEXT: bl __gcc_qadd
; PWR9BE-NEXT: nop
; PWR9BE-NEXT: stfd f2, 120(r1)
; PWR9BE-NEXT: stfd f1, 112(r1)
; PWR9BE-NEXT: lxv vs1, 112(r1)
; PWR9BE-NEXT: xxswapd vs2, vs1
-; PWR9BE-NEXT: addi r1, r1, 144
+; PWR9BE-NEXT: addi r1, r1, 128
; PWR9BE-NEXT: ld r0, 16(r1)
; PWR9BE-NEXT: mtlr r0
; PWR9BE-NEXT: blr
@@ -3661,13 +3661,13 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add
; PWR10LE: # %bb.0: # %entry
; PWR10LE-NEXT: mflr r0
; PWR10LE-NEXT: std r0, 16(r1)
-; PWR10LE-NEXT: stdu r1, -64(r1)
+; PWR10LE-NEXT: stdu r1, -48(r1)
; PWR10LE-NEXT: bl __gcc_qadd@notoc
; PWR10LE-NEXT: stfd f2, 40(r1)
; PWR10LE-NEXT: stfd f1, 32(r1)
; PWR10LE-NEXT: lxv vs1, 32(r1)
; PWR10LE-NEXT: xxswapd vs2, vs1
-; PWR10LE-NEXT: addi r1, r1, 64
+; PWR10LE-NEXT: addi r1, r1, 48
; PWR10LE-NEXT: ld r0, 16(r1)
; PWR10LE-NEXT: mtlr r0
; PWR10LE-NEXT: blr
@@ -3676,14 +3676,14 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add
; PWR10BE: # %bb.0: # %entry
; PWR10BE-NEXT: mflr r0
; PWR10BE-NEXT: std r0, 16(r1)
-; PWR10BE-NEXT: stdu r1, -144(r1)
+; PWR10BE-NEXT: stdu r1, -128(r1)
; PWR10BE-NEXT: bl __gcc_qadd
; PWR10BE-NEXT: nop
; PWR10BE-NEXT: stfd f2, 120(r1)
; PWR10BE-NEXT: stfd f1, 112(r1)
; PWR10BE-NEXT: lxv vs1, 112(r1)
; PWR10BE-NEXT: xxswapd vs2, vs1
-; PWR10BE-NEXT: addi r1, r1, 144
+; PWR10BE-NEXT: addi r1, r1, 128
; PWR10BE-NEXT: ld r0, 16(r1)
; PWR10BE-NEXT: mtlr r0
; PWR10BE-NEXT: blr
diff --git a/llvm/test/CodeGen/WebAssembly/simd.ll b/llvm/test/CodeGen/WebAssembly/simd.ll
index 5ec9f6a2a321b3..7228d5335a33f6 100644
--- a/llvm/test/CodeGen/WebAssembly/simd.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd.ll
@@ -481,21 +481,6 @@ define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) {
; NO-SIMD128-LABEL: shuffle_undef_v16i8:
; NO-SIMD128: .functype shuffle_undef_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: i32.store8 15($0), $2
-; NO-SIMD128-NEXT: i32.store8 14($0), $2
-; NO-SIMD128-NEXT: i32.store8 13($0), $2
-; NO-SIMD128-NEXT: i32.store8 12($0), $2
-; NO-SIMD128-NEXT: i32.store8 11($0), $2
-; NO-SIMD128-NEXT: i32.store8 10($0), $2
-; NO-SIMD128-NEXT: i32.store8 9($0), $2
-; NO-SIMD128-NEXT: i32.store8 8($0), $2
-; NO-SIMD128-NEXT: i32.store8 7($0), $2
-; NO-SIMD128-NEXT: i32.store8 6($0), $2
-; NO-SIMD128-NEXT: i32.store8 5($0), $2
-; NO-SIMD128-NEXT: i32.store8 4($0), $2
-; NO-SIMD128-NEXT: i32.store8 3($0), $2
-; NO-SIMD128-NEXT: i32.store8 2($0), $2
-; NO-SIMD128-NEXT: i32.store8 1($0), $2
; NO-SIMD128-NEXT: i32.store8 0($0), $2
; NO-SIMD128-NEXT: return
%res = shufflevector <16 x i8> %x, <16 x i8> %y,
@@ -994,13 +979,6 @@ define <8 x i16> @shuffle_undef_v8i16(<8 x i16> %x, <8 x i16> %y) {
; NO-SIMD128-LABEL: shuffle_undef_v8i16:
; NO-SIMD128: .functype shuffle_undef_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: i32.store16 14($0), $2
-; NO-SIMD128-NEXT: i32.store16 12($0), $2
-; NO-SIMD128-NEXT: i32.store16 10($0), $2
-; NO-SIMD128-NEXT: i32.store16 8($0), $2
-; NO-SIMD128-NEXT: i32.store16 6($0), $2
-; NO-SIMD128-NEXT: i32.store16 4($0), $2
-; NO-SIMD128-NEXT: i32.store16 2($0), $2
; NO-SIMD128-NEXT: i32.store16 0($0), $2
; NO-SIMD128-NEXT: return
%res = shufflevector <8 x i16> %x, <8 x i16> %y,
@@ -1288,9 +1266,6 @@ define <4 x i32> @shuffle_undef_v4i32(<4 x i32> %x, <4 x i32> %y) {
; NO-SIMD128-LABEL: shuffle_undef_v4i32:
; NO-SIMD128: .functype shuffle_undef_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: i32.store 12($0), $2
-; NO-SIMD128-NEXT: i32.store 8($0), $2
-; NO-SIMD128-NEXT: i32.store 4($0), $2
; NO-SIMD128-NEXT: i32.store 0($0), $2
; NO-SIMD128-NEXT: return
%res = shufflevector <4 x i32> %x, <4 x i32> %y,
@@ -1550,7 +1525,6 @@ define <2 x i64> @shuffle_undef_v2i64(<2 x i64> %x, <2 x i64> %y) {
; NO-SIMD128-LABEL: shuffle_undef_v2i64:
; NO-SIMD128: .functype shuffle_undef_v2i64 (i32, i64, i64, i64, i64) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: i64.store 8($0), $2
; NO-SIMD128-NEXT: i64.store 0($0), $2
; NO-SIMD128-NEXT: return
%res = shufflevector <2 x i64> %x, <2 x i64> %y,
@@ -1819,9 +1793,6 @@ define <4 x float> @shuffle_undef_v4f32(<4 x float> %x, <4 x float> %y) {
; NO-SIMD128-LABEL: shuffle_undef_v4f32:
; NO-SIMD128: .functype shuffle_undef_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: f32.store 12($0), $2
-; NO-SIMD128-NEXT: f32.store 8($0), $2
-; NO-SIMD128-NEXT: f32.store 4($0), $2
; NO-SIMD128-NEXT: f32.store 0($0), $2
; NO-SIMD128-NEXT: return
%res = shufflevector <4 x float> %x, <4 x float> %y,
@@ -2082,7 +2053,6 @@ define <2 x double> @shuffle_undef_v2f64(<2 x double> %x, <2 x double> %y) {
; NO-SIMD128-LABEL: shuffle_undef_v2f64:
; NO-SIMD128: .functype shuffle_undef_v2f64 (i32, f64, f64, f64, f64) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: f64.store 8($0), $2
; NO-SIMD128-NEXT: f64.store 0($0), $2
; NO-SIMD128-NEXT: return
%res = shufflevector <2 x double> %x, <2 x double> %y,
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 08d9183bd30b67..fa95ce384533c4 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -621,15 +621,14 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: por %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
@@ -1110,15 +1109,14 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; CHECK-SSE2-NEXT: por %xmm4, %xmm3
-; CHECK-SSE2-NEXT: pxor %xmm5, %xmm3
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; CHECK-SSE2-NEXT: pxor %xmm3, %xmm1
diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
index 28ac4496acb9be..97cc1f8a156943 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
@@ -141,8 +141,10 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: movl $1463, %eax # imm = 0x5B7
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2047,2047,2047,2047]
; SSE2-NEXT: movdqa %xmm0, %xmm3
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
index 838086e366fbfd..d17f2135cccad5 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
@@ -159,19 +159,18 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_allones_eq:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: por %xmm4, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT: por %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
@@ -237,19 +236,18 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_allones_ne:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: por %xmm4, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT: por %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: psrld $31, %xmm0
; CHECK-SSE2-NEXT: retq
@@ -536,19 +534,18 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: por %xmm4, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT: por %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
@@ -968,19 +965,18 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_INT_MIN:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: por %xmm4, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT: por %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index 7e081310c35be5..49cb7c707a14f3 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -474,8 +474,6 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
; SSE2-NEXT: pand %xmm7, %xmm6
; SSE2-NEXT: paddd %xmm8, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSE2-NEXT: pmuludq %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
@@ -548,8 +546,6 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6
; SSSE3-NEXT: pand %xmm7, %xmm6
; SSSE3-NEXT: paddd %xmm8, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSSE3-NEXT: pmuludq %xmm2, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
@@ -578,25 +574,23 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pmuldq %xmm2, %xmm0
; SSE41-NEXT: pinsrd $3, %r8d, %xmm2
-; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %edx
+; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE41-NEXT: movd %r9d, %xmm4
; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: pmuldq %xmm3, %xmm4
-; SSE41-NEXT: pinsrd $1, %edx, %xmm3
-; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %esi
-; SSE41-NEXT: pinsrd $1, %esi, %xmm5
+; SSE41-NEXT: pinsrd $1, %ecx, %xmm3
+; SSE...
[truncated]
|
For shuffle vector splats with undef lanes in the mask,
this was introducing real values. Filter out build_vector
results based on the undef elements in the mask.
This avoids AMDGPU test regressions in a future change.
test/CodeGen/X86/urem-seteq-illegal-types.ll looks worse
but I didn't investigate.