llvm.org GIT mirror llvm / 524491c
[TargetLowering][AMDGPU][X86] Improve SimplifyDemandedBits bitcast handling This patch adds support for BigBitWidth -> SmallBitWidth bitcasts, splitting the DemandedBits/Elts accordingly. The AMDGPU backend needed an extra (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1) combine to encourage BFE creation, I investigated putting this in DAGCombine but it caused a lot of noise on other targets - some improvements, some regressions. The X86 changes are all definite wins. Differential Revision: https://reviews.llvm.org/D60462 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@358887 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 6 months ago
8 changed file(s) with 116 addition(s) and 181 deletion(s). Raw diff Collapse all Expand all
14701470 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts,
14711471 KnownSrcBits, TLO, Depth + 1))
14721472 return true;
1473 } else if ((NumSrcEltBits % BitWidth) == 0 &&
1474 TLO.DAG.getDataLayout().isLittleEndian()) {
1475 unsigned Scale = NumSrcEltBits / BitWidth;
1476 unsigned NumSrcElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1;
1477 APInt DemandedSrcBits = APInt::getNullValue(NumSrcEltBits);
1478 APInt DemandedSrcElts = APInt::getNullValue(NumSrcElts);
1479 for (unsigned i = 0; i != NumElts; ++i)
1480 if (DemandedElts[i]) {
1481 unsigned Offset = (i % Scale) * BitWidth;
1482 DemandedSrcBits.insertBits(DemandedBits, Offset);
1483 DemandedSrcElts.setBit(i / Scale);
1484 }
1485
1486 if (SrcVT.isVector()) {
1487 APInt KnownSrcUndef, KnownSrcZero;
1488 if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, KnownSrcUndef,
1489 KnownSrcZero, TLO, Depth + 1))
1490 return true;
1491 }
1492
1493 KnownBits KnownSrcBits;
1494 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts,
1495 KnownSrcBits, TLO, Depth + 1))
1496 return true;
14731497 }
14741498
14751499 // If this is a bitcast, let computeKnownBits handle it. Only do this on a
14761500 // recursive call where Known may be useful to the caller.
14771501 if (Depth > 0) {
1478 Known = TLO.DAG.computeKnownBits(Op, Depth);
1502 Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
14791503 return false;
14801504 }
14811505 break;
31463146
31473147 SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
31483148 DAGCombinerInfo &DCI) const {
3149 if (N->getValueType(0) != MVT::i64)
3150 return SDValue();
3151
3152 const ConstantSDNode *RHS = dyn_cast(N->getOperand(1));
3149 auto *RHS = dyn_cast(N->getOperand(1));
31533150 if (!RHS)
31543151 return SDValue();
31553152
3153 EVT VT = N->getValueType(0);
3154 SDValue LHS = N->getOperand(0);
31563155 unsigned ShiftAmt = RHS->getZExtValue();
3156 SelectionDAG &DAG = DCI.DAG;
3157 SDLoc SL(N);
3158
3159 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3160 // this improves the ability to match BFE patterns in isel.
3161 if (LHS.getOpcode() == ISD::AND) {
3162 if (auto *Mask = dyn_cast(LHS.getOperand(1))) {
3163 if (Mask->getAPIntValue().isShiftedMask() &&
3164 Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) {
3165 return DAG.getNode(
3166 ISD::AND, SL, VT,
3167 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3168 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3169 }
3170 }
3171 }
3172
3173 if (VT != MVT::i64)
3174 return SDValue();
3175
31573176 if (ShiftAmt < 32)
31583177 return SDValue();
31593178
31603179 // srl i64:x, C for C >= 32
31613180 // =>
31623181 // build_pair (srl hi_32(x), C - 32), 0
3163
3164 SelectionDAG &DAG = DCI.DAG;
3165 SDLoc SL(N);
3166
31673182 SDValue One = DAG.getConstant(1, SL, MVT::i32);
31683183 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
31693184
3170 SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0));
3171 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
3172 VecOp, One);
3185 SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, LHS);
3186 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecOp, One);
31733187
31743188 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
31753189 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
8585 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
8686 ; GFX9-NEXT: ds_write_b16 v1, v2 offset:4
8787 ; GFX9-NEXT: s_waitcnt vmcnt(0)
88 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7f0000, v0
89 ; GFX9-NEXT: ds_write_b8_d16_hi v1, v0 offset:6
88 ; GFX9-NEXT: v_bfe_u32 v0, v0, 16, 7
89 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:6
9090 ; GFX9-NEXT: ds_write_b32 v1, v3
9191 ; GFX9-NEXT: s_endpgm
9292 store i55 %arg, i55 addrspace(3)* %ptr, align 8
447447 define void @bitcast_4i64_store(i4* %p, <4 x i64> %a0) {
448448 ; SSE2-SSSE3-LABEL: bitcast_4i64_store:
449449 ; SSE2-SSSE3: # %bb.0:
450 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
451 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1
452 ; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3
453 ; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm3
454 ; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4
455 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
456 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
457 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1
458 ; SSE2-SSSE3-NEXT: por %xmm4, %xmm1
459 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0
460 ; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3
461 ; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm3
462 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
463 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
464 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
465 ; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
466450 ; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0
467451 ; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
468452 ; SSE2-SSSE3-NEXT: movb %al, (%rdi)
608608 ;
609609 ; AVX1-LABEL: bitcast_8i64_store:
610610 ; AVX1: # %bb.0:
611 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
611 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
612612 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
613 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
614 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
615 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
616 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
617613 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
618614 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
619615 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
616 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
617 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
620618 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
621619 ; AVX1-NEXT: vmovmskps %ymm0, %eax
622620 ; AVX1-NEXT: movb %al, (%rdi)
1313 ;
1414 ; X64-LABEL: t:
1515 ; X64: ## %bb.0: ## %entry
16 ; X64-NEXT: ## kill: def $edx killed $edx def $rdx
17 ; X64-NEXT: ## kill: def $esi killed $esi def $rsi
1816 ; X64-NEXT: imull %ecx, %esi
19 ; X64-NEXT: leal (%rsi,%rdx), %eax
20 ; X64-NEXT: cltq
17 ; X64-NEXT: addl %edx, %esi
18 ; X64-NEXT: movslq %esi, %rax
2119 ; X64-NEXT: movl (%rdi,%rax), %eax
22 ; X64-NEXT: leal 4(%rsi,%rdx), %ecx
23 ; X64-NEXT: movslq %ecx, %rcx
24 ; X64-NEXT: movzwl (%rdi,%rcx), %ecx
25 ; X64-NEXT: shlq $32, %rcx
26 ; X64-NEXT: orq %rax, %rcx
27 ; X64-NEXT: movq %rcx, %xmm0
20 ; X64-NEXT: movq %rax, %xmm0
2821 ; X64-NEXT: movd %xmm0, %eax
2922 ; X64-NEXT: retq
3023 entry:
3535 define void @store_v2f64_v2i64(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %val) {
3636 ; SSE2-LABEL: store_v2f64_v2i64:
3737 ; SSE2: ## %bb.0:
38 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
39 ; SSE2-NEXT: pxor %xmm3, %xmm0
40 ; SSE2-NEXT: movdqa %xmm3, %xmm2
41 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
42 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm0
43 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
44 ; SSE2-NEXT: movdqa %xmm0, %xmm4
45 ; SSE2-NEXT: pand %xmm2, %xmm4
46 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
47 ; SSE2-NEXT: por %xmm3, %xmm4
48 ; SSE2-NEXT: movd %xmm4, %eax
38 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
39 ; SSE2-NEXT: pxor %xmm2, %xmm0
40 ; SSE2-NEXT: movdqa %xmm2, %xmm3
41 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
42 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
43 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
44 ; SSE2-NEXT: pand %xmm3, %xmm2
45 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
46 ; SSE2-NEXT: por %xmm2, %xmm0
47 ; SSE2-NEXT: movd %xmm0, %eax
4948 ; SSE2-NEXT: testb $1, %al
5049 ; SSE2-NEXT: je LBB1_2
5150 ; SSE2-NEXT: ## %bb.1: ## %cond.store
5251 ; SSE2-NEXT: movlpd %xmm1, (%rdi)
5352 ; SSE2-NEXT: LBB1_2: ## %else
54 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
55 ; SSE2-NEXT: pand %xmm2, %xmm0
56 ; SSE2-NEXT: por %xmm3, %xmm0
5753 ; SSE2-NEXT: pextrw $4, %xmm0, %eax
5854 ; SSE2-NEXT: testb $1, %al
5955 ; SSE2-NEXT: je LBB1_4
116112 ; SSE2-NEXT: movdqa %xmm4, %xmm5
117113 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
118114 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
119 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
120 ; SSE2-NEXT: movdqa %xmm0, %xmm7
121 ; SSE2-NEXT: pand %xmm5, %xmm7
122 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
123 ; SSE2-NEXT: por %xmm6, %xmm7
124 ; SSE2-NEXT: movd %xmm7, %eax
115 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
116 ; SSE2-NEXT: pand %xmm5, %xmm6
117 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
118 ; SSE2-NEXT: por %xmm6, %xmm0
119 ; SSE2-NEXT: movd %xmm0, %eax
125120 ; SSE2-NEXT: testb $1, %al
126121 ; SSE2-NEXT: je LBB2_2
127122 ; SSE2-NEXT: ## %bb.1: ## %cond.store
128123 ; SSE2-NEXT: movlpd %xmm2, (%rdi)
129124 ; SSE2-NEXT: LBB2_2: ## %else
130 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2]
131 ; SSE2-NEXT: pand %xmm5, %xmm0
132 ; SSE2-NEXT: por %xmm6, %xmm0
133125 ; SSE2-NEXT: pextrw $4, %xmm0, %eax
134126 ; SSE2-NEXT: testb $1, %al
135127 ; SSE2-NEXT: je LBB2_4
139131 ; SSE2-NEXT: pxor %xmm4, %xmm1
140132 ; SSE2-NEXT: movdqa %xmm4, %xmm0
141133 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
142 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
143134 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
144135 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
145 ; SSE2-NEXT: pand %xmm2, %xmm1
136 ; SSE2-NEXT: pand %xmm0, %xmm1
146137 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
147138 ; SSE2-NEXT: por %xmm1, %xmm0
148139 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
862853 define void @store_v2i64_v2i64(<2 x i64> %trigger, <2 x i64>* %addr, <2 x i64> %val) {
863854 ; SSE2-LABEL: store_v2i64_v2i64:
864855 ; SSE2: ## %bb.0:
865 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
866 ; SSE2-NEXT: pxor %xmm3, %xmm0
867 ; SSE2-NEXT: movdqa %xmm3, %xmm2
868 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
869 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm0
870 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
871 ; SSE2-NEXT: movdqa %xmm0, %xmm4
872 ; SSE2-NEXT: pand %xmm2, %xmm4
873 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
874 ; SSE2-NEXT: por %xmm3, %xmm4
875 ; SSE2-NEXT: movd %xmm4, %eax
856 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
857 ; SSE2-NEXT: pxor %xmm2, %xmm0
858 ; SSE2-NEXT: movdqa %xmm2, %xmm3
859 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
860 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
861 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
862 ; SSE2-NEXT: pand %xmm3, %xmm2
863 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
864 ; SSE2-NEXT: por %xmm2, %xmm0
865 ; SSE2-NEXT: movd %xmm0, %eax
876866 ; SSE2-NEXT: testb $1, %al
877867 ; SSE2-NEXT: je LBB7_2
878868 ; SSE2-NEXT: ## %bb.1: ## %cond.store
879869 ; SSE2-NEXT: movq %xmm1, (%rdi)
880870 ; SSE2-NEXT: LBB7_2: ## %else
881 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
882 ; SSE2-NEXT: pand %xmm2, %xmm0
883 ; SSE2-NEXT: por %xmm3, %xmm0
884871 ; SSE2-NEXT: pextrw $4, %xmm0, %eax
885872 ; SSE2-NEXT: testb $1, %al
886873 ; SSE2-NEXT: je LBB7_4
949936 ; SSE2-NEXT: movdqa %xmm4, %xmm5
950937 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
951938 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
952 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
953 ; SSE2-NEXT: movdqa %xmm0, %xmm7
954 ; SSE2-NEXT: pand %xmm5, %xmm7
955 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
956 ; SSE2-NEXT: por %xmm6, %xmm7
957 ; SSE2-NEXT: movd %xmm7, %eax
939 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
940 ; SSE2-NEXT: pand %xmm5, %xmm6
941 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
942 ; SSE2-NEXT: por %xmm6, %xmm0
943 ; SSE2-NEXT: movd %xmm0, %eax
958944 ; SSE2-NEXT: testb $1, %al
959945 ; SSE2-NEXT: je LBB8_2
960946 ; SSE2-NEXT: ## %bb.1: ## %cond.store
961947 ; SSE2-NEXT: movq %xmm2, (%rdi)
962948 ; SSE2-NEXT: LBB8_2: ## %else
963 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2]
964 ; SSE2-NEXT: pand %xmm5, %xmm0
965 ; SSE2-NEXT: por %xmm6, %xmm0
966949 ; SSE2-NEXT: pextrw $4, %xmm0, %eax
967950 ; SSE2-NEXT: testb $1, %al
968951 ; SSE2-NEXT: je LBB8_4
973956 ; SSE2-NEXT: pxor %xmm4, %xmm1
974957 ; SSE2-NEXT: movdqa %xmm4, %xmm0
975958 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
976 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
977959 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
978960 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
979 ; SSE2-NEXT: pand %xmm2, %xmm1
961 ; SSE2-NEXT: pand %xmm0, %xmm1
980962 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
981963 ; SSE2-NEXT: por %xmm1, %xmm0
982964 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
928928 define i1 @allones_v4i64_sign(<4 x i64> %arg) {
929929 ; SSE2-LABEL: allones_v4i64_sign:
930930 ; SSE2: # %bb.0:
931 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
932 ; SSE2-NEXT: pxor %xmm2, %xmm1
933 ; SSE2-NEXT: movdqa %xmm2, %xmm3
934 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
935 ; SSE2-NEXT: movdqa %xmm2, %xmm4
936 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
937 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
938 ; SSE2-NEXT: pand %xmm3, %xmm1
939 ; SSE2-NEXT: por %xmm4, %xmm1
940 ; SSE2-NEXT: pxor %xmm2, %xmm0
941 ; SSE2-NEXT: movdqa %xmm2, %xmm3
942 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
943 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
944 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
945 ; SSE2-NEXT: pand %xmm3, %xmm0
946 ; SSE2-NEXT: por %xmm2, %xmm0
947931 ; SSE2-NEXT: packssdw %xmm1, %xmm0
948932 ; SSE2-NEXT: movmskps %xmm0, %eax
949933 ; SSE2-NEXT: cmpb $15, %al
988972 define i1 @allzeros_v4i64_sign(<4 x i64> %arg) {
989973 ; SSE2-LABEL: allzeros_v4i64_sign:
990974 ; SSE2: # %bb.0:
991 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
992 ; SSE2-NEXT: pxor %xmm2, %xmm1
993 ; SSE2-NEXT: movdqa %xmm2, %xmm3
994 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
995 ; SSE2-NEXT: movdqa %xmm2, %xmm4
996 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
997 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
998 ; SSE2-NEXT: pand %xmm3, %xmm1
999 ; SSE2-NEXT: por %xmm4, %xmm1
1000 ; SSE2-NEXT: pxor %xmm2, %xmm0
1001 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1002 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
1003 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
1004 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
1005 ; SSE2-NEXT: pand %xmm3, %xmm0
1006 ; SSE2-NEXT: por %xmm2, %xmm0
1007975 ; SSE2-NEXT: packssdw %xmm1, %xmm0
1008976 ; SSE2-NEXT: movmskps %xmm0, %eax
1009977 ; SSE2-NEXT: testb %al, %al
10941062 ;
10951063 ; AVX1-LABEL: allones_v8i64_sign:
10961064 ; AVX1: # %bb.0:
1097 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1065 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
10981066 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1099 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
1100 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
1101 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
1102 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
11031067 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
11041068 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
11051069 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
1070 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1071 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
11061072 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
11071073 ; AVX1-NEXT: vmovmskps %ymm0, %eax
11081074 ; AVX1-NEXT: cmpb $-1, %al
11971163 ;
11981164 ; AVX1-LABEL: allzeros_v8i64_sign:
11991165 ; AVX1: # %bb.0:
1200 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1166 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
12011167 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1202 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
1203 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
1204 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
1205 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
12061168 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
12071169 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
12081170 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
1171 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1172 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
12091173 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
12101174 ; AVX1-NEXT: vmovmskps %ymm0, %eax
12111175 ; AVX1-NEXT: testb %al, %al
25382502 ;
25392503 ; AVX1-LABEL: allones_v8i64_and1:
25402504 ; AVX1: # %bb.0:
2541 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2505 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
25422506 ; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
25432507 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
2544 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
2545 ; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1
2546 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
2547 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
2548 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2549 ; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
25502508 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
25512509 ; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
25522510 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
25532511 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
2512 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2513 ; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
2514 ; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1
2515 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
25542516 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
25552517 ; AVX1-NEXT: vmovmskps %ymm0, %eax
25562518 ; AVX1-NEXT: cmpb $-1, %al
26142576 ;
26152577 ; AVX1-LABEL: allzeros_v8i64_and1:
26162578 ; AVX1: # %bb.0:
2617 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2579 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
26182580 ; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
26192581 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
2620 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
2621 ; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1
2622 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
2623 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
2624 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2625 ; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
26262582 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
26272583 ; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
26282584 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
26292585 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
2586 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2587 ; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
2588 ; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1
2589 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
26302590 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
26312591 ; AVX1-NEXT: vmovmskps %ymm0, %eax
26322592 ; AVX1-NEXT: testb %al, %al
39613921 ;
39623922 ; AVX1-LABEL: allones_v8i64_and4:
39633923 ; AVX1: # %bb.0:
3964 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
3924 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
39653925 ; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
39663926 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
3967 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
3968 ; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1
3969 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
3970 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
3971 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3972 ; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
39733927 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
39743928 ; AVX1-NEXT: vpsllq $61, %xmm0, %xmm0
39753929 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
39763930 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
3931 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
3932 ; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
3933 ; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1
3934 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
39773935 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
39783936 ; AVX1-NEXT: vmovmskps %ymm0, %eax
39793937 ; AVX1-NEXT: cmpb $-1, %al
40373995 ;
40383996 ; AVX1-LABEL: allzeros_v8i64_and4:
40393997 ; AVX1: # %bb.0:
4040 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
3998 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
40413999 ; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
40424000 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
4043 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
4044 ; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1
4045 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
4046 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
4047 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4048 ; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
40494001 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
40504002 ; AVX1-NEXT: vpsllq $61, %xmm0, %xmm0
40514003 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
40524004 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
4005 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
4006 ; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
4007 ; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1
4008 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
40534009 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
40544010 ; AVX1-NEXT: vmovmskps %ymm0, %eax
40554011 ; AVX1-NEXT: testb %al, %al
41694125 define i32 @movmskpd256(<4 x double> %x) {
41704126 ; SSE2-LABEL: movmskpd256:
41714127 ; SSE2: # %bb.0:
4172 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
4173 ; SSE2-NEXT: pxor %xmm2, %xmm1
4174 ; SSE2-NEXT: movdqa %xmm2, %xmm3
4175 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
4176 ; SSE2-NEXT: movdqa %xmm2, %xmm4
4177 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
4178 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
4179 ; SSE2-NEXT: pand %xmm3, %xmm1
4180 ; SSE2-NEXT: por %xmm4, %xmm1
4181 ; SSE2-NEXT: pxor %xmm2, %xmm0
4182 ; SSE2-NEXT: movdqa %xmm2, %xmm3
4183 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
4184 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
4185 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
4186 ; SSE2-NEXT: pand %xmm3, %xmm0
4187 ; SSE2-NEXT: por %xmm2, %xmm0
41884128 ; SSE2-NEXT: packssdw %xmm1, %xmm0
41894129 ; SSE2-NEXT: movmskps %xmm0, %eax
41904130 ; SSE2-NEXT: retq