llvm.org GIT mirror llvm / fed0c65
Recommit r358887 "[TargetLowering][AMDGPU][X86] Improve SimplifyDemandedBits bitcast handling" I've included a new fix in X86RegisterInfo to prevent PR41619 without reintroducing r359392. We might be able to improve that in the base class implementation of shouldRewriteCopySrc somehow. But this hopefully enables forward progress on SimplifyDemandedBits improvements for now. Original commit message: This patch adds support for BigBitWidth -> SmallBitWidth bitcasts, splitting the DemandedBits/Elts accordingly. The AMDGPU backend needed an extra (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1) combine to encourage BFE creation, I investigated putting this in DAGComb but it caused a lot of noise on other targets - some improvements, some regressions. The X86 changes are all definite wins. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360552 91177308-0d34-0410-b5e6-96231b3b80d8 Craig Topper 1 year, 2 months ago
9 changed file(s) with 107 addition(s) and 139 deletion(s). Raw diff Collapse all Expand all
15801580 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts,
15811581 KnownSrcBits, TLO, Depth + 1))
15821582 return true;
1583 } else if ((NumSrcEltBits % BitWidth) == 0 &&
1584 TLO.DAG.getDataLayout().isLittleEndian()) {
1585 unsigned Scale = NumSrcEltBits / BitWidth;
1586 unsigned NumSrcElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1;
1587 APInt DemandedSrcBits = APInt::getNullValue(NumSrcEltBits);
1588 APInt DemandedSrcElts = APInt::getNullValue(NumSrcElts);
1589 for (unsigned i = 0; i != NumElts; ++i)
1590 if (DemandedElts[i]) {
1591 unsigned Offset = (i % Scale) * BitWidth;
1592 DemandedSrcBits.insertBits(DemandedBits, Offset);
1593 DemandedSrcElts.setBit(i / Scale);
1594 }
1595
1596 if (SrcVT.isVector()) {
1597 APInt KnownSrcUndef, KnownSrcZero;
1598 if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, KnownSrcUndef,
1599 KnownSrcZero, TLO, Depth + 1))
1600 return true;
1601 }
1602
1603 KnownBits KnownSrcBits;
1604 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts,
1605 KnownSrcBits, TLO, Depth + 1))
1606 return true;
15831607 }
15841608
15851609 // If this is a bitcast, let computeKnownBits handle it. Only do this on a
15861610 // recursive call where Known may be useful to the caller.
15871611 if (Depth > 0) {
1588 Known = TLO.DAG.computeKnownBits(Op, Depth);
1612 Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
15891613 return false;
15901614 }
15911615 break;
215215 }
216216 }
217217
218 bool X86RegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
219 unsigned DefSubReg,
220 const TargetRegisterClass *SrcRC,
221 unsigned SrcSubReg) const {
222 // Prevent rewriting a copy where the destination size is larger than the
223 // input size. See PR41619.
224 // FIXME: Should this be factored into the base implementation somehow.
225 if (DefRC->hasSuperClassEq(&X86::GR64RegClass) && DefSubReg == 0 &&
226 SrcRC->hasSuperClassEq(&X86::GR64RegClass) && SrcSubReg == X86::sub_32bit)
227 return false;
228
229 return TargetRegisterInfo::shouldRewriteCopySrc(DefRC, DefSubReg,
230 SrcRC, SrcSubReg);
231 }
232
218233 const TargetRegisterClass *
219234 X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const {
220235 const Function &F = MF.getFunction();
7373 getLargestLegalSuperClass(const TargetRegisterClass *RC,
7474 const MachineFunction &MF) const override;
7575
76 bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
77 unsigned DefSubReg,
78 const TargetRegisterClass *SrcRC,
79 unsigned SrcSubReg) const override;
80
7681 /// getPointerRegClass - Returns a TargetRegisterClass used for pointer
7782 /// values.
7883 const TargetRegisterClass *
447447 define void @bitcast_4i64_store(i4* %p, <4 x i64> %a0) {
448448 ; SSE2-SSSE3-LABEL: bitcast_4i64_store:
449449 ; SSE2-SSSE3: # %bb.0:
450 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
451 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1
452 ; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3
453 ; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm3
454 ; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4
455 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
456 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
457 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1
458 ; SSE2-SSSE3-NEXT: por %xmm4, %xmm1
459 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0
460 ; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3
461 ; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm3
462 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
463 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
464 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
465 ; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
466450 ; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0
467451 ; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
468452 ; SSE2-SSSE3-NEXT: movb %al, (%rdi)
608608 ;
609609 ; AVX1-LABEL: bitcast_8i64_store:
610610 ; AVX1: # %bb.0:
611 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
611 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
612612 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
613 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
614 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
615 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
616 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
617613 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
618614 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
619615 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
616 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
617 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
620618 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
621619 ; AVX1-NEXT: vmovmskps %ymm0, %eax
622620 ; AVX1-NEXT: movb %al, (%rdi)
207207 define i2 @bitcast_v4i64_to_v2i2(<4 x i64> %a0) nounwind {
208208 ; SSE2-SSSE3-LABEL: bitcast_v4i64_to_v2i2:
209209 ; SSE2-SSSE3: # %bb.0:
210 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
211 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1
212 ; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3
213 ; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm3
214 ; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4
215 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
216 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
217 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1
218 ; SSE2-SSSE3-NEXT: por %xmm4, %xmm1
219 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0
220 ; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3
221 ; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm3
222 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
223 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
224 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
225 ; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
226210 ; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0
227211 ; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
228212 ; SSE2-SSSE3-NEXT: movl %eax, %ecx
531515 ;
532516 ; AVX1-LABEL: bitcast_v8i64_to_v2i4:
533517 ; AVX1: # %bb.0:
534 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
518 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
535519 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
536 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
537 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
538 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
539 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
540520 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
541521 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
542522 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
523 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
524 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
543525 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
544526 ; AVX1-NEXT: vmovmskps %ymm0, %eax
545527 ; AVX1-NEXT: movl %eax, %ecx
1313 ;
1414 ; X64-LABEL: t:
1515 ; X64: ## %bb.0: ## %entry
16 ; X64-NEXT: ## kill: def $edx killed $edx def $rdx
17 ; X64-NEXT: ## kill: def $esi killed $esi def $rsi
1816 ; X64-NEXT: imull %ecx, %esi
19 ; X64-NEXT: leal (%rsi,%rdx), %eax
20 ; X64-NEXT: cltq
17 ; X64-NEXT: addl %edx, %esi
18 ; X64-NEXT: movslq %esi, %rax
2119 ; X64-NEXT: movl (%rdi,%rax), %eax
22 ; X64-NEXT: leal 4(%rsi,%rdx), %ecx
23 ; X64-NEXT: movslq %ecx, %rcx
24 ; X64-NEXT: movzwl (%rdi,%rcx), %ecx
25 ; X64-NEXT: shlq $32, %rcx
26 ; X64-NEXT: orq %rax, %rcx
27 ; X64-NEXT: movq %rcx, %xmm0
20 ; X64-NEXT: movq %rax, %xmm0
2821 ; X64-NEXT: movd %xmm0, %eax
2922 ; X64-NEXT: retq
3023 entry:
928928 define i1 @allones_v4i64_sign(<4 x i64> %arg) {
929929 ; SSE2-LABEL: allones_v4i64_sign:
930930 ; SSE2: # %bb.0:
931 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
932 ; SSE2-NEXT: pxor %xmm2, %xmm1
933 ; SSE2-NEXT: movdqa %xmm2, %xmm3
934 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
935 ; SSE2-NEXT: movdqa %xmm2, %xmm4
936 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
937 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
938 ; SSE2-NEXT: pand %xmm3, %xmm1
939 ; SSE2-NEXT: por %xmm4, %xmm1
940 ; SSE2-NEXT: pxor %xmm2, %xmm0
941 ; SSE2-NEXT: movdqa %xmm2, %xmm3
942 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
943 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
944 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
945 ; SSE2-NEXT: pand %xmm3, %xmm0
946 ; SSE2-NEXT: por %xmm2, %xmm0
947931 ; SSE2-NEXT: packssdw %xmm1, %xmm0
948932 ; SSE2-NEXT: movmskps %xmm0, %eax
949933 ; SSE2-NEXT: cmpb $15, %al
988972 define i1 @allzeros_v4i64_sign(<4 x i64> %arg) {
989973 ; SSE2-LABEL: allzeros_v4i64_sign:
990974 ; SSE2: # %bb.0:
991 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
992 ; SSE2-NEXT: pxor %xmm2, %xmm1
993 ; SSE2-NEXT: movdqa %xmm2, %xmm3
994 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
995 ; SSE2-NEXT: movdqa %xmm2, %xmm4
996 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
997 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
998 ; SSE2-NEXT: pand %xmm3, %xmm1
999 ; SSE2-NEXT: por %xmm4, %xmm1
1000 ; SSE2-NEXT: pxor %xmm2, %xmm0
1001 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1002 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
1003 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
1004 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
1005 ; SSE2-NEXT: pand %xmm3, %xmm0
1006 ; SSE2-NEXT: por %xmm2, %xmm0
1007975 ; SSE2-NEXT: packssdw %xmm1, %xmm0
1008976 ; SSE2-NEXT: movmskps %xmm0, %eax
1009977 ; SSE2-NEXT: testb %al, %al
10941062 ;
10951063 ; AVX1-LABEL: allones_v8i64_sign:
10961064 ; AVX1: # %bb.0:
1097 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1065 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
10981066 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1099 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
1100 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
1101 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
1102 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
11031067 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
11041068 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
11051069 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
1070 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1071 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
11061072 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
11071073 ; AVX1-NEXT: vmovmskps %ymm0, %eax
11081074 ; AVX1-NEXT: cmpb $-1, %al
11971163 ;
11981164 ; AVX1-LABEL: allzeros_v8i64_sign:
11991165 ; AVX1: # %bb.0:
1200 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1166 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
12011167 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1202 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
1203 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
1204 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
1205 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
12061168 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
12071169 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
12081170 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
1171 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1172 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
12091173 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
12101174 ; AVX1-NEXT: vmovmskps %ymm0, %eax
12111175 ; AVX1-NEXT: testb %al, %al
25382502 ;
25392503 ; AVX1-LABEL: allones_v8i64_and1:
25402504 ; AVX1: # %bb.0:
2541 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2505 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
25422506 ; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
25432507 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
2544 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
2545 ; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1
2546 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
2547 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
2548 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2549 ; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
25502508 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
25512509 ; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
25522510 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
25532511 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
2512 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2513 ; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
2514 ; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1
2515 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
25542516 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
25552517 ; AVX1-NEXT: vmovmskps %ymm0, %eax
25562518 ; AVX1-NEXT: cmpb $-1, %al
26142576 ;
26152577 ; AVX1-LABEL: allzeros_v8i64_and1:
26162578 ; AVX1: # %bb.0:
2617 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2579 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
26182580 ; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
26192581 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
2620 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
2621 ; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1
2622 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
2623 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
2624 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2625 ; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
26262582 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
26272583 ; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
26282584 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
26292585 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
2586 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2587 ; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
2588 ; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1
2589 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
26302590 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
26312591 ; AVX1-NEXT: vmovmskps %ymm0, %eax
26322592 ; AVX1-NEXT: testb %al, %al
39613921 ;
39623922 ; AVX1-LABEL: allones_v8i64_and4:
39633923 ; AVX1: # %bb.0:
3964 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
3924 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
39653925 ; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
39663926 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
3967 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
3968 ; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1
3969 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
3970 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
3971 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3972 ; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
39733927 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
39743928 ; AVX1-NEXT: vpsllq $61, %xmm0, %xmm0
39753929 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
39763930 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
3931 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
3932 ; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
3933 ; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1
3934 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
39773935 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
39783936 ; AVX1-NEXT: vmovmskps %ymm0, %eax
39793937 ; AVX1-NEXT: cmpb $-1, %al
40373995 ;
40383996 ; AVX1-LABEL: allzeros_v8i64_and4:
40393997 ; AVX1: # %bb.0:
4040 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
3998 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
40413999 ; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
40424000 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
4043 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
4044 ; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1
4045 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
4046 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
4047 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4048 ; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
40494001 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
40504002 ; AVX1-NEXT: vpsllq $61, %xmm0, %xmm0
40514003 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
40524004 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
4005 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
4006 ; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
4007 ; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1
4008 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
40534009 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
40544010 ; AVX1-NEXT: vmovmskps %ymm0, %eax
40554011 ; AVX1-NEXT: testb %al, %al
41694125 define i32 @movmskpd256(<4 x double> %x) {
41704126 ; SSE2-LABEL: movmskpd256:
41714127 ; SSE2: # %bb.0:
4172 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
4173 ; SSE2-NEXT: pxor %xmm2, %xmm1
4174 ; SSE2-NEXT: movdqa %xmm2, %xmm3
4175 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
4176 ; SSE2-NEXT: movdqa %xmm2, %xmm4
4177 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
4178 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
4179 ; SSE2-NEXT: pand %xmm3, %xmm1
4180 ; SSE2-NEXT: por %xmm4, %xmm1
4181 ; SSE2-NEXT: pxor %xmm2, %xmm0
4182 ; SSE2-NEXT: movdqa %xmm2, %xmm3
4183 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
4184 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
4185 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
4186 ; SSE2-NEXT: pand %xmm3, %xmm0
4187 ; SSE2-NEXT: por %xmm2, %xmm0
41884128 ; SSE2-NEXT: packssdw %xmm1, %xmm0
41894129 ; SSE2-NEXT: movmskps %xmm0, %eax
41904130 ; SSE2-NEXT: retq
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.14.0 -mattr=avx2 | FileCheck %s
2
3 define void @foo(double %arg) {
4 ; CHECK-LABEL: foo:
5 ; CHECK: ## %bb.0: ## %bb
6 ; CHECK-NEXT: vmovq %xmm0, %rax
7 ; CHECK-NEXT: vmovd %eax, %xmm0
8 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
9 ; CHECK-NEXT: vmovq %xmm0, %rax
10 ; CHECK-NEXT: movl %eax, (%rax)
11 ; CHECK-NEXT: vmovlps %xmm1, (%rax)
12 ; CHECK-NEXT: retq
13 bb:
14 %tmp = bitcast double %arg to i64
15 %tmp1 = trunc i64 %tmp to i32
16 %tmp2 = bitcast i32 %tmp1 to float
17 %tmp3 = insertelement <4 x float> zeroinitializer, float %tmp2, i32 2
18 %tmp4 = bitcast <4 x float> %tmp3 to <2 x double>
19 %tmp5 = extractelement <2 x double> %tmp4, i32 0
20 %tmp6 = extractelement <2 x double> %tmp4, i32 1
21 %tmp7 = bitcast double %tmp6 to i64
22 %tmp8 = trunc i64 %tmp7 to i32
23 store i32 %tmp8, i32* undef, align 4
24 store double %tmp5, double* undef, align 16
25 ret void
26 }