llvm.org GIT mirror llvm / 349adaa
[X86] Disable combineToExtendVectorInReg under -x86-experimental-vector-widening-legalization. Add custom type legalization for extends. If we widen illegal types instead of promoting, we should be able to rely on the type legalizer to create the vector_inreg operations for us with some caveats. This patch disables combineToExtendVectorInReg when we are using widening. I've enabled custom legalization for v8i8->v8i64 extends under avx512f since the type legalizer would want to create a vector_inreg with a v64i8 input type which isn't legal without avx512bw. So we go to v16i8 with custom code using the relaxation of rules we get from D54346. I've also enable custom legalization of v8i64 and v16i32 operations with with AVX. When the input type is 128 bits, the default splitting legalization would extend first 128->256, then do the a split to two 128 pieces. Extend each half to 256 and then concat the result. The custom legalization I've added instead uses a 128->256 bit vector_inreg extend that only reads the lower 64-bits for the low half of the split. Then shuffles the high 64-bits to the low 64-bits and does another vector_inreg extend. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@347172 91177308-0d34-0410-b5e6-96231b3b80d8 Craig Topper 11 months ago
4 changed file(s) with 154 addition(s) and 94 deletion(s). Raw diff Collapse all Expand all
11031103 setOperationAction(ISD::SRA, VT, Custom);
11041104 }
11051105
1106 if (ExperimentalVectorWideningLegalization) {
1107 // These types need custom splitting if their input is a 128-bit vector.
1108 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1109 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1110 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1111 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1112 }
1113
11061114 setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
11071115 setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
11081116 setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
13661374 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
13671375 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
13681376 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1377
1378 if (ExperimentalVectorWideningLegalization) {
1379 // Need to custom widen this if we don't have AVX512BW.
1380 setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom);
1381 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom);
1382 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom);
1383 }
13691384
13701385 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
13711386 setOperationAction(ISD::FFLOOR, VT, Legal);
1763517650 InVT.getVectorElementType() == MVT::i32) &&
1763617651 "Unexpected element type");
1763717652
17653 // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
17654 if (InVT == MVT::v8i8) {
17655 if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64)
17656 return SDValue();
17657
17658 In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
17659 MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
17660 // FIXME: This should be ANY_EXTEND_VECTOR_INREG for ANY_EXTEND input.
17661 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, dl, VT, In);
17662 }
17663
1763817664 if (Subtarget.hasInt256())
1763917665 return Op;
1764017666
2008720113 InVT.getVectorElementType() == MVT::i16 ||
2008820114 InVT.getVectorElementType() == MVT::i32) &&
2008920115 "Unexpected element type");
20116
20117 // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
20118 if (InVT == MVT::v8i8) {
20119 if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64)
20120 return SDValue();
20121
20122 In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
20123 MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
20124 return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, VT, In);
20125 }
2009020126
2009120127 if (Subtarget.hasInt256())
2009220128 return Op;
2630326339 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
2630426340 return;
2630526341 }
26342 }
26343 return;
26344 }
26345 case ISD::SIGN_EXTEND:
26346 case ISD::ZERO_EXTEND: {
26347 if (!ExperimentalVectorWideningLegalization)
26348 return;
26349
26350 EVT VT = N->getValueType(0);
26351 assert((VT == MVT::v16i32 || VT == MVT::v8i64) && "Unexpected VT!");
26352 SDValue In = N->getOperand(0);
26353 EVT InVT = In.getValueType();
26354 if (InVT.is128BitVector()) {
26355 // Perform custom splitting instead of the two stage extend we would get
26356 // by default.
26357 EVT LoVT, HiVT;
26358 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
26359 assert(isTypeLegal(LoVT) && "Split VT not legal?");
26360
26361 bool IsSigned = N->getOpcode() == ISD::SIGN_EXTEND;
26362
26363 SDValue Lo = getExtendInVec(IsSigned, dl, LoVT, In, DAG);
26364
26365 // We need to shift the input over by half the number of elements.
26366 unsigned NumElts = InVT.getVectorNumElements();
26367 unsigned HalfNumElts = NumElts / 2;
26368 SmallVector ShufMask(NumElts, SM_SentinelUndef);
26369 for (unsigned i = 0; i != HalfNumElts; ++i)
26370 ShufMask[i] = i + HalfNumElts;
26371
26372 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
26373 Hi = getExtendInVec(IsSigned, dl, HiVT, Hi, DAG);
26374
26375 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
26376 Results.push_back(Res);
2630626377 }
2630726378 return;
2630826379 }
3871438785 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
3871538786 TargetLowering::DAGCombinerInfo &DCI,
3871638787 const X86Subtarget &Subtarget) {
38788 if (ExperimentalVectorWideningLegalization)
38789 return SDValue();
38790
3871738791 unsigned Opcode = N->getOpcode();
3871838792 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
3871938793 return SDValue();
54345434 define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
54355435 ; SSE2-LABEL: aggregate_sitofp_8i16_to_8f32:
54365436 ; SSE2: # %bb.0:
5437 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
5438 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
5439 ; SSE2-NEXT: psrad $16, %xmm0
5440 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
5441 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
5442 ; SSE2-NEXT: psrad $16, %xmm1
54375443 ; SSE2-NEXT: movq 24(%rdi), %rax
5438 ; SSE2-NEXT: movdqu 8(%rdi), %xmm0
5439 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
5440 ; SSE2-NEXT: psrad $16, %xmm1
54415444 ; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1
5442 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
5443 ; SSE2-NEXT: psrad $16, %xmm0
54445445 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
54455446 ; SSE2-NEXT: movaps %xmm0, 16(%rax)
54465447 ; SSE2-NEXT: movaps %xmm1, (%rax)
54495450 ; SSE41-LABEL: aggregate_sitofp_8i16_to_8f32:
54505451 ; SSE41: # %bb.0:
54515452 ; SSE41-NEXT: movq 24(%rdi), %rax
5452 ; SSE41-NEXT: movdqu 8(%rdi), %xmm0
5453 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
5454 ; SSE41-NEXT: pmovsxwd %xmm1, %xmm1
5453 ; SSE41-NEXT: pmovsxwd 16(%rdi), %xmm0
5454 ; SSE41-NEXT: pmovsxwd 8(%rdi), %xmm1
54555455 ; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1
5456 ; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
54575456 ; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0
5458 ; SSE41-NEXT: movaps %xmm0, (%rax)
5459 ; SSE41-NEXT: movaps %xmm1, 16(%rax)
5457 ; SSE41-NEXT: movaps %xmm0, 16(%rax)
5458 ; SSE41-NEXT: movaps %xmm1, (%rax)
54605459 ; SSE41-NEXT: retq
54615460 ;
54625461 ; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32:
494494 define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp {
495495 ; SSE2-LABEL: sext_16i8_to_8i64:
496496 ; SSE2: # %bb.0: # %entry
497 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
498497 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
499498 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
500499 ; SSE2-NEXT: movdqa %xmm4, %xmm1
501500 ; SSE2-NEXT: psrad $31, %xmm1
502501 ; SSE2-NEXT: psrad $24, %xmm4
503502 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
504 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
505 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
506 ; SSE2-NEXT: movdqa %xmm1, %xmm0
507 ; SSE2-NEXT: psrad $31, %xmm0
503 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
504 ; SSE2-NEXT: movdqa %xmm2, %xmm1
505 ; SSE2-NEXT: psrad $31, %xmm1
506 ; SSE2-NEXT: psrad $24, %xmm2
507 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
508 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
509 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
510 ; SSE2-NEXT: movdqa %xmm1, %xmm3
511 ; SSE2-NEXT: psrad $31, %xmm3
508512 ; SSE2-NEXT: psrad $24, %xmm1
509 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
510 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
511 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
512 ; SSE2-NEXT: movdqa %xmm2, %xmm0
513 ; SSE2-NEXT: psrad $31, %xmm0
514 ; SSE2-NEXT: psrad $24, %xmm2
515 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
516 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
513 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
514 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
517515 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
518516 ; SSE2-NEXT: movdqa %xmm3, %xmm0
519517 ; SSE2-NEXT: psrad $31, %xmm0
524522 ;
525523 ; SSSE3-LABEL: sext_16i8_to_8i64:
526524 ; SSSE3: # %bb.0: # %entry
527 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
528525 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
529526 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
530527 ; SSSE3-NEXT: movdqa %xmm4, %xmm1
531528 ; SSSE3-NEXT: psrad $31, %xmm1
532529 ; SSSE3-NEXT: psrad $24, %xmm4
533530 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
534 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
535 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
536 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
537 ; SSSE3-NEXT: psrad $31, %xmm0
531 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
532 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
533 ; SSSE3-NEXT: psrad $31, %xmm1
534 ; SSSE3-NEXT: psrad $24, %xmm2
535 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
536 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
537 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
538 ; SSSE3-NEXT: movdqa %xmm1, %xmm3
539 ; SSSE3-NEXT: psrad $31, %xmm3
538540 ; SSSE3-NEXT: psrad $24, %xmm1
539 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
540 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
541 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
542 ; SSSE3-NEXT: movdqa %xmm2, %xmm0
543 ; SSSE3-NEXT: psrad $31, %xmm0
544 ; SSSE3-NEXT: psrad $24, %xmm2
545 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
546 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
541 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
542 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
547543 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
548544 ; SSSE3-NEXT: movdqa %xmm3, %xmm0
549545 ; SSSE3-NEXT: psrad $31, %xmm0
911907 ; SSE2-NEXT: psrad $31, %xmm3
912908 ; SSE2-NEXT: psrad $16, %xmm1
913909 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
914 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
915 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7]
910 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
911 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
916912 ; SSE2-NEXT: movdqa %xmm3, %xmm0
917913 ; SSE2-NEXT: psrad $31, %xmm0
918914 ; SSE2-NEXT: psrad $16, %xmm3
937933 ; SSSE3-NEXT: psrad $31, %xmm3
938934 ; SSSE3-NEXT: psrad $16, %xmm1
939935 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
940 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
941 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7]
936 ; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
937 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
942938 ; SSSE3-NEXT: movdqa %xmm3, %xmm0
943939 ; SSSE3-NEXT: psrad $31, %xmm0
944940 ; SSSE3-NEXT: psrad $16, %xmm3
122122 ;
123123 ; AVX1-LABEL: zext_32i8_to_32i16:
124124 ; AVX1: # %bb.0: # %entry
125 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
126 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
127 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
128 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
125 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
126 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
127 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
128 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
129129 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
130 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
131 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
130 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
132131 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
133 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
132 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
134133 ; AVX1-NEXT: vmovaps %ymm2, %ymm0
135134 ; AVX1-NEXT: retq
136135 ;
397396 define <8 x i64> @zext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp {
398397 ; SSE2-LABEL: zext_16i8_to_8i64:
399398 ; SSE2: # %bb.0: # %entry
400 ; SSE2-NEXT: movdqa %xmm0, %xmm1
399 ; SSE2-NEXT: movdqa %xmm0, %xmm3
401400 ; SSE2-NEXT: pxor %xmm4, %xmm4
402 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
403 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
401 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
402 ; SSE2-NEXT: movdqa %xmm3, %xmm1
404403 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
405404 ; SSE2-NEXT: movdqa %xmm1, %xmm0
406405 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
407406 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
408 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
409 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
407 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
410408 ; SSE2-NEXT: movdqa %xmm3, %xmm2
411409 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
412410 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
414412 ;
415413 ; SSSE3-LABEL: zext_16i8_to_8i64:
416414 ; SSSE3: # %bb.0: # %entry
417 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
418 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,128,128,128,128,128,128,128,1,128,128,128,128,128,128,128]
419 ; SSSE3-NEXT: pshufb %xmm4, %xmm0
420 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2,128,128,128,128,128,128,128,3,128,128,128,128,128,128,128]
421 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
422 ; SSSE3-NEXT: pshufb %xmm5, %xmm1
415 ; SSSE3-NEXT: movdqa %xmm0, %xmm3
416 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
417 ; SSSE3-NEXT: movdqa %xmm3, %xmm1
418 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
423419 ; SSSE3-NEXT: movdqa %xmm3, %xmm2
424 ; SSSE3-NEXT: pshufb %xmm4, %xmm2
425 ; SSSE3-NEXT: pshufb %xmm5, %xmm3
420 ; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[4],zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero,zero,zero
421 ; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[6],zero,zero,zero,zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,zero,zero,zero
426422 ; SSSE3-NEXT: retq
427423 ;
428424 ; SSE41-LABEL: zext_16i8_to_8i64:
584580 ;
585581 ; AVX1-LABEL: zext_16i16_to_16i32:
586582 ; AVX1: # %bb.0: # %entry
587 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
588 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
589 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
590 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
583 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
584 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
585 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
586 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
591587 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
592 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
593 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
588 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
594589 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
595 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
590 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
596591 ; AVX1-NEXT: vmovaps %ymm2, %ymm0
597592 ; AVX1-NEXT: retq
598593 ;
883878 ;
884879 ; AVX1-LABEL: zext_8i32_to_8i64:
885880 ; AVX1: # %bb.0: # %entry
886 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
887 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
888 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
889 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
881 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
882 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
883 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
884 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
890885 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
891 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
892 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
886 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
893887 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
894 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
888 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
895889 ; AVX1-NEXT: vmovaps %ymm2, %ymm0
896890 ; AVX1-NEXT: retq
897891 ;
11611155 define <8 x i64> @load_zext_8i8_to_8i64(<8 x i8> *%ptr) {
11621156 ; SSE2-LABEL: load_zext_8i8_to_8i64:
11631157 ; SSE2: # %bb.0: # %entry
1164 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
1158 ; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
11651159 ; SSE2-NEXT: pxor %xmm4, %xmm4
1166 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
1167 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1160 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1161 ; SSE2-NEXT: movdqa %xmm3, %xmm1
11681162 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
11691163 ; SSE2-NEXT: movdqa %xmm1, %xmm0
11701164 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
11711165 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
1172 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1173 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
1166 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
11741167 ; SSE2-NEXT: movdqa %xmm3, %xmm2
11751168 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
11761169 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
11781171 ;
11791172 ; SSSE3-LABEL: load_zext_8i8_to_8i64:
11801173 ; SSSE3: # %bb.0: # %entry
1181 ; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
1182 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,128,128,128,128,128,128,128,1,128,128,128,128,128,128,128]
1183 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1184 ; SSSE3-NEXT: pshufb %xmm4, %xmm0
1185 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2,128,128,128,128,128,128,128,3,128,128,128,128,128,128,128]
1186 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
1187 ; SSSE3-NEXT: pshufb %xmm5, %xmm1
1174 ; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
1175 ; SSSE3-NEXT: movdqa %xmm3, %xmm0
1176 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1177 ; SSSE3-NEXT: movdqa %xmm3, %xmm1
1178 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
11881179 ; SSSE3-NEXT: movdqa %xmm3, %xmm2
1189 ; SSSE3-NEXT: pshufb %xmm4, %xmm2
1190 ; SSSE3-NEXT: pshufb %xmm5, %xmm3
1180 ; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[4],zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero,zero,zero
1181 ; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[6],zero,zero,zero,zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,zero,zero,zero
11911182 ; SSSE3-NEXT: retq
11921183 ;
11931184 ; SSE41-LABEL: load_zext_8i8_to_8i64:
22282219 ; AVX2-LABEL: zext_32i8_to_32i32:
22292220 ; AVX2: # %bb.0:
22302221 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
2231 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2232 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
2233 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
2234 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
2235 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,3]
2222 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
2223 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
2224 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2225 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
2226 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
22362227 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
22372228 ; AVX2-NEXT: vmovdqa %ymm4, %ymm0
22382229 ; AVX2-NEXT: retq