llvm.org GIT mirror llvm / 3ecdd44
[X86][SSE4A] Shuffle lowering using SSE4A EXTRQ/INSERTQ instructions This patch adds support for v8i16 and v16i8 shuffle lowering using the immediate versions of the SSE4A EXTRQ and INSERTQ instructions. Although rather limited (they can only act on the lower 64-bits of the source vectors, leave the upper 64-bits of the result vector undefined and don't have VEX encoded variants), the instructions are still useful for the zero extension of any lane (EXTRQ) or inserting a lane into another vector (INSERTQ). Testing demonstrated that it wasn't typically worth it to use these instructions for v2i64 or v4i32 vector shuffles although they are capable of it. As well as adding specific pattern matching for the shuffles, the patch uses EXTRQ for zero extension cases where SSE41 isn't available and its more efficient than the SSE2 'unpack' default approach. It also adds shuffle decode support for the EXTRQ / INSERTQ cases when the instructions are handling full byte-sized extractions / insertions. From this foundation, future patches will be able to make use of the instructions for situations that use their ability to extract/insert at the bit level. Differential Revision: http://reviews.llvm.org/D10146 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@241508 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 5 years ago
9 changed file(s) with 521 addition(s) and 6 deletion(s). Raw diff Collapse all Expand all
875875 case X86::VMOVDI2PDIrm:
876876 DecodeZeroMoveLowMask(MVT::v4i32, ShuffleMask);
877877 DestName = getRegName(MI->getOperand(0).getReg());
878 break;
879
880 case X86::EXTRQI:
881 if (MI->getOperand(2).isImm() &&
882 MI->getOperand(3).isImm())
883 DecodeEXTRQIMask(MI->getOperand(2).getImm(),
884 MI->getOperand(3).getImm(),
885 ShuffleMask);
886
887 DestName = getRegName(MI->getOperand(0).getReg());
888 Src1Name = getRegName(MI->getOperand(1).getReg());
889 break;
890
891 case X86::INSERTQI:
892 if (MI->getOperand(3).isImm() &&
893 MI->getOperand(4).isImm())
894 DecodeINSERTQIMask(MI->getOperand(3).getImm(),
895 MI->getOperand(4).getImm(),
896 ShuffleMask);
897
898 DestName = getRegName(MI->getOperand(0).getReg());
899 Src1Name = getRegName(MI->getOperand(1).getReg());
900 Src2Name = getRegName(MI->getOperand(2).getReg());
878901 break;
879902
880903 case X86::PMOVZXBWrr:
430430 for (unsigned i = 1; i < NumElts; i++)
431431 Mask.push_back(IsLoad ? static_cast(SM_SentinelZero) : i);
432432 }
433
434 void DecodeEXTRQIMask(int Len, int Idx,
435 SmallVectorImpl &ShuffleMask) {
436 // Only the bottom 6 bits are valid for each immediate.
437 Len &= 0x3F;
438 Idx &= 0x3F;
439
440 // We can only decode this bit extraction instruction as a shuffle if both the
441 // length and index work with whole bytes.
442 if (0 != (Len % 8) || 0 != (Idx % 8))
443 return;
444
445 // A length of zero is equivalent to a bit length of 64.
446 if (Len == 0)
447 Len = 64;
448
449 // If the length + index exceeds the bottom 64 bits the result is undefined.
450 if ((Len + Idx) > 64) {
451 ShuffleMask.append(16, SM_SentinelUndef);
452 return;
453 }
454
455 // Convert index and index to work with bytes.
456 Len /= 8;
457 Idx /= 8;
458
459 // EXTRQ: Extract Len bytes starting from Idx. Zero pad the remaining bytes
460 // of the lower 64-bits. The upper 64-bits are undefined.
461 for (int i = 0; i != Len; ++i)
462 ShuffleMask.push_back(i + Idx);
463 for (int i = Len; i != 8; ++i)
464 ShuffleMask.push_back(SM_SentinelZero);
465 for (int i = 8; i != 16; ++i)
466 ShuffleMask.push_back(SM_SentinelUndef);
467 }
468
469 void DecodeINSERTQIMask(int Len, int Idx,
470 SmallVectorImpl &ShuffleMask) {
471 // Only the bottom 6 bits are valid for each immediate.
472 Len &= 0x3F;
473 Idx &= 0x3F;
474
475 // We can only decode this bit insertion instruction as a shuffle if both the
476 // length and index work with whole bytes.
477 if (0 != (Len % 8) || 0 != (Idx % 8))
478 return;
479
480 // A length of zero is equivalent to a bit length of 64.
481 if (Len == 0)
482 Len = 64;
483
484 // If the length + index exceeds the bottom 64 bits the result is undefined.
485 if ((Len + Idx) > 64) {
486 ShuffleMask.append(16, SM_SentinelUndef);
487 return;
488 }
489
490 // Convert index and index to work with bytes.
491 Len /= 8;
492 Idx /= 8;
493
494 // INSERTQ: Extract lowest Len bytes from lower half of second source and
495 // insert over first source starting at Idx byte. The upper 64-bits are
496 // undefined.
497 for (int i = 0; i != Idx; ++i)
498 ShuffleMask.push_back(i);
499 for (int i = 0; i != Len; ++i)
500 ShuffleMask.push_back(i + 16);
501 for (int i = Idx + Len; i != 8; ++i)
502 ShuffleMask.push_back(i);
503 for (int i = 8; i != 16; ++i)
504 ShuffleMask.push_back(SM_SentinelUndef);
505 }
506
433507 } // llvm namespace
9999 /// \brief Decode a scalar float move instruction as a shuffle mask.
100100 void DecodeScalarMoveMask(MVT VT, bool IsLoad,
101101 SmallVectorImpl &ShuffleMask);
102
103 /// \brief Decode a SSE4A EXTRQ instruction as a v16i8 shuffle mask.
104 void DecodeEXTRQIMask(int Len, int Idx,
105 SmallVectorImpl &ShuffleMask);
106
107 /// \brief Decode a SSE4A INSERTQ instruction as a v16i8 shuffle mask.
108 void DecodeINSERTQIMask(int Len, int Idx,
109 SmallVectorImpl &ShuffleMask);
102110 } // llvm namespace
103111
104112 #endif
39373937 return Subtarget->hasLZCNT();
39383938 }
39393939
3940 /// isUndefInRange - Return true if every element in Mask, beginning
3941 /// from position Pos and ending in Pos+Size is undef.
3942 static bool isUndefInRange(ArrayRef Mask, unsigned Pos, unsigned Size) {
3943 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
3944 if (0 <= Mask[i])
3945 return false;
3946 return true;
3947 }
3948
39403949 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
39413950 /// the specified range (L, H].
39423951 static bool isUndefOrInRange(int Val, int Low, int Hi) {
69136922 return SDValue();
69146923 }
69156924
6925 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
6926 static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1,
6927 SDValue V2, ArrayRef Mask,
6928 SelectionDAG &DAG) {
6929 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
6930 assert(!Zeroable.all() && "Fully zeroable shuffle mask");
6931
6932 int Size = Mask.size();
6933 int HalfSize = Size / 2;
6934 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
6935
6936 // Upper half must be undefined.
6937 if (!isUndefInRange(Mask, HalfSize, HalfSize))
6938 return SDValue();
6939
6940 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
6941 // Remainder of lower half result is zero and upper half is all undef.
6942 auto LowerAsEXTRQ = [&]() {
6943 // Determine the extraction length from the part of the
6944 // lower half that isn't zeroable.
6945 int Len = HalfSize;
6946 for (; Len >= 0; --Len)
6947 if (!Zeroable[Len - 1])
6948 break;
6949 assert(Len > 0 && "Zeroable shuffle mask");
6950
6951 // Attempt to match first Len sequential elements from the lower half.
6952 SDValue Src;
6953 int Idx = -1;
6954 for (int i = 0; i != Len; ++i) {
6955 int M = Mask[i];
6956 if (M < 0)
6957 continue;
6958 SDValue &V = (M < Size ? V1 : V2);
6959 M = M % Size;
6960
6961 // All mask elements must be in the lower half.
6962 if (M > HalfSize)
6963 return SDValue();
6964
6965 if (Idx < 0 || (Src == V && Idx == (M - i))) {
6966 Src = V;
6967 Idx = M - i;
6968 continue;
6969 }
6970 return SDValue();
6971 }
6972
6973 if (Idx < 0)
6974 return SDValue();
6975
6976 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
6977 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
6978 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
6979 return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
6980 DAG.getConstant(BitLen, DL, MVT::i8),
6981 DAG.getConstant(BitIdx, DL, MVT::i8));
6982 };
6983
6984 if (SDValue ExtrQ = LowerAsEXTRQ())
6985 return ExtrQ;
6986
6987 // INSERTQ: Extract lowest Len elements from lower half of second source and
6988 // insert over first source, starting at Idx.
6989 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
6990 auto LowerAsInsertQ = [&]() {
6991 for (int Idx = 0; Idx != HalfSize; ++Idx) {
6992 SDValue Base;
6993
6994 // Attempt to match first source from mask before insertion point.
6995 if (isUndefInRange(Mask, 0, Idx)) {
6996 /* EMPTY */
6997 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
6998 Base = V1;
6999 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
7000 Base = V2;
7001 } else {
7002 continue;
7003 }
7004
7005 // Extend the extraction length looking to match both the insertion of
7006 // the second source and the remaining elements of the first.
7007 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
7008 SDValue Insert;
7009 int Len = Hi - Idx;
7010
7011 // Match insertion.
7012 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
7013 Insert = V1;
7014 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
7015 Insert = V2;
7016 } else {
7017 continue;
7018 }
7019
7020 // Match the remaining elements of the lower half.
7021 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
7022 /* EMPTY */
7023 } else if ((!Base || (Base == V1)) &&
7024 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
7025 Base = V1;
7026 } else if ((!Base || (Base == V2)) &&
7027 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
7028 Size + Hi)) {
7029 Base = V2;
7030 } else {
7031 continue;
7032 }
7033
7034 // We may not have a base (first source) - this can safely be undefined.
7035 if (!Base)
7036 Base = DAG.getUNDEF(VT);
7037
7038 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
7039 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
7040 return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
7041 DAG.getConstant(BitLen, DL, MVT::i8),
7042 DAG.getConstant(BitIdx, DL, MVT::i8));
7043 }
7044 }
7045
7046 return SDValue();
7047 };
7048
7049 if (SDValue InsertQ = LowerAsInsertQ())
7050 return InsertQ;
7051
7052 return SDValue();
7053 }
7054
69167055 /// \brief Lower a vector shuffle as a zero or any extension.
69177056 ///
69187057 /// Given a specific number of elements, element bit width, and extension
69207059 /// features of the subtarget.
69217060 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
69227061 SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
6923 const X86Subtarget *Subtarget, SelectionDAG &DAG) {
7062 ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) {
69247063 assert(Scale > 1 && "Need a scale to extend.");
69257064 int NumElements = VT.getVectorNumElements();
69267065 int EltBits = VT.getScalarSizeInBits();
69557094 VT, DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
69567095 DAG.getBitcast(MVT::v8i16, InputV),
69577096 getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG)));
7097 }
7098
7099 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
7100 // to 64-bits.
7101 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) {
7102 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
7103 assert(VT.getSizeInBits() == 128 && "Unexpected vector width!");
7104
7105 SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
7106 DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
7107 DAG.getConstant(EltBits, DL, MVT::i8),
7108 DAG.getConstant(0, DL, MVT::i8)));
7109 if (isUndefInRange(Mask, NumElements/2, NumElements/2))
7110 return DAG.getNode(ISD::BITCAST, DL, VT, Lo);
7111
7112 SDValue Hi =
7113 DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
7114 DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
7115 DAG.getConstant(EltBits, DL, MVT::i8),
7116 DAG.getConstant(EltBits, DL, MVT::i8)));
7117 return DAG.getNode(ISD::BITCAST, DL, VT,
7118 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
69587119 }
69597120
69607121 // If this would require more than 2 unpack instructions to expand, use
70477208 return SDValue();
70487209
70497210 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
7050 DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);
7211 DL, VT, Scale, AnyExt, InputV, Mask, Subtarget, DAG);
70517212 };
70527213
70537214 // The widest scale possible for extending is to a 64-bit integer.
85748735 lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG))
85758736 return Shift;
85768737
8738 // See if we can use SSE4A Extraction / Insertion.
8739 if (Subtarget->hasSSE4A())
8740 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG))
8741 return V;
8742
85778743 // There are special ways we can lower some single-element blends.
85788744 if (NumV2Inputs == 1)
85798745 if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
87258891 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
87268892 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
87278893 return ZExt;
8894
8895 // See if we can use SSE4A Extraction / Insertion.
8896 if (Subtarget->hasSSE4A())
8897 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG))
8898 return V;
87288899
87298900 int NumV2Elements =
87308901 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
1511515286 case INTR_TYPE_3OP:
1511615287 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
1511715288 Op.getOperand(2), Op.getOperand(3));
15289 case INTR_TYPE_4OP:
15290 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
15291 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
1511815292 case INTR_TYPE_1OP_MASK_RM: {
1511915293 SDValue Src = Op.getOperand(1);
1512015294 SDValue PassThru = Op.getOperand(2);
1850818682 case X86ISD::FMINC: return "X86ISD::FMINC";
1850918683 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
1851018684 case X86ISD::FRCP: return "X86ISD::FRCP";
18685 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
18686 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
1851118687 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
1851218688 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
1851318689 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
393393 VINSERT,
394394 VEXTRACT,
395395
396 /// SSE4A Extraction and Insertion.
397 EXTRQI, INSERTQI,
398
396399 // Vector multiply packed unsigned doubleword integers
397400 PMULUDQ,
398401 // Vector multiply packed signed doubleword integers
203203 SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
204204 SDTCisSameAs<1,2>]>>;
205205
206 def X86extrqi : SDNode<"X86ISD::EXTRQI",
207 SDTypeProfile<1, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>,
208 SDTCisVT<2, i8>, SDTCisVT<3, i8>]>>;
209 def X86insertqi : SDNode<"X86ISD::INSERTQI",
210 SDTypeProfile<1, 4, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>,
211 SDTCisSameAs<1,2>, SDTCisVT<3, i8>,
212 SDTCisVT<4, i8>]>>;
213
206214 // Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get
207215 // translated into one of the target nodes below during lowering.
208216 // Note: this is a work in progress...
77727772 def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
77737773 (ins VR128:$src, u8imm:$len, u8imm:$idx),
77747774 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
7775 [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len,
7775 [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
77767776 imm:$idx))]>, PD;
77777777 def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
77787778 (ins VR128:$src, VR128:$mask),
77837783 def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
77847784 (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
77857785 "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
7786 [(set VR128:$dst, (int_x86_sse4a_insertqi VR128:$src,
7787 VR128:$src2, imm:$len, imm:$idx))]>, XD;
7786 [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
7787 imm:$len, imm:$idx))]>, XD;
77887788 def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
77897789 (ins VR128:$src, VR128:$mask),
77907790 "insertq\t{$mask, $src|$src, $mask}",
1818 enum IntrinsicType {
1919 INTR_NO_TYPE,
2020 GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX,
21 INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP,
21 INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP,
2222 CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI,
2323 INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM,
2424 INTR_TYPE_3OP_MASK, FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK,
10781078 X86_INTRINSIC_DATA(sse41_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
10791079 X86_INTRINSIC_DATA(sse41_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
10801080 X86_INTRINSIC_DATA(sse41_pmuldq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
1081 X86_INTRINSIC_DATA(sse4a_extrqi, INTR_TYPE_3OP, X86ISD::EXTRQI, 0),
1082 X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP, X86ISD::INSERTQI, 0),
10811083 X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ),
10821084 X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE),
10831085 X86_INTRINSIC_DATA(sse_comigt_ss, COMI, X86ISD::COMI, ISD::SETGT),
0 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=BTVER1
1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=BTVER2
2
3 ;
4 ; EXTRQI
5 ;
6
7 define <16 x i8> @shuf_0zzzuuuuuuuuuuuu(<16 x i8> %a0) {
8 ; BTVER1-LABEL: shuf_0zzzuuuuuuuuuuuu:
9 ; BTVER1: # BB#0:
10 ; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
11 ; BTVER1-NEXT: retq
12 ;
13 ; BTVER2-LABEL: shuf_0zzzuuuuuuuuuuuu:
14 ; BTVER2: # BB#0:
15 ; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
16 ; BTVER2-NEXT: retq
17 %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32>
18 ret <16 x i8> %s
19 }
20
21 define <16 x i8> @shuf_0zzzzzzz1zzzzzzz(<16 x i8> %a0) {
22 ; BTVER1-LABEL: shuf_0zzzzzzz1zzzzzzz:
23 ; BTVER1: # BB#0:
24 ; BTVER1-NEXT: movaps %xmm0, %xmm1
25 ; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
26 ; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
27 ; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
28 ; BTVER1-NEXT: retq
29 ;
30 ; BTVER2-LABEL: shuf_0zzzzzzz1zzzzzzz:
31 ; BTVER2: # BB#0:
32 ; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
33 ; BTVER2-NEXT: retq
34 %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32>
35 ret <16 x i8> %s
36 }
37
38 define <16 x i8> @shuf_01zzuuuuuuuuuuuu(<16 x i8> %a0) {
39 ; BTVER1-LABEL: shuf_01zzuuuuuuuuuuuu:
40 ; BTVER1: # BB#0:
41 ; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
42 ; BTVER1-NEXT: retq
43 ;
44 ; BTVER2-LABEL: shuf_01zzuuuuuuuuuuuu:
45 ; BTVER2: # BB#0:
46 ; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
47 ; BTVER2-NEXT: retq
48 %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32>
49 ret <16 x i8> %s
50 }
51
52 define <16 x i8> @shuf_01zzzzzz23zzzzzz(<16 x i8> %a0) {
53 ; BTVER1-LABEL: shuf_01zzzzzz23zzzzzz:
54 ; BTVER1: # BB#0:
55 ; BTVER1-NEXT: movaps %xmm0, %xmm1
56 ; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
57 ; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
58 ; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
59 ; BTVER1-NEXT: retq
60 ;
61 ; BTVER2-LABEL: shuf_01zzzzzz23zzzzzz:
62 ; BTVER2: # BB#0:
63 ; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
64 ; BTVER2-NEXT: retq
65 %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32>
66 ret <16 x i8> %s
67 }
68
69 define <16 x i8> @shuf_1zzzuuuuuuuuuuuu(<16 x i8> %a0) {
70 ; ALL-LABEL: shuf_1zzzuuuuuuuuuuuu:
71 ; ALL: # BB#0:
72 ; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
73 ; ALL-NEXT: retq
74 %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32>
75 ret <16 x i8> %s
76 }
77
78 define <8 x i16> @shuf_1zzzuuuu(<8 x i16> %a0) {
79 ; ALL-LABEL: shuf_1zzzuuuu:
80 ; ALL: # BB#0:
81 ; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
82 ; ALL-NEXT: retq
83 %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32>
84 ret <8 x i16> %s
85 }
86
87 define <8 x i16> @shuf_12zzuuuu(<8 x i16> %a0) {
88 ; ALL-LABEL: shuf_12zzuuuu:
89 ; ALL: # BB#0:
90 ; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[2,3,4,5],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
91 ; ALL-NEXT: retq
92 %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32>
93 ret <8 x i16> %s
94 }
95
96 define <8 x i16> @shuf_012zuuuu(<8 x i16> %a0) {
97 ; ALL-LABEL: shuf_012zuuuu:
98 ; ALL: # BB#0:
99 ; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u]
100 ; ALL-NEXT: retq
101 %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32>
102 ret <8 x i16> %s
103 }
104
105 define <8 x i16> @shuf_0zzz1zzz(<8 x i16> %a0) {
106 ; BTVER1-LABEL: shuf_0zzz1zzz:
107 ; BTVER1: # BB#0:
108 ; BTVER1-NEXT: movaps %xmm0, %xmm1
109 ; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
110 ; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
111 ; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
112 ; BTVER1-NEXT: retq
113 ;
114 ; BTVER2-LABEL: shuf_0zzz1zzz:
115 ; BTVER2: # BB#0:
116 ; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
117 ; BTVER2-NEXT: retq
118 %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32>
119 ret <8 x i16> %s
120 }
121
122 define <4 x i32> @shuf_0z1z(<4 x i32> %a0) {
123 ; BTVER1-LABEL: shuf_0z1z:
124 ; BTVER1: # BB#0:
125 ; BTVER1-NEXT: pxor %xmm1, %xmm1
126 ; BTVER1-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
127 ; BTVER1-NEXT: retq
128 ;
129 ; BTVER2-LABEL: shuf_0z1z:
130 ; BTVER2: # BB#0:
131 ; BTVER2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
132 ; BTVER2-NEXT: retq
133 %s = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32>
134 ret <4 x i32> %s
135 }
136
137 ;
138 ; INSERTQI
139 ;
140
141 define <16 x i8> @shuf_0_0_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
142 ; ALL-LABEL: shuf_0_0_2_3_uuuu_uuuu_uuuu:
143 ; ALL: # BB#0:
144 ; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
145 ; ALL-NEXT: retq
146 %s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32>
147 ret <16 x i8> %s
148 }
149
150 define <16 x i8> @shuf_0_16_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
151 ; ALL-LABEL: shuf_0_16_2_3_uuuu_uuuu_uuuu:
152 ; ALL: # BB#0:
153 ; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3,4,5,6,7,u,u,u,u,u,u,u,u]
154 ; ALL-NEXT: retq
155 %s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32>
156 ret <16 x i8> %s
157 }
158
159 define <16 x i8> @shuf_16_1_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
160 ; ALL-LABEL: shuf_16_1_2_3_uuuu_uuuu_uuuu:
161 ; ALL: # BB#0:
162 ; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
163 ; ALL-NEXT: retq
164 %s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32>
165 ret <16 x i8> %s
166 }
167
168 define <8 x i16> @shuf_0823uuuu(<8 x i16> %a0, <8 x i16> %a1) {
169 ; ALL-LABEL: shuf_0823uuuu:
170 ; ALL: # BB#0:
171 ; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1],xmm0[4,5,6,7,u,u,u,u,u,u,u,u]
172 ; ALL-NEXT: retq
173 %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32>
174 ret <8 x i16> %s
175 }
176
177 define <8 x i16> @shuf_0183uuuu(<8 x i16> %a0, <8 x i16> %a1) {
178 ; ALL-LABEL: shuf_0183uuuu:
179 ; ALL: # BB#0:
180 ; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[0,1],xmm0[6,7,u,u,u,u,u,u,u,u]
181 ; ALL-NEXT: retq
182 %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32>
183 ret <8 x i16> %s
184 }
185
186 define <8 x i16> @shuf_0128uuuu(<8 x i16> %a0, <8 x i16> %a1) {
187 ; ALL-LABEL: shuf_0128uuuu:
188 ; ALL: # BB#0:
189 ; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[0,1],xmm0[u,u,u,u,u,u,u,u]
190 ; ALL-NEXT: retq
191 %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32>
192 ret <8 x i16> %s
193 }
194
195 define <8 x i16> @shuf_0893uuuu(<8 x i16> %a0, <8 x i16> %a1) {
196 ; ALL-LABEL: shuf_0893uuuu:
197 ; ALL: # BB#0:
198 ; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3],xmm0[6,7,u,u,u,u,u,u,u,u]
199 ; ALL-NEXT: retq
200 %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32>
201 ret <8 x i16> %s
202 }
203
204 define <8 x i16> @shuf_089Auuuu(<8 x i16> %a0, <8 x i16> %a1) {
205 ; ALL-LABEL: shuf_089Auuuu:
206 ; ALL: # BB#0:
207 ; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3,4,5],xmm0[u,u,u,u,u,u,u,u]
208 ; ALL-NEXT: retq
209 %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32>
210 ret <8 x i16> %s
211 }
212
213 define <8 x i16> @shuf_089uuuuu(<8 x i16> %a0, <8 x i16> %a1) {
214 ; ALL-LABEL: shuf_089uuuuu:
215 ; ALL: # BB#0:
216 ; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3],xmm0[6,7,u,u,u,u,u,u,u,u]
217 ; ALL-NEXT: retq
218 %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32>
219 ret <8 x i16> %s
220 }