llvm.org GIT mirror llvm / bdecfeb
[x86] Implement v16i16 support with AVX2 in the new vector shuffle lowering. This also implements the fancy blend lowering for v16i16 using AVX2 and teaches the X86 backend to print shuffle masks for 256-bit PSHUFB and PBLENDW instructions. It also makes the mask decoding correct for PBLENDW instructions. The yaks, they are legion. Tests are updated accordingly. There are some missing tests for the VBLENDVB lowering, but I'll add those in a follow-up as this commit has accumulated enough cruft already. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218430 91177308-0d34-0410-b5e6-96231b3b80d8 Chandler Carruth 6 years ago
5 changed file(s) with 220 addition(s) and 262 deletion(s). Raw diff Collapse all Expand all
9696 case X86::VPBLENDWrmi:
9797 if(MI->getOperand(MI->getNumOperands()-1).isImm())
9898 DecodeBLENDMask(MVT::v8i16,
99 MI->getOperand(MI->getNumOperands()-1).getImm(),
100 ShuffleMask);
101 Src1Name = getRegName(MI->getOperand(1).getReg());
102 DestName = getRegName(MI->getOperand(0).getReg());
103 break;
104 case X86::VPBLENDWYrri:
105 Src2Name = getRegName(MI->getOperand(2).getReg());
106 // FALL THROUGH.
107 case X86::VPBLENDWYrmi:
108 if(MI->getOperand(MI->getNumOperands()-1).isImm())
109 DecodeBLENDMask(MVT::v16i16,
99110 MI->getOperand(MI->getNumOperands()-1).getImm(),
100111 ShuffleMask);
101112 Src1Name = getRegName(MI->getOperand(1).getReg());
300300 }
301301 }
302302
303 void DecodeBLENDMask(MVT VT, unsigned Imm,
304 SmallVectorImpl &ShuffleMask) {
303 void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) {
304 int ElementBits = VT.getScalarSizeInBits();
305305 int NumElements = VT.getVectorNumElements();
306 for (int i = 0; i < NumElements; ++i)
307 ShuffleMask.push_back(((Imm >> i) & 1) ? NumElements + i : i);
306 for (int i = 0; i < NumElements; ++i) {
307 // If there are more than 8 elements in the vector, then any immediate blend
308 // mask applies to each 128-bit lane. There can never be more than
309 // 8 elements in a 128-bit lane with an immediate blend.
310 int Bit = NumElements > 8 ? i % (128 / ElementBits) : i;
311 assert(Bit < 8 &&
312 "Immediate blends only operate over 8 elements at a time!");
313 ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElements + i : i);
314 }
308315 }
309316
310317 /// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
71827182 return true;
71837183 }
71847184
7185 /// \brief Test whether there are elements crossing 128-bit lanes in this
7186 /// shuffle mask.
7187 ///
7188 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7189 /// and we routinely test for these.
7190 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef Mask) {
7191 int LaneSize = 128 / VT.getScalarSizeInBits();
7192 int Size = Mask.size();
7193 for (int i = 0; i < Size; ++i)
7194 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7195 return true;
7196 return false;
7197 }
7198
7199 /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
7200 ///
7201 /// This checks a shuffle mask to see if it is performing the same
7202 /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
7203 /// that it is also not lane-crossing. It may however involve a blend from the
7204 /// same lane of a second vector.
7205 ///
7206 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7207 /// non-trivial to compute in the face of undef lanes. The representation is
7208 /// *not* suitable for use with existing 128-bit shuffles as it will contain
7209 /// entries from both V1 and V2 inputs to the wider mask.
7210 static bool
7211 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask,
7212 SmallVectorImpl &RepeatedMask) {
7213 int LaneSize = 128 / VT.getScalarSizeInBits();
7214 RepeatedMask.resize(LaneSize, -1);
7215 int Size = Mask.size();
7216 for (int i = 0; i < Size; ++i) {
7217 if (Mask[i] < 0)
7218 continue;
7219 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7220 // This entry crosses lanes, so there is no way to model this shuffle.
7221 return false;
7222
7223 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7224 if (RepeatedMask[i % LaneSize] == -1)
7225 // This is the first non-undef entry in this slot of a 128-bit lane.
7226 RepeatedMask[i % LaneSize] =
7227 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
7228 else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
7229 // Found a mismatch with the repeated mask.
7230 return false;
7231 }
7232 return true;
7233 }
7234
71857235 // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
71867236 // 2013 will allow us to use it as a non-type template parameter.
71877237 namespace {
73097359 return DAG.getNode(ISD::BITCAST, DL, VT,
73107360 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
73117361 DAG.getConstant(BlendMask, MVT::i8)));
7362 }
7363
7364 case MVT::v16i16: {
7365 assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7366 SmallVector RepeatedMask;
7367 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
7368 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
7369 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
7370 BlendMask = 0;
7371 for (int i = 0; i < 8; ++i)
7372 if (RepeatedMask[i] >= 16)
7373 BlendMask |= 1u << i;
7374 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
7375 DAG.getConstant(BlendMask, MVT::i8));
7376 }
7377
7378 // Fall back to a fully general variable byte blend.
7379 SDValue PBLENDVMask[32];
7380 // Scale the blend by the number of bytes per element.
7381 int Scale = VT.getScalarSizeInBits() / 8;
7382 assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
7383 for (int i = 0, Size = Mask.size(); i < Size; ++i)
7384 for (int j = 0; j < Scale; ++j)
7385 PBLENDVMask[Scale * i + j] =
7386 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
7387 : DAG.getConstant(Mask[i] < Size ? 0 : 0x80, MVT::i8);
7388
7389 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
7390 V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
7391 return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(
7392 X86ISD::BLENDV, DL, MVT::v32i8, V1, V2,
7393 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PBLENDVMask)));
73127394 }
73137395
73147396 default:
92149296 }
92159297 }
92169298
9217 /// \brief Test whether there are elements crossing 128-bit lanes in this
9218 /// shuffle mask.
9219 ///
9220 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9221 /// and we routinely test for these.
9222 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef Mask) {
9223 int LaneSize = 128 / VT.getScalarSizeInBits();
9224 int Size = Mask.size();
9225 for (int i = 0; i < Size; ++i)
9226 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9227 return true;
9228 return false;
9229 }
9230
9231 /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
9232 ///
9233 /// This checks a shuffle mask to see if it is performing the same
9234 /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
9235 /// that it is also not lane-crossing. It may however involve a blend from the
9236 /// same lane of a second vector.
9237 ///
9238 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9239 /// non-trivial to compute in the face of undef lanes. The representation is
9240 /// *not* suitable for use with existing 128-bit shuffles as it will contain
9241 /// entries from both V1 and V2 inputs to the wider mask.
9242 static bool
9243 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask,
9244 SmallVectorImpl &RepeatedMask) {
9245 int LaneSize = 128 / VT.getScalarSizeInBits();
9246 RepeatedMask.resize(LaneSize, -1);
9247 int Size = Mask.size();
9248 for (int i = 0; i < Size; ++i) {
9249 if (Mask[i] < 0)
9250 continue;
9251 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9252 // This entry crosses lanes, so there is no way to model this shuffle.
9253 return false;
9254
9255 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9256 if (RepeatedMask[i % LaneSize] == -1)
9257 // This is the first non-undef entry in this slot of a 128-bit lane.
9258 RepeatedMask[i % LaneSize] =
9259 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
9260 else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
9261 // Found a mismatch with the repeated mask.
9262 return false;
9263 }
9264 return true;
9265 }
9266
92679299 /// \brief Generic routine to split a 256-bit vector shuffle into 128-bit
92689300 /// shuffles.
92699301 ///
95809612 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
95819613 assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
95829614
9583 // FIXME: Actually implement this using AVX2!!!
9584 (void)Mask;
9585 return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG);
9615 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
9616 Subtarget, DAG))
9617 return Blend;
9618
9619 // If the shuffle mask is repeated in each 128-bit lane we can use more
9620 // efficient instructions that mirror the shuffles across the two 128-bit
9621 // lanes.
9622 SmallVector RepeatedMask;
9623 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
9624 assert(RepeatedMask.size() == 8 && "Unexpected repeated mask size!");
9625 // FIXME: It might be worth it to call into the (terribly complex) v8i16
9626 // lowering here.
9627
9628 // Use dedicated unpack instructions for masks that match their pattern.
9629 //
9630 if (isShuffleEquivalent(Mask,
9631 // First 128-bit lane:
9632 0, 16, 1, 17, 2, 18, 3, 19,
9633 // Second 128-bit lane:
9634 8, 24, 9, 25, 10, 26, 11, 27))
9635 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
9636 if (isShuffleEquivalent(Mask,
9637 // First 128-bit lane:
9638 4, 20, 5, 21, 6, 22, 7, 23,
9639 // Second 128-bit lane:
9640 12, 28, 13, 29, 14, 30, 15, 31))
9641 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
9642 }
9643
9644 // There are no generalized cross-lane shuffle operations available on i16
9645 // element types.
9646 // FIXME: We should teach the "split and lower" path to do something more
9647 // clever, or do it ourselves here. The optimal lowering of cross-lane
9648 // shuffles I am aware of is to swap the lanes into a copy, shuffle both the
9649 // original and the copy, and then blend to pick up the cross-lane elements.
9650 // This is four instructions with a tree height of three which is better than
9651 // the worst case for a gather-cross-scatter approach such as used in SSE2
9652 // v8i16 lowering (where we don't have blends). While for cross-lane blends it
9653 // results in a blend tree, blends are very cheap in AVX2 and newer chips. We
9654 // might also want to special case situations where we can always do a single
9655 // VPERMD to produce a non-lane-crossing shuffle.
9656 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
9657 return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG);
9658
9659 if (isSingleInputShuffleMask(Mask)) {
9660 SDValue PSHUFBMask[32];
9661 for (int i = 0; i < 16; ++i) {
9662 if (Mask[i] == -1) {
9663 PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
9664 continue;
9665 }
9666
9667 int M = i < 8 ? Mask[i] : Mask[i] - 8;
9668 assert(M >= 0 && M < 8 && "Invalid single-input mask!");
9669 PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8);
9670 PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8);
9671 }
9672 return DAG.getNode(
9673 ISD::BITCAST, DL, MVT::v16i16,
9674 DAG.getNode(
9675 X86ISD::PSHUFB, DL, MVT::v32i8,
9676 DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
9677 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
9678 }
9679
9680 // Otherwise fall back on generic blend lowering.
9681 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i16, V1, V2,
9682 Mask, DAG);
95869683 }
95879684
95889685 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
10971097 // a constant shuffle mask. We won't be able to do this at the MC layer
10981098 // because the mask isn't an immediate.
10991099 case X86::PSHUFBrm:
1100 case X86::VPSHUFBrm: {
1100 case X86::VPSHUFBrm:
1101 case X86::VPSHUFBYrm: {
11011102 if (!OutStreamer.isVerboseAsm())
11021103 break;
11031104 assert(MI->getNumOperands() > 5 &&
384384 ;
385385 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08
386386 ; AVX2: # BB#0:
387 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
388 ; AVX2-NEXT: vmovdqa {{.*}} # xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
389 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
390 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
391 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
387 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
392388 ; AVX2-NEXT: retq
393389 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
394390 ret <16 x i16> %shuffle
406402 ;
407403 ; AVX2-LABEL: @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15
408404 ; AVX2: # BB#0:
409 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
410 ; AVX2-NEXT: vmovdqa {{.*}} # xmm2 = [14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15]
411 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
412 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
413 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
405 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,30,31,30,31,30,31,30,31,30,31,30,31,30,31,30,31]
414406 ; AVX2-NEXT: retq
415407 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
416408 ret <16 x i16> %shuffle
429421 ;
430422 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12
431423 ; AVX2: # BB#0:
432 ; AVX2-NEXT: vpshuflw {{.*}} # xmm1 = xmm0[0,0,0,0,4,5,6,7]
433 ; AVX2-NEXT: vpshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,4,4,4]
434 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
435 ; AVX2-NEXT: vpshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
436 ; AVX2-NEXT: vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
437 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
424 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
438425 ; AVX2-NEXT: retq
439426 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
440427 ret <16 x i16> %shuffle
453440 ;
454441 ; AVX2-LABEL: @shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15
455442 ; AVX2: # BB#0:
456 ; AVX2-NEXT: vpshuflw {{.*}} # xmm1 = xmm0[3,3,3,3,4,5,6,7]
457 ; AVX2-NEXT: vpshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,7,7,7]
458 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
459 ; AVX2-NEXT: vpshuflw {{.*}} # xmm0 = xmm0[3,3,3,3,4,5,6,7]
460 ; AVX2-NEXT: vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,7,7,7]
461 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
443 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[6,7,6,7,6,7,6,7,14,15,14,15,14,15,14,15,22,23,22,23,22,23,22,23,30,31,30,31,30,31,30,31]
462444 ; AVX2-NEXT: retq
463445 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
464446 ret <16 x i16> %shuffle
477459 ;
478460 ; AVX2-LABEL: @shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14
479461 ; AVX2: # BB#0:
480 ; AVX2-NEXT: vpshuflw {{.*}} # xmm1 = xmm0[0,0,2,2,4,5,6,7]
481 ; AVX2-NEXT: vpshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,4,6,6]
482 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
483 ; AVX2-NEXT: vpshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7]
484 ; AVX2-NEXT: vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,6,6]
485 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
462 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,16,17,16,17,20,21,20,21,24,25,24,25,28,29,28,29]
486463 ; AVX2-NEXT: retq
487464 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
488465 ret <16 x i16> %shuffle
501478 ;
502479 ; AVX2-LABEL: @shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15
503480 ; AVX2: # BB#0:
504 ; AVX2-NEXT: vpshuflw {{.*}} # xmm1 = xmm0[1,1,3,3,4,5,6,7]
505 ; AVX2-NEXT: vpshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,5,5,7,7]
506 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
507 ; AVX2-NEXT: vpshuflw {{.*}} # xmm0 = xmm0[1,1,3,3,4,5,6,7]
508 ; AVX2-NEXT: vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,7,7]
509 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
481 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15,18,19,18,19,22,23,22,23,26,27,26,27,30,31,30,31]
510482 ; AVX2-NEXT: retq
511483 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
512484 ret <16 x i16> %shuffle
636608 ;
637609 ; AVX2-LABEL: @shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15
638610 ; AVX2: # BB#0:
639 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
640 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
641 ; AVX2-NEXT: vpblendw {{.*}} # xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
642 ; AVX2-NEXT: vpblendw {{.*}} # xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
643 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
611 ; AVX2-NEXT: vpblendw {{.*}} # ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
644612 ; AVX2-NEXT: retq
645613 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
646614 ret <16 x i16> %shuffle
683651 ;
684652 ; AVX2-LABEL: @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24
685653 ; AVX2: # BB#0:
686 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
687 ; AVX2-NEXT: vmovdqa {{.*}} # xmm3 = [0,1,0,1,4,5,0,1,0,1,0,1,12,13,0,1]
688 ; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
689 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
690 ; AVX2-NEXT: vpshufd {{.*}} # xmm4 = xmm4[0,0,0,0]
691 ; AVX2-NEXT: vpblendw {{.*}} # xmm2 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4],xmm2[5],xmm4[6],xmm2[7]
692 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
693 ; AVX2-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,0,0,0]
694 ; AVX2-NEXT: vpblendw {{.*}} # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
695 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
654 ; AVX2-NEXT: vpshufb {{.*}} # ymm1 = ymm1[u,u,0,1,u,u,0,1,u,u,0,1,u,u,0,1,u,u,16,17,u,u,16,17,u,u,16,17,u,u,16,17]
655 ; AVX2-NEXT: vpshufd {{.*}} # ymm0 = ymm0[0,0,0,0,4,4,4,4]
656 ; AVX2-NEXT: vpblendw {{.*}} # ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
696657 ; AVX2-NEXT: retq
697658 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
698659 ret <16 x i16> %shuffle
712673 ;
713674 ; AVX2-LABEL: @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15
714675 ; AVX2: # BB#0:
715 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
716 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
717 ; AVX2-NEXT: vpshuflw {{.*}} # xmm3 = xmm3[0,0,0,0,4,5,6,7]
718 ; AVX2-NEXT: vpblendd {{.*}} # xmm2 = xmm3[0,1],xmm2[2,3]
719 ; AVX2-NEXT: vpshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7]
720 ; AVX2-NEXT: vpblendd {{.*}} # xmm0 = xmm1[0,1],xmm0[2,3]
721 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
676 ; AVX2-NEXT: vpshufb {{.*}} # ymm1 = ymm1[0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,u,u,u,u,u,u,u,u]
677 ; AVX2-NEXT: vpblendd {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
722678 ; AVX2-NEXT: retq
723679 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
724680 ret <16 x i16> %shuffle
740696 ;
741697 ; AVX2-LABEL: @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12
742698 ; AVX2: # BB#0:
743 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
744 ; AVX2-NEXT: vpshufhw {{.*}} # xmm2 = xmm2[0,1,2,3,7,6,5,4]
745 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
746 ; AVX2-NEXT: vpshuflw {{.*}} # xmm3 = xmm3[3,2,1,0,4,5,6,7]
747 ; AVX2-NEXT: vpblendd {{.*}} # xmm2 = xmm3[0,1],xmm2[2,3]
748 ; AVX2-NEXT: vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
749 ; AVX2-NEXT: vpshuflw {{.*}} # xmm1 = xmm1[3,2,1,0,4,5,6,7]
750 ; AVX2-NEXT: vpblendd {{.*}} # xmm0 = xmm1[0,1],xmm0[2,3]
751 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
699 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[u,u,u,u,u,u,u,u,14,15,12,13,10,11,8,9,u,u,u,u,u,u,u,u,30,31,28,29,26,27,24,25]
700 ; AVX2-NEXT: vpshufb {{.*}} # ymm1 = ymm1[6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17,u,u,u,u,u,u,u,u]
701 ; AVX2-NEXT: vpblendd {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
752702 ; AVX2-NEXT: retq
753703 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
754704 ret <16 x i16> %shuffle
772722 ;
773723 ; AVX2-LABEL: @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08
774724 ; AVX2: # BB#0:
775 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
776 ; AVX2-NEXT: vpshuflw {{.*}} # xmm2 = xmm2[3,2,1,0,4,5,6,7]
777 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
778 ; AVX2-NEXT: vpshufd {{.*}} # xmm3 = xmm3[0,1,0,1]
779 ; AVX2-NEXT: vpshufhw {{.*}} # xmm3 = xmm3[0,1,2,3,7,6,5,4]
780 ; AVX2-NEXT: vpblendd {{.*}} # xmm2 = xmm2[0,1],xmm3[2,3]
781 ; AVX2-NEXT: vpshuflw {{.*}} # xmm1 = xmm1[3,2,1,0,4,5,6,7]
782 ; AVX2-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
783 ; AVX2-NEXT: vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
784 ; AVX2-NEXT: vpblendd {{.*}} # xmm0 = xmm1[0,1],xmm0[2,3]
785 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
725 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[u,u,u,u,u,u,u,u,6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17]
726 ; AVX2-NEXT: vpshufb {{.*}} # ymm1 = ymm1[6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17,u,u,u,u,u,u,u,u]
727 ; AVX2-NEXT: vpblendd {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
786728 ; AVX2-NEXT: retq
787729 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
788730 ret <16 x i16> %shuffle
800742 ;
801743 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08
802744 ; AVX2: # BB#0:
803 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
804 ; AVX2-NEXT: vmovdqa {{.*}} # xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
805 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
806 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
807 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
745 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,18,19,16,17]
808746 ; AVX2-NEXT: retq
809747 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
810748 ret <16 x i16> %shuffle
822760 ;
823761 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08
824762 ; AVX2: # BB#0:
825 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
826 ; AVX2-NEXT: vmovdqa {{.*}} # xmm2 = [0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
827 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
828 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
829 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
763 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,20,21,16,17,16,17]
830764 ; AVX2-NEXT: retq
831765 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
832766 ret <16 x i16> %shuffle
844778 ;
845779 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08
846780 ; AVX2: # BB#0:
847 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
848 ; AVX2-NEXT: vmovdqa {{.*}} # xmm2 = [0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
849 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
850 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
851 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
781 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,22,23,16,17,16,17,16,17]
852782 ; AVX2-NEXT: retq
853783 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
854784 ret <16 x i16> %shuffle
866796 ;
867797 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08
868798 ; AVX2: # BB#0:
869 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
870 ; AVX2-NEXT: vmovdqa {{.*}} # xmm2 = [0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
871 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
872 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
873 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
799 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,24,25,16,17,16,17,16,17,16,17]
874800 ; AVX2-NEXT: retq
875801 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
876802 ret <16 x i16> %shuffle
888814 ;
889815 ; AVX2-LABEL: @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08
890816 ; AVX2: # BB#0:
891 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
892 ; AVX2-NEXT: vmovdqa {{.*}} # xmm2 = [0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
893 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
894 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
895 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
817 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,26,27,16,17,16,17,16,17,16,17,16,17]
896818 ; AVX2-NEXT: retq
897819 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
898820 ret <16 x i16> %shuffle
910832 ;
911833 ; AVX2-LABEL: @shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08
912834 ; AVX2: # BB#0:
913 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
914 ; AVX2-NEXT: vmovdqa {{.*}} # xmm2 = [0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
915 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
916 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
917 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
835 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,28,29,16,17,16,17,16,17,16,17,16,17,16,17]
918836 ; AVX2-NEXT: retq
919837 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
920838 ret <16 x i16> %shuffle
932850 ;
933851 ; AVX2-LABEL: @shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08
934852 ; AVX2: # BB#0:
935 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
936 ; AVX2-NEXT: vmovdqa {{.*}} # xmm2 = [14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
937 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
938 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
939 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
853 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,30,31,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
940854 ; AVX2-NEXT: retq
941855 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
942856 ret <16 x i16> %shuffle
958872 ;
959873 ; AVX2-LABEL: @shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27
960874 ; AVX2: # BB#0:
961 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
962 ; AVX2-NEXT: vpunpcklwd {{.*}} # xmm2 = xmm2[0,0,1,1,2,2,3,3]
963 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
964 ; AVX2-NEXT: vpmovzxwd %xmm3, %xmm3
965 ; AVX2-NEXT: vpblendw {{.*}} # xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
966 ; AVX2-NEXT: vpunpcklwd {{.*}} # xmm1 = xmm1[0,0,1,1,2,2,3,3]
967 ; AVX2-NEXT: vpmovzxwd %xmm0, %xmm0
968 ; AVX2-NEXT: vpblendw {{.*}} # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
969 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
875 ; AVX2-NEXT: vpunpcklwd {{.*}} # ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
970876 ; AVX2-NEXT: retq
971877 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
972878 ret <16 x i16> %shuffle
988894 ;
989895 ; AVX2-LABEL: @shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31
990896 ; AVX2: # BB#0:
991 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
992 ; AVX2-NEXT: vpunpckhwd {{.*}} # xmm2 = xmm2[4,4,5,5,6,6,7,7]
993 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
994 ; AVX2-NEXT: vpunpckhwd {{.*}} # xmm3 = xmm3[4,4,5,5,6,6,7,7]
995 ; AVX2-NEXT: vpblendw {{.*}} # xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
996 ; AVX2-NEXT: vpunpckhwd {{.*}} # xmm1 = xmm1[4,4,5,5,6,6,7,7]
997 ; AVX2-NEXT: vpunpckhwd {{.*}} # xmm0 = xmm0[4,4,5,5,6,6,7,7]
998 ; AVX2-NEXT: vpblendw {{.*}} # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
999 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
897 ; AVX2-NEXT: vpunpckhwd {{.*}} # ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
1000898 ; AVX2-NEXT: retq
1001899 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
1002900 ret <16 x i16> %shuffle
1018916 ;
1019917 ; AVX2-LABEL: @shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31
1020918 ; AVX2: # BB#0:
1021 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
1022 ; AVX2-NEXT: vpunpckhwd {{.*}} # xmm2 = xmm2[4,4,5,5,6,6,7,7]
1023 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
1024 ; AVX2-NEXT: vpunpckhwd {{.*}} # xmm3 = xmm3[4,4,5,5,6,6,7,7]
1025 ; AVX2-NEXT: vpblendw {{.*}} # xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
1026 ; AVX2-NEXT: vpunpcklwd {{.*}} # xmm1 = xmm1[0,0,1,1,2,2,3,3]
1027 ; AVX2-NEXT: vpmovzxwd %xmm0, %xmm0
1028 ; AVX2-NEXT: vpblendw {{.*}} # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1029 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
919 ; AVX2-NEXT: vpshufb {{.*}} # ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31]
920 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31,u,u]
921 ; AVX2-NEXT: vpblendw {{.*}} # ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
1030922 ; AVX2-NEXT: retq
1031923 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
1032924 ret <16 x i16> %shuffle
1048940 ;
1049941 ; AVX2-LABEL: @shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27
1050942 ; AVX2: # BB#0:
1051 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
1052 ; AVX2-NEXT: vpunpcklwd {{.*}} # xmm2 = xmm2[0,0,1,1,2,2,3,3]
1053 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
1054 ; AVX2-NEXT: vpmovzxwd %xmm3, %xmm3
1055 ; AVX2-NEXT: vpblendw {{.*}} # xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
1056 ; AVX2-NEXT: vpunpckhwd {{.*}} # xmm1 = xmm1[4,4,5,5,6,6,7,7]
1057 ; AVX2-NEXT: vpunpckhwd {{.*}} # xmm0 = xmm0[4,4,5,5,6,6,7,7]
1058 ; AVX2-NEXT: vpblendw {{.*}} # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1059 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
943 ; AVX2-NEXT: vpshufb {{.*}} # ymm1 = ymm1[u,u,8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23]
944 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23,u,u]
945 ; AVX2-NEXT: vpblendw {{.*}} # ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
1060946 ; AVX2-NEXT: retq
1061947 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
1062948 ret <16 x i16> %shuffle
1073959 ;
1074960 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08
1075961 ; AVX2: # BB#0:
1076 ; AVX2-NEXT: vpshufb {{.*}} # xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
1077 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1078 ; AVX2-NEXT: vpshufb {{.*}} # xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,0,1]
1079 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
962 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,18,19,16,17,16,17,16,17,16,17,16,17,16,17]
1080963 ; AVX2-NEXT: retq
1081964 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
1082965 ret <16 x i16> %shuffle
1093976 ;
1094977 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08
1095978 ; AVX2: # BB#0:
1096 ; AVX2-NEXT: vpshufb {{.*}} # xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
1097 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1098 ; AVX2-NEXT: vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,4,5,0,1,0,1,0,1,0,1,0,1]
1099 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
979 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,20,21,16,17,16,17,16,17,16,17,16,17]
1100980 ; AVX2-NEXT: retq
1101981 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
1102982 ret <16 x i16> %shuffle
1113993 ;
1114994 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08
1115995 ; AVX2: # BB#0:
1116 ; AVX2-NEXT: vpshufb {{.*}} # xmm1 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
1117 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1118 ; AVX2-NEXT: vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,0,1,6,7,0,1,0,1,0,1,0,1]
1119 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
996 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,22,23,16,17,16,17,16,17,16,17]
1120997 ; AVX2-NEXT: retq
1121998 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
1122999 ret <16 x i16> %shuffle
11331010 ;
11341011 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08
11351012 ; AVX2: # BB#0:
1136 ; AVX2-NEXT: vpshufb {{.*}} # xmm1 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
1137 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1138 ; AVX2-NEXT: vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,0,1,0,1,0,1]
1139 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1013 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,16,17,16,17,16,17]
11401014 ; AVX2-NEXT: retq
11411015 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
11421016 ret <16 x i16> %shuffle
11531027 ;
11541028 ; AVX2-LABEL: @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08
11551029 ; AVX2: # BB#0:
1156 ; AVX2-NEXT: vpshufb {{.*}} # xmm1 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
1157 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1158 ; AVX2-NEXT: vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1]
1159 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1030 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,26,27,16,17,16,17]
11601031 ; AVX2-NEXT: retq
11611032 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
11621033 ret <16 x i16> %shuffle
11731044 ;
11741045 ; AVX2-LABEL: @shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08
11751046 ; AVX2: # BB#0:
1176 ; AVX2-NEXT: vpshufb {{.*}} # xmm1 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
1177 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1178 ; AVX2-NEXT: vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,12,13,0,1]
1179 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1047 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,28,29,16,17]
11801048 ; AVX2-NEXT: retq
11811049 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
11821050 ret <16 x i16> %shuffle
11931061 ;
11941062 ; AVX2-LABEL: @shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15
11951063 ; AVX2: # BB#0:
1196 ; AVX2-NEXT: vpshufb {{.*}} # xmm1 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
1197 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1198 ; AVX2-NEXT: vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,14,15]
1199 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1064 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,30,31]
12001065 ; AVX2-NEXT: retq
12011066 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
12021067 ret <16 x i16> %shuffle
12141079 ;
12151080 ; AVX2-LABEL: @shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08
12161081 ; AVX2: # BB#0:
1217 ; AVX2-NEXT: vpshuflw {{.*}} # xmm1 = xmm0[0,0,2,2,4,5,6,7]
1218 ; AVX2-NEXT: vpshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,4,6,6]
1219 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1220 ; AVX2-NEXT: vpshufb {{.*}} # xmm0 = xmm0[12,13,12,13,8,9,8,9,4,5,4,5,0,1,0,1]
1221 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1082 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,28,29,28,29,24,25,24,25,20,21,20,21,16,17,16,17]
12221083 ; AVX2-NEXT: retq
12231084 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
12241085 ret <16 x i16> %shuffle
12361097 ;
12371098 ; AVX2-LABEL: @shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12
12381099 ; AVX2: # BB#0:
1239 ; AVX2-NEXT: vpshufb {{.*}} # xmm1 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
1240 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1241 ; AVX2-NEXT: vpshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
1242 ; AVX2-NEXT: vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
1243 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1100 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
12441101 ; AVX2-NEXT: retq
12451102 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
12461103 ret <16 x i16> %shuffle
12571114 ;
12581115 ; AVX2-LABEL: @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08
12591116 ; AVX2: # BB#0:
1260 ; AVX2-NEXT: vpshufb {{.*}} # xmm1 = xmm0[0,1,2,3,4,5,0,1,0,1,0,1,0,1,0,1]
1261 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1262 ; AVX2-NEXT: vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,12,13,0,1]
1263 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1117 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,u,u,u,u,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,28,29,16,17]
12641118 ; AVX2-NEXT: retq
12651119 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
12661120 ret <16 x i16> %shuffle
12771131 ;
12781132 ; AVX2-LABEL: @shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15
12791133 ; AVX2: # BB#0:
1280 ; AVX2-NEXT: vpshufb {{.*}} # xmm1 = xmm0[14,15,2,3,0,1,0,1,0,1,0,1,0,1,0,1]
1281 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1282 ; AVX2-NEXT: vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,0,1,14,15]
1283 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1134 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[14,15,u,u,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,16,17,30,31]
12841135 ; AVX2-NEXT: retq
12851136 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
12861137 ret <16 x i16> %shuffle
12981149 ;
12991150 ; AVX2-LABEL: @shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08
13001151 ; AVX2: # BB#0:
1301 ; AVX2-NEXT: vpshuflw {{.*}} # xmm1 = xmm0[0,1,2,2,4,5,6,7]
1302 ; AVX2-NEXT: vpshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,4,6,6]
1303 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1304 ; AVX2-NEXT: vpshufb {{.*}} # xmm0 = xmm0[12,13,12,13,12,13,8,9,4,5,4,5,0,1,0,1]
1305 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1152 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[0,1,u,u,u,u,4,5,8,9,8,9,u,u,12,13,28,29,28,29,u,u,24,25,20,21,20,21,16,17,16,17]
13061153 ; AVX2-NEXT: retq
13071154 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
13081155 ret <16 x i16> %shuffle
13211168 ;
13221169 ; AVX2-LABEL: @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12
13231170 ; AVX2: # BB#0:
1324 ; AVX2-NEXT: vpshufd {{.*}} # xmm1 = xmm0[2,1,2,3]
1325 ; AVX2-NEXT: vpshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7]
1326 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1327 ; AVX2-NEXT: vpshuflw {{.*}} # xmm0 = xmm0[0,0,0,3,4,5,6,7]
1328 ; AVX2-NEXT: vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
1329 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1171 ; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[8,9,8,9,8,9,8,9,u,u,u,u,u,u,u,u,16,17,16,17,16,17,u,u,u,u,24,25,24,25,24,25]
13301172 ; AVX2-NEXT: retq
13311173 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
13321174 ret <16 x i16> %shuffle