llvm.org GIT mirror llvm / 3496365
[X86][AVX] Lower shuffles as repeated lane shuffles then lane-crossing shuffles This patch attempts to represent a shuffle as a repeating shuffle (recognisable by is128BitLaneRepeatedShuffleMask) with the source input(s) in their original lanes, followed by a single permutation of the 128-bit lanes to their final destinations. On AVX2 we can additionally attempt to match using 64-bit sub-lane permutation. AVX2 can also now match a similar 'broadcasted' repeating shuffle. This patch has several benefits: * Avoids prematurely matching with lowerVectorShuffleByMerging128BitLanes which can require both inputs to have their input lanes permuted before shuffling. * Can replace PERMPS/PERMD instructions - although these are useful for cross-lane unary shuffling, they require their shuffle mask to be pre-loaded (and increase register pressure). * Matching the repeating shuffle makes use of a lot of existing shuffle lowering. There is an outstanding minor AVX1 regression (combine_unneeded_subvector1 in vector-shuffle-combining.ll) of a previously 128-bit shuffle + subvector splat being converted to a subvector splat + (2 instruction) 256-bit shuffle, I intend to fix this in a followup patch for review. Differential Revision: http://reviews.llvm.org/D16537 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@260834 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 4 years ago
9 changed file(s) with 256 addition(s) and 119 deletion(s). Raw diff Collapse all Expand all
1075410754 return true;
1075510755 }
1075610756
10757 /// Handle case where shuffle sources are coming from the same 128-bit lane and
10758 /// every lane can be represented as the same repeating mask - allowing us to
10759 /// shuffle the sources with the repeating shuffle and then permute the result
10760 /// to the destination lanes.
10761 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
10762 SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask,
10763 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10764 int NumElts = VT.getVectorNumElements();
10765 int NumLanes = VT.getSizeInBits() / 128;
10766 int NumLaneElts = NumElts / NumLanes;
10767
10768 // On AVX2 we may be able to just shuffle the lowest elements and then
10769 // broadcast the result.
10770 if (Subtarget.hasAVX2()) {
10771 for (unsigned BroadcastSize : {16, 32, 64}) {
10772 if (BroadcastSize <= VT.getScalarSizeInBits())
10773 continue;
10774 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
10775
10776 // Attempt to match a repeating pattern every NumBroadcastElts,
10777 // accounting for UNDEFs but only references the lowest 128-bit
10778 // lane of the inputs.
10779 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl &RepeatMask) {
10780 for (int i = 0; i != NumElts; i += NumBroadcastElts)
10781 for (int j = 0; j != NumBroadcastElts; ++j) {
10782 int M = Mask[i + j];
10783 if (M < 0)
10784 continue;
10785 int &R = RepeatMask[j];
10786 if (0 != ((M % NumElts) / NumLaneElts))
10787 return false;
10788 else if (0 <= R && R != M)
10789 return false;
10790 else
10791 R = M;
10792 }
10793 return true;
10794 };
10795
10796 SmallVector RepeatMask((unsigned)NumElts, -1);
10797 if (!FindRepeatingBroadcastMask(RepeatMask))
10798 continue;
10799
10800 // Shuffle the (lowest) repeated elements in place for broadcast.
10801 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
10802
10803 // Shuffle the actual broadcast.
10804 SmallVector BroadcastMask((unsigned)NumElts, -1);
10805 for (int i = 0; i != NumElts; i += NumBroadcastElts)
10806 for (int j = 0; j != NumBroadcastElts; ++j)
10807 BroadcastMask[i + j] = j;
10808 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
10809 BroadcastMask);
10810 }
10811 }
10812
10813 // Bail if we already have a repeated lane shuffle mask.
10814 SmallVector RepeatedShuffleMask((unsigned)NumLaneElts, -1);
10815 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
10816 return SDValue();
10817
10818 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
10819 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
10820 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
10821 int NumSubLanes = NumLanes * SubLaneScale;
10822 int NumSubLaneElts = NumLaneElts / SubLaneScale;
10823
10824 // Check that all the sources are coming from the same lane and see if we
10825 // can form a repeating shuffle mask (local to each lane). At the same time,
10826 // determine the source sub-lane for each destination sub-lane.
10827 int TopSrcSubLane = -1;
10828 SmallVector RepeatedLaneMask((unsigned)NumLaneElts, -1);
10829 SmallVector Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
10830 for (int i = 0; i != NumElts; ++i) {
10831 int M = Mask[i];
10832 if (M < 0)
10833 continue;
10834 assert(0 <= M && M < 2 * NumElts);
10835
10836 // Check that the local mask index is the same for every lane. We always do
10837 // this with 128-bit lanes to match in is128BitLaneRepeatedShuffleMask.
10838 int LocalM = M < NumElts ? (M % NumLaneElts) : (M % NumLaneElts) + NumElts;
10839 int &RepeatM = RepeatedLaneMask[i % NumLaneElts];
10840 if (0 <= RepeatM && RepeatM != LocalM)
10841 return SDValue();
10842 RepeatM = LocalM;
10843
10844 // Check that the whole of each destination sub-lane comes from the same
10845 // sub-lane, we need to calculate the source based off where the repeated
10846 // lane mask will have left it.
10847 int SrcLane = (M % NumElts) / NumLaneElts;
10848 int SrcSubLane = (SrcLane * SubLaneScale) +
10849 ((i % NumLaneElts) / NumSubLaneElts);
10850 int &Dst2SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
10851 if (0 <= Dst2SrcSubLane && SrcSubLane != Dst2SrcSubLane)
10852 return SDValue();
10853 Dst2SrcSubLane = SrcSubLane;
10854
10855 // Track the top most source sub-lane - by setting the remaining to UNDEF
10856 // we can greatly simplify shuffle matching.
10857 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
10858 }
10859 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
10860 "Unexpected source lane");
10861
10862 // Create a repeating shuffle mask for the entire vector.
10863 SmallVector RepeatedMask((unsigned)NumElts, -1);
10864 for (int i = 0, e = ((TopSrcSubLane + 1) * NumSubLaneElts); i != e; ++i) {
10865 int M = RepeatedLaneMask[i % NumLaneElts];
10866 if (M < 0)
10867 continue;
10868 int Lane = i / NumLaneElts;
10869 RepeatedMask[i] = M + (Lane * NumLaneElts);
10870 }
10871 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
10872
10873 // Shuffle each source sub-lane to its destination.
10874 SmallVector SubLaneMask((unsigned)NumElts, -1);
10875 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
10876 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
10877 if (SrcSubLane < 0)
10878 continue;
10879 for (int j = 0; j != NumSubLaneElts; ++j)
10880 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
10881 }
10882
10883 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
10884 SubLaneMask);
10885 }
10886
1075710887 static SDValue lowerVectorShuffleWithSHUFPD(SDLoc DL, MVT VT,
1075810888 ArrayRef Mask, SDValue V1,
1075910889 SDValue V2, SelectionDAG &DAG) {
1082810958 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
1082910959 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
1083010960
10961 // Try to create an in-lane repeating shuffle mask and then shuffle the
10962 // the results into the target lanes.
10963 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
10964 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
10965 return V;
10966
1083110967 // Otherwise, fall back.
1083210968 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
1083310969 DAG);
1084610982 if (SDValue Op =
1084710983 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
1084810984 return Op;
10985
10986 // Try to create an in-lane repeating shuffle mask and then shuffle the
10987 // the results into the target lanes.
10988 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
10989 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
10990 return V;
1084910991
1085010992 // Try to simplify this by merging 128-bit lanes to enable a lane-based
1085110993 // shuffle. However, if we have AVX2 and either inputs are already in place,
1100011142 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
1100111143 }
1100211144
11145 // Try to create an in-lane repeating shuffle mask and then shuffle the
11146 // the results into the target lanes.
11147 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11148 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
11149 return V;
11150
1100311151 // If we have a single input shuffle with different shuffle patterns in the
1100411152 // two 128-bit lanes use the variable mask to VPERMILPS.
1100511153 if (isSingleInputShuffleMask(Mask)) {
1109511243 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
1109611244 return Rotate;
1109711245
11246 // Try to create an in-lane repeating shuffle mask and then shuffle the
11247 // the results into the target lanes.
11248 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11249 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
11250 return V;
11251
1109811252 // If the shuffle patterns aren't repeated but it is a single input, directly
1109911253 // generate a cross-lane VPERMD instruction.
1110011254 if (isSingleInputShuffleMask(Mask)) {
1116411318 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
1116511319 return Rotate;
1116611320
11321 // Try to create an in-lane repeating shuffle mask and then shuffle the
11322 // the results into the target lanes.
11323 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11324 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
11325 return V;
11326
1116711327 if (isSingleInputShuffleMask(Mask)) {
1116811328 // There are no generalized cross-lane shuffle operations available on i16
1116911329 // element types.
1125411414 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
1125511415 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
1125611416 return Rotate;
11417
11418 // Try to create an in-lane repeating shuffle mask and then shuffle the
11419 // the results into the target lanes.
11420 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11421 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
11422 return V;
1125711423
1125811424 if (isSingleInputShuffleMask(Mask)) {
1125911425 // There are no generalized cross-lane shuffle operations available on i8
123123 define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp {
124124 ; CHECK-LABEL: funcH:
125125 ; CHECK: ## BB#0: ## %entry
126 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
127 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
128 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
126 ; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,1,1,5,5,5,5]
127 ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
129128 ; CHECK-NEXT: retq
130129 entry:
131130 %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32>
33 define <4 x i32> @trunc4(<4 x i64> %A) nounwind {
44 ; CHECK-LABEL: trunc4:
55 ; CHECK: ## BB#0:
6 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
7 ; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
6 ; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
7 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
88 ; CHECK-NEXT: vzeroupper
99 ; CHECK-NEXT: retq
1010 %B = trunc <4 x i64> %A to <4 x i32>
717717 ;
718718 ; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16:
719719 ; AVX2: # BB#0:
720 ; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
720 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
721721 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
722 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
723722 ; AVX2-NEXT: retq
724723 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
725724 ret <16 x i16> %shuffle
976976 ;
977977 ; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32:
978978 ; AVX2: # BB#0:
979 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
979980 ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
980 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
981 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
982 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
983981 ; AVX2-NEXT: retq
984982 %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32>
985983 ret <32 x i8> %shuffle
111111 define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) {
112112 ; AVX1-LABEL: shuffle_v4f64_2200:
113113 ; AVX1: # BB#0:
114 ; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
114115 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
115 ; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
116116 ; AVX1-NEXT: retq
117117 ;
118118 ; AVX2-LABEL: shuffle_v4f64_2200:
152152 define <4 x double> @shuffle_v4f64_3210(<4 x double> %a, <4 x double> %b) {
153153 ; AVX1-LABEL: shuffle_v4f64_3210:
154154 ; AVX1: # BB#0:
155 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
155156 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
156 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
157157 ; AVX1-NEXT: retq
158158 ;
159159 ; AVX2-LABEL: shuffle_v4f64_3210:
516516 define <4 x double> @shuffle_v4f64_3333(<4 x double> %a, <4 x double> %b) {
517517 ; AVX1-LABEL: shuffle_v4f64_3333:
518518 ; AVX1: # BB#0:
519 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
520 ; AVX1-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
521 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
519 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
520 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
522521 ; AVX1-NEXT: retq
523522 ;
524523 ; AVX2-LABEL: shuffle_v4f64_3333:
662661 define <4 x i64> @shuffle_v4i64_2200(<4 x i64> %a, <4 x i64> %b) {
663662 ; AVX1-LABEL: shuffle_v4i64_2200:
664663 ; AVX1: # BB#0:
664 ; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
665665 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
666 ; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
667666 ; AVX1-NEXT: retq
668667 ;
669668 ; AVX2-LABEL: shuffle_v4i64_2200:
703702 define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) {
704703 ; AVX1-LABEL: shuffle_v4i64_3210:
705704 ; AVX1: # BB#0:
705 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
706706 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
707 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
708707 ; AVX1-NEXT: retq
709708 ;
710709 ; AVX2-LABEL: shuffle_v4i64_3210:
11711170 define <4 x i64> @shuffle_v4i64_3333(<4 x i64> %a, <4 x i64> %b) {
11721171 ; AVX1-LABEL: shuffle_v4i64_3333:
11731172 ; AVX1: # BB#0:
1174 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1175 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
1176 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1173 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
1174 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
11771175 ; AVX1-NEXT: retq
11781176 ;
11791177 ; AVX2-LABEL: shuffle_v4i64_3333:
194194 define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) {
195195 ; AVX1-LABEL: shuffle_v8f32_08080808:
196196 ; AVX1: # BB#0:
197 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,0]
198 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
199 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
197 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
198 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
200199 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
201 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
202200 ; AVX1-NEXT: retq
203201 ;
204202 ; AVX2-LABEL: shuffle_v8f32_08080808:
205203 ; AVX2: # BB#0:
206 ; AVX2-NEXT: vbroadcastss %xmm1, %ymm1
204 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
207205 ; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
208 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
209206 ; AVX2-NEXT: retq
210207 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32>
211208 ret <8 x float> %shuffle
646643 ;
647644 ; AVX2-LABEL: shuffle_v8f32_c348cda0:
648645 ; AVX2: # BB#0:
649 ; AVX2-NEXT: vmovaps {{.*#+}} ymm2 =
650 ; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
651 ; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u>
652 ; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
646 ; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4]
647 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1]
648 ; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
649 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,1]
653650 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
654651 ; AVX2-NEXT: retq
655652 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32>
659656 define <8 x float> @shuffle_v8f32_f511235a(<8 x float> %a, <8 x float> %b) {
660657 ; AVX1-LABEL: shuffle_v8f32_f511235a:
661658 ; AVX1: # BB#0:
659 ; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,1,2,2,7,5,6,6]
660 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
662661 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
663662 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[0,0,3,2]
664663 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,1,1,4,5,5,5]
665664 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3]
666 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
667 ; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,1,2,2,7,5,6,6]
668665 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
669666 ; AVX1-NEXT: retq
670667 ;
671668 ; AVX2-LABEL: shuffle_v8f32_f511235a:
672669 ; AVX2: # BB#0:
673 ; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <7,u,u,u,u,u,u,2>
674 ; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
675670 ; AVX2-NEXT: vmovaps {{.*#+}} ymm2 =
676671 ; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
672 ; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,1,2,2,7,5,6,6]
673 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
677674 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
678675 ; AVX2-NEXT: retq
679676 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32>
681678 }
682679
683680 define <8 x float> @shuffle_v8f32_32103210(<8 x float> %a, <8 x float> %b) {
684 ; AVX1-LABEL: shuffle_v8f32_32103210:
685 ; AVX1: # BB#0:
686 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
687 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
688 ; AVX1-NEXT: retq
689 ;
690 ; AVX2-LABEL: shuffle_v8f32_32103210:
691 ; AVX2: # BB#0:
692 ; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0]
693 ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
694 ; AVX2-NEXT: retq
681 ; ALL-LABEL: shuffle_v8f32_32103210:
682 ; ALL: # BB#0:
683 ; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
684 ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
685 ; ALL-NEXT: retq
695686 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32>
696687 ret <8 x float> %shuffle
697688 }
698689
699690 define <8 x float> @shuffle_v8f32_76547654(<8 x float> %a, <8 x float> %b) {
700 ; AVX1-LABEL: shuffle_v8f32_76547654:
701 ; AVX1: # BB#0:
702 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
703 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
704 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
705 ; AVX1-NEXT: retq
706 ;
707 ; AVX2-LABEL: shuffle_v8f32_76547654:
708 ; AVX2: # BB#0:
709 ; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
710 ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
711 ; AVX2-NEXT: retq
691 ; ALL-LABEL: shuffle_v8f32_76547654:
692 ; ALL: # BB#0:
693 ; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
694 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
695 ; ALL-NEXT: retq
712696 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32>
713697 ret <8 x float> %shuffle
714698 }
715699
716700 define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) {
717 ; AVX1-LABEL: shuffle_v8f32_76543210:
718 ; AVX1: # BB#0:
719 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
720 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
721 ; AVX1-NEXT: retq
722 ;
723 ; AVX2-LABEL: shuffle_v8f32_76543210:
724 ; AVX2: # BB#0:
725 ; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
726 ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
727 ; AVX2-NEXT: retq
701 ; ALL-LABEL: shuffle_v8f32_76543210:
702 ; ALL: # BB#0:
703 ; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
704 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
705 ; ALL-NEXT: retq
728706 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32>
729707 ret <8 x float> %shuffle
730708 }
782760 ;
783761 ; AVX2-LABEL: PR21138:
784762 ; AVX2: # BB#0:
785 ; AVX2-NEXT: vmovaps {{.*#+}} ymm2 =
786 ; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
787 ; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <1,3,5,7,u,u,u,u>
788 ; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
763 ; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,3,1,3,5,7,5,7]
764 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
765 ; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,1,3,5,7,5,7]
766 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
789767 ; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
790768 ; AVX2-NEXT: retq
791769 %shuffle = shufflevector <8 x float> %truc, <8 x float> %tchose, <8 x i32>
843821 define <8 x float> @shuffle_v8f32_44444444(<8 x float> %a, <8 x float> %b) {
844822 ; AVX1-LABEL: shuffle_v8f32_44444444:
845823 ; AVX1: # BB#0:
846 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
847 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
848 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
824 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
825 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
849826 ; AVX1-NEXT: retq
850827 ;
851828 ; AVX2-LABEL: shuffle_v8f32_44444444:
11061083 define <8 x i32> @shuffle_v8i32_08080808(<8 x i32> %a, <8 x i32> %b) {
11071084 ; AVX1-LABEL: shuffle_v8i32_08080808:
11081085 ; AVX1: # BB#0:
1109 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,0]
1110 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
1111 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1086 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
1087 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
11121088 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1113 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
11141089 ; AVX1-NEXT: retq
11151090 ;
11161091 ; AVX2-LABEL: shuffle_v8i32_08080808:
11171092 ; AVX2: # BB#0:
1118 ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
1093 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
11191094 ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0
1120 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
11211095 ; AVX2-NEXT: retq
11221096 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32>
11231097 ret <8 x i32> %shuffle
17611735 define <8 x i32> @shuffle_v8i32_6caa87e5(<8 x i32> %a, <8 x i32> %b) {
17621736 ; AVX1-LABEL: shuffle_v8i32_6caa87e5:
17631737 ; AVX1: # BB#0:
1738 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
1739 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
17641740 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
17651741 ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[2,2],ymm2[4,4],ymm1[6,6]
1766 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1767 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1768 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
17691742 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
17701743 ; AVX1-NEXT: retq
17711744 ;
17721745 ; AVX2-LABEL: shuffle_v8i32_6caa87e5:
17731746 ; AVX2: # BB#0:
1774 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 =
1775 ; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
17761747 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,3,2]
1748 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6]
1749 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,0,3]
17771750 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
17781751 ; AVX2-NEXT: retq
17791752 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32>
17891762 ;
17901763 ; AVX2-LABEL: shuffle_v8i32_32103210:
17911764 ; AVX2: # BB#0:
1792 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0]
1793 ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
1765 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
1766 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
17941767 ; AVX2-NEXT: retq
17951768 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32>
17961769 ret <8 x i32> %shuffle
17991772 define <8 x i32> @shuffle_v8i32_76547654(<8 x i32> %a, <8 x i32> %b) {
18001773 ; AVX1-LABEL: shuffle_v8i32_76547654:
18011774 ; AVX1: # BB#0:
1802 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1803 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
1804 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1775 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1776 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
18051777 ; AVX1-NEXT: retq
18061778 ;
18071779 ; AVX2-LABEL: shuffle_v8i32_76547654:
18081780 ; AVX2: # BB#0:
1809 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
1810 ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
1781 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1782 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
18111783 ; AVX2-NEXT: retq
18121784 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32>
18131785 ret <8 x i32> %shuffle
18161788 define <8 x i32> @shuffle_v8i32_76543210(<8 x i32> %a, <8 x i32> %b) {
18171789 ; AVX1-LABEL: shuffle_v8i32_76543210:
18181790 ; AVX1: # BB#0:
1791 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
18191792 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
1820 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
18211793 ; AVX1-NEXT: retq
18221794 ;
18231795 ; AVX2-LABEL: shuffle_v8i32_76543210:
18241796 ; AVX2: # BB#0:
1825 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
1826 ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
1797 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1798 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
18271799 ; AVX2-NEXT: retq
18281800 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32>
18291801 ret <8 x i32> %shuffle
20071979 define <8 x i32> @shuffle_v8i32_44444444(<8 x i32> %a, <8 x i32> %b) {
20081980 ; AVX1-LABEL: shuffle_v8i32_44444444:
20091981 ; AVX1: # BB#0:
2010 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2011 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2012 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1982 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
1983 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
20131984 ; AVX1-NEXT: retq
20141985 ;
20151986 ; AVX2-LABEL: shuffle_v8i32_44444444:
26352635 ; AVX1: # BB#0:
26362636 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
26372637 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
2638 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
26392638 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2639 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2640 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
26402641 ; AVX1-NEXT: retq
26412642 ;
26422643 ; AVX2-LABEL: combine_unneeded_subvector1:
26432644 ; AVX2: # BB#0:
26442645 ; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
2645 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
2646 ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
2646 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2647 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
26472648 ; AVX2-NEXT: retq
26482649 %b = add <8 x i32> %a,
26492650 %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32>
28972898 ; AVX2-LABEL: PR22412:
28982899 ; AVX2: # BB#0: # %entry
28992900 ; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
2900 ; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,7,6,5,4,3,2]
2901 ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
2901 ; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
2902 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1]
29022903 ; AVX2-NEXT: retq
29032904 entry:
29042905 %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32>
5151 ;
5252 ; AVX2-LABEL: trunc8i64_8i32:
5353 ; AVX2: # BB#0: # %entry
54 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
55 ; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
56 ; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
54 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
55 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
56 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
57 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
5758 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
5859 ; AVX2-NEXT: retq
5960 ;
134135 ;
135136 ; AVX2-LABEL: trunc8i64_8i16:
136137 ; AVX2: # BB#0: # %entry
137 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
138 ; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
139 ; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
138 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
139 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
140 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
141 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
140142 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
141143 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
142144 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
186188 ;
187189 ; AVX2-LABEL: trunc8i64_8i8:
188190 ; AVX2: # BB#0: # %entry
189 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
190 ; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
191 ; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
191 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
192 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
193 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
194 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
192195 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
193196 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
194197 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
421424 ;
422425 ; AVX2-LABEL: trunc2x4i64_8i32:
423426 ; AVX2: # BB#0: # %entry
424 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
425 ; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
426 ; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
427 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
428 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
429 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
430 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
427431 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
428432 ; AVX2-NEXT: retq
429433 ;
516520 ;
517521 ; AVX2-LABEL: trunc2x4i64_8i16:
518522 ; AVX2: # BB#0: # %entry
519 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
520 ; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
521 ; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
523 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
524 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
525 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
526 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
522527 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
523528 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
524529 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0