llvm.org GIT mirror llvm / 2ce7031
[X86][AVX512]Improving shuffle lowering by using AVX-512 EXPAND* instructions This patch fix PR31351: https://llvm.org/bugs/show_bug.cgi?id=31351 1. This patch adds new type of shuffle lowering 2. We can use the expand instruction, When the shuffle pattern is as following: { 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order}. Reviewers: 1. igorb 2. guyblank 3. craig.topper 4. RKSimon Differential Revision: https://reviews.llvm.org/D28352 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@291584 91177308-0d34-0410-b5e6-96231b3b80d8 Michael Zuckerman 3 years ago
2 changed file(s) with 448 addition(s) and 6 deletion(s). Raw diff Collapse all Expand all
80898089 return Zeroable;
80908090 }
80918091
8092 // The Shuffle result is as follow:
8093 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
8094 // Each Zeroable's element correspond to a particular Mask's element.
8095 // As described in computeZeroableShuffleElements function.
8096 //
8097 // The function looks for a sub-mask that the nonzero elements are in
8098 // increasing order. If such sub-mask exist. The function returns true.
8099 static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable,
8100 ArrayRef Mask,const EVT &VectorType,
8101 bool &IsZeroSideLeft) {
8102 int NextElement = -1;
8103 // Check if the Mask's nonzero elements are in increasing order.
8104 for (int i = 0, e = Zeroable.size(); i < e; i++) {
8105 // Checks if the mask's zeros elements are built from only zeros.
8106 if (Mask[i] == -1)
8107 return false;
8108 if (Zeroable[i])
8109 continue;
8110 // Find the lowest non zero element
8111 if (NextElement == -1) {
8112 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
8113 IsZeroSideLeft = NextElement != 0;
8114 }
8115 // Exit if the mask's non zero elements are not in increasing order.
8116 if (NextElement != Mask[i])
8117 return false;
8118 NextElement++;
8119 }
8120 return true;
8121 }
8122
80928123 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
80938124 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
80948125 ArrayRef Mask, SDValue V1,
81428173 return DAG.getBitcast(
81438174 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
81448175 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8176 }
8177
8178 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
8179 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8180 const SDLoc &dl);
8181
8182 // Function convertBitVectorToUnsigned - The function gets SmallBitVector
8183 // as argument and convert him to unsigned.
8184 // The output of the function is not(zeroable)
8185 static unsigned convertBitVectorToUnsiged(const SmallBitVector &Zeroable) {
8186 unsigned convertBit = 0;
8187 for (int i = 0, e = Zeroable.size(); i < e; i++)
8188 convertBit |= !(Zeroable[i]) << i;
8189 return convertBit;
8190 }
8191
8192 // X86 has dedicated shuffle that can be lowered to VEXPAND
8193 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
8194 const SmallBitVector &Zeroable,
8195 ArrayRef Mask, SDValue &V1,
8196 SDValue &V2, SelectionDAG &DAG,
8197 const X86Subtarget &Subtarget) {
8198 bool IsLeftZeroSide = true;
8199 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
8200 IsLeftZeroSide))
8201 return SDValue();
8202 unsigned VEXPANDMask = convertBitVectorToUnsiged(Zeroable);
8203 MVT IntegerType =
8204 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8205 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
8206 unsigned NumElts = VT.getVectorNumElements();
8207 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
8208 "Unexpected number of vector elements");
8209 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
8210 Subtarget, DAG, DL);
8211 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
8212 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
8213 return DAG.getNode(ISD::VSELECT, DL, VT, VMask,
8214 DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
8215 ZeroVector);
81458216 }
81468217
81478218 // X86 has dedicated unpack instructions that can handle specific blend
1215812229 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
1215912230 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
1216012231 return Result;
12232 // If we have VLX support, we can use VEXPAND.
12233 if (Subtarget.hasVLX())
12234 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
12235 V1, V2, DAG, Subtarget))
12236 return V;
1216112237
1216212238 // If we have AVX2 then we always want to lower with a blend because an v4 we
1216312239 // can fully permute the elements.
1222112297 Zeroable, Subtarget, DAG))
1222212298 return Shift;
1222312299
12224 // If we have VLX support, we can use VALIGN.
12225 if (Subtarget.hasVLX())
12300 // If we have VLX support, we can use VALIGN or VEXPAND.
12301 if (Subtarget.hasVLX()) {
1222612302 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
1222712303 Mask, Subtarget, DAG))
1222812304 return Rotate;
12305
12306 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
12307 V1, V2, DAG, Subtarget))
12308 return V;
12309 }
1222912310
1223012311 // Try to use PALIGNR.
1223112312 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
1232712408 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
1232812409 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
1232912410 return Result;
12411 // If we have VLX support, we can use VEXPAND.
12412 if (Subtarget.hasVLX())
12413 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
12414 V1, V2, DAG, Subtarget))
12415 return V;
1233012416
1233112417 // If we have AVX2 then we always want to lower with a blend because at v8 we
1233212418 // can fully permute the elements.
1239112477 Zeroable, Subtarget, DAG))
1239212478 return Shift;
1239312479
12394 // If we have VLX support, we can use VALIGN.
12395 if (Subtarget.hasVLX())
12480 // If we have VLX support, we can use VALIGN or EXPAND.
12481 if (Subtarget.hasVLX()) {
1239612482 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
1239712483 Mask, Subtarget, DAG))
1239812484 return Rotate;
12485
12486 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
12487 V1, V2, DAG, Subtarget))
12488 return V;
12489 }
1239912490
1240012491 // Try to use byte rotation instructions.
1240112492 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
1275312844
1275412845 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
1275512846 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef Mask,
12847 const SmallBitVector &Zeroable,
1275612848 SDValue V1, SDValue V2,
1275712849 const X86Subtarget &Subtarget,
1275812850 SelectionDAG &DAG) {
1279512887 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
1279612888 return Op;
1279712889
12890 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
12891 V2, DAG, Subtarget))
12892 return V;
12893
1279812894 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
1279912895 }
1280012896
1280112897 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
1280212898 static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef Mask,
12899 const SmallBitVector &Zeroable,
1280312900 SDValue V1, SDValue V2,
1280412901 const X86Subtarget &Subtarget,
1280512902 SelectionDAG &DAG) {
1283112928 // Otherwise, fall back to a SHUFPS sequence.
1283212929 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
1283312930 }
12931 // If we have AVX512F support, we can use VEXPAND.
12932 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
12933 V1, V2, DAG, Subtarget))
12934 return V;
1283412935
1283512936 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
1283612937 }
1288812989 if (SDValue Unpck =
1288912990 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
1289012991 return Unpck;
12992 // If we have AVX512F support, we can use VEXPAND.
12993 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
12994 V2, DAG, Subtarget))
12995 return V;
1289112996
1289212997 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
1289312998 }
1295213057 CastV1, CastV2, DAG);
1295313058 return DAG.getBitcast(MVT::v16i32, ShufPS);
1295413059 }
13060 // If we have AVX512F support, we can use VEXPAND.
13061 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
13062 V1, V2, DAG, Subtarget))
13063 return V;
1295513064
1295613065 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
1295713066 }
1308813197 // the requisite ISA extensions for that element type are available.
1308913198 switch (VT.SimpleTy) {
1309013199 case MVT::v8f64:
13091 return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
13200 return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
1309213201 case MVT::v16f32:
13093 return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
13202 return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
1309413203 case MVT::v8i64:
1309513204 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
1309613205 case MVT::v16i32:
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=skx | FileCheck %s --check-prefix=SKX
2 ; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=knl | FileCheck %s --check-prefix=KNL
3
4 ;expand 128 -> 256 include <4 x float> <2 x double>
5 define <8 x float> @expand(<4 x float> %a) {
6 ; SKX-LABEL: expand:
7 ; SKX: # BB#0:
8 ; SKX-NEXT: # kill: %XMM0 %XMM0 %YMM0
9 ; SKX-NEXT: movb $5, %al
10 ; SKX-NEXT: kmovb %eax, %k1
11 ; SKX-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z}
12 ; SKX-NEXT: retq
13 ;
14 ; KNL-LABEL: expand:
15 ; KNL: # BB#0:
16 ; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
17 ; KNL-NEXT: vxorps %ymm1, %ymm1, %ymm1
18 ; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4,5,6,7]
19 ; KNL-NEXT: retq
20 %res = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <8 x i32>
21 ret <8 x float> %res
22 }
23
24 define <8 x float> @expand1(<4 x float> %a ) {
25 ; SKX-LABEL: expand1:
26 ; SKX: # BB#0:
27 ; SKX-NEXT: # kill: %XMM0 %XMM0 %YMM0
28 ; SKX-NEXT: movb $-86, %al
29 ; SKX-NEXT: kmovb %eax, %k1
30 ; SKX-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z}
31 ; SKX-NEXT: retq
32 ;
33 ; KNL-LABEL: expand1:
34 ; KNL: # BB#0:
35 ; KNL-NEXT: # kill: %XMM0 %XMM0 %YMM0
36 ; KNL-NEXT: vmovaps {{.*#+}} ymm1 =
37 ; KNL-NEXT: vpermps %ymm0, %ymm1, %ymm0
38 ; KNL-NEXT: vxorps %ymm1, %ymm1, %ymm1
39 ; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
40 ; KNL-NEXT: retq
41 %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32>
42 ret <8 x float> %res
43 }
44
45 ;Expand 128 -> 256 test <2 x double> -> <4 x double>
46 define <4 x double> @expand2(<2 x double> %a) {
47 ; SKX-LABEL: expand2:
48 ; SKX: # BB#0:
49 ; SKX-NEXT: # kill: %XMM0 %XMM0 %YMM0
50 ; SKX-NEXT: movb $9, %al
51 ; SKX-NEXT: kmovb %eax, %k1
52 ; SKX-NEXT: vexpandpd %ymm0, %ymm0 {%k1} {z}
53 ; SKX-NEXT: retq
54 ;
55 ; KNL-LABEL: expand2:
56 ; KNL: # BB#0:
57 ; KNL-NEXT: # kill: %XMM0 %XMM0 %YMM0
58 ; KNL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
59 ; KNL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
60 ; KNL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
61 ; KNL-NEXT: retq
62 %res = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <4 x i32>
63 ret <4 x double> %res
64 }
65
66 ;expand 128 -> 256 include case <4 x i32> <8 x i32>
67 define <8 x i32> @expand3(<4 x i32> %a ) {
68 ; SKX-LABEL: expand3:
69 ; SKX: # BB#0:
70 ; SKX-NEXT: # kill: %XMM0 %XMM0 %YMM0
71 ; SKX-NEXT: movb $-127, %al
72 ; SKX-NEXT: kmovb %eax, %k1
73 ; SKX-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z}
74 ; SKX-NEXT: retq
75 ;
76 ; KNL-LABEL: expand3:
77 ; KNL: # BB#0:
78 ; KNL-NEXT: # kill: %XMM0 %XMM0 %YMM0
79 ; KNL-NEXT: vpbroadcastq %xmm0, %ymm0
80 ; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
81 ; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7]
82 ; KNL-NEXT: retq
83 %res = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <8 x i32>
84 ret <8 x i32> %res
85 }
86
87 ;expand 128 -> 256 include case <2 x i64> <4 x i64>
88 define <4 x i64> @expand4(<2 x i64> %a ) {
89 ; SKX-LABEL: expand4:
90 ; SKX: # BB#0:
91 ; SKX-NEXT: # kill: %XMM0 %XMM0 %YMM0
92 ; SKX-NEXT: movb $9, %al
93 ; SKX-NEXT: kmovb %eax, %k1
94 ; SKX-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z}
95 ; SKX-NEXT: retq
96 ;
97 ; KNL-LABEL: expand4:
98 ; KNL: # BB#0:
99 ; KNL-NEXT: # kill: %XMM0 %XMM0 %YMM0
100 ; KNL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
101 ; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
102 ; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
103 ; KNL-NEXT: retq
104 %res = shufflevector <2 x i64> zeroinitializer, <2 x i64> %a, <4 x i32>
105 ret <4 x i64> %res
106 }
107
108 ;Negative test for 128-> 256
109 define <8 x float> @expand5(<4 x float> %a ) {
110 ; SKX-LABEL: expand5:
111 ; SKX: # BB#0:
112 ; SKX-NEXT: vbroadcastss %xmm0, %ymm0
113 ; SKX-NEXT: vxorps %ymm1, %ymm1, %ymm1
114 ; SKX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
115 ; SKX-NEXT: retq
116 ;
117 ; KNL-LABEL: expand5:
118 ; KNL: # BB#0:
119 ; KNL-NEXT: vbroadcastss %xmm0, %ymm0
120 ; KNL-NEXT: vxorps %ymm1, %ymm1, %ymm1
121 ; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
122 ; KNL-NEXT: retq
123 %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32>
124 ret <8 x float> %res
125 }
126
127 ;expand 256 -> 512 include <8 x float> <16 x float>
128 define <8 x float> @expand6(<4 x float> %a ) {
129 ; SKX-LABEL: expand6:
130 ; SKX: # BB#0:
131 ; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
132 ; SKX-NEXT: vinsertf{{.*}}$1, %xmm0, %ymm1, %ymm0
133 ; SKX-NEXT: retq
134 ;
135 ; KNL-LABEL: expand6:
136 ; KNL: # BB#0:
137 ; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1
138 ; KNL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
139 ; KNL-NEXT: retq
140 %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32>
141 ret <8 x float> %res
142 }
143
144 define <16 x float> @expand7(<8 x float> %a) {
145 ; SKX-LABEL: expand7:
146 ; SKX: # BB#0:
147 ; SKX-NEXT: # kill: %YMM0 %YMM0 %ZMM0
148 ; SKX-NEXT: movw $1285, %ax # imm = 0x505
149 ; SKX-NEXT: kmovw %eax, %k1
150 ; SKX-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z}
151 ; SKX-NEXT: retq
152 ;
153 ; KNL-LABEL: expand7:
154 ; KNL: # BB#0:
155 ; KNL-NEXT: # kill: %YMM0 %YMM0 %ZMM0
156 ; KNL-NEXT: movw $1285, %ax # imm = 0x505
157 ; KNL-NEXT: kmovw %eax, %k1
158 ; KNL-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z}
159 ; KNL-NEXT: retq
160 %res = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <16 x i32>
161 ret <16 x float> %res
162 }
163
164 define <16 x float> @expand8(<8 x float> %a ) {
165 ; SKX-LABEL: expand8:
166 ; SKX: # BB#0:
167 ; SKX-NEXT: # kill: %YMM0 %YMM0 %ZMM0
168 ; SKX-NEXT: movw $-21846, %ax # imm = 0xAAAA
169 ; SKX-NEXT: kmovw %eax, %k1
170 ; SKX-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z}
171 ; SKX-NEXT: retq
172 ;
173 ; KNL-LABEL: expand8:
174 ; KNL: # BB#0:
175 ; KNL-NEXT: # kill: %YMM0 %YMM0 %ZMM0
176 ; KNL-NEXT: movw $-21846, %ax # imm = 0xAAAA
177 ; KNL-NEXT: kmovw %eax, %k1
178 ; KNL-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z}
179 ; KNL-NEXT: retq
180 %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32>
181 ret <16 x float> %res
182 }
183
184 ;expand 256 -> 512 include <4 x double> <8 x double>
185 define <8 x double> @expand9(<4 x double> %a) {
186 ; SKX-LABEL: expand9:
187 ; SKX: # BB#0:
188 ; SKX-NEXT: # kill: %YMM0 %YMM0 %ZMM0
189 ; SKX-NEXT: movb $-127, %al
190 ; SKX-NEXT: kmovb %eax, %k1
191 ; SKX-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z}
192 ; SKX-NEXT: retq
193 ;
194 ; KNL-LABEL: expand9:
195 ; KNL: # BB#0:
196 ; KNL-NEXT: # kill: %YMM0 %YMM0 %ZMM0
197 ; KNL-NEXT: movb $-127, %al
198 ; KNL-NEXT: kmovw %eax, %k1
199 ; KNL-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z}
200 ; KNL-NEXT: retq
201 %res = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <8 x i32>
202 ret <8 x double> %res
203 }
204
205 define <16 x i32> @expand10(<8 x i32> %a ) {
206 ; SKX-LABEL: expand10:
207 ; SKX: # BB#0:
208 ; SKX-NEXT: # kill: %YMM0 %YMM0 %ZMM0
209 ; SKX-NEXT: movw $-21846, %ax # imm = 0xAAAA
210 ; SKX-NEXT: kmovw %eax, %k1
211 ; SKX-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z}
212 ; SKX-NEXT: retq
213 ;
214 ; KNL-LABEL: expand10:
215 ; KNL: # BB#0:
216 ; KNL-NEXT: # kill: %YMM0 %YMM0 %ZMM0
217 ; KNL-NEXT: movw $-21846, %ax # imm = 0xAAAA
218 ; KNL-NEXT: kmovw %eax, %k1
219 ; KNL-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z}
220 ; KNL-NEXT: retq
221 %res = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <16 x i32>
222 ret <16 x i32> %res
223 }
224
225 define <8 x i64> @expand11(<4 x i64> %a) {
226 ; SKX-LABEL: expand11:
227 ; SKX: # BB#0:
228 ; SKX-NEXT: # kill: %YMM0 %YMM0 %ZMM0
229 ; SKX-NEXT: movb $-127, %al
230 ; SKX-NEXT: kmovb %eax, %k1
231 ; SKX-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
232 ; SKX-NEXT: retq
233 ;
234 ; KNL-LABEL: expand11:
235 ; KNL: # BB#0:
236 ; KNL-NEXT: # kill: %YMM0 %YMM0 %ZMM0
237 ; KNL-NEXT: movb $-127, %al
238 ; KNL-NEXT: kmovw %eax, %k1
239 ; KNL-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
240 ; KNL-NEXT: retq
241 %res = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <8 x i32>
242 ret <8 x i64> %res
243 }
244
245 ;Negative test for 256-> 512
246 define <16 x float> @expand12(<8 x float> %a) {
247 ; SKX-LABEL: expand12:
248 ; SKX: # BB#0:
249 ; SKX-NEXT: # kill: %YMM0 %YMM0 %ZMM0
250 ; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16]
251 ; SKX-NEXT: vxorps %zmm1, %zmm1, %zmm1
252 ; SKX-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
253 ; SKX-NEXT: vmovaps %zmm1, %zmm0
254 ; SKX-NEXT: retq
255 ;
256 ; KNL-LABEL: expand12:
257 ; KNL: # BB#0:
258 ; KNL-NEXT: # kill: %YMM0 %YMM0 %ZMM0
259 ; KNL-NEXT: vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16]
260 ; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1
261 ; KNL-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
262 ; KNL-NEXT: vmovaps %zmm1, %zmm0
263 ; KNL-NEXT: retq
264 %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32>
265 ret <16 x float> %res
266 }
267
268 define <16 x float> @expand13(<8 x float> %a ) {
269 ; SKX-LABEL: expand13:
270 ; SKX: # BB#0:
271 ; SKX-NEXT: vxorps %ymm1, %ymm1, %ymm1
272 ; SKX-NEXT: vinsertf32x8 $1, %ymm0, %zmm1, %zmm0
273 ; SKX-NEXT: retq
274 ;
275 ; KNL-LABEL: expand13:
276 ; KNL: # BB#0:
277 ; KNL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
278 ; KNL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
279 ; KNL-NEXT: retq
280 %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32>
281 ret <16 x float> %res
282 }
283
284 ; The function checks for a case where the vector is mixed values vector ,and the mask points on zero elements from this vector.
285
286 define <8 x float> @expand14(<4 x float> %a) {
287 ; SKX-LABEL: expand14:
288 ; SKX: # BB#0:
289 ; SKX-NEXT: # kill: %XMM0 %XMM0 %YMM0
290 ; SKX-NEXT: movb $20, %al
291 ; SKX-NEXT: kmovb %eax, %k1
292 ; SKX-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z}
293 ; SKX-NEXT: retq
294 ;
295 ; KNL-LABEL: expand14:
296 ; KNL: # BB#0:
297 ; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
298 ; KNL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
299 ; KNL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>
300 ; KNL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,0,0]
301 ; KNL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
302 ; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
303 ; KNL-NEXT: retq
304 %addV = fadd <4 x float> ,
305 %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32>
306 ret <8 x float> %res
307 }
308
309 ;Negative test.
310 define <8 x float> @expand15(<4 x float> %a) {
311 ; SKX-LABEL: expand15:
312 ; SKX: # BB#0:
313 ; SKX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3]
314 ; SKX-NEXT: vmovaps {{.*#+}} ymm0 = <0,2,4,0,u,u,u,u>
315 ; SKX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,0]
316 ; SKX-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3]
317 ; SKX-NEXT: vpermi2ps %ymm1, %ymm2, %ymm0
318 ; SKX-NEXT: retq
319 ;
320 ; KNL-LABEL: expand15:
321 ; KNL: # BB#0:
322 ; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
323 ; KNL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
324 ; KNL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>
325 ; KNL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,0]
326 ; KNL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
327 ; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
328 ; KNL-NEXT: retq
329 %addV = fadd <4 x float> ,
330 %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32>
331 ret <8 x float> %res
332 }