llvm.org GIT mirror llvm / 8ea1c2f
[X86] Add initial SimplifyDemandedVectorEltsForTargetNode support This patch adds an initial x86 SimplifyDemandedVectorEltsForTargetNode implementation to handle target shuffles. Currently the patch only decodes a target shuffle, calls SimplifyDemandedVectorElts on its input operands and removes any shuffle that reduces to undef/zero/identity. Future work will need to integrate this with combineX86ShufflesRecursively, add support for other x86 ops, etc. NOTE: There is a minor regression that appears to be affecting further (extractelement?) combines which I haven't been able to solve yet - possibly something to do with how nodes are added to the worklist after simplification. Differential Revision: https://reviews.llvm.org/D52140 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@342564 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 1 year, 9 months ago
19 changed file(s) with 444 addition(s) and 400 deletion(s). Raw diff Collapse all Expand all
3176431764 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
3176531765 /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
3176631766 return Res;
31767
31768 // Simplify source operands based on shuffle mask.
31769 // TODO - merge this into combineX86ShufflesRecursively.
31770 APInt KnownUndef, KnownZero;
31771 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
31772 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, DCI))
31773 return SDValue(N, 0);
3176731774 }
3176831775
3176931776 return SDValue();
31777 }
31778
31779 bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
31780 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
31781 TargetLoweringOpt &TLO, unsigned Depth) const {
31782 int NumElts = DemandedElts.getBitWidth();
31783 unsigned Opc = Op.getOpcode();
31784 EVT VT = Op.getValueType();
31785
31786 // Handle special case opcodes.
31787 switch (Opc) {
31788 case X86ISD::VBROADCAST: {
31789 SDValue Src = Op.getOperand(0);
31790 MVT SrcVT = Src.getSimpleValueType();
31791 if (!SrcVT.isVector())
31792 return false;
31793 APInt SrcUndef, SrcZero;
31794 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
31795 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
31796 Depth + 1))
31797 return true;
31798 break;
31799 }
31800 }
31801
31802 // Simplify target shuffles.
31803 if (!isTargetShuffle(Opc))
31804 return false;
31805
31806 // Get target shuffle mask.
31807 SmallVector OpMask;
31808 SmallVector OpInputs;
31809 if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, TLO.DAG))
31810 return false;
31811
31812 // Shuffle inputs must be the same type as the result.
31813 if (llvm::any_of(OpInputs,
31814 [VT](SDValue V) { return VT != V.getValueType(); }))
31815 return false;
31816
31817 // Attempt to simplify inputs.
31818 int NumSrcs = OpInputs.size();
31819 for (int Src = 0; Src != NumSrcs; ++Src) {
31820 int Lo = Src * NumElts;
31821 APInt SrcElts = APInt::getNullValue(NumElts);
31822 for (int i = 0; i != NumElts; ++i)
31823 if (DemandedElts[i]) {
31824 int M = OpMask[i] - Lo;
31825 if (0 <= M && M < NumElts)
31826 SrcElts.setBit(M);
31827 }
31828
31829 APInt SrcUndef, SrcZero;
31830 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
31831 TLO, Depth + 1))
31832 return true;
31833 }
31834
31835 // Check if shuffle mask can be simplified to undef/zero/identity.
31836 for (int i = 0; i != NumElts; ++i)
31837 if (!DemandedElts[i])
31838 OpMask[i] = SM_SentinelUndef;
31839
31840 if (isUndefInRange(OpMask, 0, NumElts)) {
31841 KnownUndef.setAllBits();
31842 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
31843 }
31844 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
31845 KnownZero.setAllBits();
31846 return TLO.CombineTo(
31847 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
31848 }
31849 for (int Src = 0; Src != NumSrcs; ++Src)
31850 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
31851 return TLO.CombineTo(Op, OpInputs[Src]);
31852
31853 // Extract known zero/undef elements.
31854 // TODO - Propagate input undef/zero elts.
31855 for (int i = 0; i != NumElts; ++i) {
31856 if (OpMask[i] == SM_SentinelUndef)
31857 KnownUndef.setBit(i);
31858 if (OpMask[i] == SM_SentinelZero)
31859 KnownZero.setBit(i);
31860 }
31861
31862 return false;
3177031863 }
3177131864
3177231865 /// Check if a vector extract from a target-specific shuffle of a load can be
867867 const SelectionDAG &DAG,
868868 unsigned Depth) const override;
869869
870 bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op,
871 const APInt &DemandedElts,
872 APInt &KnownUndef,
873 APInt &KnownZero,
874 TargetLoweringOpt &TLO,
875 unsigned Depth) const override;
876
870877 SDValue unwrapAddress(SDValue N) const override;
871878
872879 bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA,
55 ; CHECK: # %bb.0: # %entry
66 ; CHECK-NEXT: vmovaps (%eax), %ymm0
77 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
8 ; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,1,1]
9 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
10 ; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
8 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
9 ; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1110 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
1211 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1312 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
11591159 ; SSE2-NEXT: movdqa %xmm1, %xmm3
11601160 ; SSE2-NEXT: psrad $3, %xmm3
11611161 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1162 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1163 ; SSE2-NEXT: psrad $2, %xmm2
1164 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1165 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
1162 ; SSE2-NEXT: psrad $2, %xmm1
1163 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
11661164 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
11671165 ; SSE2-NEXT: movaps %xmm1, %xmm0
11681166 ; SSE2-NEXT: retq
11821180 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
11831181 ; SSE41-NEXT: paddd %xmm0, %xmm2
11841182 ; SSE41-NEXT: movdqa %xmm2, %xmm1
1183 ; SSE41-NEXT: psrad $4, %xmm1
11851184 ; SSE41-NEXT: movdqa %xmm2, %xmm3
1186 ; SSE41-NEXT: psrad $3, %xmm3
1187 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7]
1188 ; SSE41-NEXT: psrad $4, %xmm2
1189 ; SSE41-NEXT: psrad $2, %xmm1
1190 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1191 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7]
1192 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7]
1185 ; SSE41-NEXT: psrad $2, %xmm3
1186 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
1187 ; SSE41-NEXT: psrad $3, %xmm2
1188 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1189 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
11931190 ; SSE41-NEXT: retq
11941191 ;
11951192 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
12061203 ; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2
12071204 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3
12081205 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1209 ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm3
1210 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1206 ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
12111207 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
12121208 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
12131209 ; AVX1-NEXT: retq
12521248 ; SSE2-NEXT: movdqa %xmm0, %xmm4
12531249 ; SSE2-NEXT: psrad $3, %xmm4
12541250 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1255 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1256 ; SSE2-NEXT: psrad $2, %xmm3
1257 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
1258 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[0,3]
1251 ; SSE2-NEXT: psrad $2, %xmm0
1252 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3]
12591253 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
12601254 ; SSE2-NEXT: movdqa %xmm1, %xmm2
12611255 ; SSE2-NEXT: psrad $31, %xmm2
12731267 ; SSE2-NEXT: movdqa %xmm2, %xmm4
12741268 ; SSE2-NEXT: psrad $3, %xmm4
12751269 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1276 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1277 ; SSE2-NEXT: psrad $2, %xmm3
1278 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1279 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm4[0,3]
1270 ; SSE2-NEXT: psrad $2, %xmm2
1271 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3]
12801272 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
12811273 ; SSE2-NEXT: movaps %xmm2, %xmm1
12821274 ; SSE2-NEXT: retq
12841276 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
12851277 ; SSE41: # %bb.0:
12861278 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1279 ; SSE41-NEXT: psrad $31, %xmm0
12871280 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1288 ; SSE41-NEXT: psrad $31, %xmm3
1289 ; SSE41-NEXT: movdqa %xmm3, %xmm0
1290 ; SSE41-NEXT: psrld $28, %xmm0
1291 ; SSE41-NEXT: movdqa %xmm3, %xmm4
1281 ; SSE41-NEXT: psrld $28, %xmm3
1282 ; SSE41-NEXT: movdqa %xmm0, %xmm4
12921283 ; SSE41-NEXT: psrld $30, %xmm4
1293 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4,5,6,7]
1294 ; SSE41-NEXT: psrld $29, %xmm3
1295 ; SSE41-NEXT: pxor %xmm5, %xmm5
1296 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
1297 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1298 ; SSE41-NEXT: paddd %xmm2, %xmm3
1299 ; SSE41-NEXT: movdqa %xmm3, %xmm4
1300 ; SSE41-NEXT: movdqa %xmm3, %xmm0
1284 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1285 ; SSE41-NEXT: psrld $29, %xmm0
1286 ; SSE41-NEXT: pxor %xmm3, %xmm3
1287 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
1288 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
1289 ; SSE41-NEXT: paddd %xmm2, %xmm0
1290 ; SSE41-NEXT: movdqa %xmm0, %xmm4
1291 ; SSE41-NEXT: psrad $4, %xmm4
1292 ; SSE41-NEXT: movdqa %xmm0, %xmm5
1293 ; SSE41-NEXT: psrad $2, %xmm5
1294 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7]
13011295 ; SSE41-NEXT: psrad $3, %xmm0
1302 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
1296 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
1297 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7]
1298 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1299 ; SSE41-NEXT: psrad $31, %xmm2
1300 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1301 ; SSE41-NEXT: psrld $28, %xmm4
1302 ; SSE41-NEXT: movdqa %xmm2, %xmm5
1303 ; SSE41-NEXT: psrld $30, %xmm5
1304 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1305 ; SSE41-NEXT: psrld $29, %xmm2
1306 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1307 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
1308 ; SSE41-NEXT: paddd %xmm1, %xmm2
1309 ; SSE41-NEXT: movdqa %xmm2, %xmm3
13031310 ; SSE41-NEXT: psrad $4, %xmm3
1311 ; SSE41-NEXT: movdqa %xmm2, %xmm4
13041312 ; SSE41-NEXT: psrad $2, %xmm4
13051313 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1306 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
1307 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7]
1308 ; SSE41-NEXT: movdqa %xmm1, %xmm3
1309 ; SSE41-NEXT: psrad $31, %xmm3
1310 ; SSE41-NEXT: movdqa %xmm3, %xmm2
1311 ; SSE41-NEXT: psrld $28, %xmm2
1312 ; SSE41-NEXT: movdqa %xmm3, %xmm4
1313 ; SSE41-NEXT: psrld $30, %xmm4
1314 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4,5,6,7]
1315 ; SSE41-NEXT: psrld $29, %xmm3
1316 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
1317 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1318 ; SSE41-NEXT: paddd %xmm1, %xmm3
1319 ; SSE41-NEXT: movdqa %xmm3, %xmm4
1320 ; SSE41-NEXT: movdqa %xmm3, %xmm2
13211314 ; SSE41-NEXT: psrad $3, %xmm2
1322 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1323 ; SSE41-NEXT: psrad $4, %xmm3
1324 ; SSE41-NEXT: psrad $2, %xmm4
1325 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
13261315 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
13271316 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
13281317 ; SSE41-NEXT: movdqa %xmm2, %xmm1
14131402 ; SSE2-NEXT: movdqa %xmm0, %xmm6
14141403 ; SSE2-NEXT: psrad $3, %xmm6
14151404 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1416 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1417 ; SSE2-NEXT: psrad $2, %xmm5
1418 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
1419 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[0,3]
1405 ; SSE2-NEXT: psrad $2, %xmm0
1406 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3]
14201407 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
14211408 ; SSE2-NEXT: movdqa %xmm4, %xmm1
14221409 ; SSE2-NEXT: psrad $31, %xmm1
14341421 ; SSE2-NEXT: movdqa %xmm1, %xmm6
14351422 ; SSE2-NEXT: psrad $3, %xmm6
14361423 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1437 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1438 ; SSE2-NEXT: psrad $2, %xmm5
1439 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
1440 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3]
1424 ; SSE2-NEXT: psrad $2, %xmm1
1425 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3]
14411426 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3]
14421427 ; SSE2-NEXT: movdqa %xmm2, %xmm4
14431428 ; SSE2-NEXT: psrad $31, %xmm4
14551440 ; SSE2-NEXT: movdqa %xmm4, %xmm6
14561441 ; SSE2-NEXT: psrad $3, %xmm6
14571442 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1458 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1459 ; SSE2-NEXT: psrad $2, %xmm5
1460 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
1461 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm6[0,3]
1443 ; SSE2-NEXT: psrad $2, %xmm4
1444 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3]
14621445 ; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
14631446 ; SSE2-NEXT: movdqa %xmm3, %xmm5
14641447 ; SSE2-NEXT: psrad $31, %xmm5
14761459 ; SSE2-NEXT: movdqa %xmm5, %xmm6
14771460 ; SSE2-NEXT: psrad $3, %xmm6
14781461 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1]
1479 ; SSE2-NEXT: movdqa %xmm5, %xmm2
1480 ; SSE2-NEXT: psrad $2, %xmm2
1481 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0]
1482 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm6[0,3]
1462 ; SSE2-NEXT: psrad $2, %xmm5
1463 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3]
14831464 ; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
14841465 ; SSE2-NEXT: movaps %xmm4, %xmm2
14851466 ; SSE2-NEXT: movaps %xmm5, %xmm3
14891470 ; SSE41: # %bb.0:
14901471 ; SSE41-NEXT: movdqa %xmm1, %xmm4
14911472 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1492 ; SSE41-NEXT: movdqa %xmm0, %xmm6
1493 ; SSE41-NEXT: psrad $31, %xmm6
1494 ; SSE41-NEXT: movdqa %xmm6, %xmm0
1495 ; SSE41-NEXT: psrld $28, %xmm0
1496 ; SSE41-NEXT: movdqa %xmm6, %xmm7
1473 ; SSE41-NEXT: psrad $31, %xmm0
1474 ; SSE41-NEXT: movdqa %xmm0, %xmm5
1475 ; SSE41-NEXT: psrld $28, %xmm5
1476 ; SSE41-NEXT: movdqa %xmm0, %xmm7
14971477 ; SSE41-NEXT: psrld $30, %xmm7
1498 ; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm0[4,5,6,7]
1499 ; SSE41-NEXT: psrld $29, %xmm6
1500 ; SSE41-NEXT: pxor %xmm5, %xmm5
1501 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
1502 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7]
1503 ; SSE41-NEXT: paddd %xmm1, %xmm6
1504 ; SSE41-NEXT: movdqa %xmm6, %xmm7
1505 ; SSE41-NEXT: movdqa %xmm6, %xmm0
1478 ; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7]
1479 ; SSE41-NEXT: psrld $29, %xmm0
1480 ; SSE41-NEXT: pxor %xmm6, %xmm6
1481 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm6[0,1,2,3],xmm0[4,5,6,7]
1482 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7]
1483 ; SSE41-NEXT: paddd %xmm1, %xmm0
1484 ; SSE41-NEXT: movdqa %xmm0, %xmm5
1485 ; SSE41-NEXT: psrad $4, %xmm5
1486 ; SSE41-NEXT: movdqa %xmm0, %xmm7
1487 ; SSE41-NEXT: psrad $2, %xmm7
1488 ; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7]
15061489 ; SSE41-NEXT: psrad $3, %xmm0
1507 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm6[0,1,2,3],xmm0[4,5,6,7]
1508 ; SSE41-NEXT: psrad $4, %xmm6
1509 ; SSE41-NEXT: psrad $2, %xmm7
1510 ; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7]
15111490 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7]
15121491 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
1513 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1514 ; SSE41-NEXT: psrad $31, %xmm6
1515 ; SSE41-NEXT: movdqa %xmm6, %xmm1
1516 ; SSE41-NEXT: psrld $28, %xmm1
1517 ; SSE41-NEXT: movdqa %xmm6, %xmm7
1492 ; SSE41-NEXT: movdqa %xmm4, %xmm1
1493 ; SSE41-NEXT: psrad $31, %xmm1
1494 ; SSE41-NEXT: movdqa %xmm1, %xmm5
1495 ; SSE41-NEXT: psrld $28, %xmm5
1496 ; SSE41-NEXT: movdqa %xmm1, %xmm7
15181497 ; SSE41-NEXT: psrld $30, %xmm7
1519 ; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm1[4,5,6,7]
1520 ; SSE41-NEXT: psrld $29, %xmm6
1521 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
1522 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7]
1523 ; SSE41-NEXT: paddd %xmm4, %xmm6
1524 ; SSE41-NEXT: movdqa %xmm6, %xmm7
1525 ; SSE41-NEXT: movdqa %xmm6, %xmm1
1498 ; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7]
1499 ; SSE41-NEXT: psrld $29, %xmm1
1500 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
1501 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5],xmm7[6,7]
1502 ; SSE41-NEXT: paddd %xmm4, %xmm1
1503 ; SSE41-NEXT: movdqa %xmm1, %xmm5
1504 ; SSE41-NEXT: psrad $4, %xmm5
1505 ; SSE41-NEXT: movdqa %xmm1, %xmm7
1506 ; SSE41-NEXT: psrad $2, %xmm7
1507 ; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7]
15261508 ; SSE41-NEXT: psrad $3, %xmm1
1527 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
1528 ; SSE41-NEXT: psrad $4, %xmm6
1529 ; SSE41-NEXT: psrad $2, %xmm7
1530 ; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7]
15311509 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5],xmm7[6,7]
15321510 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7]
1533 ; SSE41-NEXT: movdqa %xmm2, %xmm6
1534 ; SSE41-NEXT: psrad $31, %xmm6
1535 ; SSE41-NEXT: movdqa %xmm6, %xmm4
1536 ; SSE41-NEXT: psrld $28, %xmm4
1537 ; SSE41-NEXT: movdqa %xmm6, %xmm7
1511 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1512 ; SSE41-NEXT: psrad $31, %xmm4
1513 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1514 ; SSE41-NEXT: psrld $28, %xmm5
1515 ; SSE41-NEXT: movdqa %xmm4, %xmm7
15381516 ; SSE41-NEXT: psrld $30, %xmm7
1539 ; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm4[4,5,6,7]
1540 ; SSE41-NEXT: psrld $29, %xmm6
1541 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
1542 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7]
1543 ; SSE41-NEXT: paddd %xmm2, %xmm6
1544 ; SSE41-NEXT: movdqa %xmm6, %xmm7
1545 ; SSE41-NEXT: movdqa %xmm6, %xmm4
1517 ; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7]
1518 ; SSE41-NEXT: psrld $29, %xmm4
1519 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
1520 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3],xmm4[4,5],xmm7[6,7]
1521 ; SSE41-NEXT: paddd %xmm2, %xmm4
1522 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1523 ; SSE41-NEXT: psrad $4, %xmm5
1524 ; SSE41-NEXT: movdqa %xmm4, %xmm7
1525 ; SSE41-NEXT: psrad $2, %xmm7
1526 ; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7]
15461527 ; SSE41-NEXT: psrad $3, %xmm4
1547 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
1548 ; SSE41-NEXT: psrad $4, %xmm6
1549 ; SSE41-NEXT: psrad $2, %xmm7
1550 ; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7]
15511528 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3],xmm4[4,5],xmm7[6,7]
15521529 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
1553 ; SSE41-NEXT: movdqa %xmm3, %xmm2
1554 ; SSE41-NEXT: psrad $31, %xmm2
1555 ; SSE41-NEXT: movdqa %xmm2, %xmm6
1556 ; SSE41-NEXT: psrld $28, %xmm6
1557 ; SSE41-NEXT: movdqa %xmm2, %xmm7
1530 ; SSE41-NEXT: movdqa %xmm3, %xmm5
1531 ; SSE41-NEXT: psrad $31, %xmm5
1532 ; SSE41-NEXT: movdqa %xmm5, %xmm2
1533 ; SSE41-NEXT: psrld $28, %xmm2
1534 ; SSE41-NEXT: movdqa %xmm5, %xmm7
15581535 ; SSE41-NEXT: psrld $30, %xmm7
1559 ; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7]
1560 ; SSE41-NEXT: psrld $29, %xmm2
1561 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
1562 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3],xmm2[4,5],xmm7[6,7]
1563 ; SSE41-NEXT: paddd %xmm3, %xmm2
1564 ; SSE41-NEXT: movdqa %xmm2, %xmm6
1565 ; SSE41-NEXT: movdqa %xmm2, %xmm5
1566 ; SSE41-NEXT: psrad $3, %xmm5
1567 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4,5,6,7]
1536 ; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4,5,6,7]
1537 ; SSE41-NEXT: psrld $29, %xmm5
1538 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1539 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5],xmm7[6,7]
1540 ; SSE41-NEXT: paddd %xmm3, %xmm5
1541 ; SSE41-NEXT: movdqa %xmm5, %xmm2
15681542 ; SSE41-NEXT: psrad $4, %xmm2
1543 ; SSE41-NEXT: movdqa %xmm5, %xmm6
15691544 ; SSE41-NEXT: psrad $2, %xmm6
15701545 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7]
1546 ; SSE41-NEXT: psrad $3, %xmm5
15711547 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
15721548 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3,4,5,6,7]
15731549 ; SSE41-NEXT: movdqa %xmm4, %xmm2
22492225 ; SSE2-NEXT: movdqa %xmm0, %xmm3
22502226 ; SSE2-NEXT: psrad $3, %xmm3
22512227 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
2252 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2253 ; SSE2-NEXT: psrad $2, %xmm2
2254 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2255 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm3[0,3]
2228 ; SSE2-NEXT: psrad $2, %xmm0
2229 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3]
22562230 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
22572231 ; SSE2-NEXT: pxor %xmm1, %xmm1
22582232 ; SSE2-NEXT: psubd %xmm0, %xmm1
22632237 ;
22642238 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
22652239 ; SSE41: # %bb.0:
2240 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2241 ; SSE41-NEXT: psrad $31, %xmm0
22662242 ; SSE41-NEXT: movdqa %xmm0, %xmm2
2267 ; SSE41-NEXT: psrad $31, %xmm2
2268 ; SSE41-NEXT: movdqa %xmm2, %xmm1
2269 ; SSE41-NEXT: psrld $28, %xmm1
2270 ; SSE41-NEXT: movdqa %xmm2, %xmm3
2243 ; SSE41-NEXT: psrld $28, %xmm2
2244 ; SSE41-NEXT: movdqa %xmm0, %xmm3
22712245 ; SSE41-NEXT: psrld $30, %xmm3
2272 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
2273 ; SSE41-NEXT: psrld $29, %xmm2
2274 ; SSE41-NEXT: pxor %xmm4, %xmm4
2275 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
2276 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
2277 ; SSE41-NEXT: paddd %xmm0, %xmm2
2278 ; SSE41-NEXT: movdqa %xmm2, %xmm3
2279 ; SSE41-NEXT: movdqa %xmm2, %xmm1
2280 ; SSE41-NEXT: psrad $3, %xmm1
2281 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2282 ; SSE41-NEXT: psrad $4, %xmm2
2283 ; SSE41-NEXT: psrad $2, %xmm3
22842246 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2285 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
2286 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2287 ; SSE41-NEXT: psubd %xmm1, %xmm4
2288 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
2289 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2247 ; SSE41-NEXT: psrld $29, %xmm0
2248 ; SSE41-NEXT: pxor %xmm2, %xmm2
2249 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
2250 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
2251 ; SSE41-NEXT: paddd %xmm1, %xmm0
2252 ; SSE41-NEXT: movdqa %xmm0, %xmm3
2253 ; SSE41-NEXT: psrad $4, %xmm3
2254 ; SSE41-NEXT: movdqa %xmm0, %xmm4
2255 ; SSE41-NEXT: psrad $2, %xmm4
2256 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
2257 ; SSE41-NEXT: psrad $3, %xmm0
2258 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
2259 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
2260 ; SSE41-NEXT: psubd %xmm0, %xmm2
2261 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
22902262 ; SSE41-NEXT: retq
22912263 ;
22922264 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
23032275 ; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2
23042276 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm4
23052277 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
2306 ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm4
2307 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
2278 ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
23082279 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
23092280 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
23102281 ; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm1
307307 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
308308 ; SSE-NEXT: paddd %xmm0, %xmm1
309309 ; SSE-NEXT: movdqa %xmm1, %xmm2
310 ; SSE-NEXT: psrad $3, %xmm2
310311 ; SSE-NEXT: movdqa %xmm1, %xmm3
311 ; SSE-NEXT: psrad $2, %xmm3
312 ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4,5,6,7]
313 ; SSE-NEXT: psrad $3, %xmm1
314 ; SSE-NEXT: psrad $1, %xmm2
315 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
316 ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
317 ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3,4,5,6,7]
318 ; SSE-NEXT: pmulld {{.*}}(%rip), %xmm3
319 ; SSE-NEXT: psubd %xmm3, %xmm0
312 ; SSE-NEXT: psrad $1, %xmm3
313 ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
314 ; SSE-NEXT: psrad $2, %xmm1
315 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
316 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
317 ; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1
318 ; SSE-NEXT: psubd %xmm1, %xmm0
320319 ; SSE-NEXT: retq
321320 ;
322321 ; AVX1-LABEL: combine_vec_srem_by_pow2b:
333332 ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm2
334333 ; AVX1-NEXT: vpsrad $1, %xmm1, %xmm3
335334 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
336 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3
337 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
335 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1
338336 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
339337 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
340338 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
88 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
99 ; X32-NEXT: imull {{[0-9]+}}(%esp), %ecx
1010 ; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx
11 ; X32-NEXT: movzwl 4(%eax,%ecx), %edx
1211 ; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
13 ; X32-NEXT: movd %edx, %xmm1
14 ; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1512 ; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
1613 ; X32-NEXT: movd %xmm0, %eax
1714 ; X32-NEXT: retl
3027 ; X64-NEXT: shlq $32, %rcx
3128 ; X64-NEXT: orq %rax, %rcx
3229 ; X64-NEXT: movq %rcx, %xmm0
33 ; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
3430 ; X64-NEXT: movd %xmm0, %eax
3531 ; X64-NEXT: retq
3632 entry:
6060 ;
6161 ; X64-SSSE3-LABEL: t3:
6262 ; X64-SSSE3: # %bb.0: # %bb
63 ; X64-SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
64 ; X64-SSSE3-NEXT: movlpd %xmm0, (%rax)
63 ; X64-SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
64 ; X64-SSSE3-NEXT: movsd %xmm0, (%rax)
6565 ; X64-SSSE3-NEXT: retq
6666 ;
6767 ; X64-AVX-LABEL: t3:
6868 ; X64-AVX: # %bb.0: # %bb
69 ; X64-AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
70 ; X64-AVX-NEXT: vmovlpd %xmm0, (%rax)
69 ; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
70 ; X64-AVX-NEXT: vmovsd %xmm0, (%rax)
7171 ; X64-AVX-NEXT: retq
7272 bb:
7373 %tmp13 = load <2 x double>, <2 x double>* %a0, align 1
9292 ;
9393 ; X64-SSSE3-LABEL: t4:
9494 ; X64-SSSE3: # %bb.0:
95 ; X64-SSSE3-NEXT: movq (%rdi), %rax
95 ; X64-SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
96 ; X64-SSSE3-NEXT: movq %xmm0, %rax
9697 ; X64-SSSE3-NEXT: retq
9798 ;
9899 ; X64-AVX-LABEL: t4:
283283 define float @extract_lane_insertps_6123(<4 x float> %a0, <4 x float> *%p1) {
284284 ; SSE-LABEL: extract_lane_insertps_6123:
285285 ; SSE: # %bb.0:
286 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
286 ; SSE-NEXT: movaps (%rdi), %xmm0
287 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
287288 ; SSE-NEXT: retq
288289 ;
289290 ; AVX-LABEL: extract_lane_insertps_6123:
290291 ; AVX: # %bb.0:
291 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
292 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0]
292293 ; AVX-NEXT: retq
293294 %a1 = load <4 x float>, <4 x float> *%p1
294295 %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 128)
12681268 define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind {
12691269 ; SSE2-LABEL: interleave_24i32_out:
12701270 ; SSE2: # %bb.0:
1271 ; SSE2-NEXT: movdqu 64(%rdi), %xmm10
12711272 ; SSE2-NEXT: movups 80(%rdi), %xmm8
1272 ; SSE2-NEXT: movups 64(%rdi), %xmm11
1273 ; SSE2-NEXT: movups (%rdi), %xmm0
1274 ; SSE2-NEXT: movups 16(%rdi), %xmm10
1275 ; SSE2-NEXT: movups 32(%rdi), %xmm9
1276 ; SSE2-NEXT: movdqu 48(%rdi), %xmm1
1277 ; SSE2-NEXT: movaps %xmm0, %xmm6
1278 ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm10[2,3]
1279 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
1280 ; SSE2-NEXT: movaps %xmm9, %xmm12
1281 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,2,3]
1282 ; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
1283 ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm9[0,3]
1284 ; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm6[2,0]
1285 ; SSE2-NEXT: movaps %xmm0, %xmm3
1286 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm9[2,0]
1287 ; SSE2-NEXT: movdqa %xmm1, %xmm6
1288 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1289 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1290 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm11[2,3]
1291 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,2,3]
1292 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1273 ; SSE2-NEXT: movdqu (%rdi), %xmm0
1274 ; SSE2-NEXT: movdqu 16(%rdi), %xmm11
1275 ; SSE2-NEXT: movups 32(%rdi), %xmm5
1276 ; SSE2-NEXT: movdqu 48(%rdi), %xmm9
1277 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
1278 ; SSE2-NEXT: movaps %xmm5, %xmm7
1279 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,2,3]
1280 ; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
1281 ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,3]
1282 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm11[2,3]
1283 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1284 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[2,0]
12931285 ; SSE2-NEXT: movaps %xmm8, %xmm5
1294 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm8[0,3]
1295 ; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm1[2,0]
1296 ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm8[2,0]
1297 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm11[0,0]
1298 ; SSE2-NEXT: movaps %xmm4, %xmm1
1299 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[3,3]
1300 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,0]
1301 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[2,0]
1302 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0]
1303 ; SSE2-NEXT: movaps %xmm0, %xmm1
1304 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm10[3,3]
1305 ; SSE2-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm1[2,0]
1306 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm12[2,0]
1307 ; SSE2-NEXT: movups %xmm6, 16(%rsi)
1286 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,2,3]
1287 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,0,1]
1288 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
1289 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,3]
1290 ; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm10[2,3]
1291 ; SSE2-NEXT: movdqa %xmm9, %xmm2
1292 ; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm8[2,0]
1293 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0]
1294 ; SSE2-NEXT: movaps %xmm2, %xmm4
1295 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm10[3,3]
1296 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[2,0]
1297 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[2,0]
1298 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0]
1299 ; SSE2-NEXT: movaps %xmm0, %xmm4
1300 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm11[3,3]
1301 ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[2,0]
1302 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[2,0]
1303 ; SSE2-NEXT: movups %xmm9, 16(%rsi)
13081304 ; SSE2-NEXT: movups %xmm3, (%rsi)
1309 ; SSE2-NEXT: movups %xmm4, 16(%rdx)
1305 ; SSE2-NEXT: movups %xmm2, 16(%rdx)
13101306 ; SSE2-NEXT: movups %xmm0, (%rdx)
1311 ; SSE2-NEXT: movups %xmm2, 16(%rcx)
1312 ; SSE2-NEXT: movups %xmm7, (%rcx)
1307 ; SSE2-NEXT: movups %xmm1, 16(%rcx)
1308 ; SSE2-NEXT: movups %xmm6, (%rcx)
13131309 ; SSE2-NEXT: retq
13141310 ;
13151311 ; SSE42-LABEL: interleave_24i32_out:
2929 ; CHECK-LABEL: pr26070:
3030 ; CHECK: ## %bb.0:
3131 ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
32 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
3332 ; CHECK-NEXT: retq
3433 %c = call float @copysignf(float 1.0, float undef) readnone
3534 ret float %c
1919 ; CHECK-NEXT: vmovaps 48(%rbp), %ymm14
2020 ; CHECK-NEXT: vmovaps 16(%rbp), %ymm15
2121 ; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
22 ; CHECK-NEXT: vxorps %xmm6, %xmm6, %xmm6
23 ; CHECK-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7]
24 ; CHECK-NEXT: # kill: def $xmm9 killed $xmm9 killed $ymm9
22 ; CHECK-NEXT: vmovaps %xmm9, %xmm6
23 ; CHECK-NEXT: vmovdqa %xmm6, %xmm9
24 ; CHECK-NEXT: # kill: def $ymm9 killed $xmm9
2525 ; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
26 ; CHECK-NEXT: vmovdqa %xmm9, %xmm0
27 ; CHECK-NEXT: # kill: def $ymm0 killed $xmm0
26 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
27 ; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
28 ; CHECK-NEXT: # implicit-def: $ymm0
29 ; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
2830 ; CHECK-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
2931 ; CHECK-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,0]
30 ; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
31 ; CHECK-NEXT: # implicit-def: $ymm0
32 ; CHECK-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0
3332 ; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7]
34 ; CHECK-NEXT: vmovaps %xmm2, %xmm9
33 ; CHECK-NEXT: vmovaps %xmm2, %xmm6
3534 ; CHECK-NEXT: # implicit-def: $ymm2
36 ; CHECK-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2
37 ; CHECK-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7]
38 ; CHECK-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
39 ; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
40 ; CHECK-NEXT: vmovaps %xmm7, %xmm9
41 ; CHECK-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4,5,6,7]
42 ; CHECK-NEXT: # implicit-def: $ymm6
43 ; CHECK-NEXT: vmovaps %xmm9, %xmm6
35 ; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2
4436 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
45 ; CHECK-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
46 ; CHECK-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,3]
47 ; CHECK-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7]
37 ; CHECK-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm11[2,3],ymm7[4,5],ymm11[6,7]
38 ; CHECK-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3]
39 ; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
40 ; CHECK-NEXT: vmovaps %xmm7, %xmm6
41 ; CHECK-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7]
42 ; CHECK-NEXT: # implicit-def: $ymm11
43 ; CHECK-NEXT: vmovaps %xmm6, %xmm11
44 ; CHECK-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
45 ; CHECK-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,3]
46 ; CHECK-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
4847 ; CHECK-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7]
4948 ; CHECK-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,1,3]
5049 ; CHECK-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,0,1,4,5,4,5]
5251 ; CHECK-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5352 ; CHECK-NEXT: vmovaps %ymm5, %ymm1
5453 ; CHECK-NEXT: vmovaps %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
55 ; CHECK-NEXT: vmovaps %ymm6, %ymm3
54 ; CHECK-NEXT: vmovaps %ymm9, %ymm3
5655 ; CHECK-NEXT: vmovaps %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
57 ; CHECK-NEXT: vmovaps %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5856 ; CHECK-NEXT: vmovaps %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5957 ; CHECK-NEXT: vmovaps %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
58 ; CHECK-NEXT: vmovaps %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6059 ; CHECK-NEXT: vmovaps %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6160 ; CHECK-NEXT: vmovaps %ymm14, (%rsp) # 32-byte Spill
6261 ; CHECK-NEXT: movq %rbp, %rsp
5656 ;
5757 ; AVX2-LABEL: trunc_shl_16_v8i16_v8i32:
5858 ; AVX2: # %bb.0:
59 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29]
60 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
61 ; AVX2-NEXT: vzeroupper
59 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
6260 ; AVX2-NEXT: retq
6361 %shl = shl <8 x i32> %a,
6462 %conv = trunc <8 x i32> %shl to <8 x i16>
430430 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
431431 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
432432 ; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
433 ; CHECK-SSE41-NEXT: psrld $2, %xmm2
434 ; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3
435 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
436 ; CHECK-SSE41-NEXT: psrld $3, %xmm1
437 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
438 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
433 ; CHECK-SSE41-NEXT: psrld $3, %xmm2
434 ; CHECK-SSE41-NEXT: psrld $2, %xmm1
435 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
439436 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
440437 ; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
441438 ; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
455452 ; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
456453 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
457454 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
458 ; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm2
459 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm2[4,5,6,7]
460 ; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm1
461 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
462 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
455 ; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm2
456 ; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
457 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
463458 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
464459 ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
465460 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
2626 ; X32: # %bb.0:
2727 ; X32-NEXT: pushl %eax
2828 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
29 ; X32-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
29 ; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
3030 ; X32-NEXT: movss %xmm0, (%esp)
3131 ; X32-NEXT: flds (%esp)
3232 ; X32-NEXT: popl %eax
3434 ;
3535 ; X64-LABEL: t2:
3636 ; X64: # %bb.0:
37 ; X64-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
37 ; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
3838 ; X64-NEXT: retq
3939 %X = load <4 x float>, <4 x float>* %P1
4040 %tmp = extractelement <4 x float> %X, i32 2
55 ; X32-LABEL: t1:
66 ; X32: # %bb.0:
77 ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
8 ; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
98 ; X32-NEXT: movaps %xmm0, %xmm2
109 ; X32-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
1110 ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
267267 define <3 x double> @constrained_vector_frem_v3f64() {
268268 ; NO-FMA-LABEL: constrained_vector_frem_v3f64:
269269 ; NO-FMA: # %bb.0: # %entry
270 ; NO-FMA-NEXT: subq $56, %rsp
271 ; NO-FMA-NEXT: .cfi_def_cfa_offset 64
270 ; NO-FMA-NEXT: subq $24, %rsp
271 ; NO-FMA-NEXT: .cfi_def_cfa_offset 32
272272 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
273273 ; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
274274 ; NO-FMA-NEXT: callq fmod
275 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
275 ; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
276276 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
277277 ; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
278278 ; NO-FMA-NEXT: callq fmod
279 ; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
280 ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
281 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
279 ; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
282280 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
283281 ; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
284282 ; NO-FMA-NEXT: callq fmod
285283 ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
286284 ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
287 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
288 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
289 ; NO-FMA-NEXT: addq $56, %rsp
285 ; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
286 ; NO-FMA-NEXT: # xmm0 = mem[0],zero
287 ; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
288 ; NO-FMA-NEXT: # xmm1 = mem[0],zero
289 ; NO-FMA-NEXT: addq $24, %rsp
290290 ; NO-FMA-NEXT: .cfi_def_cfa_offset 8
291291 ; NO-FMA-NEXT: retq
292292 ;
904904 define <3 x double> @constrained_vector_fma_v3f64() {
905905 ; NO-FMA-LABEL: constrained_vector_fma_v3f64:
906906 ; NO-FMA: # %bb.0: # %entry
907 ; NO-FMA-NEXT: subq $56, %rsp
908 ; NO-FMA-NEXT: .cfi_def_cfa_offset 64
907 ; NO-FMA-NEXT: subq $24, %rsp
908 ; NO-FMA-NEXT: .cfi_def_cfa_offset 32
909909 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
910910 ; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
911911 ; NO-FMA-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
912912 ; NO-FMA-NEXT: callq fma
913 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
913 ; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
914914 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
915915 ; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
916916 ; NO-FMA-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
917917 ; NO-FMA-NEXT: callq fma
918 ; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
919 ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
920 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
918 ; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
921919 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
922920 ; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
923921 ; NO-FMA-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
924922 ; NO-FMA-NEXT: callq fma
925923 ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
926924 ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
927 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
928 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
929 ; NO-FMA-NEXT: addq $56, %rsp
925 ; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
926 ; NO-FMA-NEXT: # xmm0 = mem[0],zero
927 ; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
928 ; NO-FMA-NEXT: # xmm1 = mem[0],zero
929 ; NO-FMA-NEXT: addq $24, %rsp
930930 ; NO-FMA-NEXT: .cfi_def_cfa_offset 8
931931 ; NO-FMA-NEXT: retq
932932 ;
13811381 define <3 x double> @constrained_vector_pow_v3f64() {
13821382 ; NO-FMA-LABEL: constrained_vector_pow_v3f64:
13831383 ; NO-FMA: # %bb.0: # %entry
1384 ; NO-FMA-NEXT: subq $56, %rsp
1385 ; NO-FMA-NEXT: .cfi_def_cfa_offset 64
1384 ; NO-FMA-NEXT: subq $24, %rsp
1385 ; NO-FMA-NEXT: .cfi_def_cfa_offset 32
13861386 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
13871387 ; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
13881388 ; NO-FMA-NEXT: callq pow
1389 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1389 ; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
13901390 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
13911391 ; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
13921392 ; NO-FMA-NEXT: callq pow
1393 ; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1394 ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
1395 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1393 ; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
13961394 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
13971395 ; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
13981396 ; NO-FMA-NEXT: callq pow
13991397 ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
14001398 ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
1401 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1402 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1403 ; NO-FMA-NEXT: addq $56, %rsp
1399 ; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
1400 ; NO-FMA-NEXT: # xmm0 = mem[0],zero
1401 ; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
1402 ; NO-FMA-NEXT: # xmm1 = mem[0],zero
1403 ; NO-FMA-NEXT: addq $24, %rsp
14041404 ; NO-FMA-NEXT: .cfi_def_cfa_offset 8
14051405 ; NO-FMA-NEXT: retq
14061406 ;
16371637 define <3 x double> @constrained_vector_powi_v3f64() {
16381638 ; NO-FMA-LABEL: constrained_vector_powi_v3f64:
16391639 ; NO-FMA: # %bb.0: # %entry
1640 ; NO-FMA-NEXT: subq $56, %rsp
1641 ; NO-FMA-NEXT: .cfi_def_cfa_offset 64
1640 ; NO-FMA-NEXT: subq $24, %rsp
1641 ; NO-FMA-NEXT: .cfi_def_cfa_offset 32
16421642 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
16431643 ; NO-FMA-NEXT: movl $3, %edi
16441644 ; NO-FMA-NEXT: callq __powidf2
1645 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1645 ; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
16461646 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
16471647 ; NO-FMA-NEXT: movl $3, %edi
16481648 ; NO-FMA-NEXT: callq __powidf2
1649 ; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1650 ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
1651 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1649 ; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
16521650 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
16531651 ; NO-FMA-NEXT: movl $3, %edi
16541652 ; NO-FMA-NEXT: callq __powidf2
16551653 ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
16561654 ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
1657 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1658 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1659 ; NO-FMA-NEXT: addq $56, %rsp
1655 ; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
1656 ; NO-FMA-NEXT: # xmm0 = mem[0],zero
1657 ; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
1658 ; NO-FMA-NEXT: # xmm1 = mem[0],zero
1659 ; NO-FMA-NEXT: addq $24, %rsp
16601660 ; NO-FMA-NEXT: .cfi_def_cfa_offset 8
16611661 ; NO-FMA-NEXT: retq
16621662 ;
18771877 define <3 x double> @constrained_vector_sin_v3f64() {
18781878 ; NO-FMA-LABEL: constrained_vector_sin_v3f64:
18791879 ; NO-FMA: # %bb.0: # %entry
1880 ; NO-FMA-NEXT: subq $56, %rsp
1881 ; NO-FMA-NEXT: .cfi_def_cfa_offset 64
1880 ; NO-FMA-NEXT: subq $24, %rsp
1881 ; NO-FMA-NEXT: .cfi_def_cfa_offset 32
18821882 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
18831883 ; NO-FMA-NEXT: callq sin
1884 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1884 ; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
18851885 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
18861886 ; NO-FMA-NEXT: callq sin
1887 ; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1888 ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
1889 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1887 ; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
18901888 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
18911889 ; NO-FMA-NEXT: callq sin
18921890 ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
18931891 ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
1894 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1895 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1896 ; NO-FMA-NEXT: addq $56, %rsp
1892 ; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
1893 ; NO-FMA-NEXT: # xmm0 = mem[0],zero
1894 ; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
1895 ; NO-FMA-NEXT: # xmm1 = mem[0],zero
1896 ; NO-FMA-NEXT: addq $24, %rsp
18971897 ; NO-FMA-NEXT: .cfi_def_cfa_offset 8
18981898 ; NO-FMA-NEXT: retq
18991899 ;
21012101 define <3 x double> @constrained_vector_cos_v3f64() {
21022102 ; NO-FMA-LABEL: constrained_vector_cos_v3f64:
21032103 ; NO-FMA: # %bb.0: # %entry
2104 ; NO-FMA-NEXT: subq $56, %rsp
2105 ; NO-FMA-NEXT: .cfi_def_cfa_offset 64
2104 ; NO-FMA-NEXT: subq $24, %rsp
2105 ; NO-FMA-NEXT: .cfi_def_cfa_offset 32
21062106 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
21072107 ; NO-FMA-NEXT: callq cos
2108 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2108 ; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
21092109 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
21102110 ; NO-FMA-NEXT: callq cos
2111 ; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2112 ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
2113 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2111 ; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
21142112 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
21152113 ; NO-FMA-NEXT: callq cos
21162114 ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
21172115 ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
2118 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2119 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2120 ; NO-FMA-NEXT: addq $56, %rsp
2116 ; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
2117 ; NO-FMA-NEXT: # xmm0 = mem[0],zero
2118 ; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
2119 ; NO-FMA-NEXT: # xmm1 = mem[0],zero
2120 ; NO-FMA-NEXT: addq $24, %rsp
21212121 ; NO-FMA-NEXT: .cfi_def_cfa_offset 8
21222122 ; NO-FMA-NEXT: retq
21232123 ;
23252325 define <3 x double> @constrained_vector_exp_v3f64() {
23262326 ; NO-FMA-LABEL: constrained_vector_exp_v3f64:
23272327 ; NO-FMA: # %bb.0: # %entry
2328 ; NO-FMA-NEXT: subq $56, %rsp
2329 ; NO-FMA-NEXT: .cfi_def_cfa_offset 64
2328 ; NO-FMA-NEXT: subq $24, %rsp
2329 ; NO-FMA-NEXT: .cfi_def_cfa_offset 32
23302330 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
23312331 ; NO-FMA-NEXT: callq exp
2332 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2332 ; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
23332333 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
23342334 ; NO-FMA-NEXT: callq exp
2335 ; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2336 ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
2337 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2335 ; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
23382336 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
23392337 ; NO-FMA-NEXT: callq exp
23402338 ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
23412339 ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
2342 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2343 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2344 ; NO-FMA-NEXT: addq $56, %rsp
2340 ; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
2341 ; NO-FMA-NEXT: # xmm0 = mem[0],zero
2342 ; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
2343 ; NO-FMA-NEXT: # xmm1 = mem[0],zero
2344 ; NO-FMA-NEXT: addq $24, %rsp
23452345 ; NO-FMA-NEXT: .cfi_def_cfa_offset 8
23462346 ; NO-FMA-NEXT: retq
23472347 ;
25492549 define <3 x double> @constrained_vector_exp2_v3f64() {
25502550 ; NO-FMA-LABEL: constrained_vector_exp2_v3f64:
25512551 ; NO-FMA: # %bb.0: # %entry
2552 ; NO-FMA-NEXT: subq $56, %rsp
2553 ; NO-FMA-NEXT: .cfi_def_cfa_offset 64
2552 ; NO-FMA-NEXT: subq $24, %rsp
2553 ; NO-FMA-NEXT: .cfi_def_cfa_offset 32
25542554 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
25552555 ; NO-FMA-NEXT: callq exp2
2556 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2556 ; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
25572557 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
25582558 ; NO-FMA-NEXT: callq exp2
2559 ; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2560 ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
2561 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2559 ; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
25622560 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
25632561 ; NO-FMA-NEXT: callq exp2
25642562 ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
25652563 ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
2566 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2567 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2568 ; NO-FMA-NEXT: addq $56, %rsp
2564 ; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
2565 ; NO-FMA-NEXT: # xmm0 = mem[0],zero
2566 ; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
2567 ; NO-FMA-NEXT: # xmm1 = mem[0],zero
2568 ; NO-FMA-NEXT: addq $24, %rsp
25692569 ; NO-FMA-NEXT: .cfi_def_cfa_offset 8
25702570 ; NO-FMA-NEXT: retq
25712571 ;
27732773 define <3 x double> @constrained_vector_log_v3f64() {
27742774 ; NO-FMA-LABEL: constrained_vector_log_v3f64:
27752775 ; NO-FMA: # %bb.0: # %entry
2776 ; NO-FMA-NEXT: subq $56, %rsp
2777 ; NO-FMA-NEXT: .cfi_def_cfa_offset 64
2776 ; NO-FMA-NEXT: subq $24, %rsp
2777 ; NO-FMA-NEXT: .cfi_def_cfa_offset 32
27782778 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
27792779 ; NO-FMA-NEXT: callq log
2780 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2780 ; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
27812781 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
27822782 ; NO-FMA-NEXT: callq log
2783 ; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2784 ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
2785 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2783 ; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
27862784 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
27872785 ; NO-FMA-NEXT: callq log
27882786 ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
27892787 ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
2790 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2791 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2792 ; NO-FMA-NEXT: addq $56, %rsp
2788 ; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
2789 ; NO-FMA-NEXT: # xmm0 = mem[0],zero
2790 ; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
2791 ; NO-FMA-NEXT: # xmm1 = mem[0],zero
2792 ; NO-FMA-NEXT: addq $24, %rsp
27932793 ; NO-FMA-NEXT: .cfi_def_cfa_offset 8
27942794 ; NO-FMA-NEXT: retq
27952795 ;
29972997 define <3 x double> @constrained_vector_log10_v3f64() {
29982998 ; NO-FMA-LABEL: constrained_vector_log10_v3f64:
29992999 ; NO-FMA: # %bb.0: # %entry
3000 ; NO-FMA-NEXT: subq $56, %rsp
3001 ; NO-FMA-NEXT: .cfi_def_cfa_offset 64
3000 ; NO-FMA-NEXT: subq $24, %rsp
3001 ; NO-FMA-NEXT: .cfi_def_cfa_offset 32
30023002 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
30033003 ; NO-FMA-NEXT: callq log10
3004 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3004 ; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
30053005 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
30063006 ; NO-FMA-NEXT: callq log10
3007 ; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3008 ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
3009 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3007 ; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
30103008 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
30113009 ; NO-FMA-NEXT: callq log10
30123010 ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
30133011 ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
3014 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3015 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3016 ; NO-FMA-NEXT: addq $56, %rsp
3012 ; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
3013 ; NO-FMA-NEXT: # xmm0 = mem[0],zero
3014 ; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
3015 ; NO-FMA-NEXT: # xmm1 = mem[0],zero
3016 ; NO-FMA-NEXT: addq $24, %rsp
30173017 ; NO-FMA-NEXT: .cfi_def_cfa_offset 8
30183018 ; NO-FMA-NEXT: retq
30193019 ;
32213221 define <3 x double> @constrained_vector_log2_v3f64() {
32223222 ; NO-FMA-LABEL: constrained_vector_log2_v3f64:
32233223 ; NO-FMA: # %bb.0: # %entry
3224 ; NO-FMA-NEXT: subq $56, %rsp
3225 ; NO-FMA-NEXT: .cfi_def_cfa_offset 64
3224 ; NO-FMA-NEXT: subq $24, %rsp
3225 ; NO-FMA-NEXT: .cfi_def_cfa_offset 32
32263226 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
32273227 ; NO-FMA-NEXT: callq log2
3228 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3228 ; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
32293229 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
32303230 ; NO-FMA-NEXT: callq log2
3231 ; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3232 ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
3233 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3231 ; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
32343232 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
32353233 ; NO-FMA-NEXT: callq log2
32363234 ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
32373235 ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
3238 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3239 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3240 ; NO-FMA-NEXT: addq $56, %rsp
3236 ; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
3237 ; NO-FMA-NEXT: # xmm0 = mem[0],zero
3238 ; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
3239 ; NO-FMA-NEXT: # xmm1 = mem[0],zero
3240 ; NO-FMA-NEXT: addq $24, %rsp
32413241 ; NO-FMA-NEXT: .cfi_def_cfa_offset 8
32423242 ; NO-FMA-NEXT: retq
32433243 ;
34233423 define <3 x double> @constrained_vector_rint_v3f64() {
34243424 ; NO-FMA-LABEL: constrained_vector_rint_v3f64:
34253425 ; NO-FMA: # %bb.0: # %entry
3426 ; NO-FMA-NEXT: subq $56, %rsp
3427 ; NO-FMA-NEXT: .cfi_def_cfa_offset 64
3426 ; NO-FMA-NEXT: subq $24, %rsp
3427 ; NO-FMA-NEXT: .cfi_def_cfa_offset 32
34283428 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
34293429 ; NO-FMA-NEXT: callq rint
3430 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3430 ; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
34313431 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
34323432 ; NO-FMA-NEXT: callq rint
3433 ; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3434 ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
3435 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3433 ; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
34363434 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
34373435 ; NO-FMA-NEXT: callq rint
34383436 ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
34393437 ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
3440 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3441 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3442 ; NO-FMA-NEXT: addq $56, %rsp
3438 ; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
3439 ; NO-FMA-NEXT: # xmm0 = mem[0],zero
3440 ; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
3441 ; NO-FMA-NEXT: # xmm1 = mem[0],zero
3442 ; NO-FMA-NEXT: addq $24, %rsp
34433443 ; NO-FMA-NEXT: .cfi_def_cfa_offset 8
34443444 ; NO-FMA-NEXT: retq
34453445 ;
35933593 define <3 x double> @constrained_vector_nearby_v3f64() {
35943594 ; NO-FMA-LABEL: constrained_vector_nearby_v3f64:
35953595 ; NO-FMA: # %bb.0: # %entry
3596 ; NO-FMA-NEXT: subq $56, %rsp
3597 ; NO-FMA-NEXT: .cfi_def_cfa_offset 64
3596 ; NO-FMA-NEXT: subq $24, %rsp
3597 ; NO-FMA-NEXT: .cfi_def_cfa_offset 32
35983598 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
35993599 ; NO-FMA-NEXT: callq nearbyint
3600 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3600 ; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
36013601 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
36023602 ; NO-FMA-NEXT: callq nearbyint
3603 ; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3604 ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
3605 ; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3603 ; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
36063604 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
36073605 ; NO-FMA-NEXT: callq nearbyint
36083606 ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
36093607 ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
3610 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3611 ; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3612 ; NO-FMA-NEXT: addq $56, %rsp
3608 ; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
3609 ; NO-FMA-NEXT: # xmm0 = mem[0],zero
3610 ; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload
3611 ; NO-FMA-NEXT: # xmm1 = mem[0],zero
3612 ; NO-FMA-NEXT: addq $24, %rsp
36133613 ; NO-FMA-NEXT: .cfi_def_cfa_offset 8
36143614 ; NO-FMA-NEXT: retq
36153615 ;
762762 ; SSE2-NEXT: xorps %xmm2, %xmm2
763763 ; SSE2-NEXT: xorps %xmm3, %xmm3
764764 ; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
765 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
766765 ; SSE2-NEXT: movdqa %xmm0, %xmm4
767766 ; SSE2-NEXT: pslld %xmm3, %xmm4
768767 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32]
855854 ; X32-SSE-NEXT: xorps %xmm2, %xmm2
856855 ; X32-SSE-NEXT: xorps %xmm3, %xmm3
857856 ; X32-SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
858 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
859857 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4
860858 ; X32-SSE-NEXT: pslld %xmm3, %xmm4
861859 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32]
875873 define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
876874 ; SSE2-LABEL: splatvar_rotate_v8i16:
877875 ; SSE2: # %bb.0:
878 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,2,3,4,5,6,7]
876 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
877 ; SSE2-NEXT: psubw %xmm1, %xmm2
879878 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
880879 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
881 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
882880 ; SSE2-NEXT: movdqa %xmm0, %xmm3
883881 ; SSE2-NEXT: psllw %xmm1, %xmm3
884 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16]
885 ; SSE2-NEXT: psubw %xmm2, %xmm1
886 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
887 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
888 ; SSE2-NEXT: psrlw %xmm1, %xmm0
882 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
883 ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
884 ; SSE2-NEXT: psrlw %xmm2, %xmm0
889885 ; SSE2-NEXT: por %xmm3, %xmm0
890886 ; SSE2-NEXT: retq
891887 ;
991987 ;
992988 ; X32-SSE-LABEL: splatvar_rotate_v8i16:
993989 ; X32-SSE: # %bb.0:
994 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,2,3,4,5,6,7]
990 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
991 ; X32-SSE-NEXT: psubw %xmm1, %xmm2
995992 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
996993 ; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
997 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
998994 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3
999995 ; X32-SSE-NEXT: psllw %xmm1, %xmm3
1000 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16]
1001 ; X32-SSE-NEXT: psubw %xmm2, %xmm1
1002 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
1003 ; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1004 ; X32-SSE-NEXT: psrlw %xmm1, %xmm0
996 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
997 ; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
998 ; X32-SSE-NEXT: psrlw %xmm2, %xmm0
1005999 ; X32-SSE-NEXT: por %xmm3, %xmm0
10061000 ; X32-SSE-NEXT: retl
10071001 %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
11141114 ; SSE2: # %bb.0:
11151115 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
11161116 ; SSE2-NEXT: xorps %xmm2, %xmm2
1117 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,0]
1117 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
11181118 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
11191119 ; SSE2-NEXT: movaps %xmm2, %xmm0
11201120 ; SSE2-NEXT: retq
11231123 ; SSE3: # %bb.0:
11241124 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
11251125 ; SSE3-NEXT: xorps %xmm2, %xmm2
1126 ; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,0]
1126 ; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
11271127 ; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
11281128 ; SSE3-NEXT: movaps %xmm2, %xmm0
11291129 ; SSE3-NEXT: retq
11321132 ; SSSE3: # %bb.0:
11331133 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
11341134 ; SSSE3-NEXT: xorps %xmm2, %xmm2
1135 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,0]
1135 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
11361136 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
11371137 ; SSSE3-NEXT: movaps %xmm2, %xmm0
11381138 ; SSSE3-NEXT: retq
19221922 ; AVX2-SLOW-LABEL: PR32160:
19231923 ; AVX2-SLOW: # %bb.0:
19241924 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1925 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
19261925 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
19271926 ; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
19281927 ; AVX2-SLOW-NEXT: vzeroupper
19311930 ; AVX2-FAST-LABEL: PR32160:
19321931 ; AVX2-FAST: # %bb.0:
19331932 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1934 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
19351933 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
19361934 ; AVX2-FAST-NEXT: vzeroupper
19371935 ; AVX2-FAST-NEXT: retq