llvm.org GIT mirror llvm / d0b69cf
Remove NEON vmull, vmlal, and vmlsl intrinsics, replacing them with multiply, add, and subtract operations with zero-extended or sign-extended vectors. Update tests. Add auto-upgrade support for the old intrinsics. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@112773 91177308-0d34-0410-b5e6-96231b3b80d8 Bob Wilson 9 years ago
10 changed file(s) with 456 addition(s) and 157 deletion(s). Raw diff Collapse all Expand all
128128 def int_arm_neon_vmulp : Neon_2Arg_Intrinsic;
129129 def int_arm_neon_vqdmulh : Neon_2Arg_Intrinsic;
130130 def int_arm_neon_vqrdmulh : Neon_2Arg_Intrinsic;
131 def int_arm_neon_vmulls : Neon_2Arg_Long_Intrinsic;
132 def int_arm_neon_vmullu : Neon_2Arg_Long_Intrinsic;
133131 def int_arm_neon_vmullp : Neon_2Arg_Long_Intrinsic;
134132 def int_arm_neon_vqdmull : Neon_2Arg_Long_Intrinsic;
135
136 // Vector Multiply and Accumulate/Subtract.
137 def int_arm_neon_vmlals : Neon_3Arg_Long_Intrinsic;
138 def int_arm_neon_vmlalu : Neon_3Arg_Long_Intrinsic;
139 def int_arm_neon_vmlsls : Neon_3Arg_Long_Intrinsic;
140 def int_arm_neon_vmlslu : Neon_3Arg_Long_Intrinsic;
141133 def int_arm_neon_vqdmlal : Neon_3Arg_Long_Intrinsic;
142134 def int_arm_neon_vqdmlsl : Neon_3Arg_Long_Intrinsic;
143135
301293 def int_arm_neon_vcvtfxs2fp : Neon_CvtFxToFP_Intrinsic;
302294 def int_arm_neon_vcvtfxu2fp : Neon_CvtFxToFP_Intrinsic;
303295
304 // Narrowing and Lengthening Vector Moves.
296 // Narrowing Saturating Vector Moves.
305297 def int_arm_neon_vqmovns : Neon_1Arg_Narrow_Intrinsic;
306298 def int_arm_neon_vqmovnu : Neon_1Arg_Narrow_Intrinsic;
307299 def int_arm_neon_vqmovnsu : Neon_1Arg_Narrow_Intrinsic;
325325
326326 // Neon does not support some operations on v1i64 and v2i64 types.
327327 setOperationAction(ISD::MUL, MVT::v1i64, Expand);
328 setOperationAction(ISD::MUL, MVT::v2i64, Expand);
328 // Custom handling for some quad-vector types to detect VMULL.
329 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
330 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
331 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
329332 setOperationAction(ISD::VSETCC, MVT::v1i64, Expand);
330333 setOperationAction(ISD::VSETCC, MVT::v2i64, Expand);
331334
683686 case ARMISD::VZIP: return "ARMISD::VZIP";
684687 case ARMISD::VUZP: return "ARMISD::VUZP";
685688 case ARMISD::VTRN: return "ARMISD::VTRN";
689 case ARMISD::VMULLs: return "ARMISD::VMULLs";
690 case ARMISD::VMULLu: return "ARMISD::VMULLu";
686691 case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR";
687692 case ARMISD::FMAX: return "ARMISD::FMAX";
688693 case ARMISD::FMIN: return "ARMISD::FMIN";
37503755 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Val);
37513756 }
37523757
3758 /// SkipExtension - For a node that is either a SIGN_EXTEND, ZERO_EXTEND, or
3759 /// an extending load, return the unextended value.
3760 static SDValue SkipExtension(SDNode *N, SelectionDAG &DAG) {
3761 if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
3762 return N->getOperand(0);
3763 LoadSDNode *LD = cast(N);
3764 return DAG.getLoad(LD->getMemoryVT(), N->getDebugLoc(), LD->getChain(),
3765 LD->getBasePtr(), LD->getSrcValue(),
3766 LD->getSrcValueOffset(), LD->isVolatile(),
3767 LD->isNonTemporal(), LD->getAlignment());
3768 }
3769
3770 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
3771 // Multiplications are only custom-lowered for 128-bit vectors so that
3772 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
3773 EVT VT = Op.getValueType();
3774 assert(VT.is128BitVector() && "unexpected type for custom-lowering ISD::MUL");
3775 SDNode *N0 = Op.getOperand(0).getNode();
3776 SDNode *N1 = Op.getOperand(1).getNode();
3777 unsigned NewOpc = 0;
3778 if ((N0->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N0)) &&
3779 (N1->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N1))) {
3780 NewOpc = ARMISD::VMULLs;
3781 } else if ((N0->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N0)) &&
3782 (N1->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N1))) {
3783 NewOpc = ARMISD::VMULLu;
3784 } else if (VT.getSimpleVT().SimpleTy == MVT::v2i64) {
3785 // Fall through to expand this. It is not legal.
3786 return SDValue();
3787 } else {
3788 // Other vector multiplications are legal.
3789 return Op;
3790 }
3791
3792 // Legalize to a VMULL instruction.
3793 DebugLoc DL = Op.getDebugLoc();
3794 SDValue Op0 = SkipExtension(N0, DAG);
3795 SDValue Op1 = SkipExtension(N1, DAG);
3796
3797 assert(Op0.getValueType().is64BitVector() &&
3798 Op1.getValueType().is64BitVector() &&
3799 "unexpected types for extended operands to VMULL");
3800 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
3801 }
3802
37533803 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
37543804 switch (Op.getOpcode()) {
37553805 default: llvm_unreachable("Don't know how to custom lower this!");
37913841 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
37923842 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
37933843 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
3844 case ISD::MUL: return LowerMUL(Op, DAG);
37943845 }
37953846 return SDValue();
37963847 }
143143 VZIP, // zip (interleave)
144144 VUZP, // unzip (deinterleave)
145145 VTRN, // transpose
146
147 // Vector multiply long:
148 VMULLs, // ...signed
149 VMULLu, // ...unsigned
146150
147151 // Operands of the standard BUILD_VECTOR node are not legalized, which
148152 // is fine if BUILD_VECTORs are always lowered to shuffles or other
9191 def NEONzip : SDNode<"ARMISD::VZIP", SDTARMVSHUF2>;
9292 def NEONuzp : SDNode<"ARMISD::VUZP", SDTARMVSHUF2>;
9393 def NEONtrn : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>;
94
95 def SDTARMVMULL : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
96 SDTCisSameAs<1, 2>]>;
97 def NEONvmulls : SDNode<"ARMISD::VMULLs", SDTARMVMULL>;
98 def NEONvmullu : SDNode<"ARMISD::VMULLu", SDTARMVMULL>;
9499
95100 def SDTARMFMAX : SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisSameAs<0, 1>,
96101 SDTCisSameAs<0, 2>]>;
12531258 OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
12541259 [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1),
12551260 (OpTy QPR:$src2), (OpTy QPR:$src3))))]>;
1261
1262 // Long Multiply-Add/Sub operations.
1263 class N3VLMulOp op21_20, bits<4> op11_8, bit op4,
1264 InstrItinClass itin, string OpcodeStr, string Dt,
1265 ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
1266 : N3V
1267 (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin,
1268 OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
1269 [(set QPR:$dst, (OpNode (TyQ QPR:$src1),
1270 (TyQ (MulOp (TyD DPR:$src2),
1271 (TyD DPR:$src3)))))]>;
1272 class N3VLMulOpSL op21_20, bits<4> op11_8,
1273 InstrItinClass itin, string OpcodeStr, string Dt,
1274 ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
1275 : N3V
1276 (ins QPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane),
1277 NVMulSLFrm, itin,
1278 OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
1279 [(set QPR:$dst,
1280 (OpNode (TyQ QPR:$src1),
1281 (TyQ (MulOp (TyD DPR:$src2),
1282 (TyD (NEONvduplane (TyD DPR_VFP2:$src3),
1283 imm:$lane))))))]>;
1284 class N3VLMulOpSL16 op21_20, bits<4> op11_8,
1285 InstrItinClass itin, string OpcodeStr, string Dt,
1286 ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
1287 : N3V
1288 (ins QPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane),
1289 NVMulSLFrm, itin,
1290 OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
1291 [(set QPR:$dst,
1292 (OpNode (TyQ QPR:$src1),
1293 (TyQ (MulOp (TyD DPR:$src2),
1294 (TyD (NEONvduplane (TyD DPR_8:$src3),
1295 imm:$lane))))))]>;
1296
12561297
12571298 // Neon Long 3-argument intrinsic. The destination register is
12581299 // a quad-register and is also used as the first source operand register.
13051346 // Long 3-register operations.
13061347 class N3VL op21_20, bits<4> op11_8, bit op4,
13071348 InstrItinClass itin, string OpcodeStr, string Dt,
1308 ValueType TyQ, ValueType TyD, SDNode OpNode, SDNode ExtOp,
1309 bit Commutable>
1349 ValueType TyQ, ValueType TyD, SDNode OpNode, bit Commutable>
1350 : N3V
1351 (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin,
1352 OpcodeStr, Dt, "$dst, $src1, $src2", "",
1353 [(set QPR:$dst, (TyQ (OpNode (TyD DPR:$src1), (TyD DPR:$src2))))]> {
1354 let isCommutable = Commutable;
1355 }
1356 class N3VLSL op21_20, bits<4> op11_8,
1357 InstrItinClass itin, string OpcodeStr, string Dt,
1358 ValueType TyQ, ValueType TyD, SDNode OpNode>
1359 : N3V
1360 (outs QPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
1361 NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
1362 [(set QPR:$dst,
1363 (TyQ (OpNode (TyD DPR:$src1),
1364 (TyD (NEONvduplane (TyD DPR_VFP2:$src2),imm:$lane)))))]>;
1365 class N3VLSL16 op21_20, bits<4> op11_8,
1366 InstrItinClass itin, string OpcodeStr, string Dt,
1367 ValueType TyQ, ValueType TyD, SDNode OpNode>
1368 : N3V
1369 (outs QPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane),
1370 NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
1371 [(set QPR:$dst,
1372 (TyQ (OpNode (TyD DPR:$src1),
1373 (TyD (NEONvduplane (TyD DPR_8:$src2), imm:$lane)))))]>;
1374
1375 // Long 3-register operations with explicitly extended operands.
1376 class N3VLExt op21_20, bits<4> op11_8, bit op4,
1377 InstrItinClass itin, string OpcodeStr, string Dt,
1378 ValueType TyQ, ValueType TyD, SDNode OpNode, SDNode ExtOp,
1379 bit Commutable>
13101380 : N3V
13111381 (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin,
13121382 OpcodeStr, Dt, "$dst, $src1, $src2", "",
17281798 multiclass N3VL_QHS op11_8, bit op4,
17291799 InstrItinClass itin16, InstrItinClass itin32,
17301800 string OpcodeStr, string Dt,
1731 SDNode OpNode, SDNode ExtOp, bit Commutable = 0> {
1801 SDNode OpNode, bit Commutable = 0> {
1802 def v8i16 : N3VL
1803 OpcodeStr, !strconcat(Dt, "8"),
1804 v8i16, v8i8, OpNode, Commutable>;
17321805 def v4i32 : N3VL
17331806 OpcodeStr, !strconcat(Dt, "16"),
1734 v4i32, v4i16, OpNode, ExtOp, Commutable>;
1807 v4i32, v4i16, OpNode, Commutable>;
17351808 def v2i64 : N3VL
17361809 OpcodeStr, !strconcat(Dt, "32"),
1737 v2i64, v2i32, OpNode, ExtOp, Commutable>;
1738 def v8i16 : N3VL
1739 OpcodeStr, !strconcat(Dt, "8"),
1740 v8i16, v8i8, OpNode, ExtOp, Commutable>;
1810 v2i64, v2i32, OpNode, Commutable>;
1811 }
1812
1813 multiclass N3VLSL_HS op11_8,
1814 InstrItinClass itin, string OpcodeStr, string Dt,
1815 SDNode OpNode> {
1816 def v4i16 : N3VLSL16
1817 !strconcat(Dt, "16"), v4i32, v4i16, OpNode>;
1818 def v2i32 : N3VLSL
1819 !strconcat(Dt, "32"), v2i64, v2i32, OpNode>;
1820 }
1821
1822 multiclass N3VLExt_QHS op11_8, bit op4,
1823 InstrItinClass itin16, InstrItinClass itin32,
1824 string OpcodeStr, string Dt,
1825 SDNode OpNode, SDNode ExtOp, bit Commutable = 0> {
1826 def v8i16 : N3VLExt
1827 OpcodeStr, !strconcat(Dt, "8"),
1828 v8i16, v8i8, OpNode, ExtOp, Commutable>;
1829 def v4i32 : N3VLExt
1830 OpcodeStr, !strconcat(Dt, "16"),
1831 v4i32, v4i16, OpNode, ExtOp, Commutable>;
1832 def v2i64 : N3VLExt
1833 OpcodeStr, !strconcat(Dt, "32"),
1834 v2i64, v2i32, OpNode, ExtOp, Commutable>;
17411835 }
17421836
17431837 // Neon Long 3-register vector intrinsics.
18531947 OpcodeStr, !strconcat(Dt, "16"), v8i16, v8i16, IntOp>;
18541948 def v4i32 : N3VQInt3
18551949 OpcodeStr, !strconcat(Dt, "32"), v4i32, v4i32, IntOp>;
1950 }
1951
1952
1953 // Neon Long Multiply-Op vector operations,
1954 // element sizes of 8, 16 and 32 bits:
1955 multiclass N3VLMulOp_QHS op11_8, bit op4,
1956 InstrItinClass itin16, InstrItinClass itin32,
1957 string OpcodeStr, string Dt, SDNode MulOp,
1958 SDNode OpNode> {
1959 def v8i16 : N3VLMulOp
1960 !strconcat(Dt, "8"), v8i16, v8i8, MulOp, OpNode>;
1961 def v4i32 : N3VLMulOp
1962 !strconcat(Dt, "16"), v4i32, v4i16, MulOp, OpNode>;
1963 def v2i64 : N3VLMulOp
1964 !strconcat(Dt, "32"), v2i64, v2i32, MulOp, OpNode>;
1965 }
1966
1967 multiclass N3VLMulOpSL_HS op11_8, string OpcodeStr,
1968 string Dt, SDNode MulOp, SDNode OpNode> {
1969 def v4i16 : N3VLMulOpSL16
1970 !strconcat(Dt,"16"), v4i32, v4i16, MulOp, OpNode>;
1971 def v2i32 : N3VLMulOpSL
1972 !strconcat(Dt, "32"), v2i64, v2i32, MulOp, OpNode>;
18561973 }
18571974
18581975
21292246 def VADDfq : N3VQ<0, 0, 0b00, 0b1101, 0, IIC_VBINQ, "vadd", "f32",
21302247 v4f32, v4f32, fadd, 1>;
21312248 // VADDL : Vector Add Long (Q = D + D)
2132 defm VADDLs : N3VL_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
2133 "vaddl", "s", add, sext, 1>;
2134 defm VADDLu : N3VL_QHS<1,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
2135 "vaddl", "u", add, zext, 1>;
2249 defm VADDLs : N3VLExt_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
2250 "vaddl", "s", add, sext, 1>;
2251 defm VADDLu : N3VLExt_QHS<1,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
2252 "vaddl", "u", add, zext, 1>;
21362253 // VADDW : Vector Add Wide (Q = Q + D)
21372254 defm VADDWs : N3VW_QHS<0,1,0b0001,0, "vaddw", "s", add, sext, 0>;
21382255 defm VADDWu : N3VW_QHS<1,1,0b0001,0, "vaddw", "u", add, zext, 0>;
22462363 (SubReg_i32_lane imm:$lane)))>;
22472364
22482365 // VMULL : Vector Multiply Long (integer and polynomial) (Q = D * D)
2249 defm VMULLs : N3VLInt_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
2250 "vmull", "s", int_arm_neon_vmulls, 1>;
2251 defm VMULLu : N3VLInt_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
2252 "vmull", "u", int_arm_neon_vmullu, 1>;
2366 defm VMULLs : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
2367 "vmull", "s", NEONvmulls, 1>;
2368 defm VMULLu : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
2369 "vmull", "u", NEONvmullu, 1>;
22532370 def VMULLp : N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8",
22542371 v8i16, v8i8, int_arm_neon_vmullp, 1>;
2255 defm VMULLsls : N3VLIntSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s",
2256 int_arm_neon_vmulls>;
2257 defm VMULLslu : N3VLIntSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u",
2258 int_arm_neon_vmullu>;
2372 defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", NEONvmulls>;
2373 defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", NEONvmullu>;
22592374
22602375 // VQDMULL : Vector Saturating Doubling Multiply Long (Q = D * D)
22612376 defm VQDMULL : N3VLInt_HS<0,1,0b1101,0, IIC_VMULi16D, IIC_VMULi32D,
23052420 (SubReg_i32_lane imm:$lane)))>;
23062421
23072422 // VMLAL : Vector Multiply Accumulate Long (Q += D * D)
2308 defm VMLALs : N3VLInt3_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
2309 "vmlal", "s", int_arm_neon_vmlals>;
2310 defm VMLALu : N3VLInt3_QHS<1,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
2311 "vmlal", "u", int_arm_neon_vmlalu>;
2312
2313 defm VMLALsls : N3VLInt3SL_HS<0, 0b0010, "vmlal", "s", int_arm_neon_vmlals>;
2314 defm VMLALslu : N3VLInt3SL_HS<1, 0b0010, "vmlal", "u", int_arm_neon_vmlalu>;
2423 defm VMLALs : N3VLMulOp_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
2424 "vmlal", "s", NEONvmulls, add>;
2425 defm VMLALu : N3VLMulOp_QHS<1,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
2426 "vmlal", "u", NEONvmullu, add>;
2427
2428 defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", NEONvmulls, add>;
2429 defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", NEONvmullu, add>;
23152430
23162431 // VQDMLAL : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D)
23172432 defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
23572472 (SubReg_i32_lane imm:$lane)))>;
23582473
23592474 // VMLSL : Vector Multiply Subtract Long (Q -= D * D)
2360 defm VMLSLs : N3VLInt3_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
2361 "vmlsl", "s", int_arm_neon_vmlsls>;
2362 defm VMLSLu : N3VLInt3_QHS<1,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
2363 "vmlsl", "u", int_arm_neon_vmlslu>;
2364
2365 defm VMLSLsls : N3VLInt3SL_HS<0, 0b0110, "vmlsl", "s", int_arm_neon_vmlsls>;
2366 defm VMLSLslu : N3VLInt3SL_HS<1, 0b0110, "vmlsl", "u", int_arm_neon_vmlslu>;
2475 defm VMLSLs : N3VLMulOp_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
2476 "vmlsl", "s", NEONvmulls, sub>;
2477 defm VMLSLu : N3VLMulOp_QHS<1,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
2478 "vmlsl", "u", NEONvmullu, sub>;
2479
2480 defm VMLSLsls : N3VLMulOpSL_HS<0, 0b0110, "vmlsl", "s", NEONvmulls, sub>;
2481 defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", NEONvmullu, sub>;
23672482
23682483 // VQDMLSL : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D)
23692484 defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
23802495 def VSUBfq : N3VQ<0, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vsub", "f32",
23812496 v4f32, v4f32, fsub, 0>;
23822497 // VSUBL : Vector Subtract Long (Q = D - D)
2383 defm VSUBLs : N3VL_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
2384 "vsubl", "s", sub, sext, 0>;
2385 defm VSUBLu : N3VL_QHS<1,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
2386 "vsubl", "u", sub, zext, 0>;
2498 defm VSUBLs : N3VLExt_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
2499 "vsubl", "s", sub, sext, 0>;
2500 defm VSUBLu : N3VLExt_QHS<1,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
2501 "vsubl", "u", sub, zext, 0>;
23872502 // VSUBW : Vector Subtract Wide (Q = Q - D)
23882503 defm VSUBWs : N3VW_QHS<0,1,0b0011,0, "vsubw", "s", sub, sext, 0>;
23892504 defm VSUBWu : N3VW_QHS<1,1,0b0011,0, "vsubw", "u", sub, zext, 0>;
8686
8787 ((Name.compare(14, 5, "vaddw", 5) == 0 ||
8888 Name.compare(14, 5, "vsubw", 5) == 0) &&
89 (Name.compare(19, 2, "s.", 2) == 0 ||
90 Name.compare(19, 2, "u.", 2) == 0)) ||
91
92 ((Name.compare(14, 5, "vmull", 5) == 0 ||
93 Name.compare(14, 5, "vmlal", 5) == 0 ||
94 Name.compare(14, 5, "vmlsl", 5) == 0) &&
8995 (Name.compare(19, 2, "s.", 2) == 0 ||
9096 Name.compare(19, 2, "u.", 2) == 0)) ||
9197
358364 return Upgraded;
359365 }
360366
367 /// ExtendNEONArgs - For NEON "long" and "wide" operations, where the results
368 /// have vector elements twice as big as one or both source operands, do the
369 /// sign- or zero-extension that used to be handled by intrinsics. The
370 /// extended values are returned via V0 and V1.
371 static void ExtendNEONArgs(CallInst *CI, Value *Arg0, Value *Arg1,
372 Value *&V0, Value *&V1) {
373 Function *F = CI->getCalledFunction();
374 const std::string& Name = F->getName();
375 bool isLong = (Name.at(18) == 'l');
376 bool isSigned = (Name.at(19) == 's');
377
378 if (isSigned) {
379 if (isLong)
380 V0 = new SExtInst(Arg0, CI->getType(), "", CI);
381 else
382 V0 = Arg0;
383 V1 = new SExtInst(Arg1, CI->getType(), "", CI);
384 } else {
385 if (isLong)
386 V0 = new ZExtInst(Arg0, CI->getType(), "", CI);
387 else
388 V0 = Arg0;
389 V1 = new ZExtInst(Arg1, CI->getType(), "", CI);
390 }
391 }
392
361393 // UpgradeIntrinsicCall - Upgrade a call to an old intrinsic to be a call the
362394 // upgraded intrinsic. All argument and return casting must be provided in
363395 // order to seamlessly integrate with existing context.
375407 // Upgrade ARM NEON intrinsics.
376408 if (Name.compare(5, 9, "arm.neon.", 9) == 0) {
377409 Instruction *NewI;
410 Value *V0, *V1;
378411 if (Name.compare(14, 7, "vmovls.", 7) == 0) {
379412 NewI = new SExtInst(CI->getArgOperand(0), CI->getType(),
380413 "upgraded." + CI->getName(), CI);
381414 } else if (Name.compare(14, 7, "vmovlu.", 7) == 0) {
382415 NewI = new ZExtInst(CI->getArgOperand(0), CI->getType(),
383416 "upgraded." + CI->getName(), CI);
384
385 } else if (Name.compare(14, 4, "vadd", 4) == 0 ||
386 Name.compare(14, 4, "vsub", 4) == 0) {
387 // Extend one (vaddw/vsubw) or both (vaddl/vsubl) operands.
388 Value *V0 = CI->getArgOperand(0);
389 Value *V1 = CI->getArgOperand(1);
390 if (Name.at(19) == 's') {
391 if (Name.at(18) == 'l')
392 V0 = new SExtInst(CI->getArgOperand(0), CI->getType(), "", CI);
393 V1 = new SExtInst(CI->getArgOperand(1), CI->getType(), "", CI);
394 } else {
395 assert(Name.at(19) == 'u' && "unexpected vadd/vsub intrinsic");
396 if (Name.at(18) == 'l')
397 V0 = new ZExtInst(CI->getArgOperand(0), CI->getType(), "", CI);
398 V1 = new ZExtInst(CI->getArgOperand(1), CI->getType(), "", CI);
399 }
400 if (Name.compare(14, 4, "vadd", 4) == 0)
401 NewI = BinaryOperator::CreateAdd(V0, V1,"upgraded."+CI->getName(),CI);
402 else
403 NewI = BinaryOperator::CreateSub(V0, V1,"upgraded."+CI->getName(),CI);
404
417 } else if (Name.compare(14, 4, "vadd", 4) == 0) {
418 ExtendNEONArgs(CI, CI->getArgOperand(0), CI->getArgOperand(1), V0, V1);
419 NewI = BinaryOperator::CreateAdd(V0, V1, "upgraded."+CI->getName(), CI);
420 } else if (Name.compare(14, 4, "vsub", 4) == 0) {
421 ExtendNEONArgs(CI, CI->getArgOperand(0), CI->getArgOperand(1), V0, V1);
422 NewI = BinaryOperator::CreateSub(V0, V1,"upgraded."+CI->getName(),CI);
423 } else if (Name.compare(14, 4, "vmul", 4) == 0) {
424 ExtendNEONArgs(CI, CI->getArgOperand(0), CI->getArgOperand(1), V0, V1);
425 NewI = BinaryOperator::CreateMul(V0, V1,"upgraded."+CI->getName(),CI);
426 } else if (Name.compare(14, 4, "vmla", 4) == 0) {
427 ExtendNEONArgs(CI, CI->getArgOperand(1), CI->getArgOperand(2), V0, V1);
428 Instruction *MulI = BinaryOperator::CreateMul(V0, V1, "", CI);
429 NewI = BinaryOperator::CreateAdd(CI->getArgOperand(0), MulI,
430 "upgraded."+CI->getName(), CI);
431 } else if (Name.compare(14, 4, "vmls", 4) == 0) {
432 ExtendNEONArgs(CI, CI->getArgOperand(1), CI->getArgOperand(2), V0, V1);
433 Instruction *MulI = BinaryOperator::CreateMul(V0, V1, "", CI);
434 NewI = BinaryOperator::CreateSub(CI->getArgOperand(0), MulI,
435 "upgraded."+CI->getName(), CI);
405436 } else if (Name.compare(14, 6, "vmovn.", 6) == 0) {
406437 NewI = new TruncInst(CI->getArgOperand(0), CI->getType(),
407438 "upgraded." + CI->getName(), CI);
5151 ; CHECK: zext <4 x i16>
5252 ; CHECK-NEXT: add <4 x i32>
5353
54 ; vsubl/vsubw should be auto-upgraded to sub with sext/zext
54 ; vsubl/vsubw should be auto-upgraded to subtract with sext/zext
5555
5656 ; CHECK: vsubls16
5757 ; CHECK-NOT: arm.neon.vsubls.v4i32
7474 ; CHECK-NOT: arm.neon.vsubwu.v4i32
7575 ; CHECK: zext <4 x i16>
7676 ; CHECK-NEXT: sub <4 x i32>
77
78 ; vmull should be auto-upgraded to multiply with sext/zext
79 ; (but vmullp should remain an intrinsic)
80
81 ; CHECK: vmulls8
82 ; CHECK-NOT: arm.neon.vmulls.v8i16
83 ; CHECK: sext <8 x i8>
84 ; CHECK-NEXT: sext <8 x i8>
85 ; CHECK-NEXT: mul <8 x i16>
86
87 ; CHECK: vmullu16
88 ; CHECK-NOT: arm.neon.vmullu.v4i32
89 ; CHECK: zext <4 x i16>
90 ; CHECK-NEXT: zext <4 x i16>
91 ; CHECK-NEXT: mul <4 x i32>
92
93 ; CHECK: vmullp8
94 ; CHECK: arm.neon.vmullp.v8i16
95
96 ; vmlal should be auto-upgraded to multiply/add with sext/zext
97
98 ; CHECK: vmlals32
99 ; CHECK-NOT: arm.neon.vmlals.v2i64
100 ; CHECK: sext <2 x i32>
101 ; CHECK-NEXT: sext <2 x i32>
102 ; CHECK-NEXT: mul <2 x i64>
103 ; CHECK-NEXT: add <2 x i64>
104
105 ; CHECK: vmlalu8
106 ; CHECK-NOT: arm.neon.vmlalu.v8i16
107 ; CHECK: zext <8 x i8>
108 ; CHECK-NEXT: zext <8 x i8>
109 ; CHECK-NEXT: mul <8 x i16>
110 ; CHECK-NEXT: add <8 x i16>
111
112 ; vmlsl should be auto-upgraded to multiply/sub with sext/zext
113
114 ; CHECK: vmlsls16
115 ; CHECK-NOT: arm.neon.vmlsls.v4i32
116 ; CHECK: sext <4 x i16>
117 ; CHECK-NEXT: sext <4 x i16>
118 ; CHECK-NEXT: mul <4 x i32>
119 ; CHECK-NEXT: sub <4 x i32>
120
121 ; CHECK: vmlslu32
122 ; CHECK-NOT: arm.neon.vmlslu.v2i64
123 ; CHECK: zext <2 x i32>
124 ; CHECK-NEXT: zext <2 x i32>
125 ; CHECK-NEXT: mul <2 x i64>
126 ; CHECK-NEXT: sub <2 x i64>
77127
78128 ; vmovn should be auto-upgraded to trunc
79129
9393 %tmp1 = load <8 x i16>* %A
9494 %tmp2 = load <8 x i8>* %B
9595 %tmp3 = load <8 x i8>* %C
96 %tmp4 = call <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
97 ret <8 x i16> %tmp4
96 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
97 %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
98 %tmp6 = mul <8 x i16> %tmp4, %tmp5
99 %tmp7 = add <8 x i16> %tmp1, %tmp6
100 ret <8 x i16> %tmp7
98101 }
99102
100103 define <4 x i32> @vmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
103106 %tmp1 = load <4 x i32>* %A
104107 %tmp2 = load <4 x i16>* %B
105108 %tmp3 = load <4 x i16>* %C
106 %tmp4 = call <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
107 ret <4 x i32> %tmp4
109 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
110 %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
111 %tmp6 = mul <4 x i32> %tmp4, %tmp5
112 %tmp7 = add <4 x i32> %tmp1, %tmp6
113 ret <4 x i32> %tmp7
108114 }
109115
110116 define <2 x i64> @vmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
113119 %tmp1 = load <2 x i64>* %A
114120 %tmp2 = load <2 x i32>* %B
115121 %tmp3 = load <2 x i32>* %C
116 %tmp4 = call <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
117 ret <2 x i64> %tmp4
122 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
123 %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
124 %tmp6 = mul <2 x i64> %tmp4, %tmp5
125 %tmp7 = add <2 x i64> %tmp1, %tmp6
126 ret <2 x i64> %tmp7
118127 }
119128
120129 define <8 x i16> @vmlalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
123132 %tmp1 = load <8 x i16>* %A
124133 %tmp2 = load <8 x i8>* %B
125134 %tmp3 = load <8 x i8>* %C
126 %tmp4 = call <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
127 ret <8 x i16> %tmp4
135 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
136 %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
137 %tmp6 = mul <8 x i16> %tmp4, %tmp5
138 %tmp7 = add <8 x i16> %tmp1, %tmp6
139 ret <8 x i16> %tmp7
128140 }
129141
130142 define <4 x i32> @vmlalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
133145 %tmp1 = load <4 x i32>* %A
134146 %tmp2 = load <4 x i16>* %B
135147 %tmp3 = load <4 x i16>* %C
136 %tmp4 = call <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
137 ret <4 x i32> %tmp4
148 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
149 %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
150 %tmp6 = mul <4 x i32> %tmp4, %tmp5
151 %tmp7 = add <4 x i32> %tmp1, %tmp6
152 ret <4 x i32> %tmp7
138153 }
139154
140155 define <2 x i64> @vmlalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
143158 %tmp1 = load <2 x i64>* %A
144159 %tmp2 = load <2 x i32>* %B
145160 %tmp3 = load <2 x i32>* %C
146 %tmp4 = call <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
147 ret <2 x i64> %tmp4
161 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
162 %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
163 %tmp6 = mul <2 x i64> %tmp4, %tmp5
164 %tmp7 = add <2 x i64> %tmp1, %tmp6
165 ret <2 x i64> %tmp7
148166 }
149167
150168 define arm_aapcs_vfpcc <4 x i32> @test_vmlal_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
152170 ; CHECK: test_vmlal_lanes16
153171 ; CHECK: vmlal.s16 q0, d2, d3[1]
154172 %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> ; <<4 x i16>> [#uses=1]
155 %1 = tail call <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
156 ret <4 x i32> %1
173 %1 = sext <4 x i16> %arg1_int16x4_t to <4 x i32>
174 %2 = sext <4 x i16> %0 to <4 x i32>
175 %3 = mul <4 x i32> %1, %2
176 %4 = add <4 x i32> %arg0_int32x4_t, %3
177 ret <4 x i32> %4
157178 }
158179
159180 define arm_aapcs_vfpcc <2 x i64> @test_vmlal_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
161182 ; CHECK: test_vmlal_lanes32
162183 ; CHECK: vmlal.s32 q0, d2, d3[1]
163184 %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> ; <<2 x i32>> [#uses=1]
164 %1 = tail call <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
165 ret <2 x i64> %1
185 %1 = sext <2 x i32> %arg1_int32x2_t to <2 x i64>
186 %2 = sext <2 x i32> %0 to <2 x i64>
187 %3 = mul <2 x i64> %1, %2
188 %4 = add <2 x i64> %arg0_int64x2_t, %3
189 ret <2 x i64> %4
166190 }
167191
168192 define arm_aapcs_vfpcc <4 x i32> @test_vmlal_laneu16(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %arg2_uint16x4_t) nounwind readnone {
170194 ; CHECK: test_vmlal_laneu16
171195 ; CHECK: vmlal.u16 q0, d2, d3[1]
172196 %0 = shufflevector <4 x i16> %arg2_uint16x4_t, <4 x i16> undef, <4 x i32> ; <<4 x i16>> [#uses=1]
173 %1 = tail call <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
174 ret <4 x i32> %1
197 %1 = zext <4 x i16> %arg1_uint16x4_t to <4 x i32>
198 %2 = zext <4 x i16> %0 to <4 x i32>
199 %3 = mul <4 x i32> %1, %2
200 %4 = add <4 x i32> %arg0_uint32x4_t, %3
201 ret <4 x i32> %4
175202 }
176203
177204 define arm_aapcs_vfpcc <2 x i64> @test_vmlal_laneu32(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %arg2_uint32x2_t) nounwind readnone {
179206 ; CHECK: test_vmlal_laneu32
180207 ; CHECK: vmlal.u32 q0, d2, d3[1]
181208 %0 = shufflevector <2 x i32> %arg2_uint32x2_t, <2 x i32> undef, <2 x i32> ; <<2 x i32>> [#uses=1]
182 %1 = tail call <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
183 ret <2 x i64> %1
184 }
185
186 declare <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
187 declare <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
188 declare <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
189
190 declare <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
191 declare <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
192 declare <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
209 %1 = zext <2 x i32> %arg1_uint32x2_t to <2 x i64>
210 %2 = zext <2 x i32> %0 to <2 x i64>
211 %3 = mul <2 x i64> %1, %2
212 %4 = add <2 x i64> %arg0_uint64x2_t, %3
213 ret <2 x i64> %4
214 }
9393 %tmp1 = load <8 x i16>* %A
9494 %tmp2 = load <8 x i8>* %B
9595 %tmp3 = load <8 x i8>* %C
96 %tmp4 = call <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
97 ret <8 x i16> %tmp4
96 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
97 %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
98 %tmp6 = mul <8 x i16> %tmp4, %tmp5
99 %tmp7 = sub <8 x i16> %tmp1, %tmp6
100 ret <8 x i16> %tmp7
98101 }
99102
100103 define <4 x i32> @vmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
103106 %tmp1 = load <4 x i32>* %A
104107 %tmp2 = load <4 x i16>* %B
105108 %tmp3 = load <4 x i16>* %C
106 %tmp4 = call <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
107 ret <4 x i32> %tmp4
109 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
110 %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
111 %tmp6 = mul <4 x i32> %tmp4, %tmp5
112 %tmp7 = sub <4 x i32> %tmp1, %tmp6
113 ret <4 x i32> %tmp7
108114 }
109115
110116 define <2 x i64> @vmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
113119 %tmp1 = load <2 x i64>* %A
114120 %tmp2 = load <2 x i32>* %B
115121 %tmp3 = load <2 x i32>* %C
116 %tmp4 = call <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
117 ret <2 x i64> %tmp4
122 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
123 %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
124 %tmp6 = mul <2 x i64> %tmp4, %tmp5
125 %tmp7 = sub <2 x i64> %tmp1, %tmp6
126 ret <2 x i64> %tmp7
118127 }
119128
120129 define <8 x i16> @vmlslu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
123132 %tmp1 = load <8 x i16>* %A
124133 %tmp2 = load <8 x i8>* %B
125134 %tmp3 = load <8 x i8>* %C
126 %tmp4 = call <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
127 ret <8 x i16> %tmp4
135 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
136 %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
137 %tmp6 = mul <8 x i16> %tmp4, %tmp5
138 %tmp7 = sub <8 x i16> %tmp1, %tmp6
139 ret <8 x i16> %tmp7
128140 }
129141
130142 define <4 x i32> @vmlslu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
133145 %tmp1 = load <4 x i32>* %A
134146 %tmp2 = load <4 x i16>* %B
135147 %tmp3 = load <4 x i16>* %C
136 %tmp4 = call <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
137 ret <4 x i32> %tmp4
148 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
149 %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
150 %tmp6 = mul <4 x i32> %tmp4, %tmp5
151 %tmp7 = sub <4 x i32> %tmp1, %tmp6
152 ret <4 x i32> %tmp7
138153 }
139154
140155 define <2 x i64> @vmlslu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
143158 %tmp1 = load <2 x i64>* %A
144159 %tmp2 = load <2 x i32>* %B
145160 %tmp3 = load <2 x i32>* %C
146 %tmp4 = call <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
147 ret <2 x i64> %tmp4
161 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
162 %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
163 %tmp6 = mul <2 x i64> %tmp4, %tmp5
164 %tmp7 = sub <2 x i64> %tmp1, %tmp6
165 ret <2 x i64> %tmp7
148166 }
149167
150168 define arm_aapcs_vfpcc <4 x i32> @test_vmlsl_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
152170 ; CHECK: test_vmlsl_lanes16
153171 ; CHECK: vmlsl.s16 q0, d2, d3[1]
154172 %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> ; <<4 x i16>> [#uses=1]
155 %1 = tail call <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
156 ret <4 x i32> %1
173 %1 = sext <4 x i16> %arg1_int16x4_t to <4 x i32>
174 %2 = sext <4 x i16> %0 to <4 x i32>
175 %3 = mul <4 x i32> %1, %2
176 %4 = sub <4 x i32> %arg0_int32x4_t, %3
177 ret <4 x i32> %4
157178 }
158179
159180 define arm_aapcs_vfpcc <2 x i64> @test_vmlsl_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
161182 ; CHECK: test_vmlsl_lanes32
162183 ; CHECK: vmlsl.s32 q0, d2, d3[1]
163184 %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> ; <<2 x i32>> [#uses=1]
164 %1 = tail call <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
165 ret <2 x i64> %1
185 %1 = sext <2 x i32> %arg1_int32x2_t to <2 x i64>
186 %2 = sext <2 x i32> %0 to <2 x i64>
187 %3 = mul <2 x i64> %1, %2
188 %4 = sub <2 x i64> %arg0_int64x2_t, %3
189 ret <2 x i64> %4
166190 }
167191
168192 define arm_aapcs_vfpcc <4 x i32> @test_vmlsl_laneu16(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %arg2_uint16x4_t) nounwind readnone {
170194 ; CHECK: test_vmlsl_laneu16
171195 ; CHECK: vmlsl.u16 q0, d2, d3[1]
172196 %0 = shufflevector <4 x i16> %arg2_uint16x4_t, <4 x i16> undef, <4 x i32> ; <<4 x i16>> [#uses=1]
173 %1 = tail call <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
174 ret <4 x i32> %1
197 %1 = zext <4 x i16> %arg1_uint16x4_t to <4 x i32>
198 %2 = zext <4 x i16> %0 to <4 x i32>
199 %3 = mul <4 x i32> %1, %2
200 %4 = sub <4 x i32> %arg0_uint32x4_t, %3
201 ret <4 x i32> %4
175202 }
176203
177204 define arm_aapcs_vfpcc <2 x i64> @test_vmlsl_laneu32(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %arg2_uint32x2_t) nounwind readnone {
179206 ; CHECK: test_vmlsl_laneu32
180207 ; CHECK: vmlsl.u32 q0, d2, d3[1]
181208 %0 = shufflevector <2 x i32> %arg2_uint32x2_t, <2 x i32> undef, <2 x i32> ; <<2 x i32>> [#uses=1]
182 %1 = tail call <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
183 ret <2 x i64> %1
184 }
185
186 declare <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
187 declare <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
188 declare <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
189
190 declare <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
191 declare <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
192 declare <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
209 %1 = zext <2 x i32> %arg1_uint32x2_t to <2 x i64>
210 %2 = zext <2 x i32> %0 to <2 x i64>
211 %3 = mul <2 x i64> %1, %2
212 %4 = sub <2 x i64> %arg0_uint64x2_t, %3
213 ret <2 x i64> %4
214 }
151151 ;CHECK: vmull.s8
152152 %tmp1 = load <8 x i8>* %A
153153 %tmp2 = load <8 x i8>* %B
154 %tmp3 = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
155 ret <8 x i16> %tmp3
154 %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
155 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
156 %tmp5 = mul <8 x i16> %tmp3, %tmp4
157 ret <8 x i16> %tmp5
156158 }
157159
158160 define <4 x i32> @vmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
160162 ;CHECK: vmull.s16
161163 %tmp1 = load <4 x i16>* %A
162164 %tmp2 = load <4 x i16>* %B
163 %tmp3 = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
164 ret <4 x i32> %tmp3
165 %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
166 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
167 %tmp5 = mul <4 x i32> %tmp3, %tmp4
168 ret <4 x i32> %tmp5
165169 }
166170
167171 define <2 x i64> @vmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
169173 ;CHECK: vmull.s32
170174 %tmp1 = load <2 x i32>* %A
171175 %tmp2 = load <2 x i32>* %B
172 %tmp3 = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
173 ret <2 x i64> %tmp3
176 %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
177 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
178 %tmp5 = mul <2 x i64> %tmp3, %tmp4
179 ret <2 x i64> %tmp5
174180 }
175181
176182 define <8 x i16> @vmullu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
178184 ;CHECK: vmull.u8
179185 %tmp1 = load <8 x i8>* %A
180186 %tmp2 = load <8 x i8>* %B
181 %tmp3 = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
182 ret <8 x i16> %tmp3
187 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
188 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
189 %tmp5 = mul <8 x i16> %tmp3, %tmp4
190 ret <8 x i16> %tmp5
183191 }
184192
185193 define <4 x i32> @vmullu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
187195 ;CHECK: vmull.u16
188196 %tmp1 = load <4 x i16>* %A
189197 %tmp2 = load <4 x i16>* %B
190 %tmp3 = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
191 ret <4 x i32> %tmp3
198 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
199 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
200 %tmp5 = mul <4 x i32> %tmp3, %tmp4
201 ret <4 x i32> %tmp5
192202 }
193203
194204 define <2 x i64> @vmullu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
196206 ;CHECK: vmull.u32
197207 %tmp1 = load <2 x i32>* %A
198208 %tmp2 = load <2 x i32>* %B
199 %tmp3 = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
200 ret <2 x i64> %tmp3
209 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
210 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
211 %tmp5 = mul <2 x i64> %tmp3, %tmp4
212 ret <2 x i64> %tmp5
201213 }
202214
203215 define <8 x i16> @vmullp8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
214226 ; CHECK: test_vmull_lanes16
215227 ; CHECK: vmull.s16 q0, d0, d1[1]
216228 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> ; <<4 x i16>> [#uses=1]
217 %1 = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
218 ret <4 x i32> %1
229 %1 = sext <4 x i16> %arg0_int16x4_t to <4 x i32>
230 %2 = sext <4 x i16> %0 to <4 x i32>
231 %3 = mul <4 x i32> %1, %2
232 ret <4 x i32> %3
219233 }
220234
221235 define arm_aapcs_vfpcc <2 x i64> @test_vmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
223237 ; CHECK: test_vmull_lanes32
224238 ; CHECK: vmull.s32 q0, d0, d1[1]
225239 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> ; <<2 x i32>> [#uses=1]
226 %1 = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
227 ret <2 x i64> %1
240 %1 = sext <2 x i32> %arg0_int32x2_t to <2 x i64>
241 %2 = sext <2 x i32> %0 to <2 x i64>
242 %3 = mul <2 x i64> %1, %2
243 ret <2 x i64> %3
228244 }
229245
230246 define arm_aapcs_vfpcc <4 x i32> @test_vmull_laneu16(<4 x i16> %arg0_uint16x4_t, <4 x i16> %arg1_uint16x4_t) nounwind readnone {
232248 ; CHECK: test_vmull_laneu16
233249 ; CHECK: vmull.u16 q0, d0, d1[1]
234250 %0 = shufflevector <4 x i16> %arg1_uint16x4_t, <4 x i16> undef, <4 x i32> ; <<4 x i16>> [#uses=1]
235 %1 = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %arg0_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
236 ret <4 x i32> %1
251 %1 = zext <4 x i16> %arg0_uint16x4_t to <4 x i32>
252 %2 = zext <4 x i16> %0 to <4 x i32>
253 %3 = mul <4 x i32> %1, %2
254 ret <4 x i32> %3
237255 }
238256
239257 define arm_aapcs_vfpcc <2 x i64> @test_vmull_laneu32(<2 x i32> %arg0_uint32x2_t, <2 x i32> %arg1_uint32x2_t) nounwind readnone {
241259 ; CHECK: test_vmull_laneu32
242260 ; CHECK: vmull.u32 q0, d0, d1[1]
243261 %0 = shufflevector <2 x i32> %arg1_uint32x2_t, <2 x i32> undef, <2 x i32> ; <<2 x i32>> [#uses=1]
244 %1 = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %arg0_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
245 ret <2 x i64> %1
246 }
247
248 declare <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
249 declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
250 declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
251
252 declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
253 declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
254 declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
262 %1 = zext <2 x i32> %arg0_uint32x2_t to <2 x i64>
263 %2 = zext <2 x i32> %0 to <2 x i64>
264 %3 = mul <2 x i64> %1, %2
265 ret <2 x i64> %3
266 }
255267
256268 declare <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>) nounwind readnone