llvm.org GIT mirror llvm / eb0c3d3
Replace NEON vabdl, vaba, and vabal intrinsics with combinations of the vabd intrinsic and add and/or zext operations. In the case of vaba, this also avoids the need for a DAG combine pattern to combine vabd with add. Update tests. Auto-upgrade the old intrinsics. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@112941 91177308-0d34-0410-b5e6-96231b3b80d8 Bob Wilson 9 years ago
8 changed file(s) with 296 addition(s) and 160 deletion(s). Raw diff Collapse all Expand all
175175 // Vector Absolute Differences.
176176 def int_arm_neon_vabds : Neon_2Arg_Intrinsic;
177177 def int_arm_neon_vabdu : Neon_2Arg_Intrinsic;
178 def int_arm_neon_vabdls : Neon_2Arg_Long_Intrinsic;
179 def int_arm_neon_vabdlu : Neon_2Arg_Long_Intrinsic;
180
181 // Vector Absolute Difference and Accumulate.
182 def int_arm_neon_vabas : Neon_3Arg_Intrinsic;
183 def int_arm_neon_vabau : Neon_3Arg_Intrinsic;
184 def int_arm_neon_vabals : Neon_3Arg_Long_Intrinsic;
185 def int_arm_neon_vabalu : Neon_3Arg_Long_Intrinsic;
186178
187179 // Vector Pairwise Add.
188180 def int_arm_neon_vpadd : Neon_2Arg_Intrinsic;
42924292 /// operands.
42934293 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
42944294 TargetLowering::DAGCombinerInfo &DCI) {
4295 SelectionDAG &DAG = DCI.DAG;
4296
42974295 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
42984296 if (N0.getOpcode() == ISD::SELECT && N0.getNode()->hasOneUse()) {
42994297 SDValue Result = combineSelectAndUse(N, N0, N1, DCI);
43004298 if (Result.getNode()) return Result;
43014299 }
4302
4303 // fold (add (arm_neon_vabd a, b) c) -> (arm_neon_vaba c, a, b)
4304 EVT VT = N->getValueType(0);
4305 if (N0.getOpcode() == ISD::INTRINSIC_WO_CHAIN && VT.isInteger()) {
4306 unsigned IntNo = cast(N0.getOperand(0))->getZExtValue();
4307 if (IntNo == Intrinsic::arm_neon_vabds)
4308 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(), VT,
4309 DAG.getConstant(Intrinsic::arm_neon_vabas, MVT::i32),
4310 N1, N0.getOperand(1), N0.getOperand(2));
4311 if (IntNo == Intrinsic::arm_neon_vabdu)
4312 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(), VT,
4313 DAG.getConstant(Intrinsic::arm_neon_vabau, MVT::i32),
4314 N1, N0.getOperand(1), N0.getOperand(2));
4315 }
4316
43174300 return SDValue();
43184301 }
43194302
12871287 (ResTy (NEONvduplane (OpTy DPR_8:$src3),
12881288 imm:$lane)))))))]>;
12891289
1290 // Neon Intrinsic-Op instructions (VABA): double- and quad-register.
1291 class N3VDIntOp op21_20, bits<4> op11_8, bit op4,
1292 InstrItinClass itin, string OpcodeStr, string Dt,
1293 ValueType Ty, Intrinsic IntOp, SDNode OpNode>
1294 : N3V
1295 (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin,
1296 OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
1297 [(set DPR:$dst, (Ty (OpNode DPR:$src1,
1298 (Ty (IntOp (Ty DPR:$src2), (Ty DPR:$src3))))))]>;
1299 class N3VQIntOp op21_20, bits<4> op11_8, bit op4,
1300 InstrItinClass itin, string OpcodeStr, string Dt,
1301 ValueType Ty, Intrinsic IntOp, SDNode OpNode>
1302 : N3V
1303 (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), N3RegFrm, itin,
1304 OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
1305 [(set QPR:$dst, (Ty (OpNode QPR:$src1,
1306 (Ty (IntOp (Ty QPR:$src2), (Ty QPR:$src3))))))]>;
1307
12901308 // Neon 3-argument intrinsics, both double- and quad-register.
12911309 // The destination register is also used as the first source operand register.
12921310 class N3VDInt3 op21_20, bits<4> op11_8, bit op4,
13411359 (TyD (NEONvduplane (TyD DPR_8:$src3),
13421360 imm:$lane))))))]>;
13431361
1362 // Long Intrinsic-Op vector operations with explicit extend (VABAL).
1363 class N3VLIntExtOp op21_20, bits<4> op11_8, bit op4,
1364 InstrItinClass itin, string OpcodeStr, string Dt,
1365 ValueType TyQ, ValueType TyD, Intrinsic IntOp, SDNode ExtOp,
1366 SDNode OpNode>
1367 : N3V
1368 (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin,
1369 OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
1370 [(set QPR:$dst, (OpNode (TyQ QPR:$src1),
1371 (TyQ (ExtOp (TyD (IntOp (TyD DPR:$src2),
1372 (TyD DPR:$src3)))))))]>;
13441373
13451374 // Neon Long 3-argument intrinsic. The destination register is
13461375 // a quad-register and is also used as the first source operand register.
14321461 let isCommutable = Commutable;
14331462 }
14341463
1464 // Long 3-register intrinsics with explicit extend (VABDL).
1465 class N3VLIntExt op21_20, bits<4> op11_8, bit op4,
1466 InstrItinClass itin, string OpcodeStr, string Dt,
1467 ValueType TyQ, ValueType TyD, Intrinsic IntOp, SDNode ExtOp,
1468 bit Commutable>
1469 : N3V
1470 (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin,
1471 OpcodeStr, Dt, "$dst, $src1, $src2", "",
1472 [(set QPR:$dst, (TyQ (ExtOp (TyD (IntOp (TyD DPR:$src1),
1473 (TyD DPR:$src2))))))]> {
1474 let isCommutable = Commutable;
1475 }
1476
14351477 // Long 3-register intrinsics.
14361478 class N3VLInt op21_20, bits<4> op11_8, bit op4,
14371479 InstrItinClass itin, string OpcodeStr, string Dt,
19171959 v8i16, v8i8, IntOp, Commutable>;
19181960 }
19191961
1962 // ....with explicit extend (VABDL).
1963 multiclass N3VLIntExt_QHS op11_8, bit op4,
1964 InstrItinClass itin, string OpcodeStr, string Dt,
1965 Intrinsic IntOp, SDNode ExtOp, bit Commutable = 0> {
1966 def v8i16 : N3VLIntExt
1967 OpcodeStr, !strconcat(Dt, "8"),
1968 v8i16, v8i8, IntOp, ExtOp, Commutable>;
1969 def v4i32 : N3VLIntExt
1970 OpcodeStr, !strconcat(Dt, "16"),
1971 v4i32, v4i16, IntOp, ExtOp, Commutable>;
1972 def v2i64 : N3VLIntExt
1973 OpcodeStr, !strconcat(Dt, "32"),
1974 v2i64, v2i32, IntOp, ExtOp, Commutable>;
1975 }
1976
19201977
19211978 // Neon Wide 3-register vector intrinsics,
19221979 // source operand element sizes of 8, 16 and 32 bits:
19742031 mul, ShOp>;
19752032 }
19762033
2034 // Neon Intrinsic-Op vector operations,
2035 // element sizes of 8, 16 and 32 bits:
2036 multiclass N3VIntOp_QHS op11_8, bit op4,
2037 InstrItinClass itinD, InstrItinClass itinQ,
2038 string OpcodeStr, string Dt, Intrinsic IntOp,
2039 SDNode OpNode> {
2040 // 64-bit vector types.
2041 def v8i8 : N3VDIntOp
2042 OpcodeStr, !strconcat(Dt, "8"), v8i8, IntOp, OpNode>;
2043 def v4i16 : N3VDIntOp
2044 OpcodeStr, !strconcat(Dt, "16"), v4i16, IntOp, OpNode>;
2045 def v2i32 : N3VDIntOp
2046 OpcodeStr, !strconcat(Dt, "32"), v2i32, IntOp, OpNode>;
2047
2048 // 128-bit vector types.
2049 def v16i8 : N3VQIntOp
2050 OpcodeStr, !strconcat(Dt, "8"), v16i8, IntOp, OpNode>;
2051 def v8i16 : N3VQIntOp
2052 OpcodeStr, !strconcat(Dt, "16"), v8i16, IntOp, OpNode>;
2053 def v4i32 : N3VQIntOp
2054 OpcodeStr, !strconcat(Dt, "32"), v4i32, IntOp, OpNode>;
2055 }
2056
19772057 // Neon 3-argument intrinsics,
19782058 // element sizes of 8, 16 and 32 bits:
19792059 multiclass N3VInt3_QHS op11_8, bit op4,
20472127 : N3VLInt3_HS {
20482128 def v8i16 : N3VLInt3
20492129 OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp>;
2130 }
2131
2132 // ....with explicit extend (VABAL).
2133 multiclass N3VLIntExtOp_QHS op11_8, bit op4,
2134 InstrItinClass itin, string OpcodeStr, string Dt,
2135 Intrinsic IntOp, SDNode ExtOp, SDNode OpNode> {
2136 def v8i16 : N3VLIntExtOp
2137 OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8,
2138 IntOp, ExtOp, OpNode>;
2139 def v4i32 : N3VLIntExtOp
2140 OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16,
2141 IntOp, ExtOp, OpNode>;
2142 def v2i64 : N3VLIntExtOp
2143 OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32,
2144 IntOp, ExtOp, OpNode>;
20502145 }
20512146
20522147
27642859 // VABD : Vector Absolute Difference
27652860 defm VABDs : N3VInt_QHS<0, 0, 0b0111, 0, N3RegFrm,
27662861 IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
2767 "vabd", "s", int_arm_neon_vabds, 0>;
2862 "vabd", "s", int_arm_neon_vabds, 1>;
27682863 defm VABDu : N3VInt_QHS<1, 0, 0b0111, 0, N3RegFrm,
27692864 IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
2770 "vabd", "u", int_arm_neon_vabdu, 0>;
2865 "vabd", "u", int_arm_neon_vabdu, 1>;
27712866 def VABDfd : N3VDInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBIND,
2772 "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 0>;
2867 "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 1>;
27732868 def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBINQ,
2774 "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 0>;
2869 "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 1>;
27752870
27762871 // VABDL : Vector Absolute Difference Long (Q = | D - D |)
2777 defm VABDLs : N3VLInt_QHS<0,1,0b0111,0, IIC_VSUBi4Q, IIC_VSUBi4Q,
2778 "vabdl", "s", int_arm_neon_vabdls, 0>;
2779 defm VABDLu : N3VLInt_QHS<1,1,0b0111,0, IIC_VSUBi4Q, IIC_VSUBi4Q,
2780 "vabdl", "u", int_arm_neon_vabdlu, 0>;
2872 defm VABDLs : N3VLIntExt_QHS<0,1,0b0111,0, IIC_VSUBi4Q,
2873 "vabdl", "s", int_arm_neon_vabds, zext, 1>;
2874 defm VABDLu : N3VLIntExt_QHS<1,1,0b0111,0, IIC_VSUBi4Q,
2875 "vabdl", "u", int_arm_neon_vabdu, zext, 1>;
27812876
27822877 // VABA : Vector Absolute Difference and Accumulate
2783 defm VABAs : N3VInt3_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
2784 "vaba", "s", int_arm_neon_vabas>;
2785 defm VABAu : N3VInt3_QHS<1,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
2786 "vaba", "u", int_arm_neon_vabau>;
2878 defm VABAs : N3VIntOp_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
2879 "vaba", "s", int_arm_neon_vabds, add>;
2880 defm VABAu : N3VIntOp_QHS<1,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
2881 "vaba", "u", int_arm_neon_vabdu, add>;
27872882
27882883 // VABAL : Vector Absolute Difference and Accumulate Long (Q += | D - D |)
2789 defm VABALs : N3VLInt3_QHS<0,1,0b0101,0, IIC_VABAD, IIC_VABAD,
2790 "vabal", "s", int_arm_neon_vabals>;
2791 defm VABALu : N3VLInt3_QHS<1,1,0b0101,0, IIC_VABAD, IIC_VABAD,
2792 "vabal", "u", int_arm_neon_vabalu>;
2884 defm VABALs : N3VLIntExtOp_QHS<0,1,0b0101,0, IIC_VABAD,
2885 "vabal", "s", int_arm_neon_vabds, zext, add>;
2886 defm VABALu : N3VLIntExtOp_QHS<1,1,0b0101,0, IIC_VABAD,
2887 "vabal", "u", int_arm_neon_vabdu, zext, add>;
27932888
27942889 // Vector Maximum and Minimum.
27952890
8080 } else if (Name.compare(5, 9, "arm.neon.", 9) == 0) {
8181 if (((Name.compare(14, 5, "vmovl", 5) == 0 ||
8282 Name.compare(14, 5, "vaddl", 5) == 0 ||
83 Name.compare(14, 5, "vsubl", 5) == 0) &&
83 Name.compare(14, 5, "vsubl", 5) == 0 ||
84 Name.compare(14, 5, "vaddw", 5) == 0 ||
85 Name.compare(14, 5, "vsubw", 5) == 0 ||
86 Name.compare(14, 5, "vmull", 5) == 0 ||
87 Name.compare(14, 5, "vmlal", 5) == 0 ||
88 Name.compare(14, 5, "vmlsl", 5) == 0 ||
89 Name.compare(14, 5, "vabdl", 5) == 0 ||
90 Name.compare(14, 5, "vabal", 5) == 0) &&
8491 (Name.compare(19, 2, "s.", 2) == 0 ||
8592 Name.compare(19, 2, "u.", 2) == 0)) ||
8693
87 ((Name.compare(14, 5, "vaddw", 5) == 0 ||
88 Name.compare(14, 5, "vsubw", 5) == 0) &&
89 (Name.compare(19, 2, "s.", 2) == 0 ||
90 Name.compare(19, 2, "u.", 2) == 0)) ||
91
92 ((Name.compare(14, 5, "vmull", 5) == 0 ||
93 Name.compare(14, 5, "vmlal", 5) == 0 ||
94 Name.compare(14, 5, "vmlsl", 5) == 0) &&
95 (Name.compare(19, 2, "s.", 2) == 0 ||
96 Name.compare(19, 2, "u.", 2) == 0)) ||
94 (Name.compare(14, 4, "vaba", 4) == 0 &&
95 (Name.compare(18, 2, "s.", 2) == 0 ||
96 Name.compare(18, 2, "u.", 2) == 0)) ||
9797
9898 (Name.compare(14, 6, "vmovn.", 6) == 0)) {
9999
390390 }
391391 }
392392
393 /// CallVABD - As part of expanding a call to one of the old NEON vabdl, vaba,
394 /// or vabal intrinsics, construct a call to a vabd intrinsic. Examine the
395 /// name of the old intrinsic to determine whether to use a signed or unsigned
396 /// vabd intrinsic. Get the type from the old call instruction, adjusted for
397 /// half-size vector elements if the old intrinsic was vabdl or vabal.
398 static Instruction *CallVABD(CallInst *CI, Value *Arg0, Value *Arg1) {
399 Function *F = CI->getCalledFunction();
400 const std::string& Name = F->getName();
401 bool isLong = (Name.at(18) == 'l');
402 bool isSigned = (Name.at(isLong ? 19 : 18) == 's');
403
404 Intrinsic::ID intID;
405 if (isSigned)
406 intID = Intrinsic::arm_neon_vabds;
407 else
408 intID = Intrinsic::arm_neon_vabdu;
409
410 const Type *Ty = CI->getType();
411 if (isLong)
412 Ty = VectorType::getTruncatedElementVectorType(cast(Ty));
413
414 Function *VABD = Intrinsic::getDeclaration(F->getParent(), intID, &Ty, 1);
415 Value *Operands[2];
416 Operands[0] = Arg0;
417 Operands[1] = Arg1;
418 return CallInst::Create(VABD, Operands, Operands+2,
419 "upgraded."+CI->getName(), CI);
420 }
421
393422 // UpgradeIntrinsicCall - Upgrade a call to an old intrinsic to be a call the
394423 // upgraded intrinsic. All argument and return casting must be provided in
395424 // order to seamlessly integrate with existing context.
432461 ExtendNEONArgs(CI, CI->getArgOperand(1), CI->getArgOperand(2), V0, V1);
433462 Instruction *MulI = BinaryOperator::CreateMul(V0, V1, "", CI);
434463 NewI = BinaryOperator::CreateSub(CI->getArgOperand(0), MulI,
464 "upgraded."+CI->getName(), CI);
465 } else if (Name.compare(14, 4, "vabd", 4) == 0) {
466 NewI = CallVABD(CI, CI->getArgOperand(0), CI->getArgOperand(1));
467 NewI = new ZExtInst(NewI, CI->getType(), "upgraded."+CI->getName(), CI);
468 } else if (Name.compare(14, 4, "vaba", 4) == 0) {
469 NewI = CallVABD(CI, CI->getArgOperand(1), CI->getArgOperand(2));
470 if (Name.at(18) == 'l')
471 NewI = new ZExtInst(NewI, CI->getType(), "", CI);
472 NewI = BinaryOperator::CreateAdd(CI->getArgOperand(0), NewI,
435473 "upgraded."+CI->getName(), CI);
436474 } else if (Name.compare(14, 6, "vmovn.", 6) == 0) {
437475 NewI = new TruncInst(CI->getArgOperand(0), CI->getType(),
674712 }
675713
676714 switch (NewFn->getIntrinsicID()) {
677 default: llvm_unreachable("Unknown function for CallInst upgrade.");
715 default: llvm_unreachable("Unknown function for CallInst upgrade.");
678716 case Intrinsic::arm_neon_vld1:
679717 case Intrinsic::arm_neon_vld2:
680718 case Intrinsic::arm_neon_vld3:
125125 ; CHECK-NEXT: mul <2 x i64>
126126 ; CHECK-NEXT: sub <2 x i64>
127127
128 ; vaba should be auto-upgraded to vabd + add
129
130 ; CHECK: vabas32
131 ; CHECK-NOT: arm.neon.vabas.v2i32
132 ; CHECK: arm.neon.vabds.v2i32
133 ; CHECK-NEXT: add <2 x i32>
134
135 ; CHECK: vabaQu8
136 ; CHECK-NOT: arm.neon.vabau.v16i8
137 ; CHECK: arm.neon.vabdu.v16i8
138 ; CHECK-NEXT: add <16 x i8>
139
140 ; vabal should be auto-upgraded to vabd with zext + add
141
142 ; CHECK: vabals16
143 ; CHECK-NOT: arm.neon.vabals.v4i32
144 ; CHECK: arm.neon.vabds.v4i16
145 ; CHECK-NEXT: zext <4 x i16>
146 ; CHECK-NEXT: add <4 x i32>
147
148 ; CHECK: vabalu32
149 ; CHECK-NOT: arm.neon.vabalu.v2i64
150 ; CHECK: arm.neon.vabdu.v2i32
151 ; CHECK-NEXT: zext <2 x i32>
152 ; CHECK-NEXT: add <2 x i64>
153
154 ; vabdl should be auto-upgraded to vabd with zext
155
156 ; CHECK: vabdls8
157 ; CHECK-NOT: arm.neon.vabdls.v8i16
158 ; CHECK: arm.neon.vabds.v8i8
159 ; CHECK-NEXT: zext <8 x i8>
160
161 ; CHECK: vabdlu16
162 ; CHECK-NOT: arm.neon.vabdlu.v4i32
163 ; CHECK: arm.neon.vabdu.v4i16
164 ; CHECK-NEXT: zext <4 x i16>
165
128166 ; vmovn should be auto-upgraded to trunc
129167
130168 ; CHECK: vmovni16
55 %tmp1 = load <8 x i8>* %A
66 %tmp2 = load <8 x i8>* %B
77 %tmp3 = load <8 x i8>* %C
8 %tmp4 = call <8 x i8> @llvm.arm.neon.vabas.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
9 ret <8 x i8> %tmp4
8 %tmp4 = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %tmp2, <8 x i8> %tmp3)
9 %tmp5 = add <8 x i8> %tmp1, %tmp4
10 ret <8 x i8> %tmp5
1011 }
1112
1213 define <4 x i16> @vabas16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
1516 %tmp1 = load <4 x i16>* %A
1617 %tmp2 = load <4 x i16>* %B
1718 %tmp3 = load <4 x i16>* %C
18 %tmp4 = call <4 x i16> @llvm.arm.neon.vabas.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
19 ret <4 x i16> %tmp4
19 %tmp4 = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %tmp2, <4 x i16> %tmp3)
20 %tmp5 = add <4 x i16> %tmp1, %tmp4
21 ret <4 x i16> %tmp5
2022 }
2123
2224 define <2 x i32> @vabas32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
2527 %tmp1 = load <2 x i32>* %A
2628 %tmp2 = load <2 x i32>* %B
2729 %tmp3 = load <2 x i32>* %C
28 %tmp4 = call <2 x i32> @llvm.arm.neon.vabas.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
29 ret <2 x i32> %tmp4
30 %tmp4 = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %tmp2, <2 x i32> %tmp3)
31 %tmp5 = add <2 x i32> %tmp1, %tmp4
32 ret <2 x i32> %tmp5
3033 }
3134
3235 define <8 x i8> @vabau8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
3538 %tmp1 = load <8 x i8>* %A
3639 %tmp2 = load <8 x i8>* %B
3740 %tmp3 = load <8 x i8>* %C
38 %tmp4 = call <8 x i8> @llvm.arm.neon.vabau.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
39 ret <8 x i8> %tmp4
41 %tmp4 = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %tmp2, <8 x i8> %tmp3)
42 %tmp5 = add <8 x i8> %tmp1, %tmp4
43 ret <8 x i8> %tmp5
4044 }
4145
4246 define <4 x i16> @vabau16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
4549 %tmp1 = load <4 x i16>* %A
4650 %tmp2 = load <4 x i16>* %B
4751 %tmp3 = load <4 x i16>* %C
48 %tmp4 = call <4 x i16> @llvm.arm.neon.vabau.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
49 ret <4 x i16> %tmp4
52 %tmp4 = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %tmp2, <4 x i16> %tmp3)
53 %tmp5 = add <4 x i16> %tmp1, %tmp4
54 ret <4 x i16> %tmp5
5055 }
5156
5257 define <2 x i32> @vabau32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
5560 %tmp1 = load <2 x i32>* %A
5661 %tmp2 = load <2 x i32>* %B
5762 %tmp3 = load <2 x i32>* %C
58 %tmp4 = call <2 x i32> @llvm.arm.neon.vabau.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
59 ret <2 x i32> %tmp4
63 %tmp4 = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %tmp2, <2 x i32> %tmp3)
64 %tmp5 = add <2 x i32> %tmp1, %tmp4
65 ret <2 x i32> %tmp5
6066 }
6167
6268 define <16 x i8> @vabaQs8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
6571 %tmp1 = load <16 x i8>* %A
6672 %tmp2 = load <16 x i8>* %B
6773 %tmp3 = load <16 x i8>* %C
68 %tmp4 = call <16 x i8> @llvm.arm.neon.vabas.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> %tmp3)
69 ret <16 x i8> %tmp4
74 %tmp4 = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %tmp2, <16 x i8> %tmp3)
75 %tmp5 = add <16 x i8> %tmp1, %tmp4
76 ret <16 x i8> %tmp5
7077 }
7178
7279 define <8 x i16> @vabaQs16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
7582 %tmp1 = load <8 x i16>* %A
7683 %tmp2 = load <8 x i16>* %B
7784 %tmp3 = load <8 x i16>* %C
78 %tmp4 = call <8 x i16> @llvm.arm.neon.vabas.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> %tmp3)
79 ret <8 x i16> %tmp4
85 %tmp4 = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %tmp2, <8 x i16> %tmp3)
86 %tmp5 = add <8 x i16> %tmp1, %tmp4
87 ret <8 x i16> %tmp5
8088 }
8189
8290 define <4 x i32> @vabaQs32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
8593 %tmp1 = load <4 x i32>* %A
8694 %tmp2 = load <4 x i32>* %B
8795 %tmp3 = load <4 x i32>* %C
88 %tmp4 = call <4 x i32> @llvm.arm.neon.vabas.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> %tmp3)
89 ret <4 x i32> %tmp4
96 %tmp4 = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %tmp2, <4 x i32> %tmp3)
97 %tmp5 = add <4 x i32> %tmp1, %tmp4
98 ret <4 x i32> %tmp5
9099 }
91100
92101 define <16 x i8> @vabaQu8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
95104 %tmp1 = load <16 x i8>* %A
96105 %tmp2 = load <16 x i8>* %B
97106 %tmp3 = load <16 x i8>* %C
98 %tmp4 = call <16 x i8> @llvm.arm.neon.vabau.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> %tmp3)
99 ret <16 x i8> %tmp4
107 %tmp4 = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %tmp2, <16 x i8> %tmp3)
108 %tmp5 = add <16 x i8> %tmp1, %tmp4
109 ret <16 x i8> %tmp5
100110 }
101111
102112 define <8 x i16> @vabaQu16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
105115 %tmp1 = load <8 x i16>* %A
106116 %tmp2 = load <8 x i16>* %B
107117 %tmp3 = load <8 x i16>* %C
108 %tmp4 = call <8 x i16> @llvm.arm.neon.vabau.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> %tmp3)
109 ret <8 x i16> %tmp4
118 %tmp4 = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %tmp2, <8 x i16> %tmp3)
119 %tmp5 = add <8 x i16> %tmp1, %tmp4
120 ret <8 x i16> %tmp5
110121 }
111122
112123 define <4 x i32> @vabaQu32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
115126 %tmp1 = load <4 x i32>* %A
116127 %tmp2 = load <4 x i32>* %B
117128 %tmp3 = load <4 x i32>* %C
118 %tmp4 = call <4 x i32> @llvm.arm.neon.vabau.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> %tmp3)
119 ret <4 x i32> %tmp4
120 }
121
122 declare <8 x i8> @llvm.arm.neon.vabas.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
123 declare <4 x i16> @llvm.arm.neon.vabas.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone
124 declare <2 x i32> @llvm.arm.neon.vabas.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
125
126 declare <8 x i8> @llvm.arm.neon.vabau.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
127 declare <4 x i16> @llvm.arm.neon.vabau.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone
128 declare <2 x i32> @llvm.arm.neon.vabau.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
129
130 declare <16 x i8> @llvm.arm.neon.vabas.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
131 declare <8 x i16> @llvm.arm.neon.vabas.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
132 declare <4 x i32> @llvm.arm.neon.vabas.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
133
134 declare <16 x i8> @llvm.arm.neon.vabau.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
135 declare <8 x i16> @llvm.arm.neon.vabau.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
136 declare <4 x i32> @llvm.arm.neon.vabau.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
129 %tmp4 = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %tmp2, <4 x i32> %tmp3)
130 %tmp5 = add <4 x i32> %tmp1, %tmp4
131 ret <4 x i32> %tmp5
132 }
133
134 declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
135 declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
136 declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
137
138 declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
139 declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
140 declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
141
142 declare <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
143 declare <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
144 declare <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
145
146 declare <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
147 declare <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
148 declare <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
137149
138150 define <8 x i16> @vabals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
139151 ;CHECK: vabals8:
141153 %tmp1 = load <8 x i16>* %A
142154 %tmp2 = load <8 x i8>* %B
143155 %tmp3 = load <8 x i8>* %C
144 %tmp4 = call <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
145 ret <8 x i16> %tmp4
156 %tmp4 = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %tmp2, <8 x i8> %tmp3)
157 %tmp5 = zext <8 x i8> %tmp4 to <8 x i16>
158 %tmp6 = add <8 x i16> %tmp1, %tmp5
159 ret <8 x i16> %tmp6
146160 }
147161
148162 define <4 x i32> @vabals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
151165 %tmp1 = load <4 x i32>* %A
152166 %tmp2 = load <4 x i16>* %B
153167 %tmp3 = load <4 x i16>* %C
154 %tmp4 = call <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
155 ret <4 x i32> %tmp4
168 %tmp4 = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %tmp2, <4 x i16> %tmp3)
169 %tmp5 = zext <4 x i16> %tmp4 to <4 x i32>
170 %tmp6 = add <4 x i32> %tmp1, %tmp5
171 ret <4 x i32> %tmp6
156172 }
157173
158174 define <2 x i64> @vabals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
161177 %tmp1 = load <2 x i64>* %A
162178 %tmp2 = load <2 x i32>* %B
163179 %tmp3 = load <2 x i32>* %C
164 %tmp4 = call <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
165 ret <2 x i64> %tmp4
180 %tmp4 = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %tmp2, <2 x i32> %tmp3)
181 %tmp5 = zext <2 x i32> %tmp4 to <2 x i64>
182 %tmp6 = add <2 x i64> %tmp1, %tmp5
183 ret <2 x i64> %tmp6
166184 }
167185
168186 define <8 x i16> @vabalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
171189 %tmp1 = load <8 x i16>* %A
172190 %tmp2 = load <8 x i8>* %B
173191 %tmp3 = load <8 x i8>* %C
174 %tmp4 = call <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
175 ret <8 x i16> %tmp4
192 %tmp4 = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %tmp2, <8 x i8> %tmp3)
193 %tmp5 = zext <8 x i8> %tmp4 to <8 x i16>
194 %tmp6 = add <8 x i16> %tmp1, %tmp5
195 ret <8 x i16> %tmp6
176196 }
177197
178198 define <4 x i32> @vabalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
181201 %tmp1 = load <4 x i32>* %A
182202 %tmp2 = load <4 x i16>* %B
183203 %tmp3 = load <4 x i16>* %C
184 %tmp4 = call <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
185 ret <4 x i32> %tmp4
204 %tmp4 = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %tmp2, <4 x i16> %tmp3)
205 %tmp5 = zext <4 x i16> %tmp4 to <4 x i32>
206 %tmp6 = add <4 x i32> %tmp1, %tmp5
207 ret <4 x i32> %tmp6
186208 }
187209
188210 define <2 x i64> @vabalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
191213 %tmp1 = load <2 x i64>* %A
192214 %tmp2 = load <2 x i32>* %B
193215 %tmp3 = load <2 x i32>* %C
194 %tmp4 = call <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
195 ret <2 x i64> %tmp4
196 }
197
198 declare <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
199 declare <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
200 declare <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
201
202 declare <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
203 declare <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
204 declare <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
205
206 define <8 x i8> @vabd_combine_s8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
207 ;CHECK: vabd_combine_s8:
208 ;CHECK: vaba.s8
209 %tmp1 = load <8 x i8>* %A
210 %tmp2 = load <8 x i8>* %B
211 %tmp3 = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
212 %tmp4 = add <8 x i8> %tmp2, %tmp3
213 ret <8 x i8> %tmp4
214 }
215
216 define <4 x i16> @vabd_combine_u16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
217 ;CHECK: vabd_combine_u16:
218 ;CHECK: vaba.u16
219 %tmp1 = load <4 x i16>* %A
220 %tmp2 = load <4 x i16>* %B
221 %tmp3 = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
222 %tmp4 = add <4 x i16> %tmp3, %tmp1
223 ret <4 x i16> %tmp4
224 }
225
226 declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
227 declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
228
216 %tmp4 = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %tmp2, <2 x i32> %tmp3)
217 %tmp5 = zext <2 x i32> %tmp4 to <2 x i64>
218 %tmp6 = add <2 x i64> %tmp1, %tmp5
219 ret <2 x i64> %tmp6
220 }
150150 ;CHECK: vabdl.s8
151151 %tmp1 = load <8 x i8>* %A
152152 %tmp2 = load <8 x i8>* %B
153 %tmp3 = call <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
154 ret <8 x i16> %tmp3
153 %tmp3 = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
154 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
155 ret <8 x i16> %tmp4
155156 }
156157
157158 define <4 x i32> @vabdls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
159160 ;CHECK: vabdl.s16
160161 %tmp1 = load <4 x i16>* %A
161162 %tmp2 = load <4 x i16>* %B
162 %tmp3 = call <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
163 ret <4 x i32> %tmp3
163 %tmp3 = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
164 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
165 ret <4 x i32> %tmp4
164166 }
165167
166168 define <2 x i64> @vabdls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
168170 ;CHECK: vabdl.s32
169171 %tmp1 = load <2 x i32>* %A
170172 %tmp2 = load <2 x i32>* %B
171 %tmp3 = call <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
172 ret <2 x i64> %tmp3
173 %tmp3 = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
174 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
175 ret <2 x i64> %tmp4
173176 }
174177
175178 define <8 x i16> @vabdlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
177180 ;CHECK: vabdl.u8
178181 %tmp1 = load <8 x i8>* %A
179182 %tmp2 = load <8 x i8>* %B
180 %tmp3 = call <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
181 ret <8 x i16> %tmp3
183 %tmp3 = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
184 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
185 ret <8 x i16> %tmp4
182186 }
183187
184188 define <4 x i32> @vabdlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
186190 ;CHECK: vabdl.u16
187191 %tmp1 = load <4 x i16>* %A
188192 %tmp2 = load <4 x i16>* %B
189 %tmp3 = call <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
190 ret <4 x i32> %tmp3
193 %tmp3 = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
194 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
195 ret <4 x i32> %tmp4
191196 }
192197
193198 define <2 x i64> @vabdlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
195200 ;CHECK: vabdl.u32
196201 %tmp1 = load <2 x i32>* %A
197202 %tmp2 = load <2 x i32>* %B
198 %tmp3 = call <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
199 ret <2 x i64> %tmp3
200 }
201
202 declare <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
203 declare <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
204 declare <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
205
206 declare <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
207 declare <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
208 declare <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
203 %tmp3 = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
204 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
205 ret <2 x i64> %tmp4
206 }