llvm.org GIT mirror llvm / 04d6c28
Remove NEON vaddl, vaddw, vsubl, and vsubw intrinsics. Instead, use llvm IR add/sub operations with one or both operands sign- or zero-extended. Auto-upgrade the old intrinsics. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@112416 91177308-0d34-0410-b5e6-96231b3b80d8 Bob Wilson 10 years ago
7 changed file(s) with 229 addition(s) and 128 deletion(s). Raw diff Collapse all Expand all
7171 : Intrinsic<[llvm_anyvector_ty],
7272 [LLVMTruncatedElementVectorType<0>,
7373 LLVMTruncatedElementVectorType<0>],
74 [IntrNoMem]>;
75 class Neon_2Arg_Wide_Intrinsic
76 : Intrinsic<[llvm_anyvector_ty],
77 [LLVMMatchType<0>, LLVMTruncatedElementVectorType<0>],
7874 [IntrNoMem]>;
7975 class Neon_3Arg_Intrinsic
8076 : Intrinsic<[llvm_anyvector_ty],
127123 def int_arm_neon_vqaddu : Neon_2Arg_Intrinsic;
128124 def int_arm_neon_vaddhn : Neon_2Arg_Narrow_Intrinsic;
129125 def int_arm_neon_vraddhn : Neon_2Arg_Narrow_Intrinsic;
130 def int_arm_neon_vaddls : Neon_2Arg_Long_Intrinsic;
131 def int_arm_neon_vaddlu : Neon_2Arg_Long_Intrinsic;
132 def int_arm_neon_vaddws : Neon_2Arg_Wide_Intrinsic;
133 def int_arm_neon_vaddwu : Neon_2Arg_Wide_Intrinsic;
134126
135127 // Vector Multiply.
136128 def int_arm_neon_vmulp : Neon_2Arg_Intrinsic;
171163 def int_arm_neon_vqsubu : Neon_2Arg_Intrinsic;
172164 def int_arm_neon_vsubhn : Neon_2Arg_Narrow_Intrinsic;
173165 def int_arm_neon_vrsubhn : Neon_2Arg_Narrow_Intrinsic;
174 def int_arm_neon_vsubls : Neon_2Arg_Long_Intrinsic;
175 def int_arm_neon_vsublu : Neon_2Arg_Long_Intrinsic;
176 def int_arm_neon_vsubws : Neon_2Arg_Wide_Intrinsic;
177 def int_arm_neon_vsubwu : Neon_2Arg_Wide_Intrinsic;
178166
179167 // Vector Absolute Compare.
180168 let TargetPrefix = "arm" in {
12931293 let isCommutable = Commutable;
12941294 }
12951295
1296 // Long 3-register operations.
1297 class N3VL op21_20, bits<4> op11_8, bit op4,
1298 InstrItinClass itin, string OpcodeStr, string Dt,
1299 ValueType TyQ, ValueType TyD, SDNode OpNode, SDNode ExtOp,
1300 bit Commutable>
1301 : N3V
1302 (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin,
1303 OpcodeStr, Dt, "$dst, $src1, $src2", "",
1304 [(set QPR:$dst, (OpNode (TyQ (ExtOp (TyD DPR:$src1))),
1305 (TyQ (ExtOp (TyD DPR:$src2)))))]> {
1306 let isCommutable = Commutable;
1307 }
1308
12961309 // Long 3-register intrinsics.
12971310 class N3VLInt op21_20, bits<4> op11_8, bit op4,
12981311 InstrItinClass itin, string OpcodeStr, string Dt,
13241337 (OpTy (NEONvduplane (OpTy DPR_8:$src2),
13251338 imm:$lane)))))]>;
13261339
1327 // Wide 3-register intrinsics.
1328 class N3VWInt op21_20, bits<4> op11_8, bit op4,
1329 string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD,
1330 Intrinsic IntOp, bit Commutable>
1340 // Wide 3-register operations.
1341 class N3VW op21_20, bits<4> op11_8, bit op4,
1342 string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD,
1343 SDNode OpNode, SDNode ExtOp, bit Commutable>
13311344 : N3V
13321345 (outs QPR:$dst), (ins QPR:$src1, DPR:$src2), N3RegFrm, IIC_VSUBiD,
13331346 OpcodeStr, Dt, "$dst, $src1, $src2", "",
1334 [(set QPR:$dst, (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2))))]> {
1347 [(set QPR:$dst, (OpNode (TyQ QPR:$src1),
1348 (TyQ (ExtOp (TyD DPR:$src2)))))]> {
13351349 let isCommutable = Commutable;
13361350 }
13371351
16831697 }
16841698
16851699
1700 // Neon Long 3-register vector operations.
1701
1702 multiclass N3VL_QHS op11_8, bit op4,
1703 InstrItinClass itin16, InstrItinClass itin32,
1704 string OpcodeStr, string Dt,
1705 SDNode OpNode, SDNode ExtOp, bit Commutable = 0> {
1706 def v4i32 : N3VL
1707 OpcodeStr, !strconcat(Dt, "16"),
1708 v4i32, v4i16, OpNode, ExtOp, Commutable>;
1709 def v2i64 : N3VL
1710 OpcodeStr, !strconcat(Dt, "32"),
1711 v2i64, v2i32, OpNode, ExtOp, Commutable>;
1712 def v8i16 : N3VL
1713 OpcodeStr, !strconcat(Dt, "8"),
1714 v8i16, v8i8, OpNode, ExtOp, Commutable>;
1715 }
1716
16861717 // Neon Long 3-register vector intrinsics.
16871718
16881719 // First with only element sizes of 16 and 32 bits:
17221753
17231754 // Neon Wide 3-register vector intrinsics,
17241755 // source operand element sizes of 8, 16 and 32 bits:
1725 multiclass N3VWInt_QHS op11_8, bit op4,
1726 string OpcodeStr, string Dt,
1727 Intrinsic IntOp, bit Commutable = 0> {
1728 def v8i16 : N3VWInt
1729 OpcodeStr, !strconcat(Dt, "8"),
1730 v8i16, v8i8, IntOp, Commutable>;
1731 def v4i32 : N3VWInt
1732 OpcodeStr, !strconcat(Dt, "16"),
1733 v4i32, v4i16, IntOp, Commutable>;
1734 def v2i64 : N3VWInt
1735 OpcodeStr, !strconcat(Dt, "32"),
1736 v2i64, v2i32, IntOp, Commutable>;
1756 multiclass N3VW_QHS op11_8, bit op4,
1757 string OpcodeStr, string Dt,
1758 SDNode OpNode, SDNode ExtOp, bit Commutable = 0> {
1759 def v8i16 : N3VW
1760 OpcodeStr, !strconcat(Dt, "8"),
1761 v8i16, v8i8, OpNode, ExtOp, Commutable>;
1762 def v4i32 : N3VW
1763 OpcodeStr, !strconcat(Dt, "16"),
1764 v4i32, v4i16, OpNode, ExtOp, Commutable>;
1765 def v2i64 : N3VW
1766 OpcodeStr, !strconcat(Dt, "32"),
1767 v2i64, v2i32, OpNode, ExtOp, Commutable>;
17371768 }
17381769
17391770
20722103 def VADDfq : N3VQ<0, 0, 0b00, 0b1101, 0, IIC_VBINQ, "vadd", "f32",
20732104 v4f32, v4f32, fadd, 1>;
20742105 // VADDL : Vector Add Long (Q = D + D)
2075 defm VADDLs : N3VLInt_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
2076 "vaddl", "s", int_arm_neon_vaddls, 1>;
2077 defm VADDLu : N3VLInt_QHS<1,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
2078 "vaddl", "u", int_arm_neon_vaddlu, 1>;
2106 defm VADDLs : N3VL_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
2107 "vaddl", "s", add, sext, 1>;
2108 defm VADDLu : N3VL_QHS<1,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
2109 "vaddl", "u", add, zext, 1>;
20792110 // VADDW : Vector Add Wide (Q = Q + D)
2080 defm VADDWs : N3VWInt_QHS<0,1,0b0001,0, "vaddw", "s", int_arm_neon_vaddws, 0>;
2081 defm VADDWu : N3VWInt_QHS<1,1,0b0001,0, "vaddw", "u", int_arm_neon_vaddwu, 0>;
2111 defm VADDWs : N3VW_QHS<0,1,0b0001,0, "vaddw", "s", add, sext, 0>;
2112 defm VADDWu : N3VW_QHS<1,1,0b0001,0, "vaddw", "u", add, zext, 0>;
20822113 // VHADD : Vector Halving Add
20832114 defm VHADDs : N3VInt_QHS<0, 0, 0b0000, 0, N3RegFrm,
20842115 IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
23232354 def VSUBfq : N3VQ<0, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vsub", "f32",
23242355 v4f32, v4f32, fsub, 0>;
23252356 // VSUBL : Vector Subtract Long (Q = D - D)
2326 defm VSUBLs : N3VLInt_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
2327 "vsubl", "s", int_arm_neon_vsubls, 1>;
2328 defm VSUBLu : N3VLInt_QHS<1,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
2329 "vsubl", "u", int_arm_neon_vsublu, 1>;
2357 defm VSUBLs : N3VL_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
2358 "vsubl", "s", sub, sext, 0>;
2359 defm VSUBLu : N3VL_QHS<1,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
2360 "vsubl", "u", sub, zext, 0>;
23302361 // VSUBW : Vector Subtract Wide (Q = Q - D)
2331 defm VSUBWs : N3VWInt_QHS<0,1,0b0011,0, "vsubw", "s", int_arm_neon_vsubws, 0>;
2332 defm VSUBWu : N3VWInt_QHS<1,1,0b0011,0, "vsubw", "u", int_arm_neon_vsubwu, 0>;
2362 defm VSUBWs : N3VW_QHS<0,1,0b0011,0, "vsubw", "s", sub, sext, 0>;
2363 defm VSUBWu : N3VW_QHS<1,1,0b0011,0, "vsubw", "u", sub, zext, 0>;
23332364 // VHSUB : Vector Halving Subtract
23342365 defm VHSUBs : N3VInt_QHS<0, 0, 0b0010, 0, N3RegFrm,
23352366 IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
25582589 defm VABDLs : N3VLInt_QHS<0,1,0b0111,0, IIC_VSUBi4Q, IIC_VSUBi4Q,
25592590 "vabdl", "s", int_arm_neon_vabdls, 0>;
25602591 defm VABDLu : N3VLInt_QHS<1,1,0b0111,0, IIC_VSUBi4Q, IIC_VSUBi4Q,
2561 "vabdl", "u", int_arm_neon_vabdlu, 0>;
2592 "vabdl", "u", int_arm_neon_vabdlu, 0>;
25622593
25632594 // VABA : Vector Absolute Difference and Accumulate
25642595 defm VABAs : N3VInt3_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
7878 return true;
7979 }
8080 } else if (Name.compare(5, 9, "arm.neon.", 9) == 0) {
81 if (Name.compare(14, 7, "vmovls.", 7) == 0 ||
82 Name.compare(14, 7, "vmovlu.", 7) == 0) {
81 if (((Name.compare(14, 5, "vmovl", 5) == 0 ||
82 Name.compare(14, 5, "vaddl", 5) == 0 ||
83 Name.compare(14, 5, "vsubl", 5) == 0) &&
84 (Name.compare(19, 2, "s.", 2) == 0 ||
85 Name.compare(19, 2, "u.", 2) == 0)) ||
86
87 ((Name.compare(14, 5, "vaddw", 5) == 0 ||
88 Name.compare(14, 5, "vsubw", 5) == 0) &&
89 (Name.compare(19, 2, "s.", 2) == 0 ||
90 Name.compare(19, 2, "u.", 2) == 0))) {
91
8392 // Calls to these are transformed into IR without intrinsics.
8493 NewFn = 0;
8594 return true;
370379 } else if (Name.compare(14, 7, "vmovlu.", 7) == 0) {
371380 NewI = new ZExtInst(CI->getArgOperand(0), CI->getType(),
372381 "upgraded." + CI->getName(), CI);
382
383 } else if (Name.compare(14, 4, "vadd", 4) == 0 ||
384 Name.compare(14, 4, "vsub", 4) == 0) {
385 // Extend one (vaddw/vsubw) or both (vaddl/vsubl) operands.
386 Value *V0 = CI->getArgOperand(0);
387 Value *V1 = CI->getArgOperand(1);
388 if (Name.at(19) == 's') {
389 if (Name.at(18) == 'l')
390 V0 = new SExtInst(CI->getArgOperand(0), CI->getType(), "", CI);
391 V1 = new SExtInst(CI->getArgOperand(1), CI->getType(), "", CI);
392 } else {
393 assert(Name.at(19) == 'u' && "unexpected vadd/vsub intrinsic");
394 if (Name.at(18) == 'l')
395 V0 = new ZExtInst(CI->getArgOperand(0), CI->getType(), "", CI);
396 V1 = new ZExtInst(CI->getArgOperand(1), CI->getType(), "", CI);
397 }
398 if (Name.compare(14, 4, "vadd", 4) == 0)
399 NewI = BinaryOperator::CreateAdd(V0, V1,"upgraded."+CI->getName(),CI);
400 else
401 NewI = BinaryOperator::CreateSub(V0, V1,"upgraded."+CI->getName(),CI);
402
373403 } else {
374404 llvm_unreachable("Unknown arm.neon function for CallInst upgrade.");
375405 }
2626 ; CHECK: vmovlu32
2727 ; CHECK-NOT: arm.neon.vmovlu.v2i64
2828 ; CHECK: zext <2 x i32>
29
30 ; vaddl/vaddw should be auto-upgraded to add with sext/zext
31
32 ; CHECK: vaddls16
33 ; CHECK-NOT: arm.neon.vaddls.v4i32
34 ; CHECK: sext <4 x i16>
35 ; CHECK-NEXT: sext <4 x i16>
36 ; CHECK-NEXT: add <4 x i32>
37
38 ; CHECK: vaddlu32
39 ; CHECK-NOT: arm.neon.vaddlu.v2i64
40 ; CHECK: zext <2 x i32>
41 ; CHECK-NEXT: zext <2 x i32>
42 ; CHECK-NEXT: add <2 x i64>
43
44 ; CHECK: vaddws8
45 ; CHECK-NOT: arm.neon.vaddws.v8i16
46 ; CHECK: sext <8 x i8>
47 ; CHECK-NEXT: add <8 x i16>
48
49 ; CHECK: vaddwu16
50 ; CHECK-NOT: arm.neon.vaddwu.v4i32
51 ; CHECK: zext <4 x i16>
52 ; CHECK-NEXT: add <4 x i32>
53
54 ; vsubl/vsubw should be auto-upgraded to sub with sext/zext
55
56 ; CHECK: vsubls16
57 ; CHECK-NOT: arm.neon.vsubls.v4i32
58 ; CHECK: sext <4 x i16>
59 ; CHECK-NEXT: sext <4 x i16>
60 ; CHECK-NEXT: sub <4 x i32>
61
62 ; CHECK: vsublu32
63 ; CHECK-NOT: arm.neon.vsublu.v2i64
64 ; CHECK: zext <2 x i32>
65 ; CHECK-NEXT: zext <2 x i32>
66 ; CHECK-NEXT: sub <2 x i64>
67
68 ; CHECK: vsubws8
69 ; CHECK-NOT: arm.neon.vsubws.v8i16
70 ; CHECK: sext <8 x i8>
71 ; CHECK-NEXT: sub <8 x i16>
72
73 ; CHECK: vsubwu16
74 ; CHECK-NOT: arm.neon.vsubwu.v4i32
75 ; CHECK: zext <4 x i16>
76 ; CHECK-NEXT: sub <4 x i32>
2977
3078 ; vld* and vst* intrinsic calls need an alignment argument (defaulted to 1)
3179
156156 ;CHECK: vaddl.s8
157157 %tmp1 = load <8 x i8>* %A
158158 %tmp2 = load <8 x i8>* %B
159 %tmp3 = call <8 x i16> @llvm.arm.neon.vaddls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
160 ret <8 x i16> %tmp3
159 %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
160 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
161 %tmp5 = add <8 x i16> %tmp3, %tmp4
162 ret <8 x i16> %tmp5
161163 }
162164
163165 define <4 x i32> @vaddls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
165167 ;CHECK: vaddl.s16
166168 %tmp1 = load <4 x i16>* %A
167169 %tmp2 = load <4 x i16>* %B
168 %tmp3 = call <4 x i32> @llvm.arm.neon.vaddls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
169 ret <4 x i32> %tmp3
170 %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
171 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
172 %tmp5 = add <4 x i32> %tmp3, %tmp4
173 ret <4 x i32> %tmp5
170174 }
171175
172176 define <2 x i64> @vaddls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
174178 ;CHECK: vaddl.s32
175179 %tmp1 = load <2 x i32>* %A
176180 %tmp2 = load <2 x i32>* %B
177 %tmp3 = call <2 x i64> @llvm.arm.neon.vaddls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
178 ret <2 x i64> %tmp3
181 %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
182 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
183 %tmp5 = add <2 x i64> %tmp3, %tmp4
184 ret <2 x i64> %tmp5
179185 }
180186
181187 define <8 x i16> @vaddlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
183189 ;CHECK: vaddl.u8
184190 %tmp1 = load <8 x i8>* %A
185191 %tmp2 = load <8 x i8>* %B
186 %tmp3 = call <8 x i16> @llvm.arm.neon.vaddlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
187 ret <8 x i16> %tmp3
192 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
193 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
194 %tmp5 = add <8 x i16> %tmp3, %tmp4
195 ret <8 x i16> %tmp5
188196 }
189197
190198 define <4 x i32> @vaddlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
192200 ;CHECK: vaddl.u16
193201 %tmp1 = load <4 x i16>* %A
194202 %tmp2 = load <4 x i16>* %B
195 %tmp3 = call <4 x i32> @llvm.arm.neon.vaddlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
196 ret <4 x i32> %tmp3
203 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
204 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
205 %tmp5 = add <4 x i32> %tmp3, %tmp4
206 ret <4 x i32> %tmp5
197207 }
198208
199209 define <2 x i64> @vaddlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
201211 ;CHECK: vaddl.u32
202212 %tmp1 = load <2 x i32>* %A
203213 %tmp2 = load <2 x i32>* %B
204 %tmp3 = call <2 x i64> @llvm.arm.neon.vaddlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
205 ret <2 x i64> %tmp3
206 }
207
208 declare <8 x i16> @llvm.arm.neon.vaddls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
209 declare <4 x i32> @llvm.arm.neon.vaddls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
210 declare <2 x i64> @llvm.arm.neon.vaddls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
211
212 declare <8 x i16> @llvm.arm.neon.vaddlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
213 declare <4 x i32> @llvm.arm.neon.vaddlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
214 declare <2 x i64> @llvm.arm.neon.vaddlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
214 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
215 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
216 %tmp5 = add <2 x i64> %tmp3, %tmp4
217 ret <2 x i64> %tmp5
218 }
215219
216220 define <8 x i16> @vaddws8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
217221 ;CHECK: vaddws8:
218222 ;CHECK: vaddw.s8
219223 %tmp1 = load <8 x i16>* %A
220224 %tmp2 = load <8 x i8>* %B
221 %tmp3 = call <8 x i16> @llvm.arm.neon.vaddws.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
222 ret <8 x i16> %tmp3
225 %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
226 %tmp4 = add <8 x i16> %tmp1, %tmp3
227 ret <8 x i16> %tmp4
223228 }
224229
225230 define <4 x i32> @vaddws16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
227232 ;CHECK: vaddw.s16
228233 %tmp1 = load <4 x i32>* %A
229234 %tmp2 = load <4 x i16>* %B
230 %tmp3 = call <4 x i32> @llvm.arm.neon.vaddws.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
231 ret <4 x i32> %tmp3
235 %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
236 %tmp4 = add <4 x i32> %tmp1, %tmp3
237 ret <4 x i32> %tmp4
232238 }
233239
234240 define <2 x i64> @vaddws32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
236242 ;CHECK: vaddw.s32
237243 %tmp1 = load <2 x i64>* %A
238244 %tmp2 = load <2 x i32>* %B
239 %tmp3 = call <2 x i64> @llvm.arm.neon.vaddws.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
240 ret <2 x i64> %tmp3
245 %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
246 %tmp4 = add <2 x i64> %tmp1, %tmp3
247 ret <2 x i64> %tmp4
241248 }
242249
243250 define <8 x i16> @vaddwu8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
245252 ;CHECK: vaddw.u8
246253 %tmp1 = load <8 x i16>* %A
247254 %tmp2 = load <8 x i8>* %B
248 %tmp3 = call <8 x i16> @llvm.arm.neon.vaddwu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
249 ret <8 x i16> %tmp3
255 %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
256 %tmp4 = add <8 x i16> %tmp1, %tmp3
257 ret <8 x i16> %tmp4
250258 }
251259
252260 define <4 x i32> @vaddwu16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
254262 ;CHECK: vaddw.u16
255263 %tmp1 = load <4 x i32>* %A
256264 %tmp2 = load <4 x i16>* %B
257 %tmp3 = call <4 x i32> @llvm.arm.neon.vaddwu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
258 ret <4 x i32> %tmp3
265 %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
266 %tmp4 = add <4 x i32> %tmp1, %tmp3
267 ret <4 x i32> %tmp4
259268 }
260269
261270 define <2 x i64> @vaddwu32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
263272 ;CHECK: vaddw.u32
264273 %tmp1 = load <2 x i64>* %A
265274 %tmp2 = load <2 x i32>* %B
266 %tmp3 = call <2 x i64> @llvm.arm.neon.vaddwu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
267 ret <2 x i64> %tmp3
268 }
269
270 declare <8 x i16> @llvm.arm.neon.vaddws.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
271 declare <4 x i32> @llvm.arm.neon.vaddws.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
272 declare <2 x i64> @llvm.arm.neon.vaddws.v2i64(<2 x i64>, <2 x i32>) nounwind readnone
273
274 declare <8 x i16> @llvm.arm.neon.vaddwu.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
275 declare <4 x i32> @llvm.arm.neon.vaddwu.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
276 declare <2 x i64> @llvm.arm.neon.vaddwu.v2i64(<2 x i64>, <2 x i32>) nounwind readnone
275 %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
276 %tmp4 = add <2 x i64> %tmp1, %tmp3
277 ret <2 x i64> %tmp4
278 }
156156 ;CHECK: vsubl.s8
157157 %tmp1 = load <8 x i8>* %A
158158 %tmp2 = load <8 x i8>* %B
159 %tmp3 = call <8 x i16> @llvm.arm.neon.vsubls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
160 ret <8 x i16> %tmp3
159 %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
160 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
161 %tmp5 = sub <8 x i16> %tmp3, %tmp4
162 ret <8 x i16> %tmp5
161163 }
162164
163165 define <4 x i32> @vsubls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
165167 ;CHECK: vsubl.s16
166168 %tmp1 = load <4 x i16>* %A
167169 %tmp2 = load <4 x i16>* %B
168 %tmp3 = call <4 x i32> @llvm.arm.neon.vsubls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
169 ret <4 x i32> %tmp3
170 %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
171 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
172 %tmp5 = sub <4 x i32> %tmp3, %tmp4
173 ret <4 x i32> %tmp5
170174 }
171175
172176 define <2 x i64> @vsubls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
174178 ;CHECK: vsubl.s32
175179 %tmp1 = load <2 x i32>* %A
176180 %tmp2 = load <2 x i32>* %B
177 %tmp3 = call <2 x i64> @llvm.arm.neon.vsubls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
178 ret <2 x i64> %tmp3
181 %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
182 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
183 %tmp5 = sub <2 x i64> %tmp3, %tmp4
184 ret <2 x i64> %tmp5
179185 }
180186
181187 define <8 x i16> @vsublu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
183189 ;CHECK: vsubl.u8
184190 %tmp1 = load <8 x i8>* %A
185191 %tmp2 = load <8 x i8>* %B
186 %tmp3 = call <8 x i16> @llvm.arm.neon.vsublu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
187 ret <8 x i16> %tmp3
192 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
193 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
194 %tmp5 = sub <8 x i16> %tmp3, %tmp4
195 ret <8 x i16> %tmp5
188196 }
189197
190198 define <4 x i32> @vsublu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
192200 ;CHECK: vsubl.u16
193201 %tmp1 = load <4 x i16>* %A
194202 %tmp2 = load <4 x i16>* %B
195 %tmp3 = call <4 x i32> @llvm.arm.neon.vsublu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
196 ret <4 x i32> %tmp3
203 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
204 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
205 %tmp5 = sub <4 x i32> %tmp3, %tmp4
206 ret <4 x i32> %tmp5
197207 }
198208
199209 define <2 x i64> @vsublu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
201211 ;CHECK: vsubl.u32
202212 %tmp1 = load <2 x i32>* %A
203213 %tmp2 = load <2 x i32>* %B
204 %tmp3 = call <2 x i64> @llvm.arm.neon.vsublu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
205 ret <2 x i64> %tmp3
206 }
207
208 declare <8 x i16> @llvm.arm.neon.vsubls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
209 declare <4 x i32> @llvm.arm.neon.vsubls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
210 declare <2 x i64> @llvm.arm.neon.vsubls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
211
212 declare <8 x i16> @llvm.arm.neon.vsublu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
213 declare <4 x i32> @llvm.arm.neon.vsublu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
214 declare <2 x i64> @llvm.arm.neon.vsublu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
214 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
215 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
216 %tmp5 = sub <2 x i64> %tmp3, %tmp4
217 ret <2 x i64> %tmp5
218 }
215219
216220 define <8 x i16> @vsubws8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
217221 ;CHECK: vsubws8:
218222 ;CHECK: vsubw.s8
219223 %tmp1 = load <8 x i16>* %A
220224 %tmp2 = load <8 x i8>* %B
221 %tmp3 = call <8 x i16> @llvm.arm.neon.vsubws.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
222 ret <8 x i16> %tmp3
225 %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
226 %tmp4 = sub <8 x i16> %tmp1, %tmp3
227 ret <8 x i16> %tmp4
223228 }
224229
225230 define <4 x i32> @vsubws16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
227232 ;CHECK: vsubw.s16
228233 %tmp1 = load <4 x i32>* %A
229234 %tmp2 = load <4 x i16>* %B
230 %tmp3 = call <4 x i32> @llvm.arm.neon.vsubws.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
231 ret <4 x i32> %tmp3
235 %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
236 %tmp4 = sub <4 x i32> %tmp1, %tmp3
237 ret <4 x i32> %tmp4
232238 }
233239
234240 define <2 x i64> @vsubws32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
236242 ;CHECK: vsubw.s32
237243 %tmp1 = load <2 x i64>* %A
238244 %tmp2 = load <2 x i32>* %B
239 %tmp3 = call <2 x i64> @llvm.arm.neon.vsubws.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
240 ret <2 x i64> %tmp3
245 %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
246 %tmp4 = sub <2 x i64> %tmp1, %tmp3
247 ret <2 x i64> %tmp4
241248 }
242249
243250 define <8 x i16> @vsubwu8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
245252 ;CHECK: vsubw.u8
246253 %tmp1 = load <8 x i16>* %A
247254 %tmp2 = load <8 x i8>* %B
248 %tmp3 = call <8 x i16> @llvm.arm.neon.vsubwu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
249 ret <8 x i16> %tmp3
255 %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
256 %tmp4 = sub <8 x i16> %tmp1, %tmp3
257 ret <8 x i16> %tmp4
250258 }
251259
252260 define <4 x i32> @vsubwu16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
254262 ;CHECK: vsubw.u16
255263 %tmp1 = load <4 x i32>* %A
256264 %tmp2 = load <4 x i16>* %B
257 %tmp3 = call <4 x i32> @llvm.arm.neon.vsubwu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
258 ret <4 x i32> %tmp3
265 %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
266 %tmp4 = sub <4 x i32> %tmp1, %tmp3
267 ret <4 x i32> %tmp4
259268 }
260269
261270 define <2 x i64> @vsubwu32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
263272 ;CHECK: vsubw.u32
264273 %tmp1 = load <2 x i64>* %A
265274 %tmp2 = load <2 x i32>* %B
266 %tmp3 = call <2 x i64> @llvm.arm.neon.vsubwu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
267 ret <2 x i64> %tmp3
268 }
269
270 declare <8 x i16> @llvm.arm.neon.vsubws.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
271 declare <4 x i32> @llvm.arm.neon.vsubws.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
272 declare <2 x i64> @llvm.arm.neon.vsubws.v2i64(<2 x i64>, <2 x i32>) nounwind readnone
273
274 declare <8 x i16> @llvm.arm.neon.vsubwu.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
275 declare <4 x i32> @llvm.arm.neon.vsubwu.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
276 declare <2 x i64> @llvm.arm.neon.vsubwu.v2i64(<2 x i64>, <2 x i32>) nounwind readnone
275 %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
276 %tmp4 = sub <2 x i64> %tmp1, %tmp3
277 ret <2 x i64> %tmp4
278 }