llvm.org GIT mirror llvm / b0abb4d
Use vAny type to get rid of Neon intrinsics that differed only in whether the overloaded vector types allowed floating-point or integer vector elements. Most of these operations actually depend on the element type, so bitcasting was not an option. If you include the vpadd intrinsics that I updated earlier, this gets rid of 20 intrinsics. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78646 91177308-0d34-0410-b5e6-96231b3b80d8 Bob Wilson 10 years ago
23 changed file(s) with 255 addition(s) and 310 deletion(s). Raw diff Collapse all Expand all
2626
2727 // The following classes do not correspond directly to GCC builtins.
2828 class Neon_1Arg_Intrinsic
29 : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
30 class Neon_1Arg_Float_Intrinsic
31 : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
29 : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>;
3230 class Neon_1Arg_Narrow_Intrinsic
33 : Intrinsic<[llvm_anyint_ty],
31 : Intrinsic<[llvm_anyvector_ty],
3432 [LLVMExtendedElementVectorType<0>], [IntrNoMem]>;
3533 class Neon_1Arg_Long_Intrinsic
36 : Intrinsic<[llvm_anyint_ty],
34 : Intrinsic<[llvm_anyvector_ty],
3735 [LLVMTruncatedElementVectorType<0>], [IntrNoMem]>;
3836 class Neon_2Arg_Intrinsic
39 : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
40 [IntrNoMem]>;
41 class Neon_2Arg_Float_Intrinsic
42 : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
43 [IntrNoMem]>;
44 class Neon_2Arg_Vector_Intrinsic
4537 : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
4638 [IntrNoMem]>;
4739 class Neon_2Arg_Narrow_Intrinsic
48 : Intrinsic<[llvm_anyint_ty],
40 : Intrinsic<[llvm_anyvector_ty],
4941 [LLVMExtendedElementVectorType<0>,
5042 LLVMExtendedElementVectorType<0>],
5143 [IntrNoMem]>;
5244 class Neon_2Arg_Long_Intrinsic
53 : Intrinsic<[llvm_anyint_ty],
45 : Intrinsic<[llvm_anyvector_ty],
5446 [LLVMTruncatedElementVectorType<0>,
5547 LLVMTruncatedElementVectorType<0>],
5648 [IntrNoMem]>;
5749 class Neon_2Arg_Wide_Intrinsic
58 : Intrinsic<[llvm_anyint_ty],
50 : Intrinsic<[llvm_anyvector_ty],
5951 [LLVMMatchType<0>, LLVMTruncatedElementVectorType<0>],
6052 [IntrNoMem]>;
6153 class Neon_3Arg_Intrinsic
62 : Intrinsic<[llvm_anyint_ty],
54 : Intrinsic<[llvm_anyvector_ty],
6355 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
6456 [IntrNoMem]>;
6557 class Neon_3Arg_Long_Intrinsic
66 : Intrinsic<[llvm_anyint_ty],
58 : Intrinsic<[llvm_anyvector_ty],
6759 [LLVMMatchType<0>,
6860 LLVMTruncatedElementVectorType<0>,
6961 LLVMTruncatedElementVectorType<0>],
7062 [IntrNoMem]>;
7163 class Neon_2Result_Intrinsic
72 : Intrinsic<[llvm_anyint_ty, LLVMMatchType<0>],
73 [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
74 class Neon_2Result_Float_Intrinsic
75 : Intrinsic<[llvm_anyfloat_ty, LLVMMatchType<0>],
64 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
7665 [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
7766 class Neon_CvtFxToFP_Intrinsic
7867 : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
140129 // Vector Maximum.
141130 def int_arm_neon_vmaxs : Neon_2Arg_Intrinsic;
142131 def int_arm_neon_vmaxu : Neon_2Arg_Intrinsic;
143 def int_arm_neon_vmaxf : Neon_2Arg_Float_Intrinsic;
144132
145133 // Vector Minimum.
146134 def int_arm_neon_vmins : Neon_2Arg_Intrinsic;
147135 def int_arm_neon_vminu : Neon_2Arg_Intrinsic;
148 def int_arm_neon_vminf : Neon_2Arg_Float_Intrinsic;
149136
150137 // Vector Reciprocal Step.
151 def int_arm_neon_vrecps : Neon_2Arg_Float_Intrinsic;
138 def int_arm_neon_vrecps : Neon_2Arg_Intrinsic;
152139
153140 // Vector Reciprocal Square Root Step.
154 def int_arm_neon_vrsqrts : Neon_2Arg_Float_Intrinsic;
141 def int_arm_neon_vrsqrts : Neon_2Arg_Intrinsic;
155142 }
156143
157144 // Vector Subtract.
185172 // Vector Absolute Differences.
186173 def int_arm_neon_vabds : Neon_2Arg_Intrinsic;
187174 def int_arm_neon_vabdu : Neon_2Arg_Intrinsic;
188 def int_arm_neon_vabdf : Neon_2Arg_Float_Intrinsic;
189175 def int_arm_neon_vabdls : Neon_2Arg_Long_Intrinsic;
190176 def int_arm_neon_vabdlu : Neon_2Arg_Long_Intrinsic;
191177
196182 def int_arm_neon_vabalu : Neon_3Arg_Long_Intrinsic;
197183
198184 // Vector Pairwise Add.
199 def int_arm_neon_vpadd : Neon_2Arg_Vector_Intrinsic;
185 def int_arm_neon_vpadd : Neon_2Arg_Intrinsic;
200186
201187 // Vector Pairwise Add Long.
202188 // Note: This is different than the other "long" NEON intrinsics because
203189 // the result vector has half as many elements as the source vector.
204190 // The source and destination vector types must be specified separately.
205191 let TargetPrefix = "arm" in {
206 def int_arm_neon_vpaddls : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty],
192 def int_arm_neon_vpaddls : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty],
207193 [IntrNoMem]>;
208 def int_arm_neon_vpaddlu : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty],
194 def int_arm_neon_vpaddlu : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty],
209195 [IntrNoMem]>;
210196 }
211197
213199 // Note: This is similar to vpaddl but the destination vector also appears
214200 // as the first argument.
215201 let TargetPrefix = "arm" in {
216 def int_arm_neon_vpadals : Intrinsic<[llvm_anyint_ty],
217 [LLVMMatchType<0>, llvm_anyint_ty],
202 def int_arm_neon_vpadals : Intrinsic<[llvm_anyvector_ty],
203 [LLVMMatchType<0>, llvm_anyvector_ty],
218204 [IntrNoMem]>;
219 def int_arm_neon_vpadalu : Intrinsic<[llvm_anyint_ty],
220 [LLVMMatchType<0>, llvm_anyint_ty],
205 def int_arm_neon_vpadalu : Intrinsic<[llvm_anyvector_ty],
206 [LLVMMatchType<0>, llvm_anyvector_ty],
221207 [IntrNoMem]>;
222208 }
223209
224210 // Vector Pairwise Maximum and Minimum.
225211 def int_arm_neon_vpmaxs : Neon_2Arg_Intrinsic;
226212 def int_arm_neon_vpmaxu : Neon_2Arg_Intrinsic;
227 def int_arm_neon_vpmaxf : Neon_2Arg_Float_Intrinsic;
228213 def int_arm_neon_vpmins : Neon_2Arg_Intrinsic;
229214 def int_arm_neon_vpminu : Neon_2Arg_Intrinsic;
230 def int_arm_neon_vpminf : Neon_2Arg_Float_Intrinsic;
231215
232216 // Vector Shifts:
233217 //
282266
283267 // Vector Absolute Value and Saturating Absolute Value.
284268 def int_arm_neon_vabs : Neon_1Arg_Intrinsic;
285 def int_arm_neon_vabsf : Neon_1Arg_Float_Intrinsic;
286269 def int_arm_neon_vqabs : Neon_1Arg_Intrinsic;
287270
288271 // Vector Saturating Negate.
297280
298281 // Vector Reciprocal Estimate.
299282 def int_arm_neon_vrecpe : Neon_1Arg_Intrinsic;
300 def int_arm_neon_vrecpef : Neon_1Arg_Float_Intrinsic;
301283
302284 // Vector Reciprocal Square Root Estimate.
303285 def int_arm_neon_vrsqrte : Neon_1Arg_Intrinsic;
304 def int_arm_neon_vrsqrtef : Neon_1Arg_Float_Intrinsic;
305286
306287 // Vector Conversions Between Floating-point and Fixed-point.
307288 def int_arm_neon_vcvtfp2fxs : Neon_CvtFPToFx_Intrinsic;
330311 def int_arm_neon_vtbx4 : Neon_Tbl6Arg_Intrinsic;
331312
332313 // Vector Transpose.
333 def int_arm_neon_vtrni : Neon_2Result_Intrinsic;
334 def int_arm_neon_vtrnf : Neon_2Result_Float_Intrinsic;
314 def int_arm_neon_vtrn : Neon_2Result_Intrinsic;
335315
336316 // Vector Interleave (vzip).
337 def int_arm_neon_vzipi : Neon_2Result_Intrinsic;
338 def int_arm_neon_vzipf : Neon_2Result_Float_Intrinsic;
317 def int_arm_neon_vzip : Neon_2Result_Intrinsic;
339318
340319 // Vector Deinterleave (vuzp).
341 def int_arm_neon_vuzpi : Neon_2Result_Intrinsic;
342 def int_arm_neon_vuzpf : Neon_2Result_Float_Intrinsic;
320 def int_arm_neon_vuzp : Neon_2Result_Intrinsic;
343321
344322 let TargetPrefix = "arm" in {
345323
346324 // De-interleaving vector loads from N-element structures.
347 def int_arm_neon_vld1i : Intrinsic<[llvm_anyint_ty],
348 [llvm_ptr_ty], [IntrReadArgMem]>;
349 def int_arm_neon_vld1f : Intrinsic<[llvm_anyfloat_ty],
350 [llvm_ptr_ty], [IntrReadArgMem]>;
351 def int_arm_neon_vld2i : Intrinsic<[llvm_anyint_ty, LLVMMatchType<0>],
352 [llvm_ptr_ty], [IntrReadArgMem]>;
353 def int_arm_neon_vld2f : Intrinsic<[llvm_anyfloat_ty, LLVMMatchType<0>],
354 [llvm_ptr_ty], [IntrReadArgMem]>;
355 def int_arm_neon_vld3i : Intrinsic<[llvm_anyint_ty, LLVMMatchType<0>,
356 LLVMMatchType<0>],
357 [llvm_ptr_ty], [IntrReadArgMem]>;
358 def int_arm_neon_vld3f : Intrinsic<[llvm_anyfloat_ty, LLVMMatchType<0>,
359 LLVMMatchType<0>],
360 [llvm_ptr_ty], [IntrReadArgMem]>;
361 def int_arm_neon_vld4i : Intrinsic<[llvm_anyint_ty, LLVMMatchType<0>,
362 LLVMMatchType<0>, LLVMMatchType<0>],
363 [llvm_ptr_ty], [IntrReadArgMem]>;
364 def int_arm_neon_vld4f : Intrinsic<[llvm_anyfloat_ty, LLVMMatchType<0>,
365 LLVMMatchType<0>, LLVMMatchType<0>],
366 [llvm_ptr_ty], [IntrReadArgMem]>;
325 def int_arm_neon_vld1 : Intrinsic<[llvm_anyvector_ty],
326 [llvm_ptr_ty], [IntrReadArgMem]>;
327 def int_arm_neon_vld2 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
328 [llvm_ptr_ty], [IntrReadArgMem]>;
329 def int_arm_neon_vld3 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
330 LLVMMatchType<0>],
331 [llvm_ptr_ty], [IntrReadArgMem]>;
332 def int_arm_neon_vld4 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
333 LLVMMatchType<0>, LLVMMatchType<0>],
334 [llvm_ptr_ty], [IntrReadArgMem]>;
367335
368336 // Interleaving vector stores from N-element structures.
369 def int_arm_neon_vst1i : Intrinsic<[llvm_void_ty],
370 [llvm_ptr_ty, llvm_anyint_ty],
337 def int_arm_neon_vst1 : Intrinsic<[llvm_void_ty],
338 [llvm_ptr_ty, llvm_anyvector_ty],
339 [IntrWriteArgMem]>;
340 def int_arm_neon_vst2 : Intrinsic<[llvm_void_ty],
341 [llvm_ptr_ty, llvm_anyvector_ty,
342 LLVMMatchType<0>], [IntrWriteArgMem]>;
343 def int_arm_neon_vst3 : Intrinsic<[llvm_void_ty],
344 [llvm_ptr_ty, llvm_anyvector_ty,
345 LLVMMatchType<0>, LLVMMatchType<0>],
371346 [IntrWriteArgMem]>;
372 def int_arm_neon_vst1f : Intrinsic<[llvm_void_ty],
373 [llvm_ptr_ty, llvm_anyfloat_ty],
374 [IntrWriteArgMem]>;
375 def int_arm_neon_vst2i : Intrinsic<[llvm_void_ty],
376 [llvm_ptr_ty, llvm_anyint_ty,
377 LLVMMatchType<0>], [IntrWriteArgMem]>;
378 def int_arm_neon_vst2f : Intrinsic<[llvm_void_ty],
379 [llvm_ptr_ty, llvm_anyfloat_ty,
380 LLVMMatchType<0>], [IntrWriteArgMem]>;
381 def int_arm_neon_vst3i : Intrinsic<[llvm_void_ty],
382 [llvm_ptr_ty, llvm_anyint_ty,
383 LLVMMatchType<0>, LLVMMatchType<0>],
384 [IntrWriteArgMem]>;
385 def int_arm_neon_vst3f : Intrinsic<[llvm_void_ty],
386 [llvm_ptr_ty, llvm_anyfloat_ty,
387 LLVMMatchType<0>, LLVMMatchType<0>],
388 [IntrWriteArgMem]>;
389 def int_arm_neon_vst4i : Intrinsic<[llvm_void_ty],
390 [llvm_ptr_ty, llvm_anyint_ty,
391 LLVMMatchType<0>, LLVMMatchType<0>,
392 LLVMMatchType<0>], [IntrWriteArgMem]>;
393 def int_arm_neon_vst4f : Intrinsic<[llvm_void_ty],
394 [llvm_ptr_ty, llvm_anyfloat_ty,
395 LLVMMatchType<0>, LLVMMatchType<0>,
396 LLVMMatchType<0>], [IntrWriteArgMem]>;
397 }
347 def int_arm_neon_vst4 : Intrinsic<[llvm_void_ty],
348 [llvm_ptr_ty, llvm_anyvector_ty,
349 LLVMMatchType<0>, LLVMMatchType<0>,
350 LLVMMatchType<0>], [IntrWriteArgMem]>;
351 }
14651465 switch (IntNo) {
14661466 default: break;
14671467
1468 case Intrinsic::arm_neon_vtrni:
1469 case Intrinsic::arm_neon_vtrnf:
1468 case Intrinsic::arm_neon_vtrn:
14701469 switch (VT.getSimpleVT()) {
14711470 default: return NULL;
14721471 case EVT::v8i8: Opc = ARM::VTRNd8; break;
14811480 return CurDAG->getTargetNode(Opc, dl, VT, VT, N->getOperand(1),
14821481 N->getOperand(2));
14831482
1484 case Intrinsic::arm_neon_vuzpi:
1485 case Intrinsic::arm_neon_vuzpf:
1483 case Intrinsic::arm_neon_vuzp:
14861484 switch (VT.getSimpleVT()) {
14871485 default: return NULL;
14881486 case EVT::v8i8: Opc = ARM::VUZPd8; break;
14971495 return CurDAG->getTargetNode(Opc, dl, VT, VT, N->getOperand(1),
14981496 N->getOperand(2));
14991497
1500 case Intrinsic::arm_neon_vzipi:
1501 case Intrinsic::arm_neon_vzipf:
1498 case Intrinsic::arm_neon_vzip:
15021499 switch (VT.getSimpleVT()) {
15031500 default: return NULL;
15041501 case EVT::v8i8: Opc = ARM::VZIPd8; break;
13591359 ARMTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) {
13601360 unsigned IntNo = cast(Op.getOperand(1))->getZExtValue();
13611361 switch (IntNo) {
1362 case Intrinsic::arm_neon_vld2i:
1363 case Intrinsic::arm_neon_vld2f:
1362 case Intrinsic::arm_neon_vld2:
13641363 return LowerNeonVLDIntrinsic(Op, DAG, ARMISD::VLD2D);
1365 case Intrinsic::arm_neon_vld3i:
1366 case Intrinsic::arm_neon_vld3f:
1364 case Intrinsic::arm_neon_vld3:
13671365 return LowerNeonVLDIntrinsic(Op, DAG, ARMISD::VLD3D);
1368 case Intrinsic::arm_neon_vld4i:
1369 case Intrinsic::arm_neon_vld4f:
1366 case Intrinsic::arm_neon_vld4:
13701367 return LowerNeonVLDIntrinsic(Op, DAG, ARMISD::VLD4D);
1371 case Intrinsic::arm_neon_vst2i:
1372 case Intrinsic::arm_neon_vst2f:
1368 case Intrinsic::arm_neon_vst2:
13731369 return LowerNeonVSTIntrinsic(Op, DAG, ARMISD::VST2D, 2);
1374 case Intrinsic::arm_neon_vst3i:
1375 case Intrinsic::arm_neon_vst3f:
1370 case Intrinsic::arm_neon_vst3:
13761371 return LowerNeonVSTIntrinsic(Op, DAG, ARMISD::VST3D, 3);
1377 case Intrinsic::arm_neon_vst4i:
1378 case Intrinsic::arm_neon_vst4f:
1372 case Intrinsic::arm_neon_vst4:
13791373 return LowerNeonVSTIntrinsic(Op, DAG, ARMISD::VST4D, 4);
13801374 default: return SDValue(); // Don't custom lower most intrinsics.
13811375 }
172172 !strconcat(OpcodeStr, "\t${dst:dregpair}, $addr"),
173173 [(set QPR:$dst, (Ty (IntOp addrmode6:$addr)))]>;
174174
175 def VLD1d8 : VLD1D<"vld1.8", v8i8, int_arm_neon_vld1i>;
176 def VLD1d16 : VLD1D<"vld1.16", v4i16, int_arm_neon_vld1i>;
177 def VLD1d32 : VLD1D<"vld1.32", v2i32, int_arm_neon_vld1i>;
178 def VLD1df : VLD1D<"vld1.32", v2f32, int_arm_neon_vld1f>;
179 def VLD1d64 : VLD1D<"vld1.64", v1i64, int_arm_neon_vld1i>;
180
181 def VLD1q8 : VLD1Q<"vld1.8", v16i8, int_arm_neon_vld1i>;
182 def VLD1q16 : VLD1Q<"vld1.16", v8i16, int_arm_neon_vld1i>;
183 def VLD1q32 : VLD1Q<"vld1.32", v4i32, int_arm_neon_vld1i>;
184 def VLD1qf : VLD1Q<"vld1.32", v4f32, int_arm_neon_vld1f>;
185 def VLD1q64 : VLD1Q<"vld1.64", v2i64, int_arm_neon_vld1i>;
175 def VLD1d8 : VLD1D<"vld1.8", v8i8, int_arm_neon_vld1>;
176 def VLD1d16 : VLD1D<"vld1.16", v4i16, int_arm_neon_vld1>;
177 def VLD1d32 : VLD1D<"vld1.32", v2i32, int_arm_neon_vld1>;
178 def VLD1df : VLD1D<"vld1.32", v2f32, int_arm_neon_vld1>;
179 def VLD1d64 : VLD1D<"vld1.64", v1i64, int_arm_neon_vld1>;
180
181 def VLD1q8 : VLD1Q<"vld1.8", v16i8, int_arm_neon_vld1>;
182 def VLD1q16 : VLD1Q<"vld1.16", v8i16, int_arm_neon_vld1>;
183 def VLD1q32 : VLD1Q<"vld1.32", v4i32, int_arm_neon_vld1>;
184 def VLD1qf : VLD1Q<"vld1.32", v4f32, int_arm_neon_vld1>;
185 def VLD1q64 : VLD1Q<"vld1.64", v2i64, int_arm_neon_vld1>;
186186
187187 // VLD2 : Vector Load (multiple 2-element structures)
188188 class VLD2D
227227 !strconcat(OpcodeStr, "\t${src:dregpair}, $addr"),
228228 [(IntOp addrmode6:$addr, (Ty QPR:$src))]>;
229229
230 def VST1d8 : VST1D<"vst1.8", v8i8, int_arm_neon_vst1i>;
231 def VST1d16 : VST1D<"vst1.16", v4i16, int_arm_neon_vst1i>;
232 def VST1d32 : VST1D<"vst1.32", v2i32, int_arm_neon_vst1i>;
233 def VST1df : VST1D<"vst1.32", v2f32, int_arm_neon_vst1f>;
234 def VST1d64 : VST1D<"vst1.64", v1i64, int_arm_neon_vst1i>;
235
236 def VST1q8 : VST1Q<"vst1.8", v16i8, int_arm_neon_vst1i>;
237 def VST1q16 : VST1Q<"vst1.16", v8i16, int_arm_neon_vst1i>;
238 def VST1q32 : VST1Q<"vst1.32", v4i32, int_arm_neon_vst1i>;
239 def VST1qf : VST1Q<"vst1.32", v4f32, int_arm_neon_vst1f>;
240 def VST1q64 : VST1Q<"vst1.64", v2i64, int_arm_neon_vst1i>;
230 def VST1d8 : VST1D<"vst1.8", v8i8, int_arm_neon_vst1>;
231 def VST1d16 : VST1D<"vst1.16", v4i16, int_arm_neon_vst1>;
232 def VST1d32 : VST1D<"vst1.32", v2i32, int_arm_neon_vst1>;
233 def VST1df : VST1D<"vst1.32", v2f32, int_arm_neon_vst1>;
234 def VST1d64 : VST1D<"vst1.64", v1i64, int_arm_neon_vst1>;
235
236 def VST1q8 : VST1Q<"vst1.8", v16i8, int_arm_neon_vst1>;
237 def VST1q16 : VST1Q<"vst1.16", v8i16, int_arm_neon_vst1>;
238 def VST1q32 : VST1Q<"vst1.32", v4i32, int_arm_neon_vst1>;
239 def VST1qf : VST1Q<"vst1.32", v4f32, int_arm_neon_vst1>;
240 def VST1q64 : VST1Q<"vst1.64", v2i64, int_arm_neon_vst1>;
241241
242242 // VST2 : Vector Store (multiple 2-element structures)
243243 class VST2D
12221222 defm VABDs : N3VInt_QHS<0, 0, 0b0111, 0, "vabd.s", int_arm_neon_vabds, 0>;
12231223 defm VABDu : N3VInt_QHS<1, 0, 0b0111, 0, "vabd.u", int_arm_neon_vabdu, 0>;
12241224 def VABDfd : N3VDInt<1, 0, 0b10, 0b1101, 0, "vabd.f32", v2f32, v2f32,
1225 int_arm_neon_vabdf, 0>;
1225 int_arm_neon_vabds, 0>;
12261226 def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, "vabd.f32", v4f32, v4f32,
1227 int_arm_neon_vabdf, 0>;
1227 int_arm_neon_vabds, 0>;
12281228
12291229 // VABDL : Vector Absolute Difference Long (Q = | D - D |)
12301230 defm VABDLs : N3VLInt_QHS<0,1,0b0111,0, "vabdl.s", int_arm_neon_vabdls, 0>;
12441244 defm VMAXs : N3VInt_QHS<0, 0, 0b0110, 0, "vmax.s", int_arm_neon_vmaxs, 1>;
12451245 defm VMAXu : N3VInt_QHS<1, 0, 0b0110, 0, "vmax.u", int_arm_neon_vmaxu, 1>;
12461246 def VMAXfd : N3VDInt<0, 0, 0b00, 0b1111, 0, "vmax.f32", v2f32, v2f32,
1247 int_arm_neon_vmaxf, 1>;
1247 int_arm_neon_vmaxs, 1>;
12481248 def VMAXfq : N3VQInt<0, 0, 0b00, 0b1111, 0, "vmax.f32", v4f32, v4f32,
1249 int_arm_neon_vmaxf, 1>;
1249 int_arm_neon_vmaxs, 1>;
12501250
12511251 // VMIN : Vector Minimum
12521252 defm VMINs : N3VInt_QHS<0, 0, 0b0110, 1, "vmin.s", int_arm_neon_vmins, 1>;
12531253 defm VMINu : N3VInt_QHS<1, 0, 0b0110, 1, "vmin.u", int_arm_neon_vminu, 1>;
12541254 def VMINfd : N3VDInt<0, 0, 0b10, 0b1111, 0, "vmin.f32", v2f32, v2f32,
1255 int_arm_neon_vminf, 1>;
1255 int_arm_neon_vmins, 1>;
12561256 def VMINfq : N3VQInt<0, 0, 0b10, 0b1111, 0, "vmin.f32", v4f32, v4f32,
1257 int_arm_neon_vminf, 1>;
1257 int_arm_neon_vmins, 1>;
12581258
12591259 // Vector Pairwise Operations.
12601260
12941294 def VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, "vpmax.u32", v2i32, v2i32,
12951295 int_arm_neon_vpmaxu, 0>;
12961296 def VPMAXf : N3VDInt<1, 0, 0b00, 0b1111, 0, "vpmax.f32", v2f32, v2f32,
1297 int_arm_neon_vpmaxf, 0>;
1297 int_arm_neon_vpmaxs, 0>;
12981298
12991299 // VPMIN : Vector Pairwise Minimum
13001300 def VPMINs8 : N3VDInt<0, 0, 0b00, 0b1010, 1, "vpmin.s8", v8i8, v8i8,
13101310 def VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, "vpmin.u32", v2i32, v2i32,
13111311 int_arm_neon_vpminu, 0>;
13121312 def VPMINf : N3VDInt<1, 0, 0b10, 0b1111, 0, "vpmin.f32", v2f32, v2f32,
1313 int_arm_neon_vpminf, 0>;
1313 int_arm_neon_vpmins, 0>;
13141314
13151315 // Vector Reciprocal and Reciprocal Square Root Estimate and Step.
13161316
13201320 def VRECPEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, "vrecpe.u32",
13211321 v4i32, v4i32, int_arm_neon_vrecpe>;
13221322 def VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, "vrecpe.f32",
1323 v2f32, v2f32, int_arm_neon_vrecpef>;
1323 v2f32, v2f32, int_arm_neon_vrecpe>;
13241324 def VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, "vrecpe.f32",
1325 v4f32, v4f32, int_arm_neon_vrecpef>;
1325 v4f32, v4f32, int_arm_neon_vrecpe>;
13261326
13271327 // VRECPS : Vector Reciprocal Step
13281328 def VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, "vrecps.f32", v2f32, v2f32,
13361336 def VRSQRTEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, "vrsqrte.u32",
13371337 v4i32, v4i32, int_arm_neon_vrsqrte>;
13381338 def VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, "vrsqrte.f32",
1339 v2f32, v2f32, int_arm_neon_vrsqrtef>;
1339 v2f32, v2f32, int_arm_neon_vrsqrte>;
13401340 def VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, "vrsqrte.f32",
1341 v4f32, v4f32, int_arm_neon_vrsqrtef>;
1341 v4f32, v4f32, int_arm_neon_vrsqrte>;
13421342
13431343 // VRSQRTS : Vector Reciprocal Square Root Step
13441344 def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, "vrsqrts.f32", v2f32, v2f32,
14791479 defm VABS : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0, "vabs.s",
14801480 int_arm_neon_vabs>;
14811481 def VABSfd : N2VDInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32",
1482 v2f32, v2f32, int_arm_neon_vabsf>;
1482 v2f32, v2f32, int_arm_neon_vabs>;
14831483 def VABSfq : N2VQInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32",
1484 v4f32, v4f32, int_arm_neon_vabsf>;
1484 v4f32, v4f32, int_arm_neon_vabs>;
14851485
14861486 // VQABS : Vector Saturating Absolute Value
14871487 defm VQABS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, "vqabs.s",
20162016 // Vector Absolute used for single-precision FP
20172017 let neverHasSideEffects = 1 in
20182018 def VABSfd_sfp : N2VDInts<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32",
2019 v2f32, v2f32, int_arm_neon_vabsf>;
2019 v2f32, v2f32, int_arm_neon_vabs>;
20202020 def : N2VDIntsPat;
20212021
20222022 // Vector Negate used for single-precision FP
5858 ;CHECK: vabd.f32
5959 %tmp1 = load <2 x float>* %A
6060 %tmp2 = load <2 x float>* %B
61 %tmp3 = call <2 x float> @llvm.arm.neon.vabdf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
61 %tmp3 = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
6262 ret <2 x float> %tmp3
6363 }
6464
121121 ;CHECK: vabd.f32
122122 %tmp1 = load <4 x float>* %A
123123 %tmp2 = load <4 x float>* %B
124 %tmp3 = call <4 x float> @llvm.arm.neon.vabdf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
124 %tmp3 = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
125125 ret <4 x float> %tmp3
126126 }
127127
133133 declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
134134 declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
135135
136 declare <2 x float> @llvm.arm.neon.vabdf.v2f32(<2 x float>, <2 x float>) nounwind readnone
136 declare <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float>, <2 x float>) nounwind readnone
137137
138138 declare <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
139139 declare <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
143143 declare <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
144144 declare <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
145145
146 declare <4 x float> @llvm.arm.neon.vabdf.v4f32(<4 x float>, <4 x float>) nounwind readnone
146 declare <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float>, <4 x float>) nounwind readnone
2727 ;CHECK: vabsf32:
2828 ;CHECK: vabs.f32
2929 %tmp1 = load <2 x float>* %A
30 %tmp2 = call <2 x float> @llvm.arm.neon.vabsf.v2f32(<2 x float> %tmp1)
30 %tmp2 = call <2 x float> @llvm.arm.neon.vabs.v2f32(<2 x float> %tmp1)
3131 ret <2 x float> %tmp2
3232 }
3333
5959 ;CHECK: vabsQf32:
6060 ;CHECK: vabs.f32
6161 %tmp1 = load <4 x float>* %A
62 %tmp2 = call <4 x float> @llvm.arm.neon.vabsf.v4f32(<4 x float> %tmp1)
62 %tmp2 = call <4 x float> @llvm.arm.neon.vabs.v4f32(<4 x float> %tmp1)
6363 ret <4 x float> %tmp2
6464 }
6565
6666 declare <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8>) nounwind readnone
6767 declare <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16>) nounwind readnone
6868 declare <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32>) nounwind readnone
69 declare <2 x float> @llvm.arm.neon.vabsf.v2f32(<2 x float>) nounwind readnone
69 declare <2 x float> @llvm.arm.neon.vabs.v2f32(<2 x float>) nounwind readnone
7070
7171 declare <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8>) nounwind readnone
7272 declare <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16>) nounwind readnone
7373 declare <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32>) nounwind readnone
74 declare <4 x float> @llvm.arm.neon.vabsf.v4f32(<4 x float>) nounwind readnone
74 declare <4 x float> @llvm.arm.neon.vabs.v4f32(<4 x float>) nounwind readnone
7575
22 define <8 x i8> @vld1i8(i8* %A) nounwind {
33 ;CHECK: vld1i8:
44 ;CHECK: vld1.8
5 %tmp1 = call <8 x i8> @llvm.arm.neon.vld1i.v8i8(i8* %A)
5 %tmp1 = call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %A)
66 ret <8 x i8> %tmp1
77 }
88
99 define <4 x i16> @vld1i16(i16* %A) nounwind {
1010 ;CHECK: vld1i16:
1111 ;CHECK: vld1.16
12 %tmp1 = call <4 x i16> @llvm.arm.neon.vld1i.v4i16(i16* %A)
12 %tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i16* %A)
1313 ret <4 x i16> %tmp1
1414 }
1515
1616 define <2 x i32> @vld1i32(i32* %A) nounwind {
1717 ;CHECK: vld1i32:
1818 ;CHECK: vld1.32
19 %tmp1 = call <2 x i32> @llvm.arm.neon.vld1i.v2i32(i32* %A)
19 %tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i32* %A)
2020 ret <2 x i32> %tmp1
2121 }
2222
2323 define <2 x float> @vld1f(float* %A) nounwind {
2424 ;CHECK: vld1f:
2525 ;CHECK: vld1.32
26 %tmp1 = call <2 x float> @llvm.arm.neon.vld1f.v2f32(float* %A)
26 %tmp1 = call <2 x float> @llvm.arm.neon.vld1.v2f32(float* %A)
2727 ret <2 x float> %tmp1
2828 }
2929
3030 define <1 x i64> @vld1i64(i64* %A) nounwind {
3131 ;CHECK: vld1i64:
3232 ;CHECK: vld1.64
33 %tmp1 = call <1 x i64> @llvm.arm.neon.vld1i.v1i64(i64* %A)
33 %tmp1 = call <1 x i64> @llvm.arm.neon.vld1.v1i64(i64* %A)
3434 ret <1 x i64> %tmp1
3535 }
3636
3737 define <16 x i8> @vld1Qi8(i8* %A) nounwind {
3838 ;CHECK: vld1Qi8:
3939 ;CHECK: vld1.8
40 %tmp1 = call <16 x i8> @llvm.arm.neon.vld1i.v16i8(i8* %A)
40 %tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %A)
4141 ret <16 x i8> %tmp1
4242 }
4343
4444 define <8 x i16> @vld1Qi16(i16* %A) nounwind {
4545 ;CHECK: vld1Qi16:
4646 ;CHECK: vld1.16
47 %tmp1 = call <8 x i16> @llvm.arm.neon.vld1i.v8i16(i16* %A)
47 %tmp1 = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i16* %A)
4848 ret <8 x i16> %tmp1
4949 }
5050
5151 define <4 x i32> @vld1Qi32(i32* %A) nounwind {
5252 ;CHECK: vld1Qi32:
5353 ;CHECK: vld1.32
54 %tmp1 = call <4 x i32> @llvm.arm.neon.vld1i.v4i32(i32* %A)
54 %tmp1 = call <4 x i32> @llvm.arm.neon.vld1.v4i32(i32* %A)
5555 ret <4 x i32> %tmp1
5656 }
5757
5858 define <4 x float> @vld1Qf(float* %A) nounwind {
5959 ;CHECK: vld1Qf:
6060 ;CHECK: vld1.32
61 %tmp1 = call <4 x float> @llvm.arm.neon.vld1f.v4f32(float* %A)
61 %tmp1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(float* %A)
6262 ret <4 x float> %tmp1
6363 }
6464
6565 define <2 x i64> @vld1Qi64(i64* %A) nounwind {
6666 ;CHECK: vld1Qi64:
6767 ;CHECK: vld1.64
68 %tmp1 = call <2 x i64> @llvm.arm.neon.vld1i.v2i64(i64* %A)
68 %tmp1 = call <2 x i64> @llvm.arm.neon.vld1.v2i64(i64* %A)
6969 ret <2 x i64> %tmp1
7070 }
7171
72 declare <8 x i8> @llvm.arm.neon.vld1i.v8i8(i8*) nounwind readonly
73 declare <4 x i16> @llvm.arm.neon.vld1i.v4i16(i8*) nounwind readonly
74 declare <2 x i32> @llvm.arm.neon.vld1i.v2i32(i8*) nounwind readonly
75 declare <2 x float> @llvm.arm.neon.vld1f.v2f32(i8*) nounwind readonly
76 declare <1 x i64> @llvm.arm.neon.vld1i.v1i64(i8*) nounwind readonly
72 declare <8 x i8> @llvm.arm.neon.vld1.v8i8(i8*) nounwind readonly
73 declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*) nounwind readonly
74 declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*) nounwind readonly
75 declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*) nounwind readonly
76 declare <1 x i64> @llvm.arm.neon.vld1.v1i64(i8*) nounwind readonly
7777
78 declare <16 x i8> @llvm.arm.neon.vld1i.v16i8(i8*) nounwind readonly
79 declare <8 x i16> @llvm.arm.neon.vld1i.v8i16(i8*) nounwind readonly
80 declare <4 x i32> @llvm.arm.neon.vld1i.v4i32(i8*) nounwind readonly
81 declare <4 x float> @llvm.arm.neon.vld1f.v4f32(i8*) nounwind readonly
82 declare <2 x i64> @llvm.arm.neon.vld1i.v2i64(i8*) nounwind readonly
78 declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*) nounwind readonly
79 declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*) nounwind readonly
80 declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*) nounwind readonly
81 declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*) nounwind readonly
82 declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*) nounwind readonly
77 define <8 x i8> @vld2i8(i8* %A) nounwind {
88 ;CHECK: vld2i8:
99 ;CHECK: vld2.8
10 %tmp1 = call %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vld2i.v8i8(i8* %A)
10 %tmp1 = call %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vld2.v8i8(i8* %A)
1111 %tmp2 = extractvalue %struct.__builtin_neon_v8qi2 %tmp1, 0
1212 %tmp3 = extractvalue %struct.__builtin_neon_v8qi2 %tmp1, 1
1313 %tmp4 = add <8 x i8> %tmp2, %tmp3
1717 define <4 x i16> @vld2i16(i16* %A) nounwind {
1818 ;CHECK: vld2i16:
1919 ;CHECK: vld2.16
20 %tmp1 = call %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vld2i.v4i16(i16* %A)
20 %tmp1 = call %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vld2.v4i16(i16* %A)
2121 %tmp2 = extractvalue %struct.__builtin_neon_v4hi2 %tmp1, 0
2222 %tmp3 = extractvalue %struct.__builtin_neon_v4hi2 %tmp1, 1
2323 %tmp4 = add <4 x i16> %tmp2, %tmp3
2727 define <2 x i32> @vld2i32(i32* %A) nounwind {
2828 ;CHECK: vld2i32:
2929 ;CHECK: vld2.32
30 %tmp1 = call %struct.__builtin_neon_v2si2 @llvm.arm.neon.vld2i.v2i32(i32* %A)
30 %tmp1 = call %struct.__builtin_neon_v2si2 @llvm.arm.neon.vld2.v2i32(i32* %A)
3131 %tmp2 = extractvalue %struct.__builtin_neon_v2si2 %tmp1, 0
3232 %tmp3 = extractvalue %struct.__builtin_neon_v2si2 %tmp1, 1
3333 %tmp4 = add <2 x i32> %tmp2, %tmp3
3737 define <2 x float> @vld2f(float* %A) nounwind {
3838 ;CHECK: vld2f:
3939 ;CHECK: vld2.32
40 %tmp1 = call %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vld2f.v2f32(float* %A)
40 %tmp1 = call %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vld2.v2f32(float* %A)
4141 %tmp2 = extractvalue %struct.__builtin_neon_v2sf2 %tmp1, 0
4242 %tmp3 = extractvalue %struct.__builtin_neon_v2sf2 %tmp1, 1
4343 %tmp4 = add <2 x float> %tmp2, %tmp3
4444 ret <2 x float> %tmp4
4545 }
4646
47 declare %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vld2i.v8i8(i8*) nounwind readonly
48 declare %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vld2i.v4i16(i8*) nounwind readonly
49 declare %struct.__builtin_neon_v2si2 @llvm.arm.neon.vld2i.v2i32(i8*) nounwind readonly
50 declare %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vld2f.v2f32(i8*) nounwind readonly
47 declare %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vld2.v8i8(i8*) nounwind readonly
48 declare %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vld2.v4i16(i8*) nounwind readonly
49 declare %struct.__builtin_neon_v2si2 @llvm.arm.neon.vld2.v2i32(i8*) nounwind readonly
50 declare %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vld2.v2f32(i8*) nounwind readonly
77 define <8 x i8> @vld3i8(i8* %A) nounwind {
88 ;CHECK: vld3i8:
99 ;CHECK: vld3.8
10 %tmp1 = call %struct.__builtin_neon_v8qi3 @llvm.arm.neon.vld3i.v8i8(i8* %A)
10 %tmp1 = call %struct.__builtin_neon_v8qi3 @llvm.arm.neon.vld3.v8i8(i8* %A)
1111 %tmp2 = extractvalue %struct.__builtin_neon_v8qi3 %tmp1, 0
1212 %tmp3 = extractvalue %struct.__builtin_neon_v8qi3 %tmp1, 2
1313 %tmp4 = add <8 x i8> %tmp2, %tmp3
1717 define <4 x i16> @vld3i16(i16* %A) nounwind {
1818 ;CHECK: vld3i16:
1919 ;CHECK: vld3.16
20 %tmp1 = call %struct.__builtin_neon_v4hi3 @llvm.arm.neon.vld3i.v4i16(i16* %A)
20 %tmp1 = call %struct.__builtin_neon_v4hi3 @llvm.arm.neon.vld3.v4i16(i16* %A)
2121 %tmp2 = extractvalue %struct.__builtin_neon_v4hi3 %tmp1, 0
2222 %tmp3 = extractvalue %struct.__builtin_neon_v4hi3 %tmp1, 2
2323 %tmp4 = add <4 x i16> %tmp2, %tmp3
2727 define <2 x i32> @vld3i32(i32* %A) nounwind {
2828 ;CHECK: vld3i32:
2929 ;CHECK: vld3.32
30 %tmp1 = call %struct.__builtin_neon_v2si3 @llvm.arm.neon.vld3i.v2i32(i32* %A)
30 %tmp1 = call %struct.__builtin_neon_v2si3 @llvm.arm.neon.vld3.v2i32(i32* %A)
3131 %tmp2 = extractvalue %struct.__builtin_neon_v2si3 %tmp1, 0
3232 %tmp3 = extractvalue %struct.__builtin_neon_v2si3 %tmp1, 2
3333 %tmp4 = add <2 x i32> %tmp2, %tmp3
3737 define <2 x float> @vld3f(float* %A) nounwind {
3838 ;CHECK: vld3f:
3939 ;CHECK: vld3.32
40 %tmp1 = call %struct.__builtin_neon_v2sf3 @llvm.arm.neon.vld3f.v2f32(float* %A)
40 %tmp1 = call %struct.__builtin_neon_v2sf3 @llvm.arm.neon.vld3.v2f32(float* %A)
4141 %tmp2 = extractvalue %struct.__builtin_neon_v2sf3 %tmp1, 0
4242 %tmp3 = extractvalue %struct.__builtin_neon_v2sf3 %tmp1, 2
4343 %tmp4 = add <2 x float> %tmp2, %tmp3
4444 ret <2 x float> %tmp4
4545 }
4646
47 declare %struct.__builtin_neon_v8qi3 @llvm.arm.neon.vld3i.v8i8(i8*) nounwind readonly
48 declare %struct.__builtin_neon_v4hi3 @llvm.arm.neon.vld3i.v4i16(i8*) nounwind readonly
49 declare %struct.__builtin_neon_v2si3 @llvm.arm.neon.vld3i.v2i32(i8*) nounwind readonly
50 declare %struct.__builtin_neon_v2sf3 @llvm.arm.neon.vld3f.v2f32(i8*) nounwind readonly
47 declare %struct.__builtin_neon_v8qi3 @llvm.arm.neon.vld3.v8i8(i8*) nounwind readonly
48 declare %struct.__builtin_neon_v4hi3 @llvm.arm.neon.vld3.v4i16(i8*) nounwind readonly
49 declare %struct.__builtin_neon_v2si3 @llvm.arm.neon.vld3.v2i32(i8*) nounwind readonly
50 declare %struct.__builtin_neon_v2sf3 @llvm.arm.neon.vld3.v2f32(i8*) nounwind readonly
77 define <8 x i8> @vld4i8(i8* %A) nounwind {
88 ;CHECK: vld4i8:
99 ;CHECK: vld4.8
10 %tmp1 = call %struct.__builtin_neon_v8qi4 @llvm.arm.neon.vld4i.v8i8(i8* %A)
10 %tmp1 = call %struct.__builtin_neon_v8qi4 @llvm.arm.neon.vld4.v8i8(i8* %A)
1111 %tmp2 = extractvalue %struct.__builtin_neon_v8qi4 %tmp1, 0
1212 %tmp3 = extractvalue %struct.__builtin_neon_v8qi4 %tmp1, 2
1313 %tmp4 = add <8 x i8> %tmp2, %tmp3
1717 define <4 x i16> @vld4i16(i16* %A) nounwind {
1818 ;CHECK: vld4i16:
1919 ;CHECK: vld4.16
20 %tmp1 = call %struct.__builtin_neon_v4hi4 @llvm.arm.neon.vld4i.v4i16(i16* %A)
20 %tmp1 = call %struct.__builtin_neon_v4hi4 @llvm.arm.neon.vld4.v4i16(i16* %A)
2121 %tmp2 = extractvalue %struct.__builtin_neon_v4hi4 %tmp1, 0
2222 %tmp3 = extractvalue %struct.__builtin_neon_v4hi4 %tmp1, 2
2323 %tmp4 = add <4 x i16> %tmp2, %tmp3
2727 define <2 x i32> @vld4i32(i32* %A) nounwind {
2828 ;CHECK: vld4i32:
2929 ;CHECK: vld4.32
30 %tmp1 = call %struct.__builtin_neon_v2si4 @llvm.arm.neon.vld4i.v2i32(i32* %A)
30 %tmp1 = call %struct.__builtin_neon_v2si4 @llvm.arm.neon.vld4.v2i32(i32* %A)
3131 %tmp2 = extractvalue %struct.__builtin_neon_v2si4 %tmp1, 0
3232 %tmp3 = extractvalue %struct.__builtin_neon_v2si4 %tmp1, 2
3333 %tmp4 = add <2 x i32> %tmp2, %tmp3
3737 define <2 x float> @vld4f(float* %A) nounwind {
3838 ;CHECK: vld4f:
3939 ;CHECK: vld4.32
40 %tmp1 = call %struct.__builtin_neon_v2sf4 @llvm.arm.neon.vld4f.v2f32(float* %A)
40 %tmp1 = call %struct.__builtin_neon_v2sf4 @llvm.arm.neon.vld4.v2f32(float* %A)
4141 %tmp2 = extractvalue %struct.__builtin_neon_v2sf4 %tmp1, 0
4242 %tmp3 = extractvalue %struct.__builtin_neon_v2sf4 %tmp1, 2
4343 %tmp4 = add <2 x float> %tmp2, %tmp3
4444 ret <2 x float> %tmp4
4545 }
4646
47 declare %struct.__builtin_neon_v8qi4 @llvm.arm.neon.vld4i.v8i8(i8*) nounwind readonly
48 declare %struct.__builtin_neon_v4hi4 @llvm.arm.neon.vld4i.v4i16(i8*) nounwind readonly
49 declare %struct.__builtin_neon_v2si4 @llvm.arm.neon.vld4i.v2i32(i8*) nounwind readonly
50 declare %struct.__builtin_neon_v2sf4 @llvm.arm.neon.vld4f.v2f32(i8*) nounwind readonly
47 declare %struct.__builtin_neon_v8qi4 @llvm.arm.neon.vld4.v8i8(i8*) nounwind readonly
48 declare %struct.__builtin_neon_v4hi4 @llvm.arm.neon.vld4.v4i16(i8*) nounwind readonly
49 declare %struct.__builtin_neon_v2si4 @llvm.arm.neon.vld4.v2i32(i8*) nounwind readonly
50 declare %struct.__builtin_neon_v2sf4 @llvm.arm.neon.vld4.v2f32(i8*) nounwind readonly
5151 define <2 x float> @vmaxf32(<2 x float>* %A, <2 x float>* %B) nounwind {
5252 %tmp1 = load <2 x float>* %A
5353 %tmp2 = load <2 x float>* %B
54 %tmp3 = call <2 x float> @llvm.arm.neon.vmaxf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
54 %tmp3 = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
5555 ret <2 x float> %tmp3
5656 }
5757
100100 define <4 x float> @vmaxQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
101101 %tmp1 = load <4 x float>* %A
102102 %tmp2 = load <4 x float>* %B
103 %tmp3 = call <4 x float> @llvm.arm.neon.vmaxf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
103 %tmp3 = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
104104 ret <4 x float> %tmp3
105105 }
106106
112112 declare <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
113113 declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
114114
115 declare <2 x float> @llvm.arm.neon.vmaxf.v2f32(<2 x float>, <2 x float>) nounwind readnone
115 declare <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
116116
117117 declare <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
118118 declare <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
122122 declare <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
123123 declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
124124
125 declare <4 x float> @llvm.arm.neon.vmaxf.v4f32(<4 x float>, <4 x float>) nounwind readnone
125 declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
5151 define <2 x float> @vminf32(<2 x float>* %A, <2 x float>* %B) nounwind {
5252 %tmp1 = load <2 x float>* %A
5353 %tmp2 = load <2 x float>* %B
54 %tmp3 = call <2 x float> @llvm.arm.neon.vminf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
54 %tmp3 = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
5555 ret <2 x float> %tmp3
5656 }
5757
100100 define <4 x float> @vminQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
101101 %tmp1 = load <4 x float>* %A
102102 %tmp2 = load <4 x float>* %B
103 %tmp3 = call <4 x float> @llvm.arm.neon.vminf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
103 %tmp3 = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
104104 ret <4 x float> %tmp3
105105 }
106106
112112 declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
113113 declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
114114
115 declare <2 x float> @llvm.arm.neon.vminf.v2f32(<2 x float>, <2 x float>) nounwind readnone
115 declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
116116
117117 declare <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
118118 declare <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
122122 declare <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
123123 declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
124124
125 declare <4 x float> @llvm.arm.neon.vminf.v4f32(<4 x float>, <4 x float>) nounwind readnone
125 declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
5151 define <2 x float> @vpmaxf32(<2 x float>* %A, <2 x float>* %B) nounwind {
5252 %tmp1 = load <2 x float>* %A
5353 %tmp2 = load <2 x float>* %B
54 %tmp3 = call <2 x float> @llvm.arm.neon.vpmaxf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
54 %tmp3 = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
5555 ret <2 x float> %tmp3
5656 }
5757
6363 declare <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
6464 declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
6565
66 declare <2 x float> @llvm.arm.neon.vpmaxf.v2f32(<2 x float>, <2 x float>) nounwind readnone
66 declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
5151 define <2 x float> @vpminf32(<2 x float>* %A, <2 x float>* %B) nounwind {
5252 %tmp1 = load <2 x float>* %A
5353 %tmp2 = load <2 x float>* %B
54 %tmp3 = call <2 x float> @llvm.arm.neon.vpminf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
54 %tmp3 = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
5555 ret <2 x float> %tmp3
5656 }
5757
6363 declare <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
6464 declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
6565
66 declare <2 x float> @llvm.arm.neon.vpminf.v2f32(<2 x float>, <2 x float>) nounwind readnone
66 declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
1515
1616 define <2 x float> @vrecpef32(<2 x float>* %A) nounwind {
1717 %tmp1 = load <2 x float>* %A
18 %tmp2 = call <2 x float> @llvm.arm.neon.vrecpef.v2f32(<2 x float> %tmp1)
18 %tmp2 = call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %tmp1)
1919 ret <2 x float> %tmp2
2020 }
2121
2222 define <4 x float> @vrecpeQf32(<4 x float>* %A) nounwind {
2323 %tmp1 = load <4 x float>* %A
24 %tmp2 = call <4 x float> @llvm.arm.neon.vrecpef.v4f32(<4 x float> %tmp1)
24 %tmp2 = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %tmp1)
2525 ret <4 x float> %tmp2
2626 }
2727
2828 declare <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32>) nounwind readnone
2929 declare <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32>) nounwind readnone
3030
31 declare <2 x float> @llvm.arm.neon.vrecpef.v2f32(<2 x float>) nounwind readnone
32 declare <4 x float> @llvm.arm.neon.vrecpef.v4f32(<4 x float>) nounwind readnone
31 declare <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float>) nounwind readnone
32 declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
1515
1616 define <2 x float> @vrsqrtef32(<2 x float>* %A) nounwind {
1717 %tmp1 = load <2 x float>* %A
18 %tmp2 = call <2 x float> @llvm.arm.neon.vrsqrtef.v2f32(<2 x float> %tmp1)
18 %tmp2 = call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %tmp1)
1919 ret <2 x float> %tmp2
2020 }
2121
2222 define <4 x float> @vrsqrteQf32(<4 x float>* %A) nounwind {
2323 %tmp1 = load <4 x float>* %A
24 %tmp2 = call <4 x float> @llvm.arm.neon.vrsqrtef.v4f32(<4 x float> %tmp1)
24 %tmp2 = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %tmp1)
2525 ret <4 x float> %tmp2
2626 }
2727
2828 declare <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32>) nounwind readnone
2929 declare <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32>) nounwind readnone
3030
31 declare <2 x float> @llvm.arm.neon.vrsqrtef.v2f32(<2 x float>) nounwind readnone
32 declare <4 x float> @llvm.arm.neon.vrsqrtef.v4f32(<4 x float>) nounwind readnone
31 declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) nounwind readnone
32 declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
33 ;CHECK: vst1i8:
44 ;CHECK: vst1.8
55 %tmp1 = load <8 x i8>* %B
6 call void @llvm.arm.neon.vst1i.v8i8(i8* %A, <8 x i8> %tmp1)
6 call void @llvm.arm.neon.vst1.v8i8(i8* %A, <8 x i8> %tmp1)
77 ret void
88 }
99
1111 ;CHECK: vst1i16:
1212 ;CHECK: vst1.16
1313 %tmp1 = load <4 x i16>* %B
14 call void @llvm.arm.neon.vst1i.v4i16(i16* %A, <4 x i16> %tmp1)
14 call void @llvm.arm.neon.vst1.v4i16(i16* %A, <4 x i16> %tmp1)
1515 ret void
1616 }
1717
1919 ;CHECK: vst1i32:
2020 ;CHECK: vst1.32
2121 %tmp1 = load <2 x i32>* %B
22 call void @llvm.arm.neon.vst1i.v2i32(i32* %A, <2 x i32> %tmp1)
22 call void @llvm.arm.neon.vst1.v2i32(i32* %A, <2 x i32> %tmp1)
2323 ret void
2424 }
2525
2727 ;CHECK: vst1f:
2828 ;CHECK: vst1.32
2929 %tmp1 = load <2 x float>* %B
30 call void @llvm.arm.neon.vst1f.v2f32(float* %A, <2 x float> %tmp1)
30 call void @llvm.arm.neon.vst1.v2f32(float* %A, <2 x float> %tmp1)
3131 ret void
3232 }
3333
3535 ;CHECK: vst1i64:
3636 ;CHECK: vst1.64
3737 %tmp1 = load <1 x i64>* %B
38 call void @llvm.arm.neon.vst1i.v1i64(i64* %A, <1 x i64> %tmp1)
38 call void @llvm.arm.neon.vst1.v1i64(i64* %A, <1 x i64> %tmp1)
3939 ret void
4040 }
4141
4343 ;CHECK: vst1Qi8:
4444 ;CHECK: vst1.8
4545 %tmp1 = load <16 x i8>* %B
46 call void @llvm.arm.neon.vst1i.v16i8(i8* %A, <16 x i8> %tmp1)
46 call void @llvm.arm.neon.vst1.v16i8(i8* %A, <16 x i8> %tmp1)
4747 ret void
4848 }
4949
5151 ;CHECK: vst1Qi16:
5252 ;CHECK: vst1.16
5353 %tmp1 = load <8 x i16>* %B
54 call void @llvm.arm.neon.vst1i.v8i16(i16* %A, <8 x i16> %tmp1)
54 call void @llvm.arm.neon.vst1.v8i16(i16* %A, <8 x i16> %tmp1)
5555 ret void
5656 }
5757
5959 ;CHECK: vst1Qi32:
6060 ;CHECK: vst1.32
6161 %tmp1 = load <4 x i32>* %B
62 call void @llvm.arm.neon.vst1i.v4i32(i32* %A, <4 x i32> %tmp1)
62 call void @llvm.arm.neon.vst1.v4i32(i32* %A, <4 x i32> %tmp1)
6363 ret void
6464 }
6565
6767 ;CHECK: vst1Qf:
6868 ;CHECK: vst1.32
6969 %tmp1 = load <4 x float>* %B
70 call void @llvm.arm.neon.vst1f.v4f32(float* %A, <4 x float> %tmp1)
70 call void @llvm.arm.neon.vst1.v4f32(float* %A, <4 x float> %tmp1)
7171 ret void
7272 }
7373
7575 ;CHECK: vst1Qi64:
7676 ;CHECK: vst1.64
7777 %tmp1 = load <2 x i64>* %B
78 call void @llvm.arm.neon.vst1i.v2i64(i64* %A, <2 x i64> %tmp1)
78 call void @llvm.arm.neon.vst1.v2i64(i64* %A, <2 x i64> %tmp1)
7979 ret void
8080 }
8181
82 declare void @llvm.arm.neon.vst1i.v8i8(i8*, <8 x i8>) nounwind
83 declare void @llvm.arm.neon.vst1i.v4i16(i8*, <4 x i16>) nounwind
84 declare void @llvm.arm.neon.vst1i.v2i32(i8*, <2 x i32>) nounwind
85 declare void @llvm.arm.neon.vst1f.v2f32(i8*, <2 x float>) nounwind
86 declare void @llvm.arm.neon.vst1i.v1i64(i8*, <1 x i64>) nounwind
82 declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>) nounwind
83 declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>) nounwind
84 declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>) nounwind
85 declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>) nounwind
86 declare void @llvm.arm.neon.vst1.v1i64(i8*, <1 x i64>) nounwind
8787
88 declare void @llvm.arm.neon.vst1i.v16i8(i8*, <16 x i8>) nounwind
89 declare void @llvm.arm.neon.vst1i.v8i16(i8*, <8 x i16>) nounwind
90 declare void @llvm.arm.neon.vst1i.v4i32(i8*, <4 x i32>) nounwind
91 declare void @llvm.arm.neon.vst1f.v4f32(i8*, <4 x float>) nounwind
92 declare void @llvm.arm.neon.vst1i.v2i64(i8*, <2 x i64>) nounwind
88 declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>) nounwind
89 declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>) nounwind
90 declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>) nounwind
91 declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>) nounwind
92 declare void @llvm.arm.neon.vst1.v2i64(i8*, <2 x i64>) nounwind
33 ;CHECK: vst2i8:
44 ;CHECK: vst2.8
55 %tmp1 = load <8 x i8>* %B
6 call void @llvm.arm.neon.vst2i.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1)
6 call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1)
77 ret void
88 }
99
1111 ;CHECK: vst2i16:
1212 ;CHECK: vst2.16
1313 %tmp1 = load <4 x i16>* %B
14 call void @llvm.arm.neon.vst2i.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1)
14 call void @llvm.arm.neon.vst2.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1)
1515 ret void
1616 }
1717
1919 ;CHECK: vst2i32:
2020 ;CHECK: vst2.32
2121 %tmp1 = load <2 x i32>* %B
22 call void @llvm.arm.neon.vst2i.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1)
22 call void @llvm.arm.neon.vst2.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1)
2323 ret void
2424 }
2525
2727 ;CHECK: vst2f:
2828 ;CHECK: vst2.32
2929 %tmp1 = load <2 x float>* %B
30 call void @llvm.arm.neon.vst2f.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1)
30 call void @llvm.arm.neon.vst2.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1)
3131 ret void
3232 }
3333
34 declare void @llvm.arm.neon.vst2i.v8i8(i8*, <8 x i8>, <8 x i8>) nounwind
35 declare void @llvm.arm.neon.vst2i.v4i16(i8*, <4 x i16>, <4 x i16>) nounwind
36 declare void @llvm.arm.neon.vst2i.v2i32(i8*, <2 x i32>, <2 x i32>) nounwind
37 declare void @llvm.arm.neon.vst2f.v2f32(i8*, <2 x float>, <2 x float>) nounwind
34 declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>) nounwind
35 declare void @llvm.arm.neon.vst2.v4i16(i8*, <4 x i16>, <4 x i16>) nounwind
36 declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>) nounwind
37 declare void @llvm.arm.neon.vst2.v2f32(i8*, <2 x float>, <2 x float>) nounwind
33 ;CHECK: vst3i8:
44 ;CHECK: vst3.8
55 %tmp1 = load <8 x i8>* %B
6 call void @llvm.arm.neon.vst3i.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1)
6 call void @llvm.arm.neon.vst3.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1)
77 ret void
88 }
99
1111 ;CHECK: vst3i16:
1212 ;CHECK: vst3.16
1313 %tmp1 = load <4 x i16>* %B
14 call void @llvm.arm.neon.vst3i.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1)
14 call void @llvm.arm.neon.vst3.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1)
1515 ret void
1616 }
1717
1919 ;CHECK: vst3i32:
2020 ;CHECK: vst3.32
2121 %tmp1 = load <2 x i32>* %B
22 call void @llvm.arm.neon.vst3i.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1)
22 call void @llvm.arm.neon.vst3.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1)
2323 ret void
2424 }
2525
2727 ;CHECK: vst3f:
2828 ;CHECK: vst3.32
2929 %tmp1 = load <2 x float>* %B
30 call void @llvm.arm.neon.vst3f.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1)
30 call void @llvm.arm.neon.vst3.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1)
3131 ret void
3232 }
3333
34 declare void @llvm.arm.neon.vst3i.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>) nounwind
35 declare void @llvm.arm.neon.vst3i.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>) nounwind
36 declare void @llvm.arm.neon.vst3i.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>) nounwind
37 declare void @llvm.arm.neon.vst3f.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>) nounwind
34 declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>) nounwind
35 declare void @llvm.arm.neon.vst3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>) nounwind
36 declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>) nounwind
37 declare void @llvm.arm.neon.vst3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>) nounwind
33 ;CHECK: vst4i8:
44 ;CHECK: vst4.8
55 %tmp1 = load <8 x i8>* %B
6 call void @llvm.arm.neon.vst4i.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1)
6 call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1)
77 ret void
88 }
99
1111 ;CHECK: vst4i16:
1212 ;CHECK: vst4.16
1313 %tmp1 = load <4 x i16>* %B
14 call void @llvm.arm.neon.vst4i.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1)
14 call void @llvm.arm.neon.vst4.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1)
1515 ret void
1616 }
1717
1919 ;CHECK: vst4i32:
2020 ;CHECK: vst4.32
2121 %tmp1 = load <2 x i32>* %B
22 call void @llvm.arm.neon.vst4i.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1)
22 call void @llvm.arm.neon.vst4.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1)
2323 ret void
2424 }
2525
2727 ;CHECK: vst4f:
2828 ;CHECK: vst4.32
2929 %tmp1 = load <2 x float>* %B
30 call void @llvm.arm.neon.vst4f.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1)
30 call void @llvm.arm.neon.vst4.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1)
3131 ret void
3232 }
3333
34 declare void @llvm.arm.neon.vst4i.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind
35 declare void @llvm.arm.neon.vst4i.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>) nounwind
36 declare void @llvm.arm.neon.vst4i.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>) nounwind
37 declare void @llvm.arm.neon.vst4f.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>) nounwind
34 declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind
35 declare void @llvm.arm.neon.vst4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>) nounwind
36 declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>) nounwind
37 declare void @llvm.arm.neon.vst4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>) nounwind
1414 ;CHECK: vtrn.8
1515 %tmp1 = load <8 x i8>* %A
1616 %tmp2 = load <8 x i8>* %B
17 %tmp3 = call %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vtrni.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
17 %tmp3 = call %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vtrn.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
1818 %tmp4 = extractvalue %struct.__builtin_neon_v8qi2 %tmp3, 0
1919 %tmp5 = extractvalue %struct.__builtin_neon_v8qi2 %tmp3, 1
2020 %tmp6 = add <8 x i8> %tmp4, %tmp5
2626 ;CHECK: vtrn.16
2727 %tmp1 = load <4 x i16>* %A
2828 %tmp2 = load <4 x i16>* %B
29 %tmp3 = call %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vtrni.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
29 %tmp3 = call %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vtrn.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
3030 %tmp4 = extractvalue %struct.__builtin_neon_v4hi2 %tmp3, 0
3131 %tmp5 = extractvalue %struct.__builtin_neon_v4hi2 %tmp3, 1
3232 %tmp6 = add <4 x i16> %tmp4, %tmp5
3838 ;CHECK: vtrn.32
3939 %tmp1 = load <2 x i32>* %A
4040 %tmp2 = load <2 x i32>* %B
41 %tmp3 = call %struct.__builtin_neon_v2si2 @llvm.arm.neon.vtrni.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
41 %tmp3 = call %struct.__builtin_neon_v2si2 @llvm.arm.neon.vtrn.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
4242 %tmp4 = extractvalue %struct.__builtin_neon_v2si2 %tmp3, 0
4343 %tmp5 = extractvalue %struct.__builtin_neon_v2si2 %tmp3, 1
4444 %tmp6 = add <2 x i32> %tmp4, %tmp5
5050 ;CHECK: vtrn.32
5151 %tmp1 = load <2 x float>* %A
5252 %tmp2 = load <2 x float>* %B
53 %tmp3 = call %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vtrnf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
53 %tmp3 = call %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vtrn.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
5454 %tmp4 = extractvalue %struct.__builtin_neon_v2sf2 %tmp3, 0
5555 %tmp5 = extractvalue %struct.__builtin_neon_v2sf2 %tmp3, 1
5656 %tmp6 = add <2 x float> %tmp4, %tmp5
6262 ;CHECK: vtrn.8
6363 %tmp1 = load <16 x i8>* %A
6464 %tmp2 = load <16 x i8>* %B
65 %tmp3 = call %struct.__builtin_neon_v16qi2 @llvm.arm.neon.vtrni.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
65 %tmp3 = call %struct.__builtin_neon_v16qi2 @llvm.arm.neon.vtrn.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
6666 %tmp4 = extractvalue %struct.__builtin_neon_v16qi2 %tmp3, 0
6767 %tmp5 = extractvalue %struct.__builtin_neon_v16qi2 %tmp3, 1
6868 %tmp6 = add <16 x i8> %tmp4, %tmp5
7474 ;CHECK: vtrn.16
7575 %tmp1 = load <8 x i16>* %A
7676 %tmp2 = load <8 x i16>* %B
77 %tmp3 = call %struct.__builtin_neon_v8hi2 @llvm.arm.neon.vtrni.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
77 %tmp3 = call %struct.__builtin_neon_v8hi2 @llvm.arm.neon.vtrn.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
7878 %tmp4 = extractvalue %struct.__builtin_neon_v8hi2 %tmp3, 0
7979 %tmp5 = extractvalue %struct.__builtin_neon_v8hi2 %tmp3, 1
8080 %tmp6 = add <8 x i16> %tmp4, %tmp5
8686 ;CHECK: vtrn.32
8787 %tmp1 = load <4 x i32>* %A
8888 %tmp2 = load <4 x i32>* %B
89 %tmp3 = call %struct.__builtin_neon_v4si2 @llvm.arm.neon.vtrni.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
89 %tmp3 = call %struct.__builtin_neon_v4si2 @llvm.arm.neon.vtrn.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
9090 %tmp4 = extractvalue %struct.__builtin_neon_v4si2 %tmp3, 0
9191 %tmp5 = extractvalue %struct.__builtin_neon_v4si2 %tmp3, 1
9292 %tmp6 = add <4 x i32> %tmp4, %tmp5
9898 ;CHECK: vtrn.32
9999 %tmp1 = load <4 x float>* %A
100100 %tmp2 = load <4 x float>* %B
101 %tmp3 = call %struct.__builtin_neon_v4sf2 @llvm.arm.neon.vtrnf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
101 %tmp3 = call %struct.__builtin_neon_v4sf2 @llvm.arm.neon.vtrn.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
102102 %tmp4 = extractvalue %struct.__builtin_neon_v4sf2 %tmp3, 0
103103 %tmp5 = extractvalue %struct.__builtin_neon_v4sf2 %tmp3, 1
104104 %tmp6 = add <4 x float> %tmp4, %tmp5
105105 ret <4 x float> %tmp6
106106 }
107107
108 declare %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vtrni.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
109 declare %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vtrni.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
110 declare %struct.__builtin_neon_v2si2 @llvm.arm.neon.vtrni.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
111 declare %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vtrnf.v2f32(<2 x float>, <2 x float>) nounwind readnone
108 declare %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vtrn.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
109 declare %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vtrn.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
110 declare %struct.__builtin_neon_v2si2 @llvm.arm.neon.vtrn.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
111 declare %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vtrn.v2f32(<2 x float>, <2 x float>) nounwind readnone
112112
113 declare %struct.__builtin_neon_v16qi2 @llvm.arm.neon.vtrni.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
114 declare %struct.__builtin_neon_v8hi2 @llvm.arm.neon.vtrni.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
115 declare %struct.__builtin_neon_v4si2 @llvm.arm.neon.vtrni.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
116 declare %struct.__builtin_neon_v4sf2 @llvm.arm.neon.vtrnf.v4f32(<4 x float>, <4 x float>) nounwind readnone
113 declare %struct.__builtin_neon_v16qi2 @llvm.arm.neon.vtrn.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
114 declare %struct.__builtin_neon_v8hi2 @llvm.arm.neon.vtrn.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
115 declare %struct.__builtin_neon_v4si2 @llvm.arm.neon.vtrn.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
116 declare %struct.__builtin_neon_v4sf2 @llvm.arm.neon.vtrn.v4f32(<4 x float>, <4 x float>) nounwind readnone
1414 ;CHECK: vuzp.8
1515 %tmp1 = load <8 x i8>* %A
1616 %tmp2 = load <8 x i8>* %B
17 %tmp3 = call %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vuzpi.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
17 %tmp3 = call %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vuzp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
1818 %tmp4 = extractvalue %struct.__builtin_neon_v8qi2 %tmp3, 0
1919 %tmp5 = extractvalue %struct.__builtin_neon_v8qi2 %tmp3, 1
2020 %tmp6 = add <8 x i8> %tmp4, %tmp5
2626 ;CHECK: vuzp.16
2727 %tmp1 = load <4 x i16>* %A
2828 %tmp2 = load <4 x i16>* %B
29 %tmp3 = call %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vuzpi.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
29 %tmp3 = call %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vuzp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
3030 %tmp4 = extractvalue %struct.__builtin_neon_v4hi2 %tmp3, 0
3131 %tmp5 = extractvalue %struct.__builtin_neon_v4hi2 %tmp3, 1
3232 %tmp6 = add <4 x i16> %tmp4, %tmp5
3838 ;CHECK: vuzp.32
3939 %tmp1 = load <2 x i32>* %A
4040 %tmp2 = load <2 x i32>* %B
41 %tmp3 = call %struct.__builtin_neon_v2si2 @llvm.arm.neon.vuzpi.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
41 %tmp3 = call %struct.__builtin_neon_v2si2 @llvm.arm.neon.vuzp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
4242 %tmp4 = extractvalue %struct.__builtin_neon_v2si2 %tmp3, 0
4343 %tmp5 = extractvalue %struct.__builtin_neon_v2si2 %tmp3, 1
4444 %tmp6 = add <2 x i32> %tmp4, %tmp5
5050 ;CHECK: vuzp.32
5151 %tmp1 = load <2 x float>* %A
5252 %tmp2 = load <2 x float>* %B
53 %tmp3 = call %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vuzpf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
53 %tmp3 = call %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vuzp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
5454 %tmp4 = extractvalue %struct.__builtin_neon_v2sf2 %tmp3, 0
5555 %tmp5 = extractvalue %struct.__builtin_neon_v2sf2 %tmp3, 1
5656 %tmp6 = add <2 x float> %tmp4, %tmp5
6262 ;CHECK: vuzp.8
6363 %tmp1 = load <16 x i8>* %A
6464 %tmp2 = load <16 x i8>* %B
65 %tmp3 = call %struct.__builtin_neon_v16qi2 @llvm.arm.neon.vuzpi.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
65 %tmp3 = call %struct.__builtin_neon_v16qi2 @llvm.arm.neon.vuzp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
6666 %tmp4 = extractvalue %struct.__builtin_neon_v16qi2 %tmp3, 0
6767 %tmp5 = extractvalue %struct.__builtin_neon_v16qi2 %tmp3, 1
6868 %tmp6 = add <16 x i8> %tmp4, %tmp5
7474 ;CHECK: vuzp.16
7575 %tmp1 = load <8 x i16>* %A
7676 %tmp2 = load <8 x i16>* %B
77 %tmp3 = call %struct.__builtin_neon_v8hi2 @llvm.arm.neon.vuzpi.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
77 %tmp3 = call %struct.__builtin_neon_v8hi2 @llvm.arm.neon.vuzp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
7878 %tmp4 = extractvalue %struct.__builtin_neon_v8hi2 %tmp3, 0
7979 %tmp5 = extractvalue %struct.__builtin_neon_v8hi2 %tmp3, 1
8080 %tmp6 = add <8 x i16> %tmp4, %tmp5
8686 ;CHECK: vuzp.32
8787 %tmp1 = load <4 x i32>* %A
8888 %tmp2 = load <4 x i32>* %B
89 %tmp3 = call %struct.__builtin_neon_v4si2 @llvm.arm.neon.vuzpi.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
89 %tmp3 = call %struct.__builtin_neon_v4si2 @llvm.arm.neon.vuzp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
9090 %tmp4 = extractvalue %struct.__builtin_neon_v4si2 %tmp3, 0
9191 %tmp5 = extractvalue %struct.__builtin_neon_v4si2 %tmp3, 1
9292 %tmp6 = add <4 x i32> %tmp4, %tmp5
9898 ;CHECK: vuzp.32
9999 %tmp1 = load <4 x float>* %A
100100 %tmp2 = load <4 x float>* %B
101 %tmp3 = call %struct.__builtin_neon_v4sf2 @llvm.arm.neon.vuzpf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
101 %tmp3 = call %struct.__builtin_neon_v4sf2 @llvm.arm.neon.vuzp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
102102 %tmp4 = extractvalue %struct.__builtin_neon_v4sf2 %tmp3, 0
103103 %tmp5 = extractvalue %struct.__builtin_neon_v4sf2 %tmp3, 1
104104 %tmp6 = add <4 x float> %tmp4, %tmp5
105105 ret <4 x float> %tmp6
106106 }
107107
108 declare %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vuzpi.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
109 declare %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vuzpi.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
110 declare %struct.__builtin_neon_v2si2 @llvm.arm.neon.vuzpi.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
111 declare %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vuzpf.v2f32(<2 x float>, <2 x float>) nounwind readnone
108 declare %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vuzp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
109 declare %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vuzp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
110 declare %struct.__builtin_neon_v2si2 @llvm.arm.neon.vuzp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
111 declare %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vuzp.v2f32(<2 x float>, <2 x float>) nounwind readnone
112112
113 declare %struct.__builtin_neon_v16qi2 @llvm.arm.neon.vuzpi.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
114 declare %struct.__builtin_neon_v8hi2 @llvm.arm.neon.vuzpi.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
115 declare %struct.__builtin_neon_v4si2 @llvm.arm.neon.vuzpi.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
116 declare %struct.__builtin_neon_v4sf2 @llvm.arm.neon.vuzpf.v4f32(<4 x float>, <4 x float>) nounwind readnone
113 declare %struct.__builtin_neon_v16qi2 @llvm.arm.neon.vuzp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
114 declare %struct.__builtin_neon_v8hi2 @llvm.arm.neon.vuzp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
115 declare %struct.__builtin_neon_v4si2 @llvm.arm.neon.vuzp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
116 declare %struct.__builtin_neon_v4sf2 @llvm.arm.neon.vuzp.v4f32(<4 x float>, <4 x float>) nounwind readnone
1414 ;CHECK: vzip.8
1515 %tmp1 = load <8 x i8>* %A
1616 %tmp2 = load <8 x i8>* %B
17 %tmp3 = call %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vzipi.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
17 %tmp3 = call %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vzip.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
1818 %tmp4 = extractvalue %struct.__builtin_neon_v8qi2 %tmp3, 0
1919 %tmp5 = extractvalue %struct.__builtin_neon_v8qi2 %tmp3, 1
2020 %tmp6 = add <8 x i8> %tmp4, %tmp5
2626 ;CHECK: vzip.16
2727 %tmp1 = load <4 x i16>* %A
2828 %tmp2 = load <4 x i16>* %B
29 %tmp3 = call %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vzipi.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
29 %tmp3 = call %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vzip.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
3030 %tmp4 = extractvalue %struct.__builtin_neon_v4hi2 %tmp3, 0
3131 %tmp5 = extractvalue %struct.__builtin_neon_v4hi2 %tmp3, 1
3232 %tmp6 = add <4 x i16> %tmp4, %tmp5
3838 ;CHECK: vzip.32
3939 %tmp1 = load <2 x i32>* %A
4040 %tmp2 = load <2 x i32>* %B
41 %tmp3 = call %struct.__builtin_neon_v2si2 @llvm.arm.neon.vzipi.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
41 %tmp3 = call %struct.__builtin_neon_v2si2 @llvm.arm.neon.vzip.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
4242 %tmp4 = extractvalue %struct.__builtin_neon_v2si2 %tmp3, 0
4343 %tmp5 = extractvalue %struct.__builtin_neon_v2si2 %tmp3, 1
4444 %tmp6 = add <2 x i32> %tmp4, %tmp5
5050 ;CHECK: vzip.32
5151 %tmp1 = load <2 x float>* %A
5252 %tmp2 = load <2 x float>* %B
53 %tmp3 = call %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vzipf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
53 %tmp3 = call %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vzip.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
5454 %tmp4 = extractvalue %struct.__builtin_neon_v2sf2 %tmp3, 0
5555 %tmp5 = extractvalue %struct.__builtin_neon_v2sf2 %tmp3, 1
5656 %tmp6 = add <2 x float> %tmp4, %tmp5
6262 ;CHECK: vzip.8
6363 %tmp1 = load <16 x i8>* %A
6464 %tmp2 = load <16 x i8>* %B
65 %tmp3 = call %struct.__builtin_neon_v16qi2 @llvm.arm.neon.vzipi.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
65 %tmp3 = call %struct.__builtin_neon_v16qi2 @llvm.arm.neon.vzip.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
6666 %tmp4 = extractvalue %struct.__builtin_neon_v16qi2 %tmp3, 0
6767 %tmp5 = extractvalue %struct.__builtin_neon_v16qi2 %tmp3, 1
6868 %tmp6 = add <16 x i8> %tmp4, %tmp5
7474 ;CHECK: vzip.16
7575 %tmp1 = load <8 x i16>* %A
7676 %tmp2 = load <8 x i16>* %B
77 %tmp3 = call %struct.__builtin_neon_v8hi2 @llvm.arm.neon.vzipi.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
77 %tmp3 = call %struct.__builtin_neon_v8hi2 @llvm.arm.neon.vzip.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
7878 %tmp4 = extractvalue %struct.__builtin_neon_v8hi2 %tmp3, 0
7979 %tmp5 = extractvalue %struct.__builtin_neon_v8hi2 %tmp3, 1
8080 %tmp6 = add <8 x i16> %tmp4, %tmp5
8686 ;CHECK: vzip.32
8787 %tmp1 = load <4 x i32>* %A
8888 %tmp2 = load <4 x i32>* %B
89 %tmp3 = call %struct.__builtin_neon_v4si2 @llvm.arm.neon.vzipi.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
89 %tmp3 = call %struct.__builtin_neon_v4si2 @llvm.arm.neon.vzip.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
9090 %tmp4 = extractvalue %struct.__builtin_neon_v4si2 %tmp3, 0
9191 %tmp5 = extractvalue %struct.__builtin_neon_v4si2 %tmp3, 1
9292 %tmp6 = add <4 x i32> %tmp4, %tmp5
9898 ;CHECK: vzip.32
9999 %tmp1 = load <4 x float>* %A
100100 %tmp2 = load <4 x float>* %B
101 %tmp3 = call %struct.__builtin_neon_v4sf2 @llvm.arm.neon.vzipf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
101 %tmp3 = call %struct.__builtin_neon_v4sf2 @llvm.arm.neon.vzip.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
102102 %tmp4 = extractvalue %struct.__builtin_neon_v4sf2 %tmp3, 0
103103 %tmp5 = extractvalue %struct.__builtin_neon_v4sf2 %tmp3, 1
104104 %tmp6 = add <4 x float> %tmp4, %tmp5
105105 ret <4 x float> %tmp6
106106 }
107107
108 declare %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vzipi.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
109 declare %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vzipi.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
110 declare %struct.__builtin_neon_v2si2 @llvm.arm.neon.vzipi.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
111 declare %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vzipf.v2f32(<2 x float>, <2 x float>) nounwind readnone
108 declare %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vzip.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
109 declare %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vzip.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
110 declare %struct.__builtin_neon_v2si2 @llvm.arm.neon.vzip.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
111 declare %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vzip.v2f32(<2 x float>, <2 x float>) nounwind readnone
112112
113 declare %struct.__builtin_neon_v16qi2 @llvm.arm.neon.vzipi.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
114 declare %struct.__builtin_neon_v8hi2 @llvm.arm.neon.vzipi.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
115 declare %struct.__builtin_neon_v4si2 @llvm.arm.neon.vzipi.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
116 declare %struct.__builtin_neon_v4sf2 @llvm.arm.neon.vzipf.v4f32(<4 x float>, <4 x float>) nounwind readnone
113 declare %struct.__builtin_neon_v16qi2 @llvm.arm.neon.vzip.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
114 declare %struct.__builtin_neon_v8hi2 @llvm.arm.neon.vzip.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
115 declare %struct.__builtin_neon_v4si2 @llvm.arm.neon.vzip.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
116 declare %struct.__builtin_neon_v4sf2 @llvm.arm.neon.vzip.v4f32(<4 x float>, <4 x float>) nounwind readnone