llvm.org GIT mirror llvm / 874371d
[ARM][NEON] Use address space in vld([1234]|[234]lane) and vst([1234]|[234]lane) instructions This commit changes the interface of the vld[1234], vld[234]lane, and vst[1234], vst[234]lane ARM neon intrinsics and associates an address space with the pointer that these intrinsics take. This changes, e.g., <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32) to <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8*, i32) This change ensures that address spaces are fully taken into account in the ARM target during lowering of interleaved loads and stores. Differential Revision: http://reviews.llvm.org/D12985 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@248887 91177308-0d34-0410-b5e6-96231b3b80d8 Jeroen Ketema 4 years ago
45 changed file(s) with 774 addition(s) and 558 deletion(s). Raw diff Collapse all Expand all
404404 // De-interleaving vector loads from N-element structures.
405405 // Source operands are the address and alignment.
406406 def int_arm_neon_vld1 : Intrinsic<[llvm_anyvector_ty],
407 [llvm_ptr_ty, llvm_i32_ty],
407 [llvm_anyptr_ty, llvm_i32_ty],
408408 [IntrReadArgMem]>;
409409 def int_arm_neon_vld2 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
410 [llvm_ptr_ty, llvm_i32_ty],
410 [llvm_anyptr_ty, llvm_i32_ty],
411411 [IntrReadArgMem]>;
412412 def int_arm_neon_vld3 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
413413 LLVMMatchType<0>],
414 [llvm_ptr_ty, llvm_i32_ty],
414 [llvm_anyptr_ty, llvm_i32_ty],
415415 [IntrReadArgMem]>;
416416 def int_arm_neon_vld4 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
417417 LLVMMatchType<0>, LLVMMatchType<0>],
418 [llvm_ptr_ty, llvm_i32_ty],
418 [llvm_anyptr_ty, llvm_i32_ty],
419419 [IntrReadArgMem]>;
420420
421421 // Vector load N-element structure to one lane.
422422 // Source operands are: the address, the N input vectors (since only one
423423 // lane is assigned), the lane number, and the alignment.
424424 def int_arm_neon_vld2lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
425 [llvm_ptr_ty, LLVMMatchType<0>,
425 [llvm_anyptr_ty, LLVMMatchType<0>,
426426 LLVMMatchType<0>, llvm_i32_ty,
427427 llvm_i32_ty], [IntrReadArgMem]>;
428428 def int_arm_neon_vld3lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
429429 LLVMMatchType<0>],
430 [llvm_ptr_ty, LLVMMatchType<0>,
430 [llvm_anyptr_ty, LLVMMatchType<0>,
431431 LLVMMatchType<0>, LLVMMatchType<0>,
432432 llvm_i32_ty, llvm_i32_ty],
433433 [IntrReadArgMem]>;
434434 def int_arm_neon_vld4lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
435435 LLVMMatchType<0>, LLVMMatchType<0>],
436 [llvm_ptr_ty, LLVMMatchType<0>,
436 [llvm_anyptr_ty, LLVMMatchType<0>,
437437 LLVMMatchType<0>, LLVMMatchType<0>,
438438 LLVMMatchType<0>, llvm_i32_ty,
439439 llvm_i32_ty], [IntrReadArgMem]>;
441441 // Interleaving vector stores from N-element structures.
442442 // Source operands are: the address, the N vectors, and the alignment.
443443 def int_arm_neon_vst1 : Intrinsic<[],
444 [llvm_ptr_ty, llvm_anyvector_ty,
444 [llvm_anyptr_ty, llvm_anyvector_ty,
445445 llvm_i32_ty], [IntrReadWriteArgMem]>;
446446 def int_arm_neon_vst2 : Intrinsic<[],
447 [llvm_ptr_ty, llvm_anyvector_ty,
448 LLVMMatchType<0>, llvm_i32_ty],
447 [llvm_anyptr_ty, llvm_anyvector_ty,
448 LLVMMatchType<1>, llvm_i32_ty],
449449 [IntrReadWriteArgMem]>;
450450 def int_arm_neon_vst3 : Intrinsic<[],
451 [llvm_ptr_ty, llvm_anyvector_ty,
452 LLVMMatchType<0>, LLVMMatchType<0>,
451 [llvm_anyptr_ty, llvm_anyvector_ty,
452 LLVMMatchType<1>, LLVMMatchType<1>,
453453 llvm_i32_ty], [IntrReadWriteArgMem]>;
454454 def int_arm_neon_vst4 : Intrinsic<[],
455 [llvm_ptr_ty, llvm_anyvector_ty,
456 LLVMMatchType<0>, LLVMMatchType<0>,
457 LLVMMatchType<0>, llvm_i32_ty],
455 [llvm_anyptr_ty, llvm_anyvector_ty,
456 LLVMMatchType<1>, LLVMMatchType<1>,
457 LLVMMatchType<1>, llvm_i32_ty],
458458 [IntrReadWriteArgMem]>;
459459
460460 // Vector store N-element structure from one lane.
461461 // Source operands are: the address, the N vectors, the lane number, and
462462 // the alignment.
463463 def int_arm_neon_vst2lane : Intrinsic<[],
464 [llvm_ptr_ty, llvm_anyvector_ty,
465 LLVMMatchType<0>, llvm_i32_ty,
464 [llvm_anyptr_ty, llvm_anyvector_ty,
465 LLVMMatchType<1>, llvm_i32_ty,
466466 llvm_i32_ty], [IntrReadWriteArgMem]>;
467467 def int_arm_neon_vst3lane : Intrinsic<[],
468 [llvm_ptr_ty, llvm_anyvector_ty,
469 LLVMMatchType<0>, LLVMMatchType<0>,
468 [llvm_anyptr_ty, llvm_anyvector_ty,
469 LLVMMatchType<1>, LLVMMatchType<1>,
470470 llvm_i32_ty, llvm_i32_ty],
471471 [IntrReadWriteArgMem]>;
472472 def int_arm_neon_vst4lane : Intrinsic<[],
473 [llvm_ptr_ty, llvm_anyvector_ty,
474 LLVMMatchType<0>, LLVMMatchType<0>,
475 LLVMMatchType<0>, llvm_i32_ty,
473 [llvm_anyptr_ty, llvm_anyvector_ty,
474 LLVMMatchType<1>, LLVMMatchType<1>,
475 LLVMMatchType<1>, llvm_i32_ty,
476476 llvm_i32_ty], [IntrReadWriteArgMem]>;
477477
478478 // Vector bitwise select.
2626 #include "llvm/IR/LLVMContext.h"
2727 #include "llvm/IR/Module.h"
2828 #include "llvm/Support/ErrorHandling.h"
29 #include "llvm/Support/Regex.h"
2930 #include
3031 using namespace llvm;
3132
9192 F->arg_begin()->getType());
9293 return true;
9394 }
95 Regex vldRegex("^arm\\.neon\\.vld([1234]|[234]lane)\\.v[a-z0-9]*$");
96 if (vldRegex.match(Name)) {
97 auto fArgs = F->getFunctionType()->params();
98 SmallVector Tys(fArgs.begin(), fArgs.end());
99 // Can't use Intrinsic::getDeclaration here as the return types might
100 // then only be structurally equal.
101 FunctionType* fType = FunctionType::get(F->getReturnType(), Tys, false);
102 NewFn = Function::Create(fType, F->getLinkage(),
103 "llvm." + Name + ".p0i8", F->getParent());
104 return true;
105 }
106 Regex vstRegex("^arm\\.neon\\.vst([1234]|[234]lane)\\.v[a-z0-9]*$");
107 if (vstRegex.match(Name)) {
108 static Intrinsic::ID StoreInts[] = {Intrinsic::arm_neon_vst1,
109 Intrinsic::arm_neon_vst2,
110 Intrinsic::arm_neon_vst3,
111 Intrinsic::arm_neon_vst4};
112
113 static Intrinsic::ID StoreLaneInts[] = {Intrinsic::arm_neon_vst2lane,
114 Intrinsic::arm_neon_vst3lane,
115 Intrinsic::arm_neon_vst4lane};
116
117 auto fArgs = F->getFunctionType()->params();
118 Type *Tys[] = {fArgs[0], fArgs[1]};
119 if (Name.find("lane") == StringRef::npos)
120 NewFn = Intrinsic::getDeclaration(F->getParent(),
121 StoreInts[fArgs.size() - 3], Tys);
122 else
123 NewFn = Intrinsic::getDeclaration(F->getParent(),
124 StoreLaneInts[fArgs.size() - 5], Tys);
125 return true;
126 }
94127 break;
95128 }
129
96130 case 'c': {
97131 if (Name.startswith("ctlz.") && F->arg_size() == 1) {
98132 F->setName(Name + ".old");
650684 default:
651685 llvm_unreachable("Unknown function for CallInst upgrade.");
652686
687 case Intrinsic::arm_neon_vld1:
688 case Intrinsic::arm_neon_vld2:
689 case Intrinsic::arm_neon_vld3:
690 case Intrinsic::arm_neon_vld4:
691 case Intrinsic::arm_neon_vld2lane:
692 case Intrinsic::arm_neon_vld3lane:
693 case Intrinsic::arm_neon_vld4lane:
694 case Intrinsic::arm_neon_vst1:
695 case Intrinsic::arm_neon_vst2:
696 case Intrinsic::arm_neon_vst3:
697 case Intrinsic::arm_neon_vst4:
698 case Intrinsic::arm_neon_vst2lane:
699 case Intrinsic::arm_neon_vst3lane:
700 case Intrinsic::arm_neon_vst4lane: {
701 SmallVector Args(CI->arg_operands().begin(),
702 CI->arg_operands().end());
703 CI->replaceAllUsesWith(Builder.CreateCall(NewFn, Args));
704 CI->eraseFromParent();
705 return;
706 }
707
653708 case Intrinsic::ctlz:
654709 case Intrinsic::cttz:
655710 assert(CI->getNumArgOperands() == 1 &&
1180111801 Intrinsic::arm_neon_vld3,
1180211802 Intrinsic::arm_neon_vld4};
1180311803
11804 Function *VldnFunc =
11805 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], VecTy);
11806
1180711804 IRBuilder<> Builder(LI);
1180811805 SmallVector Ops;
1180911806
1181111808 Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr));
1181211809 Ops.push_back(Builder.getInt32(LI->getAlignment()));
1181311810
11811 Type *Tys[] = { VecTy, Int8Ptr };
11812 Function *VldnFunc =
11813 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
1181411814 CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
1181511815
1181611816 // Replace uses of each shufflevector with the corresponding vector loaded
1190211902 static Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
1190311903 Intrinsic::arm_neon_vst3,
1190411904 Intrinsic::arm_neon_vst4};
11905 Function *VstNFunc = Intrinsic::getDeclaration(
11906 SI->getModule(), StoreInts[Factor - 2], SubVecTy);
11907
1190811905 SmallVector Ops;
1190911906
1191011907 Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
1191111908 Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr));
11909
11910 Type *Tys[] = { Int8Ptr, SubVecTy };
11911 Function *VstNFunc = Intrinsic::getDeclaration(
11912 SI->getModule(), StoreInts[Factor - 2], Tys);
1191211913
1191311914 // Split the shufflevector operands into sub vectors for the new vstN call.
1191411915 for (unsigned i = 0; i < Factor; i++)
11 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
22 target triple = "arm-apple-ios"
33
4 declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly
5 declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
4 declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly
5 declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
66
77 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
88 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
1212 define <8 x i16> @test1(i8* %p, <8 x i16> %y) {
1313 entry:
1414 %q = getelementptr i8, i8* %p, i64 16
15 %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind
16 call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16)
17 %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind
15 %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
16 call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
17 %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
1818 %c = add <8 x i16> %a, %b
1919 ret <8 x i16> %c
2020
2121 ; CHECK-LABEL: Function: test1:
2222
2323 ; CHECK: NoAlias: i8* %p, i8* %q
24 ; CHECK: Just Ref: Ptr: i8* %p <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4
25 ; CHECK: NoModRef: Ptr: i8* %q <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4
26 ; CHECK: NoModRef: Ptr: i8* %p <-> call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16)
27 ; CHECK: Both ModRef: Ptr: i8* %q <-> call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16)
28 ; CHECK: Just Ref: Ptr: i8* %p <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4
29 ; CHECK: NoModRef: Ptr: i8* %q <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4
30 ; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 <-> call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16)
31 ; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4
32 ; CHECK: NoModRef: call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4
33 ; CHECK: NoModRef: call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4
34 ; CHECK: NoModRef: %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4
35 ; CHECK: NoModRef: %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 <-> call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16)
24 ; CHECK: Just Ref: Ptr: i8* %p <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4
25 ; CHECK: NoModRef: Ptr: i8* %q <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4
26 ; CHECK: NoModRef: Ptr: i8* %p <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
27 ; CHECK: Both ModRef: Ptr: i8* %q <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
28 ; CHECK: Just Ref: Ptr: i8* %p <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4
29 ; CHECK: NoModRef: Ptr: i8* %q <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4
30 ; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4 <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
31 ; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4 <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4
32 ; CHECK: NoModRef: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4
33 ; CHECK: NoModRef: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4
34 ; CHECK: NoModRef: %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4 <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4
35 ; CHECK: NoModRef: %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4 <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
3636 }
3737
3838 define void @test2(i8* %P, i8* %Q) nounwind ssp {
66
77 ; CHECK: define <8 x i16> @test0(i8* noalias %p, i8* noalias %q, <8 x i16> %y) {
88 ; CHECK-NEXT: entry:
9 ; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) [[ATTR:#[0-9]+]]
10 ; CHECK-NEXT: call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16)
9 ; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) [[ATTR:#[0-9]+]]
10 ; CHECK-NEXT: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
1111 ; CHECK-NEXT: %c = add <8 x i16> %a, %a
1212 define <8 x i16> @test0(i8* noalias %p, i8* noalias %q, <8 x i16> %y) {
1313 entry:
14 %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind
15 call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16)
16 %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind
14 %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
15 call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
16 %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
1717 %c = add <8 x i16> %a, %b
1818 ret <8 x i16> %c
1919 }
2121 ; CHECK: define <8 x i16> @test1(i8* %p, <8 x i16> %y) {
2222 ; CHECK-NEXT: entry:
2323 ; CHECK-NEXT: %q = getelementptr i8, i8* %p, i64 16
24 ; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) [[ATTR]]
25 ; CHECK-NEXT: call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16)
24 ; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) [[ATTR]]
25 ; CHECK-NEXT: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
2626 ; CHECK-NEXT: %c = add <8 x i16> %a, %a
2727 define <8 x i16> @test1(i8* %p, <8 x i16> %y) {
2828 entry:
2929 %q = getelementptr i8, i8* %p, i64 16
30 %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind
31 call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16)
32 %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind
30 %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
31 call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
32 %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
3333 %c = add <8 x i16> %a, %b
3434 ret <8 x i16> %c
3535 }
3636
37 declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly
38 declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
37 declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly
38 declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
3939
4040 ; CHECK: attributes #0 = { nounwind readonly argmemonly }
4141 ; CHECK: attributes #1 = { nounwind argmemonly }
66
77 ; CHECK: define <8 x i16> @test0(i8* %p, i8* %q, <8 x i16> %y) {
88 ; CHECK-NEXT: entry:
9 ; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) [[NUW:#[0-9]+]]
10 ; CHECK-NEXT: call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16)
9 ; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) [[NUW:#[0-9]+]]
10 ; CHECK-NEXT: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
1111 ; CHECK-NEXT: %c = add <8 x i16> %a, %a
1212 define <8 x i16> @test0(i8* %p, i8* %q, <8 x i16> %y) {
1313 entry:
14 %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind, !tbaa !2
15 call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16), !tbaa !1
16 %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind, !tbaa !2
14 %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind, !tbaa !2
15 call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16), !tbaa !1
16 %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind, !tbaa !2
1717 %c = add <8 x i16> %a, %b
1818 ret <8 x i16> %c
1919 }
2020
21 declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly
22 declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
21 declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly
22 declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
2323
2424 ; CHECK: attributes #0 = { nounwind readonly argmemonly }
2525 ; CHECK: attributes #1 = { nounwind argmemonly }
0 ; RUN: llc -mtriple=arm-eabi -mattr=+neon -O0 -optimize-regalloc -regalloc=basic %s -o /dev/null
11
22 ; This test would crash the rewriter when trying to handle a spill after one of
3 ; the @llvm.arm.neon.vld3.v8i8 defined three parts of a register.
3 ; the @llvm.arm.neon.vld3.v8i8.p0i8 defined three parts of a register.
44
55 %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
66
7 declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*, i32) nounwind readonly
7 declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8*, i32) nounwind readonly
88
9 declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
9 declare void @llvm.arm.neon.vst3.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
1010
1111 define <8 x i8> @t3(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A7, i8* %A8, i8* %B) nounwind {
12 %tmp1b = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A2, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
12 %tmp1b = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A2, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
1313 %tmp2b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 0 ; <<8 x i8>> [#uses=1]
1414 %tmp4b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 1 ; <<8 x i8>> [#uses=1]
15 %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
15 %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
1616 %tmp2d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 0 ; <<8 x i8>> [#uses=1]
1717 %tmp4d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 1 ; <<8 x i8>> [#uses=1]
18 %tmp1e = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A5, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
18 %tmp1e = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A5, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
1919 %tmp2e = extractvalue %struct.__neon_int8x8x3_t %tmp1e, 0 ; <<8 x i8>> [#uses=1]
20 %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
20 %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
2121 %tmp2f = extractvalue %struct.__neon_int8x8x3_t %tmp1f, 0 ; <<8 x i8>> [#uses=1]
22 %tmp1g = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A7, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
22 %tmp1g = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A7, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
2323 %tmp2g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 0 ; <<8 x i8>> [#uses=1]
2424 %tmp4g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 1 ; <<8 x i8>> [#uses=1]
25 %tmp1h = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A8, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
25 %tmp1h = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A8, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
2626 %tmp2h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 0 ; <<8 x i8>> [#uses=1]
2727 %tmp3h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 2 ; <<8 x i8>> [#uses=1]
2828 %tmp2bd = add <8 x i8> %tmp2b, %tmp2d ; <<8 x i8>> [#uses=1]
2929 %tmp4bd = add <8 x i8> %tmp4b, %tmp4d ; <<8 x i8>> [#uses=1]
3030 %tmp2abcd = mul <8 x i8> undef, %tmp2bd ; <<8 x i8>> [#uses=1]
3131 %tmp4abcd = mul <8 x i8> undef, %tmp4bd ; <<8 x i8>> [#uses=2]
32 call void @llvm.arm.neon.vst3.v8i8(i8* %A1, <8 x i8> %tmp4abcd, <8 x i8> zeroinitializer, <8 x i8> %tmp2abcd, i32 1)
32 call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %A1, <8 x i8> %tmp4abcd, <8 x i8> zeroinitializer, <8 x i8> %tmp2abcd, i32 1)
3333 %tmp2ef = sub <8 x i8> %tmp2e, %tmp2f ; <<8 x i8>> [#uses=1]
3434 %tmp2gh = sub <8 x i8> %tmp2g, %tmp2h ; <<8 x i8>> [#uses=1]
3535 %tmp3gh = sub <8 x i8> zeroinitializer, %tmp3h ; <<8 x i8>> [#uses=1]
3737 %tmp2efgh = mul <8 x i8> %tmp2ef, %tmp2gh ; <<8 x i8>> [#uses=1]
3838 %tmp3efgh = mul <8 x i8> undef, %tmp3gh ; <<8 x i8>> [#uses=1]
3939 %tmp4efgh = mul <8 x i8> %tmp4ef, undef ; <<8 x i8>> [#uses=2]
40 call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> %tmp4efgh, <8 x i8> %tmp3efgh, <8 x i8> %tmp2efgh, i32 1)
40 call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %A2, <8 x i8> %tmp4efgh, <8 x i8> %tmp3efgh, <8 x i8> %tmp2efgh, i32 1)
4141 %tmp4 = sub <8 x i8> %tmp4efgh, %tmp4abcd ; <<8 x i8>> [#uses=1]
42 tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> zeroinitializer, <8 x i8> undef, <8 x i8> undef, i32 1)
42 tail call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %B, <8 x i8> zeroinitializer, <8 x i8> undef, <8 x i8> undef, i32 1)
4343 ret <8 x i8> %tmp4
4444 }
3535 %tmp5 = insertelement <4 x float> %tmp7, float %18, i32 3
3636 %19 = fmul <4 x float> %tmp5, %2
3737 %20 = bitcast float* %fltp to i8*
38 tail call void @llvm.arm.neon.vst1.v4f32(i8* %20, <4 x float> %19, i32 1)
38 tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %20, <4 x float> %19, i32 1)
3939 ret void
4040 }
4141
42 declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
42 declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind
1111 %tmp9 = trunc i128 %tmp8 to i64 ; [#uses=1]
1212 %tmp16.i = bitcast i64 %tmp6 to <8 x i8> ; <<8 x i8>> [#uses=1]
1313 %tmp20.i = bitcast i64 %tmp9 to <8 x i8> ; <<8 x i8>> [#uses=1]
14 tail call void @llvm.arm.neon.vst2.v8i8(i8* %b, <8 x i8> %tmp16.i, <8 x i8> %tmp20.i, i32 1) nounwind
14 tail call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %b, <8 x i8> %tmp16.i, <8 x i8> %tmp20.i, i32 1) nounwind
1515 ret void
1616 }
1717
18 declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind
18 declare void @llvm.arm.neon.vst2.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind
1515
1616 define i32 @test(i8* %arg) nounwind {
1717 entry:
18 %0 = call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %arg, i32 1)
18 %0 = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* %arg, i32 1)
1919 %1 = shufflevector <2 x i64> undef, <2 x i64> %0, <2 x i32>
2020 store <2 x i64> %1, <2 x i64>* undef, align 16
2121 ret i32 undef
2222 }
2323
24 declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32) nounwind readonly
24 declare <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8*, i32) nounwind readonly
33
44 define void @test_vmovqqqq_pseudo() nounwind ssp {
55 entry:
6 %vld3_lane = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> zeroinitializer, i32 7, i32 2)
6 %vld3_lane = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> zeroinitializer, i32 7, i32 2)
77 store { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, { <8 x i16>, <8 x i16>, <8 x i16> }* undef
88 ret void
99 }
1010
11 declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
11 declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
5151 %shuffle.i35.i.i = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
5252 %shuffle.i34.i.i = shufflevector <1 x i64> %shuffle.i36.i.i, <1 x i64> %shuffle.i35.i.i, <2 x i32>
5353 %2 = bitcast <2 x i64> %shuffle.i34.i.i to <4 x float>
54 tail call void @llvm.arm.neon.vst1.v4f32(i8* undef, <4 x float> %0, i32 4) nounwind
55 tail call void @llvm.arm.neon.vst1.v4f32(i8* undef, <4 x float> %2, i32 4) nounwind
54 tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* undef, <4 x float> %0, i32 4) nounwind
55 tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* undef, <4 x float> %2, i32 4) nounwind
5656 unreachable
5757
5858 for.end: ; preds = %entry
6262 ; Check that pseudo-expansion preserves flags.
6363 define void @foo3(i8* %p) nounwind ssp {
6464 entry:
65 tail call void @llvm.arm.neon.vst2.v4f32(i8* %p, <4 x float> undef, <4 x float> undef, i32 4)
65 tail call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %p, <4 x float> undef, <4 x float> undef, i32 4)
6666 ret void
6767 }
6868
6969 declare arm_aapcs_vfpcc void @bar(i8*, float, float, float)
70 declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
71 declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
70 declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind
71 declare void @llvm.arm.neon.vst2.p0i8.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
66 %vecinit.i = insertelement <2 x i32> undef, i32 %x, i32 0
77 %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %x, i32 1
88 %0 = bitcast i32* %p to i8*
9 tail call void @llvm.arm.neon.vst1.v2i32(i8* %0, <2 x i32> %vecinit1.i, i32 4)
9 tail call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %0, <2 x i32> %vecinit1.i, i32 4)
1010 ret void
1111 }
1212
13 declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32) nounwind
13 declare void @llvm.arm.neon.vst1.p0i8.v2i32(i8*, <2 x i32>, i32) nounwind
44 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
55 target triple = "thumbv7-apple-ios5.1.0"
66
7 declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32) nounwind readonly
7 declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8*, i32) nounwind readonly
88
9 declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32) nounwind
9 declare void @llvm.arm.neon.vst1.p0i8.v16i8(i8*, <16 x i8>, i32) nounwind
1010
1111 define void @findEdges(i8*) nounwind ssp {
1212 %2 = icmp sgt i32 undef, 0
1818
1919 ;
2020 %6 = phi i8* [ %19, %5 ], [ %0, %1 ]
21 %7 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* null, i32 1)
21 %7 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* null, i32 1)
2222 %8 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %7, 0
2323 %9 = getelementptr inbounds i8, i8* null, i32 3
24 %10 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %9, i32 1)
24 %10 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %9, i32 1)
2525 %11 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %10, 2
26 %12 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %6, i32 1)
26 %12 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %6, i32 1)
2727 %13 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %12, 0
2828 %14 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %12, 1
2929 %15 = getelementptr inbounds i8, i8* %6, i32 3
30 %16 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %15, i32 1)
30 %16 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %15, i32 1)
3131 %17 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %16, 1
3232 %18 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %16, 2
3333 %19 = getelementptr inbounds i8, i8* %6, i32 48
110110 %96 = bitcast <8 x i8> %94 to <1 x i64>
111111 %97 = shufflevector <1 x i64> %95, <1 x i64> %96, <2 x i32>
112112 %98 = bitcast <2 x i64> %97 to <16 x i8>
113 tail call void @llvm.arm.neon.vst1.v16i8(i8* null, <16 x i8> %98, i32 1)
113 tail call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* null, <16 x i8> %98, i32 1)
114114 %99 = icmp slt i32 undef, undef
115115 br i1 %99, label %5, label %3
116116 }
99 ; CHECK-NOT: Number of pipeline stalls
1010 define <16 x i8> @multiselect(i32 %avail, i8* %foo, i8* %bar) {
1111 entry:
12 %vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %foo, i32 1)
13 %vld2 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %bar, i32 1)
12 %vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %foo, i32 1)
13 %vld2 = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %bar, i32 1)
1414 %and = and i32 %avail, 3
1515 %tobool = icmp eq i32 %and, 0
1616 %retv = select i1 %tobool, <16 x i8> %vld1, <16 x i8> %vld2
1717 ret <16 x i8> %retv
1818 }
1919
20 declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* , i32 )
20 declare <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* , i32 )
2626 %n0 = insertelement <2 x i64> undef, i64 %tmp0, i32 0
2727 %n1 = insertelement <2 x i64> %n0, i64 %tmp1, i32 1
2828
29 call void @llvm.arm.neon.vst4.v1i64(i8* %m, <1 x i64> %s0, <1 x i64> %s1, <1 x i64> %s2, <1 x i64> %s3, i32 8)
29 call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* %m, <1 x i64> %s0, <1 x i64> %s1, <1 x i64> %s2, <1 x i64> %s3, i32 8)
3030
3131 call void @bar(<2 x i64> %n1)
3232
4949 ret <8 x i8> %tmp8
5050 }
5151
52 declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
52 declare void @llvm.arm.neon.vst4.p0i8.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
5353 declare <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
5454 declare void @bar2(%struct.__neon_int8x8x4_t, <8 x i8>)
5555 declare void @bar(<2 x i64> %arg)
201201 store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
202202 ret void
203203 }
204
205 ; The following test cases check that address spaces are properly handled
206
207 ; CHECK-LABEL: load_address_space
208 ; CHECK: vld3.32
209 define void @load_address_space(<4 x i32> addrspace(1)* %A, <2 x i32>* %B) {
210 %tmp = load <4 x i32>, <4 x i32> addrspace(1)* %A
211 %interleaved = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32>
212 store <2 x i32> %interleaved, <2 x i32>* %B
213 ret void
214 }
215
216 ; CHECK-LABEL: store_address_space
217 ; CHECK: vst2.32
218 define void @store_address_space(<2 x i32>* %A, <2 x i32>* %B, <4 x i32> addrspace(1)* %C) {
219 %tmp0 = load <2 x i32>, <2 x i32>* %A
220 %tmp1 = load <2 x i32>, <2 x i32>* %B
221 %interleaved = shufflevector <2 x i32> %tmp0, <2 x i32> %tmp1, <4 x i32>
222 store <4 x i32> %interleaved, <4 x i32> addrspace(1)* %C
223 ret void
224 }
1313 define void @f(float* %p, i32 %c) nounwind ssp {
1414 entry:
1515 %0 = bitcast float* %p to i8*
16 %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %0, i32 4)
16 %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* %0, i32 4)
1717 %vld221 = extractvalue { <4 x float>, <4 x float> } %vld2, 1
1818 %add.ptr = getelementptr inbounds float, float* %p, i32 8
1919 %1 = bitcast float* %add.ptr to i8*
20 tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %vld221, <4 x float> undef, i32 4)
20 tail call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %1, <4 x float> %vld221, <4 x float> undef, i32 4)
2121 ret void
2222 }
2323
2626 define void @f1(float* %p, i32 %c) nounwind ssp {
2727 entry:
2828 %0 = bitcast float* %p to i8*
29 %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %0, i32 4)
29 %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* %0, i32 4)
3030 %vld221 = extractvalue { <4 x float>, <4 x float> } %vld2, 1
3131 %add.ptr = getelementptr inbounds float, float* %p, i32 8
3232 %1 = bitcast float* %add.ptr to i8*
33 %vld22 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4)
33 %vld22 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* %1, i32 4)
3434 %vld2215 = extractvalue { <4 x float>, <4 x float> } %vld22, 0
35 tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %vld221, <4 x float> %vld2215, i32 4)
35 tail call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %1, <4 x float> %vld221, <4 x float> %vld2215, i32 4)
3636 ret void
3737 }
3838
4141 define void @f2(float* %p, i32 %c) nounwind ssp {
4242 entry:
4343 %0 = bitcast float* %p to i8*
44 %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %0, i32 4)
44 %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* %0, i32 4)
4545 %vld224 = extractvalue { <4 x float>, <4 x float> } %vld2, 1
4646 br label %do.body
4747
5151 %p.addr.0 = phi float* [ %p, %entry ], [ %add.ptr, %do.body ]
5252 %add.ptr = getelementptr inbounds float, float* %p.addr.0, i32 8
5353 %1 = bitcast float* %add.ptr to i8*
54 %vld22 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4)
54 %vld22 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* %1, i32 4)
5555 %vld2215 = extractvalue { <4 x float>, <4 x float> } %vld22, 0
5656 %vld2216 = extractvalue { <4 x float>, <4 x float> } %vld22, 1
57 tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %qq0.0.1.0, <4 x float> %vld2215, i32 4)
57 tail call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %1, <4 x float> %qq0.0.1.0, <4 x float> %vld2215, i32 4)
5858 %dec = add nsw i32 %c.addr.0, -1
5959 %tobool = icmp eq i32 %dec, 0
6060 br i1 %tobool, label %do.end, label %do.body
6363 ret void
6464 }
6565
66 declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8*, i32) nounwind readonly
67 declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
66 declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8*, i32) nounwind readonly
67 declare void @llvm.arm.neon.vst2.p0i8.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
6868
6969 ; CHECK: f3
7070 ; This function has lane insertions that span basic blocks.
108108 %x.0 = phi <2 x float> [ %vecins3, %if.then ], [ %vecins5, %if.else ]
109109 %add.ptr = getelementptr inbounds float, float* %p, i32 4
110110 %4 = bitcast float* %add.ptr to i8*
111 tail call void @llvm.arm.neon.vst1.v2f32(i8* %4, <2 x float> %x.0, i32 4)
112 ret void
113 }
114
115 declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32) nounwind
116 declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32) nounwind readonly
111 tail call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %4, <2 x float> %x.0, i32 4)
112 ret void
113 }
114
115 declare void @llvm.arm.neon.vst1.p0i8.v2f32(i8*, <2 x float>, i32) nounwind
116 declare <2 x float> @llvm.arm.neon.vld1.v2f32.p0i8(i8*, i32) nounwind readonly
117117
118118 ; CHECK: f4
119119 ; This function inserts a lane into a fully defined vector.
123123 define void @f4(float* %p, float* %q) nounwind ssp {
124124 entry:
125125 %0 = bitcast float* %p to i8*
126 %vld1 = tail call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %0, i32 4)
126 %vld1 = tail call <2 x float> @llvm.arm.neon.vld1.v2f32.p0i8(i8* %0, i32 4)
127127 %tobool = icmp eq float* %q, null
128128 br i1 %tobool, label %if.end, label %if.then
129129
137137
138138 if.end: ; preds = %entry, %if.then
139139 %x.0 = phi <2 x float> [ %vecins, %if.then ], [ %vld1, %entry ]
140 tail call void @llvm.arm.neon.vst1.v2f32(i8* %0, <2 x float> %x.0, i32 4)
140 tail call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %0, <2 x float> %x.0, i32 4)
141141 ret void
142142 }
143143
153153 define void @f5(float* %p, float* %q) nounwind ssp {
154154 entry:
155155 %0 = bitcast float* %p to i8*
156 %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %0, i32 4)
156 %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %0, i32 4)
157157 %vecext = extractelement <4 x float> %vld1, i32 0
158158 %vecext1 = extractelement <4 x float> %vld1, i32 1
159159 %vecext2 = extractelement <4 x float> %vld1, i32 2
181181 %vecinit9 = insertelement <4 x float> %vecinit, float %b.0, i32 1
182182 %vecinit10 = insertelement <4 x float> %vecinit9, float %c.0, i32 2
183183 %vecinit11 = insertelement <4 x float> %vecinit10, float %add, i32 3
184 tail call void @llvm.arm.neon.vst1.v4f32(i8* %0, <4 x float> %vecinit11, i32 4)
185 ret void
186 }
187
188 declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
189
190 declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
184 tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %0, <4 x float> %vecinit11, i32 4)
185 ret void
186 }
187
188 declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
189
190 declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind
191191
192192 ; CHECK: pr13999
193193 define void @pr13999() nounwind readonly {
1818 %tmp5 = bitcast i64 %tmp4 to <8 x i8>
1919 %tmp6 = shufflevector <8 x i8> %tmp5, <8 x i8> undef, <16 x i32>
2020 %tmp7 = shufflevector <16 x i8> %tmp6, <16 x i8> %tmp3, <16 x i32>
21 tail call void @llvm.arm.neon.vst1.v16i8(i8* %arg, <16 x i8> %tmp7, i32 2)
21 tail call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %arg, <16 x i8> %tmp7, i32 2)
2222 ret void
2323 }
2424
25 declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32)
25 declare void @llvm.arm.neon.vst1.p0i8.v16i8(i8*, <16 x i8>, i32)
2121 declare arm_aapcs_vfpcc %2** @func4()
2222
2323 define arm_aapcs_vfpcc void @foo(%3* nocapture) nounwind align 2 {
24 call void @llvm.arm.neon.vst4.v4i32(i8* undef, <4 x i32> , <4 x i32> , <4 x i32> , <4 x i32> , i32 16) nounwind
24 call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* undef, <4 x i32> , <4 x i32> , <4 x i32> , <4 x i32> , i32 16) nounwind
2525 %2 = call arm_aapcs_vfpcc %0** @func2() nounwind
2626 %3 = load %0*, %0** %2, align 4
2727 store float 0.000000e+00, float* undef, align 4
3939 %10 = fmul float undef, 2.000000e+05
4040 %11 = fadd float %10, -1.000000e+05
4141 store float %11, float* undef, align 4
42 call void @llvm.arm.neon.vst4.v4i32(i8* undef, <4 x i32> , <4 x i32> , <4 x i32> , <4 x i32> , i32 16) nounwind
42 call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* undef, <4 x i32> , <4 x i32> , <4 x i32> , <4 x i32> , i32 16) nounwind
4343 ret void
4444 }
4545
46 declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
46 declare void @llvm.arm.neon.vst4.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
4747
4848 declare arm_aapcs_vfpcc i32 @rand()
77
88 define void @foo(float* nocapture %A) #0 {
99 %1= bitcast float* %A to i8*
10 %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8* %1, i32 4)
10 %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32.p0i8(i8* %1, i32 4)
1111 %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0
1212 %divp_vec = fdiv <4 x float> , %3
1313 %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 1
1616 %div8p_vec = fdiv <4 x float> , %5
1717 %6 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 3
1818 %div13p_vec = fdiv <4 x float> , %6
19 tail call void @llvm.arm.neon.vst4.v4f32(i8* %1, <4 x float> %divp_vec, <4 x float> %div3p_vec, <4 x float> %div8p_vec, <4 x float> %div13p_vec, i32 4)
19 tail call void @llvm.arm.neon.vst4.p0i8.v4f32(i8* %1, <4 x float> %divp_vec, <4 x float> %div3p_vec, <4 x float> %div8p_vec, <4 x float> %div13p_vec, i32 4)
2020 ret void
2121 }
2222
2626 ; Function Attrs: nounwind readonly
2727
2828 ; Function Attrs: nounwind
29 declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) #1
30 declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8*, i32) #2
29 declare void @llvm.arm.neon.vst4.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) #1
30 declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32.p0i8(i8*, i32) #2
3131
3232 ; Function Attrs: nounwind
3333
2323 %2 = getelementptr inbounds %struct.int32x4_t, %struct.int32x4_t* %vT1ptr, i32 0, i32 0 ; <<4 x i32>*> [#uses=1]
2424 %3 = load <4 x i32>, <4 x i32>* %2, align 16 ; <<4 x i32>> [#uses=1]
2525 %4 = bitcast i16* %i_ptr to i8* ; [#uses=1]
26 %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %4, i32 1) ; <<8 x i16>> [#uses=1]
26 %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %4, i32 1) ; <<8 x i16>> [#uses=1]
2727 %6 = bitcast <8 x i16> %5 to <2 x double> ; <<2 x double>> [#uses=2]
2828 %7 = extractelement <2 x double> %6, i32 0 ; [#uses=1]
2929 %8 = bitcast double %7 to <4 x i16> ; <<4 x i16>> [#uses=1]
3939 %trunc_16 = trunc <4 x i32> %16 to <4 x i16>
4040 %17 = shufflevector <4 x i16> %trunc_15, <4 x i16> %trunc_16, <8 x i32> ; <<8 x i16>> [#uses=1]
4141 %18 = bitcast i16* %o_ptr to i8* ; [#uses=1]
42 tail call void @llvm.arm.neon.vst1.v8i16(i8* %18, <8 x i16> %17, i32 1)
42 tail call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %18, <8 x i16> %17, i32 1)
4343 ret void
4444 }
4545
5959 %2 = getelementptr inbounds %struct.int16x8_t, %struct.int16x8_t* %vT1ptr, i32 0, i32 0 ; <<8 x i16>*> [#uses=1]
6060 %3 = load <8 x i16>, <8 x i16>* %2, align 16 ; <<8 x i16>> [#uses=1]
6161 %4 = bitcast i16* %i_ptr to i8* ; [#uses=1]
62 %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %4, i32 1) ; <<8 x i16>> [#uses=1]
62 %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %4, i32 1) ; <<8 x i16>> [#uses=1]
6363 %6 = getelementptr inbounds i16, i16* %i_ptr, i32 8 ; [#uses=1]
6464 %7 = bitcast i16* %6 to i8* ; [#uses=1]
65 %8 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %7, i32 1) ; <<8 x i16>> [#uses=1]
65 %8 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %7, i32 1) ; <<8 x i16>> [#uses=1]
6666 %9 = mul <8 x i16> %1, %5 ; <<8 x i16>> [#uses=1]
6767 %10 = mul <8 x i16> %3, %8 ; <<8 x i16>> [#uses=1]
6868 %11 = bitcast i16* %o_ptr to i8* ; [#uses=1]
69 tail call void @llvm.arm.neon.vst1.v8i16(i8* %11, <8 x i16> %9, i32 1)
69 tail call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %11, <8 x i16> %9, i32 1)
7070 %12 = getelementptr inbounds i16, i16* %o_ptr, i32 8 ; [#uses=1]
7171 %13 = bitcast i16* %12 to i8* ; [#uses=1]
72 tail call void @llvm.arm.neon.vst1.v8i16(i8* %13, <8 x i16> %10, i32 1)
72 tail call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %13, <8 x i16> %10, i32 1)
7373 ret void
7474 }
7575
8080 ; CHECK: vmov r
8181 ; CHECK-NOT: vmov d
8282 ; CHECK: vst3.8
83 %tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
83 %tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
8484 %tmp2 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 0 ; <<8 x i8>> [#uses=1]
8585 %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 2 ; <<8 x i8>> [#uses=1]
8686 %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 1 ; <<8 x i8>> [#uses=1]
8787 %tmp5 = sub <8 x i8> %tmp3, %tmp4
8888 %tmp6 = add <8 x i8> %tmp2, %tmp3 ; <<8 x i8>> [#uses=1]
8989 %tmp7 = mul <8 x i8> %tmp4, %tmp2
90 tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> %tmp5, <8 x i8> %tmp6, <8 x i8> %tmp7, i32 1)
90 tail call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %B, <8 x i8> %tmp5, <8 x i8> %tmp6, <8 x i8> %tmp7, i32 1)
9191 ret <8 x i8> %tmp4
9292 }
9393
100100 ; CHECK-NOT: vmov
101101 ; CHECK: bne
102102 %tmp1 = bitcast i32* %in to i8* ; [#uses=1]
103 %tmp2 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp1, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
103 %tmp2 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8* %tmp1, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
104104 %tmp3 = getelementptr inbounds i32, i32* %in, i32 8 ; [#uses=1]
105105 %tmp4 = bitcast i32* %tmp3 to i8* ; [#uses=1]
106 %tmp5 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp4, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
106 %tmp5 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8* %tmp4, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
107107 %tmp8 = bitcast i32* %out to i8* ; [#uses=1]
108108 br i1 undef, label %return1, label %return2
109109
119119 %tmp39 = extractvalue %struct.__neon_int32x4x2_t %tmp5, 1 ; <<4 x i32>> [#uses=1]
120120 %tmp6 = add <4 x i32> %tmp52, %tmp ; <<4 x i32>> [#uses=1]
121121 %tmp7 = add <4 x i32> %tmp57, %tmp39 ; <<4 x i32>> [#uses=1]
122 tail call void @llvm.arm.neon.vst2.v4i32(i8* %tmp8, <4 x i32> %tmp6, <4 x i32> %tmp7, i32 1)
122 tail call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* %tmp8, <4 x i32> %tmp6, <4 x i32> %tmp7, i32 1)
123123 ret void
124124
125125 return2:
130130 %tmp100 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 ; <<4 x i32>> [#uses=1]
131131 %tmp101 = extractvalue %struct.__neon_int32x4x2_t %tmp5, 1 ; <<4 x i32>> [#uses=1]
132132 %tmp102 = add <4 x i32> %tmp100, %tmp101 ; <<4 x i32>> [#uses=1]
133 tail call void @llvm.arm.neon.vst2.v4i32(i8* %tmp8, <4 x i32> %tmp102, <4 x i32> %tmp101, i32 1)
133 tail call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* %tmp8, <4 x i32> %tmp102, <4 x i32> %tmp101, i32 1)
134134 call void @llvm.trap()
135135 unreachable
136136 }
146146 ; CHECK: vadd.i16
147147 %tmp0 = bitcast i16* %A to i8* ; [#uses=1]
148148 %tmp1 = load <8 x i16>, <8 x i16>* %B ; <<8 x i16>> [#uses=2]
149 %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 1) ; <%struct.__neon_int16x8x2_t> [#uses=2]
149 %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 1) ; <%struct.__neon_int16x8x2_t> [#uses=2]
150150 %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0 ; <<8 x i16>> [#uses=1]
151151 %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1 ; <<8 x i16>> [#uses=1]
152152 %tmp5 = add <8 x i16> %tmp3, %tmp4 ; <<8 x i16>> [#uses=1]
159159 ; CHECK: vorr d[[D0:[0-9]+]], d[[D1:[0-9]+]]
160160 ; CHECK-NEXT: vld2.8 {d[[D1]][1], d[[D0]][1]}
161161 %tmp1 = load <8 x i8>, <8 x i8>* %B ; <<8 x i8>> [#uses=2]
162 %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) ; <%struct.__neon_int8x8x2_t> [#uses=2]
162 %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) ; <%struct.__neon_int8x8x2_t> [#uses=2]
163163 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 ; <<8 x i8>> [#uses=1]
164164 %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1 ; <<8 x i8>> [#uses=1]
165165 %tmp5 = add <8 x i8> %tmp3, %tmp4 ; <<8 x i8>> [#uses=1]
177177 ; CHECK: vuzp.32 q[[Q1]], q[[Q0]]
178178 ; CHECK: vst1.32
179179 %0 = bitcast i32* %iptr to i8* ; [#uses=2]
180 %1 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %0, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
180 %1 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8* %0, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
181181 %tmp57 = extractvalue %struct.__neon_int32x4x2_t %1, 0 ; <<4 x i32>> [#uses=1]
182182 %tmp60 = extractvalue %struct.__neon_int32x4x2_t %1, 1 ; <<4 x i32>> [#uses=1]
183183 %2 = bitcast i32* %optr to i8* ; [#uses=2]
184 tail call void @llvm.arm.neon.vst2.v4i32(i8* %2, <4 x i32> %tmp57, <4 x i32> %tmp60, i32 1)
185 %3 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %0, i32 1) ; <<4 x i32>> [#uses=1]
184 tail call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* %2, <4 x i32> %tmp57, <4 x i32> %tmp60, i32 1)
185 %3 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8* %0, i32 1) ; <<4 x i32>> [#uses=1]
186186 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> ; <<4 x i32>> [#uses=1]
187 tail call void @llvm.arm.neon.vst1.v4i32(i8* %2, <4 x i32> %4, i32 1)
187 tail call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* %2, <4 x i32> %4, i32 1)
188188 ret void
189189 }
190190
306306
307307 ; This test crashes the coalescer because live variables were not updated properly.
308308 define <8 x i8> @t11(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A7, i8* %A8, i8* %B) nounwind {
309 %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
309 %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
310310 %tmp2d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 0 ; <<8 x i8>> [#uses=1]
311 %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
311 %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
312312 %tmp2f = extractvalue %struct.__neon_int8x8x3_t %tmp1f, 0 ; <<8 x i8>> [#uses=1]
313313 %tmp2bd = add <8 x i8> zeroinitializer, %tmp2d ; <<8 x i8>> [#uses=1]
314314 %tmp2abcd = mul <8 x i8> zeroinitializer, %tmp2bd ; <<8 x i8>> [#uses=1]
315315 %tmp2ef = sub <8 x i8> zeroinitializer, %tmp2f ; <<8 x i8>> [#uses=1]
316316 %tmp2efgh = mul <8 x i8> %tmp2ef, undef ; <<8 x i8>> [#uses=2]
317 call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp2efgh, i32 1)
317 call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %A2, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp2efgh, i32 1)
318318 %tmp2 = sub <8 x i8> %tmp2efgh, %tmp2abcd ; <<8 x i8>> [#uses=1]
319319 %tmp7 = mul <8 x i8> undef, %tmp2 ; <<8 x i8>> [#uses=1]
320 tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp7, i32 1)
320 tail call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %B, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp7, i32 1)
321321 ret <8 x i8> undef
322322 }
323323
324 declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32) nounwind readonly
325
326 declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly
324 declare <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8*, i32) nounwind readonly
325
326 declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly
327327
328328 declare <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
329329
330 declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32) nounwind
331
332 declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
333
334 declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
330 declare void @llvm.arm.neon.vst1.p0i8.v4i32(i8*, <4 x i32>, i32) nounwind
331
332 declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
333
334 declare void @llvm.arm.neon.vst3.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
335335 nounwind
336336
337 declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*, i32) nounwind readonly
338
339 declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8*, i32) nounwind readonly
340
341 declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
342
343 declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
344
345 declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind
337 declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8*, i32) nounwind readonly
338
339 declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8*, i32) nounwind readonly
340
341 declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
342
343 declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
344
345 declare void @llvm.arm.neon.vst2.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind
346346
347347 declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
348348
66 %quux = type { i32 (...)**, %baz*, i32 }
77 %quuz = type { %quux, i32, %bar, [128 x i8], [16 x %foo], %foo, %foo, %foo }
88
9 declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
9 declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
1010
1111 define void @aaa(%quuz* %this, i8* %block) {
1212 ; CHECK-LABEL: aaa:
1717 %aligned_vec = alloca <4 x float>, align 16
1818 %"alloca point" = bitcast i32 0 to i32
1919 %vecptr = bitcast <4 x float>* %aligned_vec to i8*
20 %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %vecptr, i32 1) nounwind ; <<4 x float>> [#uses=1]
20 %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %vecptr, i32 1) nounwind ; <<4 x float>> [#uses=1]
2121 store float 6.300000e+01, float* undef, align 4
22 %1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
22 %1 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
2323 store float 0.000000e+00, float* undef, align 4
24 %2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
25 %ld3 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
24 %2 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
25 %ld3 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
2626 store float 0.000000e+00, float* undef, align 4
27 %ld4 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
27 %ld4 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
2828 store float 0.000000e+00, float* undef, align 4
29 %ld5 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
29 %ld5 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
3030 store float 0.000000e+00, float* undef, align 4
31 %ld6 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
31 %ld6 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
3232 store float 0.000000e+00, float* undef, align 4
33 %ld7 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
33 %ld7 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
3434 store float 0.000000e+00, float* undef, align 4
35 %ld8 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
35 %ld8 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
3636 store float 0.000000e+00, float* undef, align 4
37 %ld9 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
37 %ld9 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
3838 store float 0.000000e+00, float* undef, align 4
39 %ld10 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
39 %ld10 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
4040 store float 0.000000e+00, float* undef, align 4
41 %ld11 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
41 %ld11 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
4242 store float 0.000000e+00, float* undef, align 4
43 %ld12 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
43 %ld12 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
4444 store float 0.000000e+00, float* undef, align 4
4545 %val173 = load <4 x float>, <4 x float>* undef ; <<4 x float>> [#uses=1]
4646 br label %bb4
195195 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32>
196196 %4 = add <8 x i16> %3,
197197 %5 = trunc <8 x i16> %4 to <8 x i8>
198 tail call void @llvm.arm.neon.vst1.v8i8(i8* undef, <8 x i8> %5, i32 1)
198 tail call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* undef, <8 x i8> %5, i32 1)
199199 unreachable
200200 }
201201
202 declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32) nounwind
202 declare void @llvm.arm.neon.vst1.p0i8.v8i8(i8*, <8 x i8>, i32) nounwind
7777 %2 = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32>
7878 %3 = add <8 x i16> %2,
7979 %4 = trunc <8 x i16> %3 to <8 x i8>
80 tail call void @llvm.arm.neon.vst1.v8i8(i8* undef, <8 x i8> %4, i32 1)
80 tail call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* undef, <8 x i8> %4, i32 1)
8181 unreachable
8282 }
8383
84 declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32) nounwind
84 declare void @llvm.arm.neon.vst1.p0i8.v8i8(i8*, <8 x i8>, i32) nounwind
8585
8686 ; Test that loads and stores of i64 vector elements are handled as f64 values
8787 ; so they are not split up into i32 values. Radar 8755338.
0 ; RUN: llc -mtriple=arm-eabi -mattr=+neon < %s | FileCheck %s
1
2 %struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> }
3 %struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> }
4 %struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
5
6 ; vld[1234] auto-upgrade tests
7
8 ; CHECK-LABEL: test_vld1_upgrade:
9 ; CHECK: vld1.32 {d16}, [r0]
10 define <2 x i32> @test_vld1_upgrade(i8* %ptr) {
11 %tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %ptr, i32 1)
12 ret <2 x i32> %tmp1
13 }
14
15 declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32) nounwind readonly
16
17 ; CHECK-LABEL: test_vld2_upgrade:
18 ; CHECK: vld2.32 {d16, d17}, [r0]
19 define %struct.__neon_int32x2x2_t @test_vld2_upgrade(i8* %ptr) {
20 %tmp1 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i8* %ptr, i32 1)
21 ret %struct.__neon_int32x2x2_t %tmp1
22 }
23
24 declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i8*, i32) nounwind readonly
25
26 ; CHECK-LABEL: test_vld3_upgrade:
27 ; CHECK: vld3.32 {d16, d17, d18}, [r1]
28 define %struct.__neon_int32x2x3_t @test_vld3_upgrade(i8* %ptr) {
29 %tmp1 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8* %ptr, i32 1)
30 ret %struct.__neon_int32x2x3_t %tmp1
31 }
32
33 declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8*, i32) nounwind readonly
34
35 ; CHECK-LABEL: test_vld4_upgrade:
36 ; CHECK: vld4.32 {d16, d17, d18, d19}, [r1]
37 define %struct.__neon_int32x2x4_t @test_vld4_upgrade(i8* %ptr) {
38 %tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8* %ptr, i32 1)
39 ret %struct.__neon_int32x2x4_t %tmp1
40 }
41
42 declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8*, i32) nounwind readonly
43
44 ; vld[234]lane auto-upgrade tests
45
46 ; CHECK-LABEL: test_vld2lane_upgrade:
47 ; CHECK: vld2.32 {d16[1], d17[1]}, [r0]
48 define %struct.__neon_int32x2x2_t @test_vld2lane_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B) {
49 %tmp1 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, i32 1, i32 1)
50 ret %struct.__neon_int32x2x2_t %tmp1
51 }
52
53 declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
54
55 ; CHECK-LABEL: test_vld3lane_upgrade:
56 ; CHECK: vld3.32 {d16[1], d17[1], d18[1]}, [r1]
57 define %struct.__neon_int32x2x3_t @test_vld3lane_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
58 %tmp1 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32 1, i32 1)
59 ret %struct.__neon_int32x2x3_t %tmp1
60 }
61
62 declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
63
64 ; CHECK-LABEL: test_vld4lane_upgrade:
65 ; CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [r1]
66 define %struct.__neon_int32x2x4_t @test_vld4lane_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) {
67 %tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32 1, i32 1)
68 ret %struct.__neon_int32x2x4_t %tmp1
69 }
70
71 declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
72
73 ; vst[1234] auto-upgrade tests
74
75 ; CHECK-LABEL: test_vst1_upgrade:
76 ; CHECK: vst1.32 {d16}, [r0]
77 define void @test_vst1_upgrade(i8* %ptr, <2 x i32> %A) {
78 call void @llvm.arm.neon.vst1.v2i32(i8* %ptr, <2 x i32> %A, i32 1)
79 ret void
80 }
81
82 declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32) nounwind
83
84 ; CHECK-LABEL: test_vst2_upgrade:
85 ; CHECK: vst2.32 {d16, d17}, [r0]
86 define void @test_vst2_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B) {
87 call void @llvm.arm.neon.vst2.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, i32 1)
88 ret void
89 }
90
91 declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>, i32) nounwind
92
93 ; CHECK-LABEL: test_vst3_upgrade:
94 ; CHECK: vst3.32 {d16, d17, d18}, [r0]
95 define void @test_vst3_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
96 call void @llvm.arm.neon.vst3.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32 1)
97 ret void
98 }
99
100 declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
101
102 ; CHECK-LABEL: test_vst4_upgrade:
103 ; CHECK: vst4.32 {d16, d17, d18, d19}, [r0]
104 define void @test_vst4_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) {
105 call void @llvm.arm.neon.vst4.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32 1)
106 ret void
107 }
108
109 declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
110
111 ; vst[234]lane auto-upgrade tests
112
113 ; CHECK-LABEL: test_vst2lane_upgrade:
114 ; CHECK: vst2.32 {d16[1], d17[1]}, [r0]
115 define void @test_vst2lane_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B) {
116 call void @llvm.arm.neon.vst2lane.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, i32 1, i32 1)
117 ret void
118 }
119
120 declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind
121
122 ; CHECK-LABEL: test_vst3lane_upgrade:
123 ; CHECK: vst3.32 {d16[1], d17[1], d18[1]}, [r0]
124 define void @test_vst3lane_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
125 call void @llvm.arm.neon.vst3lane.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32 1, i32 1)
126 ret void
127 }
128
129 declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
130
131 ; CHECK-LABEL: test_vst4lane_upgrade:
132 ; CHECK: vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0]
133 define void @test_vst4lane_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) {
134 call void @llvm.arm.neon.vst4lane.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32 1, i32 1)
135 ret void
136 }
137
138 declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
66 ;CHECK-LABEL: vld1i8:
77 ;Check the alignment value. Max for this instruction is 64 bits:
88 ;CHECK: vld1.8 {d16}, [r0:64]
9 %tmp1 = call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %A, i32 16)
9 %tmp1 = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %A, i32 16)
1010 ret <8 x i8> %tmp1
1111 }
1212
1414 ;CHECK-LABEL: vld1i16:
1515 ;CHECK: vld1.16
1616 %tmp0 = bitcast i16* %A to i8*
17 %tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %tmp0, i32 1)
17 %tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* %tmp0, i32 1)
1818 ret <4 x i16> %tmp1
1919 }
2020
2424 ;CHECK: vld1.16 {d16}, [{{r[0-9]+}}]!
2525 %A = load i16*, i16** %ptr
2626 %tmp0 = bitcast i16* %A to i8*
27 %tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %tmp0, i32 1)
27 %tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* %tmp0, i32 1)
2828 %tmp2 = getelementptr i16, i16* %A, i32 4
2929 store i16* %tmp2, i16** %ptr
3030 ret <4 x i16> %tmp1
3434 ;CHECK-LABEL: vld1i32:
3535 ;CHECK: vld1.32
3636 %tmp0 = bitcast i32* %A to i8*
37 %tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %tmp0, i32 1)
37 %tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8* %tmp0, i32 1)
3838 ret <2 x i32> %tmp1
3939 }
4040
4444 ;CHECK: vld1.32 {d16}, [{{r[0-9]+}}], {{r[0-9]+}}
4545 %A = load i32*, i32** %ptr
4646 %tmp0 = bitcast i32* %A to i8*
47 %tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %tmp0, i32 1)
47 %tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8* %tmp0, i32 1)
4848 %tmp2 = getelementptr i32, i32* %A, i32 %inc
4949 store i32* %tmp2, i32** %ptr
5050 ret <2 x i32> %tmp1
5454 ;CHECK-LABEL: vld1f:
5555 ;CHECK: vld1.32
5656 %tmp0 = bitcast float* %A to i8*
57 %tmp1 = call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %tmp0, i32 1)
57 %tmp1 = call <2 x float> @llvm.arm.neon.vld1.v2f32.p0i8(i8* %tmp0, i32 1)
5858 ret <2 x float> %tmp1
5959 }
6060
6262 ;CHECK-LABEL: vld1i64:
6363 ;CHECK: vld1.64
6464 %tmp0 = bitcast i64* %A to i8*
65 %tmp1 = call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %tmp0, i32 1)
65 %tmp1 = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* %tmp0, i32 1)
6666 ret <1 x i64> %tmp1
6767 }
6868
7070 ;CHECK-LABEL: vld1Qi8:
7171 ;Check the alignment value. Max for this instruction is 128 bits:
7272 ;CHECK: vld1.8 {d16, d17}, [r0:64]
73 %tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %A, i32 8)
73 %tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %A, i32 8)
7474 ret <16 x i8> %tmp1
7575 }
7676
7979 ;CHECK-LABEL: vld1Qi8_update:
8080 ;CHECK: vld1.8 {d16, d17}, [{{r[0-9]+}}:64]!
8181 %A = load i8*, i8** %ptr
82 %tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %A, i32 8)
82 %tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %A, i32 8)
8383 %tmp2 = getelementptr i8, i8* %A, i32 16
8484 store i8* %tmp2, i8** %ptr
8585 ret <16 x i8> %tmp1
9090 ;Check the alignment value. Max for this instruction is 128 bits:
9191 ;CHECK: vld1.16 {d16, d17}, [r0:128]
9292 %tmp0 = bitcast i16* %A to i8*
93 %tmp1 = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %tmp0, i32 32)
93 %tmp1 = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %tmp0, i32 32)
9494 ret <8 x i16> %tmp1
9595 }
9696
9898 ;CHECK-LABEL: vld1Qi32:
9999 ;CHECK: vld1.32
100100 %tmp0 = bitcast i32* %A to i8*
101 %tmp1 = call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %tmp0, i32 1)
101 %tmp1 = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8* %tmp0, i32 1)
102102 ret <4 x i32> %tmp1
103103 }
104104
106106 ;CHECK-LABEL: vld1Qf:
107107 ;CHECK: vld1.32
108108 %tmp0 = bitcast float* %A to i8*
109 %tmp1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %tmp0, i32 1)
109 %tmp1 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %tmp0, i32 1)
110110 ret <4 x float> %tmp1
111111 }
112112
114114 ;CHECK-LABEL: vld1Qi64:
115115 ;CHECK: vld1.64
116116 %tmp0 = bitcast i64* %A to i8*
117 %tmp1 = call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %tmp0, i32 1)
117 %tmp1 = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* %tmp0, i32 1)
118118 ret <2 x i64> %tmp1
119119 }
120120
122122 ;CHECK-LABEL: vld1Qf64:
123123 ;CHECK: vld1.64
124124 %tmp0 = bitcast double* %A to i8*
125 %tmp1 = call <2 x double> @llvm.arm.neon.vld1.v2f64(i8* %tmp0, i32 1)
125 %tmp1 = call <2 x double> @llvm.arm.neon.vld1.v2f64.p0i8(i8* %tmp0, i32 1)
126126 ret <2 x double> %tmp1
127127 }
128128
129 declare <8 x i8> @llvm.arm.neon.vld1.v8i8(i8*, i32) nounwind readonly
130 declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32) nounwind readonly
131 declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32) nounwind readonly
132 declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32) nounwind readonly
133 declare <1 x i64> @llvm.arm.neon.vld1.v1i64(i8*, i32) nounwind readonly
129 declare <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8*, i32) nounwind readonly
130 declare <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8*, i32) nounwind readonly
131 declare <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8*, i32) nounwind readonly
132 declare <2 x float> @llvm.arm.neon.vld1.v2f32.p0i8(i8*, i32) nounwind readonly
133 declare <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8*, i32) nounwind readonly
134134
135 declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32) nounwind readonly
136 declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly
137 declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32) nounwind readonly
138 declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
139 declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32) nounwind readonly
140 declare <2 x double> @llvm.arm.neon.vld1.v2f64(i8*, i32) nounwind readonly
135 declare <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8*, i32) nounwind readonly
136 declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly
137 declare <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8*, i32) nounwind readonly
138 declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
139 declare <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8*, i32) nounwind readonly
140 declare <2 x double> @llvm.arm.neon.vld1.v2f64.p0i8(i8*, i32) nounwind readonly
141141
142142 ; Radar 8355607
143143 ; Do not crash if the vld1 result is not used.
144144 define void @unused_vld1_result() {
145145 entry:
146 %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1)
146 %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1)
147147 call void @llvm.trap()
148148 unreachable
149149 }
1414 ;CHECK-LABEL: vld2i8:
1515 ;Check the alignment value. Max for this instruction is 128 bits:
1616 ;CHECK: vld2.8 {d16, d17}, [r0:64]
17 %tmp1 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8(i8* %A, i32 8)
17 %tmp1 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8.p0i8(i8* %A, i32 8)
1818 %tmp2 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 0
1919 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 1
2020 %tmp4 = add <8 x i8> %tmp2, %tmp3
2626 ;Check the alignment value. Max for this instruction is 128 bits:
2727 ;CHECK: vld2.16 {d16, d17}, [r0:128]
2828 %tmp0 = bitcast i16* %A to i8*
29 %tmp1 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16(i8* %tmp0, i32 32)
29 %tmp1 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16.p0i8(i8* %tmp0, i32 32)
3030 %tmp2 = extractvalue %struct.__neon_int16x4x2_t %tmp1, 0
3131 %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp1, 1
3232 %tmp4 = add <4 x i16> %tmp2, %tmp3
3737 ;CHECK-LABEL: vld2i32:
3838 ;CHECK: vld2.32
3939 %tmp0 = bitcast i32* %A to i8*
40 %tmp1 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i8* %tmp0, i32 1)
40 %tmp1 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32.p0i8(i8* %tmp0, i32 1)
4141 %tmp2 = extractvalue %struct.__neon_int32x2x2_t %tmp1, 0
4242 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp1, 1
4343 %tmp4 = add <2 x i32> %tmp2, %tmp3
4848 ;CHECK-LABEL: vld2f:
4949 ;CHECK: vld2.32
5050 %tmp0 = bitcast float* %A to i8*
51 %tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32(i8* %tmp0, i32 1)
51 %tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32.p0i8(i8* %tmp0, i32 1)
5252 %tmp2 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 0
5353 %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 1
5454 %tmp4 = fadd <2 x float> %tmp2, %tmp3
6161 ;CHECK: vld2.32 {d16, d17}, [r1]!
6262 %A = load float*, float** %ptr
6363 %tmp0 = bitcast float* %A to i8*
64 %tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32(i8* %tmp0, i32 1)
64 %tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32.p0i8(i8* %tmp0, i32 1)
6565 %tmp2 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 0
6666 %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 1
6767 %tmp4 = fadd <2 x float> %tmp2, %tmp3
7575 ;Check the alignment value. Max for this instruction is 128 bits:
7676 ;CHECK: vld1.64 {d16, d17}, [r0:128]
7777 %tmp0 = bitcast i64* %A to i8*
78 %tmp1 = call %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64(i8* %tmp0, i32 32)
78 %tmp1 = call %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64.p0i8(i8* %tmp0, i32 32)
7979 %tmp2 = extractvalue %struct.__neon_int64x1x2_t %tmp1, 0
8080 %tmp3 = extractvalue %struct.__neon_int64x1x2_t %tmp1, 1
8181 %tmp4 = add <1 x i64> %tmp2, %tmp3
8686 ;CHECK-LABEL: vld2Qi8:
8787 ;Check the alignment value. Max for this instruction is 256 bits:
8888 ;CHECK: vld2.8 {d16, d17, d18, d19}, [r0:64]
89 %tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8* %A, i32 8)
89 %tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8.p0i8(i8* %A, i32 8)
9090 %tmp2 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 0
9191 %tmp3 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 1
9292 %tmp4 = add <16 x i8> %tmp2, %tmp3
9898 ;CHECK-LABEL: vld2Qi8_update:
9999 ;CHECK: vld2.8 {d16, d17, d18, d19}, [r2:128], r1
100100 %A = load i8*, i8** %ptr
101 %tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8* %A, i32 16)
101 %tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8.p0i8(i8* %A, i32 16)
102102 %tmp2 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 0
103103 %tmp3 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 1
104104 %tmp4 = add <16 x i8> %tmp2, %tmp3
112112 ;Check the alignment value. Max for this instruction is 256 bits:
113113 ;CHECK: vld2.16 {d16, d17, d18, d19}, [r0:128]
114114 %tmp0 = bitcast i16* %A to i8*
115 %tmp1 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16(i8* %tmp0, i32 16)
115 %tmp1 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16.p0i8(i8* %tmp0, i32 16)
116116 %tmp2 = extractvalue %struct.__neon_int16x8x2_t %tmp1, 0
117117 %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp1, 1
118118 %tmp4 = add <8 x i16> %tmp2, %tmp3
124124 ;Check the alignment value. Max for this instruction is 256 bits:
125125 ;CHECK: vld2.32 {d16, d17, d18, d19}, [r0:256]
126126 %tmp0 = bitcast i32* %A to i8*
127 %tmp1 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp0, i32 64)
127 %tmp1 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8* %tmp0, i32 64)
128128 %tmp2 = extractvalue %struct.__neon_int32x4x2_t %tmp1, 0
129129 %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp1, 1
130130 %tmp4 = add <4 x i32> %tmp2, %tmp3
135135 ;CHECK-LABEL: vld2Qf:
136136 ;CHECK: vld2.32
137137 %tmp0 = bitcast float* %A to i8*
138 %tmp1 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32(i8* %tmp0, i32 1)
138 %tmp1 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32.p0i8(i8* %tmp0, i32 1)
139139 %tmp2 = extractvalue %struct.__neon_float32x4x2_t %tmp1, 0
140140 %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp1, 1
141141 %tmp4 = fadd <4 x float> %tmp2, %tmp3
142142 ret <4 x float> %tmp4
143143 }
144144
145 declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8(i8*, i32) nounwind readonly
146 declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16(i8*, i32) nounwind readonly
147 declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i8*, i32) nounwind readonly
148 declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32(i8*, i32) nounwind readonly
149 declare %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64(i8*, i32) nounwind readonly
145 declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8.p0i8(i8*, i32) nounwind readonly
146 declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16.p0i8(i8*, i32) nounwind readonly
147 declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32.p0i8(i8*, i32) nounwind readonly
148 declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32.p0i8(i8*, i32) nounwind readonly
149 declare %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64.p0i8(i8*, i32) nounwind readonly
150150
151 declare %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8*, i32) nounwind readonly
152 declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16(i8*, i32) nounwind readonly
153 declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8*, i32) nounwind readonly
154 declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32(i8*, i32) nounwind readonly
151 declare %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8.p0i8(i8*, i32) nounwind readonly
152 declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16.p0i8(i8*, i32) nounwind readonly
153 declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8*, i32) nounwind readonly
154 declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32.p0i8(i8*, i32) nounwind readonly
1515 ;CHECK-LABEL: vld3i8:
1616 ;Check the alignment value. Max for this instruction is 64 bits:
1717 ;CHECK: vld3.8 {d16, d17, d18}, [r0:64]
18 %tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A, i32 32)
18 %tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A, i32 32)
1919 %tmp2 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 0
2020 %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 2
2121 %tmp4 = add <8 x i8> %tmp2, %tmp3
2626 ;CHECK-LABEL: vld3i16:
2727 ;CHECK: vld3.16
2828 %tmp0 = bitcast i16* %A to i8*
29 %tmp1 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i8* %tmp0, i32 1)
29 %tmp1 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16.p0i8(i8* %tmp0, i32 1)
3030 %tmp2 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 0
3131 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 2
3232 %tmp4 = add <4 x i16> %tmp2, %tmp3
3939 ;CHECK: vld3.16 {d16, d17, d18}, [{{r[0-9]+}}], {{r[0-9]+}}
4040 %A = load i16*, i16** %ptr
4141 %tmp0 = bitcast i16* %A to i8*
42 %tmp1 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i8* %tmp0, i32 1)
42 %tmp1 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16.p0i8(i8* %tmp0, i32 1)
4343 %tmp2 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 0
4444 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 2
4545 %tmp4 = add <4 x i16> %tmp2, %tmp3
5252 ;CHECK-LABEL: vld3i32:
5353 ;CHECK: vld3.32
5454 %tmp0 = bitcast i32* %A to i8*
55 %tmp1 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8* %tmp0, i32 1)
55 %tmp1 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32.p0i8(i8* %tmp0, i32 1)
5656 %tmp2 = extractvalue %struct.__neon_int32x2x3_t %tmp1, 0
5757 %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp1, 2
5858 %tmp4 = add <2 x i32> %tmp2, %tmp3
6363 ;CHECK-LABEL: vld3f:
6464 ;CHECK: vld3.32
6565 %tmp0 = bitcast float* %A to i8*
66 %tmp1 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32(i8* %tmp0, i32 1)
66 %tmp1 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32.p0i8(i8* %tmp0, i32 1)
6767 %tmp2 = extractvalue %struct.__neon_float32x2x3_t %tmp1, 0
6868 %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp1, 2
6969 %tmp4 = fadd <2 x float> %tmp2, %tmp3
7575 ;Check the alignment value. Max for this instruction is 64 bits:
7676 ;CHECK: vld1.64 {d16, d17, d18}, [r0:64]
7777 %tmp0 = bitcast i64* %A to i8*
78 %tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8* %tmp0, i32 16)
78 %tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64.p0i8(i8* %tmp0, i32 16)
7979 %tmp2 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 0
8080 %tmp3 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 2
8181 %tmp4 = add <1 x i64> %tmp2, %tmp3
8686 ;CHECK-LABEL: vld3i64_update:
8787 ;CHECK: vld1.64 {d16, d17, d18}, [r1:64]!
8888 %tmp0 = bitcast i64* %A to i8*
89 %tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8* %tmp0, i32 16)
89 %tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64.p0i8(i8* %tmp0, i32 16)
9090 %tmp5 = getelementptr i64, i64* %A, i32 3
9191 store i64* %tmp5, i64** %ptr
9292 %tmp2 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 0
100100 ;Check the alignment value. Max for this instruction is 64 bits:
101101 ;CHECK: vld3.8 {d16, d18, d20}, [r0:64]!
102102 ;CHECK: vld3.8 {d17, d19, d21}, [r0:64]
103 %tmp1 = call %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8(i8* %A, i32 32)
103 %tmp1 = call %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8.p0i8(i8* %A, i32 32)
104104 %tmp2 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 0
105105 %tmp3 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 2
106106 %tmp4 = add <16 x i8> %tmp2, %tmp3
112112 ;CHECK: vld3.16
113113 ;CHECK: vld3.16
114114 %tmp0 = bitcast i16* %A to i8*
115 %tmp1 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16(i8* %tmp0, i32 1)
115 %tmp1 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16.p0i8(i8* %tmp0, i32 1)
116116 %tmp2 = extractvalue %struct.__neon_int16x8x3_t %tmp1, 0
117117 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp1, 2
118118 %tmp4 = add <8 x i16> %tmp2, %tmp3
124124 ;CHECK: vld3.32
125125 ;CHECK: vld3.32
126126 %tmp0 = bitcast i32* %A to i8*
127 %tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i8* %tmp0, i32 1)
127 %tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32.p0i8(i8* %tmp0, i32 1)
128128 %tmp2 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 0
129129 %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 2
130130 %tmp4 = add <4 x i32> %tmp2, %tmp3
138138 ;CHECK: vld3.32 {d17, d19, d21}, [r[[R]]]!
139139 %A = load i32*, i32** %ptr
140140 %tmp0 = bitcast i32* %A to i8*
141 %tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i8* %tmp0, i32 1)
141 %tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32.p0i8(i8* %tmp0, i32 1)
142142 %tmp2 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 0
143143 %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 2
144144 %tmp4 = add <4 x i32> %tmp2, %tmp3
152152 ;CHECK: vld3.32
153153 ;CHECK: vld3.32
154154 %tmp0 = bitcast float* %A to i8*
155 %tmp1 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32(i8* %tmp0, i32 1)
155 %tmp1 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32.p0i8(i8* %tmp0, i32 1)
156156 %tmp2 = extractvalue %struct.__neon_float32x4x3_t %tmp1, 0
157157 %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp1, 2
158158 %tmp4 = fadd <4 x float> %tmp2, %tmp3
159159 ret <4 x float> %tmp4
160160 }
161161
162 declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*, i32) nounwind readonly
163 declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i8*, i32) nounwind readonly
164 declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8*, i32) nounwind readonly
165 declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32(i8*, i32) nounwind readonly
166 declare %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8*, i32) nounwind readonly
162 declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8*, i32) nounwind readonly
163 declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16.p0i8(i8*, i32) nounwind readonly
164 declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32.p0i8(i8*, i32) nounwind readonly
165 declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32.p0i8(i8*, i32) nounwind readonly
166 declare %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64.p0i8(i8*, i32) nounwind readonly
167167
168 declare %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8(i8*, i32) nounwind readonly
169 declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16(i8*, i32) nounwind readonly
170 declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i8*, i32) nounwind readonly
171 declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32(i8*, i32) nounwind readonly
168 declare %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8.p0i8(i8*, i32) nounwind readonly
169 declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16.p0i8(i8*, i32) nounwind readonly
170 declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32.p0i8(i8*, i32) nounwind readonly
171 declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32.p0i8(i8*, i32) nounwind readonly
1414 ;CHECK-LABEL: vld4i8:
1515 ;Check the alignment value. Max for this instruction is 256 bits:
1616 ;CHECK: vld4.8 {d16, d17, d18, d19}, [r0:64]
17 %tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8* %A, i32 8)
17 %tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8.p0i8(i8* %A, i32 8)
1818 %tmp2 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 0
1919 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 2
2020 %tmp4 = add <8 x i8> %tmp2, %tmp3
2626 ;CHECK-LABEL: vld4i8_update:
2727 ;CHECK: vld4.8 {d16, d17, d18, d19}, [r2:128], r1
2828 %A = load i8*, i8** %ptr
29 %tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8* %A, i32 16)
29 %tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8.p0i8(i8* %A, i32 16)
3030 %tmp2 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 0
3131 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 2
3232 %tmp4 = add <8 x i8> %tmp2, %tmp3
4040 ;Check the alignment value. Max for this instruction is 256 bits:
4141 ;CHECK: vld4.16 {d16, d17, d18, d19}, [r0:128]
4242 %tmp0 = bitcast i16* %A to i8*
43 %tmp1 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16(i8* %tmp0, i32 16)
43 %tmp1 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16.p0i8(i8* %tmp0, i32 16)
4444 %tmp2 = extractvalue %struct.__neon_int16x4x4_t %tmp1, 0
4545 %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp1, 2
4646 %tmp4 = add <4 x i16> %tmp2, %tmp3
5252 ;Check the alignment value. Max for this instruction is 256 bits:
5353 ;CHECK: vld4.32 {d16, d17, d18, d19}, [r0:256]
5454 %tmp0 = bitcast i32* %A to i8*
55 %tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8* %tmp0, i32 32)
55 %tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32.p0i8(i8* %tmp0, i32 32)
5656 %tmp2 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 0
5757 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 2
5858 %tmp4 = add <2 x i32> %tmp2, %tmp3
6363 ;CHECK-LABEL: vld4f:
6464 ;CHECK: vld4.32
6565 %tmp0 = bitcast float* %A to i8*
66 %tmp1 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32(i8* %tmp0, i32 1)
66 %tmp1 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32.p0i8(i8* %tmp0, i32 1)
6767 %tmp2 = extractvalue %struct.__neon_float32x2x4_t %tmp1, 0
6868 %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp1, 2
6969 %tmp4 = fadd <2 x float> %tmp2, %tmp3
7575 ;Check the alignment value. Max for this instruction is 256 bits:
7676 ;CHECK: vld1.64 {d16, d17, d18, d19}, [r0:256]
7777 %tmp0 = bitcast i64* %A to i8*
78 %tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8* %tmp0, i32 64)
78 %tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64.p0i8(i8* %tmp0, i32 64)
7979 %tmp2 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 0
8080 %tmp3 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 2
8181 %tmp4 = add <1 x i64> %tmp2, %tmp3
8686 ;CHECK-LABEL: vld4i64_update:
8787 ;CHECK: vld1.64 {d16, d17, d18, d19}, [r1:256]!
8888 %tmp0 = bitcast i64* %A to i8*
89 %tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8* %tmp0, i32 64)
89 %tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64.p0i8(i8* %tmp0, i32 64)
9090 %tmp5 = getelementptr i64, i64* %A, i32 4
9191 store i64* %tmp5, i64** %ptr
9292 %tmp2 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 0
100100 ;Check the alignment value. Max for this instruction is 256 bits:
101101 ;CHECK: vld4.8 {d16, d18, d20, d22}, [r0:256]!
102102 ;CHECK: vld4.8 {d17, d19, d21, d23}, [r0:256]
103 %tmp1 = call %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8(i8* %A, i32 64)
103 %tmp1 = call %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8.p0i8(i8* %A, i32 64)
104104 %tmp2 = extractvalue %struct.__neon_int8x16x4_t %tmp1, 0
105105 %tmp3 = extractvalue %struct.__neon_int8x16x4_t %tmp1, 2
106106 %tmp4 = add <16 x i8> %tmp2, %tmp3
113113 ;CHECK: vld4.16 {d16, d18, d20, d22}, [r0]!
114114 ;CHECK: vld4.16 {d17, d19, d21, d23}, [r0]
115115 %tmp0 = bitcast i16* %A to i8*
116 %tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8* %tmp0, i32 1)
116 %tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16.p0i8(i8* %tmp0, i32 1)
117117 %tmp2 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 0
118118 %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 2
119119 %tmp4 = add <8 x i16> %tmp2, %tmp3
127127 ;CHECK: vld4.16 {d17, d19, d21, d23}, [r1:64]!
128128 %A = load i16*, i16** %ptr
129129 %tmp0 = bitcast i16* %A to i8*
130 %tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8* %tmp0, i32 8)
130 %tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16.p0i8(i8* %tmp0, i32 8)
131131 %tmp2 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 0
132132 %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 2
133133 %tmp4 = add <8 x i16> %tmp2, %tmp3
141141 ;CHECK: vld4.32
142142 ;CHECK: vld4.32
143143 %tmp0 = bitcast i32* %A to i8*
144 %tmp1 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32(i8* %tmp0, i32 1)
144 %tmp1 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32.p0i8(i8* %tmp0, i32 1)
145145 %tmp2 = extractvalue %struct.__neon_int32x4x4_t %tmp1, 0
146146 %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp1, 2
147147 %tmp4 = add <4 x i32> %tmp2, %tmp3
153153 ;CHECK: vld4.32
154154 ;CHECK: vld4.32
155155 %tmp0 = bitcast float* %A to i8*
156 %tmp1 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32(i8* %tmp0, i32 1)
156 %tmp1 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32.p0i8(i8* %tmp0, i32 1)
157157 %tmp2 = extractvalue %struct.__neon_float32x4x4_t %tmp1, 0
158158 %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp1, 2
159159 %tmp4 = fadd <4 x float> %tmp2, %tmp3
160160 ret <4 x float> %tmp4
161161 }
162162
163 declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8*, i32) nounwind readonly
164 declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16(i8*, i32) nounwind readonly
165 declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8*, i32) nounwind readonly
166 declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32(i8*, i32) nounwind readonly
167 declare %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8*, i32) nounwind readonly
163 declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8.p0i8(i8*, i32) nounwind readonly
164 declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16.p0i8(i8*, i32) nounwind readonly
165 declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32.p0i8(i8*, i32) nounwind readonly
166 declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32.p0i8(i8*, i32) nounwind readonly
167 declare %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64.p0i8(i8*, i32) nounwind readonly
168168
169 declare %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8(i8*, i32) nounwind readonly
170 declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8*, i32) nounwind readonly
171 declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32(i8*, i32) nounwind readonly
172 declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32(i8*, i32) nounwind readonly
169 declare %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8.p0i8(i8*, i32) nounwind readonly
170 declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16.p0i8(i8*, i32) nounwind readonly
171 declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32.p0i8(i8*, i32) nounwind readonly
172 declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32.p0i8(i8*, i32) nounwind readonly
6565 ;CHECK-LABEL: vld2dupi8:
6666 ;Check the (default) alignment value.
6767 ;CHECK: vld2.8 {d16[], d17[]}, [r0]
68 %tmp0 = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
68 %tmp0 = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %A, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
6969 %tmp1 = extractvalue %struct.__neon_int8x8x2_t %tmp0, 0
7070 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
7171 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp0, 1
7979 ;Check that a power-of-two alignment smaller than the total size of the memory
8080 ;being loaded is ignored.
8181 ;CHECK: vld2.16 {d16[], d17[]}, [r0]
82 %tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
82 %tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
8383 %tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0
8484 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
8585 %tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1
9494 ;CHECK: vld2.16 {d16[], d17[]}, [r1]!
9595 %A = load i16*, i16** %ptr
9696 %A2 = bitcast i16* %A to i8*
97 %tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %A2, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
97 %tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* %A2, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
9898 %tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0
9999 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
100100 %tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1
109109 ;CHECK-LABEL: vld2dupi32:
110110 ;Check the alignment value. Max for this instruction is 64 bits:
111111 ;CHECK: vld2.32 {d16[], d17[]}, [r0:64]
112 %tmp0 = tail call %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %A, <2 x i32> undef, <2 x i32> undef, i32 0, i32 16)
112 %tmp0 = tail call %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %A, <2 x i32> undef, <2 x i32> undef, i32 0, i32 16)
113113 %tmp1 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 0
114114 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
115115 %tmp3 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 1
118118 ret <2 x i32> %tmp5
119119 }
120120
121 declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
122 declare %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
123 declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
121 declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
122 declare %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
123 declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
124124
125125 %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
126126 %struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
130130 ;CHECK-LABEL: vld3dupi8_update:
131131 ;CHECK: vld3.8 {d16[], d17[], d18[]}, [r2], r1
132132 %A = load i8*, i8** %ptr
133 %tmp0 = tail call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 8)
133 %tmp0 = tail call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %A, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 8)
134134 %tmp1 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 0
135135 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
136136 %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 1
148148 ;CHECK-LABEL: vld3dupi16:
149149 ;Check the (default) alignment value. VLD3 does not support alignment.
150150 ;CHECK: vld3.16 {d16[], d17[], d18[]}, [r0]
151 %tmp0 = tail call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 8)
151 %tmp0 = tail call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 8)
152152 %tmp1 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 0
153153 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
154154 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 1
160160 ret <4 x i16> %tmp8
161161 }
162162
163 declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
164 declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
163 declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
164 declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
165165
166166 %struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
167167 %struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
172172 ;CHECK: vld4.16 {d16[], d17[], d18[], d19[]}, [r1]!
173173 %A = load i16*, i16** %ptr
174174 %A2 = bitcast i16* %A to i8*
175 %tmp0 = tail call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %A2, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 1)
175 %tmp0 = tail call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* %A2, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 1)
176176 %tmp1 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 0
177177 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
178178 %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 1
194194 ;Check the alignment value. An 8-byte alignment is allowed here even though
195195 ;it is smaller than the total size of the memory being loaded.
196196 ;CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, [r0:64]
197 %tmp0 = tail call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %A, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 8)
197 %tmp0 = tail call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* %A, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 8)
198198 %tmp1 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 0
199199 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
200200 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 1
209209 ret <2 x i32> %tmp11
210210 }
211211
212 declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
213 declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
212 declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
213 declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
101101 ;Check the alignment value. Max for this instruction is 16 bits:
102102 ;CHECK: vld2.8 {d16[1], d17[1]}, [r0:16]
103103 %tmp1 = load <8 x i8>, <8 x i8>* %B
104 %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
104 %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
105105 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0
106106 %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1
107107 %tmp5 = add <8 x i8> %tmp3, %tmp4
114114 ;CHECK: vld2.16 {d16[1], d17[1]}, [r0:32]
115115 %tmp0 = bitcast i16* %A to i8*
116116 %tmp1 = load <4 x i16>, <4 x i16>* %B
117 %tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
117 %tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
118118 %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0
119119 %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1
120120 %tmp5 = add <4 x i16> %tmp3, %tmp4
126126 ;CHECK: vld2.32
127127 %tmp0 = bitcast i32* %A to i8*
128128 %tmp1 = load <2 x i32>, <2 x i32>* %B
129 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
129 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
130130 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
131131 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
132132 %tmp5 = add <2 x i32> %tmp3, %tmp4
140140 %A = load i32*, i32** %ptr
141141 %tmp0 = bitcast i32* %A to i8*
142142 %tmp1 = load <2 x i32>, <2 x i32>* %B
143 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
143 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
144144 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
145145 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
146146 %tmp5 = add <2 x i32> %tmp3, %tmp4
154154 ;CHECK: vld2.32
155155 %tmp0 = bitcast float* %A to i8*
156156 %tmp1 = load <2 x float>, <2 x float>* %B
157 %tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
157 %tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
158158 %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0
159159 %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1
160160 %tmp5 = fadd <2 x float> %tmp3, %tmp4
167167 ;CHECK: vld2.16 {d17[1], d19[1]}, [{{r[0-9]+}}]
168168 %tmp0 = bitcast i16* %A to i8*
169169 %tmp1 = load <8 x i16>, <8 x i16>* %B
170 %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
170 %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
171171 %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0
172172 %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1
173173 %tmp5 = add <8 x i16> %tmp3, %tmp4
180180 ;CHECK: vld2.32 {d17[0], d19[0]}, [{{r[0-9]+}}:64]
181181 %tmp0 = bitcast i32* %A to i8*
182182 %tmp1 = load <4 x i32>, <4 x i32>* %B
183 %tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
183 %tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
184184 %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0
185185 %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1
186186 %tmp5 = add <4 x i32> %tmp3, %tmp4
192192 ;CHECK: vld2.32
193193 %tmp0 = bitcast float* %A to i8*
194194 %tmp1 = load <4 x float>, <4 x float>* %B
195 %tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
195 %tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
196196 %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0
197197 %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1
198198 %tmp5 = fadd <4 x float> %tmp3, %tmp4
199199 ret <4 x float> %tmp5
200200 }
201201
202 declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
203 declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
204 declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
205 declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly
206
207 declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
208 declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
209 declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly
202 declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
203 declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
204 declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
205 declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly
206
207 declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
208 declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
209 declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly
210210
211211 %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
212212 %struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
221221 ;CHECK-LABEL: vld3lanei8:
222222 ;CHECK: vld3.8
223223 %tmp1 = load <8 x i8>, <8 x i8>* %B
224 %tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
224 %tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
225225 %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0
226226 %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1
227227 %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2
236236 ;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}]
237237 %tmp0 = bitcast i16* %A to i8*
238238 %tmp1 = load <4 x i16>, <4 x i16>* %B
239 %tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
239 %tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
240240 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0
241241 %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1
242242 %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2
250250 ;CHECK: vld3.32
251251 %tmp0 = bitcast i32* %A to i8*
252252 %tmp1 = load <2 x i32>, <2 x i32>* %B
253 %tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
253 %tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
254254 %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0
255255 %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1
256256 %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2
264264 ;CHECK: vld3.32
265265 %tmp0 = bitcast float* %A to i8*
266266 %tmp1 = load <2 x float>, <2 x float>* %B
267 %tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
267 %tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
268268 %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0
269269 %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1
270270 %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2
279279 ;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}]
280280 %tmp0 = bitcast i16* %A to i8*
281281 %tmp1 = load <8 x i16>, <8 x i16>* %B
282 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
282 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
283283 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
284284 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
285285 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
295295 %A = load i16*, i16** %ptr
296296 %tmp0 = bitcast i16* %A to i8*
297297 %tmp1 = load <8 x i16>, <8 x i16>* %B
298 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
298 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
299299 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
300300 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
301301 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
311311 ;CHECK: vld3.32
312312 %tmp0 = bitcast i32* %A to i8*
313313 %tmp1 = load <4 x i32>, <4 x i32>* %B
314 %tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1)
314 %tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1)
315315 %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0
316316 %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1
317317 %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2
325325 ;CHECK: vld3.32
326326 %tmp0 = bitcast float* %A to i8*
327327 %tmp1 = load <4 x float>, <4 x float>* %B
328 %tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
328 %tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
329329 %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0
330330 %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1
331331 %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2
334334 ret <4 x float> %tmp7
335335 }
336336
337 declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
338 declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
339 declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
340 declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
341
342 declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
343 declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
344 declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
337 declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
338 declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
339 declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
340 declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
341
342 declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
343 declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
344 declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
345345
346346 %struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }
347347 %struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
357357 ;Check the alignment value. Max for this instruction is 32 bits:
358358 ;CHECK: vld4.8 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}:32]
359359 %tmp1 = load <8 x i8>, <8 x i8>* %B
360 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
360 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
361361 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
362362 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
363363 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
374374 ;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}:32]!
375375 %A = load i8*, i8** %ptr
376376 %tmp1 = load <8 x i8>, <8 x i8>* %B
377 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
377 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
378378 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
379379 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
380380 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
394394 ;CHECK: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}]
395395 %tmp0 = bitcast i16* %A to i8*
396396 %tmp1 = load <4 x i16>, <4 x i16>* %B
397 %tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4)
397 %tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4)
398398 %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0
399399 %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1
400400 %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2
412412 ;CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}:64]
413413 %tmp0 = bitcast i32* %A to i8*
414414 %tmp1 = load <2 x i32>, <2 x i32>* %B
415 %tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8)
415 %tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8)
416416 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0
417417 %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1
418418 %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2
428428 ;CHECK: vld4.32
429429 %tmp0 = bitcast float* %A to i8*
430430 %tmp1 = load <2 x float>, <2 x float>* %B
431 %tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
431 %tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
432432 %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0
433433 %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1
434434 %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2
445445 ;CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [{{r[0-9]+}}:64]
446446 %tmp0 = bitcast i16* %A to i8*
447447 %tmp1 = load <8 x i16>, <8 x i16>* %B
448 %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16)
448 %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16)
449449 %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0
450450 %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1
451451 %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2
462462 ;CHECK: vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [{{r[0-9]+}}]
463463 %tmp0 = bitcast i32* %A to i8*
464464 %tmp1 = load <4 x i32>, <4 x i32>* %B
465 %tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
465 %tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
466466 %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0
467467 %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1
468468 %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2
478478 ;CHECK: vld4.32
479479 %tmp0 = bitcast float* %A to i8*
480480 %tmp1 = load <4 x float>, <4 x float>* %B
481 %tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
481 %tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
482482 %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0
483483 %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1
484484 %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2
489489 ret <4 x float> %tmp9
490490 }
491491
492 declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
493 declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
494 declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
495 declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
496
497 declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
498 declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
499 declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
492 declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
493 declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
494 declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
495 declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
496
497 declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
498 declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
499 declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
500500
501501 ; Radar 8776599: If one of the operands to a QQQQ REG_SEQUENCE is a register
502502 ; in the QPR_VFP2 regclass, it needs to be copied to a QPR regclass because
510510 %tmp65 = shl i128 %tmp64, 64
511511 %ins67 = or i128 %tmp65, 0
512512 %tmp78 = bitcast i128 %ins67 to <8 x i16>
513 %vld3_lane = tail call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> %tmp78, i32 1, i32 2)
513 %vld3_lane = tail call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> %tmp78, i32 1, i32 2)
514514 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 0
515515 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 1
516516 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 2
392392 %sub.i = sub <4 x i32> %add.i185, zeroinitializer
393393 %add.i = add <4 x i32> %sub.i, zeroinitializer
394394 %vmovn.i = trunc <4 x i32> %add.i to <4 x i16>
395 tail call void @llvm.arm.neon.vst1.v4i16(i8* undef, <4 x i16> %vmovn.i, i32 2)
395 tail call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* undef, <4 x i16> %vmovn.i, i32 2)
396396 unreachable
397397 }
398398
399 declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>, i32) nounwind
399 declare void @llvm.arm.neon.vst1.p0i8.v4i16(i8*, <4 x i16>, i32) nounwind
446446 %0 = trunc i32 %mul to i8
447447 %1 = insertelement <8 x i8> undef, i8 %0, i32 0
448448 %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
449 %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %src, i32 1)
449 %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %src, i32 1)
450450 %4 = bitcast <16 x i8> %3 to <2 x double>
451451 %5 = extractelement <2 x double> %4, i32 1
452452 %6 = bitcast double %5 to <8 x i8>
458458 %12 = add <8 x i16> %7, %11
459459 %13 = mul <8 x i16> %12, %8
460460 %14 = bitcast i16* %dst to i8*
461 tail call void @llvm.arm.neon.vst1.v8i16(i8* %14, <8 x i16> %13, i32 2)
461 tail call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %14, <8 x i16> %13, i32 2)
462462 ret void
463463 }
464464
465 declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32) nounwind readonly
466
467 declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
465 declare <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8*, i32) nounwind readonly
466
467 declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
468468
469469 ; Take advantage of the Cortex-A8 multiplier accumulator forward.
470470
479479 %0 = trunc i32 %mul to i8
480480 %1 = insertelement <8 x i8> undef, i8 %0, i32 0
481481 %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
482 %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %src, i32 1)
482 %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %src, i32 1)
483483 %4 = bitcast <16 x i8> %3 to <2 x double>
484484 %5 = extractelement <2 x double> %4, i32 1
485485 %6 = bitcast double %5 to <8 x i8>
501501 %0 = trunc i32 %mul to i8
502502 %1 = insertelement <8 x i8> undef, i8 %0, i32 0
503503 %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
504 %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %src, i32 1)
504 %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %src, i32 1)
505505 %4 = bitcast <16 x i8> %3 to <2 x double>
506506 %5 = extractelement <2 x double> %4, i32 1
507507 %6 = bitcast double %5 to <8 x i8>
558558
559559 for.body33: ; preds = %for.body33, %for.body33.lr.ph
560560 %add45 = add i32 undef, undef
561 %vld155 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* undef, i32 1)
561 %vld155 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* undef, i32 1)
562562 %0 = load i32*, i32** undef, align 4
563563 %shuffle.i250 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
564564 %1 = bitcast <1 x i64> %shuffle.i250 to <8 x i8>
44 ;Check the alignment value. Max for this instruction is 64 bits:
55 ;CHECK: vst1.8 {d16}, [r0:64]
66 %tmp1 = load <8 x i8>, <8 x i8>* %B
7 call void @llvm.arm.neon.vst1.v8i8(i8* %A, <8 x i8> %tmp1, i32 16)
7 call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, i32 16)
88 ret void
99 }
1010
1313 ;CHECK: vst1.16
1414 %tmp0 = bitcast i16* %A to i8*
1515 %tmp1 = load <4 x i16>, <4 x i16>* %B
16 call void @llvm.arm.neon.vst1.v4i16(i8* %tmp0, <4 x i16> %tmp1, i32 1)
16 call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, i32 1)
1717 ret void
1818 }
1919
2222 ;CHECK: vst1.32
2323 %tmp0 = bitcast i32* %A to i8*
2424 %tmp1 = load <2 x i32>, <2 x i32>* %B
25 call void @llvm.arm.neon.vst1.v2i32(i8* %tmp0, <2 x i32> %tmp1, i32 1)
25 call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, i32 1)
2626 ret void
2727 }
2828
3131 ;CHECK: vst1.32
3232 %tmp0 = bitcast float* %A to i8*
3333 %tmp1 = load <2 x float>, <2 x float>* %B
34 call void @llvm.arm.neon.vst1.v2f32(i8* %tmp0, <2 x float> %tmp1, i32 1)
34 call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, i32 1)
3535 ret void
3636 }
3737
4242 %A = load float*, float** %ptr
4343 %tmp0 = bitcast float* %A to i8*
4444 %tmp1 = load <2 x float>, <2 x float>* %B
45 call void @llvm.arm.neon.vst1.v2f32(i8* %tmp0, <2 x float> %tmp1, i32 1)
45 call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, i32 1)
4646 %tmp2 = getelementptr float, float* %A, i32 2
4747 store float* %tmp2, float** %ptr
4848 ret void
5353 ;CHECK: vst1.64
5454 %tmp0 = bitcast i64* %A to i8*
5555 %tmp1 = load <1 x i64>, <1 x i64>* %B
56 call void @llvm.arm.neon.vst1.v1i64(i8* %tmp0, <1 x i64> %tmp1, i32 1)
56 call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, i32 1)
5757 ret void
5858 }
5959
6262 ;Check the alignment value. Max for this instruction is 128 bits:
6363 ;CHECK: vst1.8 {d16, d17}, [r0:64]
6464 %tmp1 = load <16 x i8>, <16 x i8>* %B
65 call void @llvm.arm.neon.vst1.v16i8(i8* %A, <16 x i8> %tmp1, i32 8)
65 call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %A, <16 x i8> %tmp1, i32 8)
6666 ret void
6767 }
6868
7272 ;CHECK: vst1.16 {d16, d17}, [r0:128]
7373 %tmp0 = bitcast i16* %A to i8*
7474 %tmp1 = load <8 x i16>, <8 x i16>* %B
75 call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 32)
75 call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 32)
7676 ret void
7777 }
7878
8383 %A = load i16*, i16** %ptr
8484 %tmp0 = bitcast i16* %A to i8*
8585 %tmp1 = load <8 x i16>, <8 x i16>* %B
86 call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 8)
86 call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 8)
8787 %tmp2 = getelementptr i16, i16* %A, i32 %inc
8888 store i16* %tmp2, i16** %ptr
8989 ret void
9494 ;CHECK: vst1.32
9595 %tmp0 = bitcast i32* %A to i8*
9696 %tmp1 = load <4 x i32>, <4 x i32>* %B
97 call void @llvm.arm.neon.vst1.v4i32(i8* %tmp0, <4 x i32> %tmp1, i32 1)
97 call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, i32 1)
9898 ret void
9999 }
100100
103103 ;CHECK: vst1.32
104104 %tmp0 = bitcast float* %A to i8*
105105 %tmp1 = load <4 x float>, <4 x float>* %B
106 call void @llvm.arm.neon.vst1.v4f32(i8* %tmp0, <4 x float> %tmp1, i32 1)
106 call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, i32 1)
107107 ret void
108108 }
109109
112112 ;CHECK: vst1.64
113113 %tmp0 = bitcast i64* %A to i8*
114114 %tmp1 = load <2 x i64>, <2 x i64>* %B
115 call void @llvm.arm.neon.vst1.v2i64(i8* %tmp0, <2 x i64> %tmp1, i32 1)
115 call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* %tmp0, <2 x i64> %tmp1, i32 1)
116116 ret void
117117 }
118118
121121 ;CHECK: vst1.64
122122 %tmp0 = bitcast double* %A to i8*
123123 %tmp1 = load <2 x double>, <2 x double>* %B
124 call void @llvm.arm.neon.vst1.v2f64(i8* %tmp0, <2 x double> %tmp1, i32 1)
124 call void @llvm.arm.neon.vst1.p0i8.v2f64(i8* %tmp0, <2 x double> %tmp1, i32 1)
125125 ret void
126126 }
127127
128 declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32) nounwind
129 declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>, i32) nounwind
130 declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32) nounwind
131 declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32) nounwind
132 declare void @llvm.arm.neon.vst1.v1i64(i8*, <1 x i64>, i32) nounwind
128 declare void @llvm.arm.neon.vst1.p0i8.v8i8(i8*, <8 x i8>, i32) nounwind
129 declare void @llvm.arm.neon.vst1.p0i8.v4i16(i8*, <4 x i16>, i32) nounwind
130 declare void @llvm.arm.neon.vst1.p0i8.v2i32(i8*, <2 x i32>, i32) nounwind
131 declare void @llvm.arm.neon.vst1.p0i8.v2f32(i8*, <2 x float>, i32) nounwind
132 declare void @llvm.arm.neon.vst1.p0i8.v1i64(i8*, <1 x i64>, i32) nounwind
133133
134 declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32) nounwind
135 declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
136 declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32) nounwind
137 declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
138 declare void @llvm.arm.neon.vst1.v2i64(i8*, <2 x i64>, i32) nounwind
139 declare void @llvm.arm.neon.vst1.v2f64(i8*, <2 x double>, i32) nounwind
134 declare void @llvm.arm.neon.vst1.p0i8.v16i8(i8*, <16 x i8>, i32) nounwind
135 declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
136 declare void @llvm.arm.neon.vst1.p0i8.v4i32(i8*, <4 x i32>, i32) nounwind
137 declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind
138 declare void @llvm.arm.neon.vst1.p0i8.v2i64(i8*, <2 x i64>, i32) nounwind
139 declare void @llvm.arm.neon.vst1.p0i8.v2f64(i8*, <2 x double>, i32) nounwind
44 ;Check the alignment value. Max for this instruction is 128 bits:
55 ;CHECK: vst2.8 {d16, d17}, [r0:64]
66 %tmp1 = load <8 x i8>, <8 x i8>* %B
7 call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8)
7 call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8)
88 ret void
99 }
1010
1414 ;CHECK: vst2.8 {d16, d17}, [r1], r2
1515 %A = load i8*, i8** %ptr
1616 %tmp1 = load <8 x i8>, <8 x i8>* %B
17 call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 4)
17 call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 4)
1818 %tmp2 = getelementptr i8, i8* %A, i32 %inc
1919 store i8* %tmp2, i8** %ptr
2020 ret void
2626 ;CHECK: vst2.16 {d16, d17}, [r0:128]
2727 %tmp0 = bitcast i16* %A to i8*
2828 %tmp1 = load <4 x i16>, <4 x i16>* %B
29 call void @llvm.arm.neon.vst2.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 32)
29 call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 32)
3030 ret void
3131 }
3232
3535 ;CHECK: vst2.32
3636 %tmp0 = bitcast i32* %A to i8*
3737 %tmp1 = load <2 x i32>, <2 x i32>* %B
38 call void @llvm.arm.neon.vst2.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
38 call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
3939 ret void
4040 }
4141
4444 ;CHECK: vst2.32
4545 %tmp0 = bitcast float* %A to i8*
4646 %tmp1 = load <2 x float>, <2 x float>* %B
47 call void @llvm.arm.neon.vst2.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
47 call void @llvm.arm.neon.vst2.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
4848 ret void
4949 }
5050
5454 ;CHECK: vst1.64 {d16, d17}, [r0:128]
5555 %tmp0 = bitcast i64* %A to i8*
5656 %tmp1 = load <1 x i64>, <1 x i64>* %B
57 call void @llvm.arm.neon.vst2.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 32)
57 call void @llvm.arm.neon.vst2.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 32)
5858 ret void
5959 }
6060
6565 %A = load i64*, i64** %ptr
6666 %tmp0 = bitcast i64* %A to i8*
6767 %tmp1 = load <1 x i64>, <1 x i64>* %B
68 call void @llvm.arm.neon.vst2.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 8)
68 call void @llvm.arm.neon.vst2.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 8)
6969 %tmp2 = getelementptr i64, i64* %A, i32 2
7070 store i64* %tmp2, i64** %ptr
7171 ret void
7676 ;Check the alignment value. Max for this instruction is 256 bits:
7777 ;CHECK: vst2.8 {d16, d17, d18, d19}, [r0:64]
7878 %tmp1 = load <16 x i8>, <16 x i8>* %B
79 call void @llvm.arm.neon.vst2.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 8)
79 call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 8)
8080 ret void
8181 }
8282
8686 ;CHECK: vst2.16 {d16, d17, d18, d19}, [r0:128]
8787 %tmp0 = bitcast i16* %A to i8*
8888 %tmp1 = load <8 x i16>, <8 x i16>* %B
89 call void @llvm.arm.neon.vst2.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 16)
89 call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 16)
9090 ret void
9191 }
9292
9696 ;CHECK: vst2.32 {d16, d17, d18, d19}, [r0:256]
9797 %tmp0 = bitcast i32* %A to i8*
9898 %tmp1 = load <4 x i32>, <4 x i32>* %B
99 call void @llvm.arm.neon.vst2.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 64)
99 call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 64)
100100 ret void
101101 }
102102
105105 ;CHECK: vst2.32
106106 %tmp0 = bitcast float* %A to i8*
107107 %tmp1 = load <4 x float>, <4 x float>* %B
108 call void @llvm.arm.neon.vst2.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
108 call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
109109 ret void
110110 }
111111
113113 ;CHECK-LABEL: vst2update:
114114 ;CHECK: vst2.16 {d16, d17}, [r0]!
115115 %tmp1 = load <4 x i16>, <4 x i16>* %B
116 tail call void @llvm.arm.neon.vst2.v4i16(i8* %out, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 2)
116 tail call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* %out, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 2)
117117 %t5 = getelementptr inbounds i8, i8* %out, i32 16
118118 ret i8* %t5
119119 }
122122 ;CHECK-LABEL: vst2update2:
123123 ;CHECK: vst2.32 {d16, d17, d18, d19}, [r0]!
124124 %tmp1 = load <4 x float>, <4 x float>* %this
125 call void @llvm.arm.neon.vst2.v4f32(i8* %out, <4 x float> %tmp1, <4 x float> %tmp1, i32 4) nounwind
125 call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %out, <4 x float> %tmp1, <4 x float> %tmp1, i32 4) nounwind
126126 %tmp2 = getelementptr inbounds i8, i8* %out, i32 32
127127 ret i8* %tmp2
128128 }
129129
130 declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind
131 declare void @llvm.arm.neon.vst2.v4i16(i8*, <4 x i16>, <4 x i16>, i32) nounwind
132 declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>, i32) nounwind
133 declare void @llvm.arm.neon.vst2.v2f32(i8*, <2 x float>, <2 x float>, i32) nounwind
134 declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32) nounwind
130 declare void @llvm.arm.neon.vst2.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind
131 declare void @llvm.arm.neon.vst2.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, i32) nounwind
132 declare void @llvm.arm.neon.vst2.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, i32) nounwind
133 declare void @llvm.arm.neon.vst2.p0i8.v2f32(i8*, <2 x float>, <2 x float>, i32) nounwind
134 declare void @llvm.arm.neon.vst2.p0i8.v1i64(i8*, <1 x i64>, <1 x i64>, i32) nounwind
135135
136 declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32) nounwind
137 declare void @llvm.arm.neon.vst2.v8i16(i8*, <8 x i16>, <8 x i16>, i32) nounwind
138 declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind
139 declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
136 declare void @llvm.arm.neon.vst2.p0i8.v16i8(i8*, <16 x i8>, <16 x i8>, i32) nounwind
137 declare void @llvm.arm.neon.vst2.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, i32) nounwind
138 declare void @llvm.arm.neon.vst2.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind
139 declare void @llvm.arm.neon.vst2.p0i8.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
55 ;This test runs at -O0 so do not check for specific register numbers.
66 ;CHECK: vst3.8 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]
77 %tmp1 = load <8 x i8>, <8 x i8>* %B
8 call void @llvm.arm.neon.vst3.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 32)
8 call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 32)
99 ret void
1010 }
1111
1414 ;CHECK: vst3.16
1515 %tmp0 = bitcast i16* %A to i8*
1616 %tmp1 = load <4 x i16>, <4 x i16>* %B
17 call void @llvm.arm.neon.vst3.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
17 call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
1818 ret void
1919 }
2020
2323 ;CHECK: vst3.32
2424 %tmp0 = bitcast i32* %A to i8*
2525 %tmp1 = load <2 x i32>, <2 x i32>* %B
26 call void @llvm.arm.neon.vst3.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
26 call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
2727 ret void
2828 }
2929
3434 %A = load i32*, i32** %ptr
3535 %tmp0 = bitcast i32* %A to i8*
3636 %tmp1 = load <2 x i32>, <2 x i32>* %B
37 call void @llvm.arm.neon.vst3.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
37 call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
3838 %tmp2 = getelementptr i32, i32* %A, i32 6
3939 store i32* %tmp2, i32** %ptr
4040 ret void
4545 ;CHECK: vst3.32
4646 %tmp0 = bitcast float* %A to i8*
4747 %tmp1 = load <2 x float>, <2 x float>* %B
48 call void @llvm.arm.neon.vst3.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
48 call void @llvm.arm.neon.vst3.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
4949 ret void
5050 }
5151
5656 ;CHECK: vst1.64 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]
5757 %tmp0 = bitcast i64* %A to i8*
5858 %tmp1 = load <1 x i64>, <1 x i64>* %B
59 call void @llvm.arm.neon.vst3.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 16)
59 call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 16)
6060 ret void
6161 }
6262
6666 %A = load i64*, i64** %ptr
6767 %tmp0 = bitcast i64* %A to i8*
6868 %tmp1 = load <1 x i64>, <1 x i64>* %B
69 call void @llvm.arm.neon.vst3.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
69 call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
7070 %tmp2 = getelementptr i64, i64* %A, i32 3
7171 store i64* %tmp2, i64** %ptr
7272 ret void
7979 ;CHECK: vst3.8 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]!
8080 ;CHECK: vst3.8 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]
8181 %tmp1 = load <16 x i8>, <16 x i8>* %B
82 call void @llvm.arm.neon.vst3.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 32)
82 call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 32)
8383 ret void
8484 }
8585
8989 ;CHECK: vst3.16
9090 %tmp0 = bitcast i16* %A to i8*
9191 %tmp1 = load <8 x i16>, <8 x i16>* %B
92 call void @llvm.arm.neon.vst3.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
92 call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
9393 ret void
9494 }
9595
101101 %A = load i16*, i16** %ptr
102102 %tmp0 = bitcast i16* %A to i8*
103103 %tmp1 = load <8 x i16>, <8 x i16>* %B
104 call void @llvm.arm.neon.vst3.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
104 call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
105105 %tmp2 = getelementptr i16, i16* %A, i32 24
106106 store i16* %tmp2, i16** %ptr
107107 ret void
113113 ;CHECK: vst3.32
114114 %tmp0 = bitcast i32* %A to i8*
115115 %tmp1 = load <4 x i32>, <4 x i32>* %B
116 call void @llvm.arm.neon.vst3.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1)
116 call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1)
117117 ret void
118118 }
119119
123123 ;CHECK: vst3.32
124124 %tmp0 = bitcast float* %A to i8*
125125 %tmp1 = load <4 x float>, <4 x float>* %B
126 call void @llvm.arm.neon.vst3.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
126 call void @llvm.arm.neon.vst3.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
127127 ret void
128128 }
129129
130 declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
131 declare void @llvm.arm.neon.vst3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
132 declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
133 declare void @llvm.arm.neon.vst3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32) nounwind
134 declare void @llvm.arm.neon.vst3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32) nounwind
130 declare void @llvm.arm.neon.vst3.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
131 declare void @llvm.arm.neon.vst3.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
132 declare void @llvm.arm.neon.vst3.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
133 declare void @llvm.arm.neon.vst3.p0i8.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32) nounwind
134 declare void @llvm.arm.neon.vst3.p0i8.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32) nounwind
135135
136 declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32) nounwind
137 declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind
138 declare void @llvm.arm.neon.vst3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
139 declare void @llvm.arm.neon.vst3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32) nounwind
136 declare void @llvm.arm.neon.vst3.p0i8.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32) nounwind
137 declare void @llvm.arm.neon.vst3.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind
138 declare void @llvm.arm.neon.vst3.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
139 declare void @llvm.arm.neon.vst3.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32) nounwind
44 ;Check the alignment value. Max for this instruction is 256 bits:
55 ;CHECK: vst4.8 {d16, d17, d18, d19}, [r0:64]
66 %tmp1 = load <8 x i8>, <8 x i8>* %B
7 call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8)
7 call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8)
88 ret void
99 }
1010
1414 ;CHECK: vst4.8 {d16, d17, d18, d19}, [r1:128], r2
1515 %A = load i8*, i8** %ptr
1616 %tmp1 = load <8 x i8>, <8 x i8>* %B
17 call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 16)
17 call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 16)
1818 %tmp2 = getelementptr i8, i8* %A, i32 %inc
1919 store i8* %tmp2, i8** %ptr
2020 ret void
2626 ;CHECK: vst4.16 {d16, d17, d18, d19}, [r0:128]
2727 %tmp0 = bitcast i16* %A to i8*
2828 %tmp1 = load <4 x i16>, <4 x i16>* %B
29 call void @llvm.arm.neon.vst4.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 16)
29 call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 16)
3030 ret void
3131 }
3232
3636 ;CHECK: vst4.32 {d16, d17, d18, d19}, [r0:256]
3737 %tmp0 = bitcast i32* %A to i8*
3838 %tmp1 = load <2 x i32>, <2 x i32>* %B
39 call void @llvm.arm.neon.vst4.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 32)
39 call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 32)
4040 ret void
4141 }
4242
4545 ;CHECK: vst4.32
4646 %tmp0 = bitcast float* %A to i8*
4747 %tmp1 = load <2 x float>, <2 x float>* %B
48 call void @llvm.arm.neon.vst4.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
48 call void @llvm.arm.neon.vst4.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
4949 ret void
5050 }
5151
5555 ;CHECK: vst1.64 {d16, d17, d18, d19}, [r0:256]
5656 %tmp0 = bitcast i64* %A to i8*
5757 %tmp1 = load <1 x i64>, <1 x i64>* %B
58 call void @llvm.arm.neon.vst4.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 64)
58 call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 64)
5959 ret void
6060 }
6161
6565 %A = load i64*, i64** %ptr
6666 %tmp0 = bitcast i64* %A to i8*
6767 %tmp1 = load <1 x i64>, <1 x i64>* %B
68 call void @llvm.arm.neon.vst4.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
68 call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
6969 %tmp2 = getelementptr i64, i64* %A, i32 4
7070 store i64* %tmp2, i64** %ptr
7171 ret void
7777 ;CHECK: vst4.8 {d16, d18, d20, d22}, [r0:256]!
7878 ;CHECK: vst4.8 {d17, d19, d21, d23}, [r0:256]
7979 %tmp1 = load <16 x i8>, <16 x i8>* %B
80 call void @llvm.arm.neon.vst4.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 64)
80 call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 64)
8181 ret void
8282 }
8383
8888 ;CHECK: vst4.16 {d17, d19, d21, d23}, [r0]
8989 %tmp0 = bitcast i16* %A to i8*
9090 %tmp1 = load <8 x i16>, <8 x i16>* %B
91 call void @llvm.arm.neon.vst4.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
91 call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
9292 ret void
9393 }
9494
9898 ;CHECK: vst4.32
9999 %tmp0 = bitcast i32* %A to i8*
100100 %tmp1 = load <4 x i32>, <4 x i32>* %B
101 call void @llvm.arm.neon.vst4.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1)
101 call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1)
102102 ret void
103103 }
104104
108108 ;CHECK: vst4.32
109109 %tmp0 = bitcast float* %A to i8*
110110 %tmp1 = load <4 x float>, <4 x float>* %B
111 call void @llvm.arm.neon.vst4.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
111 call void @llvm.arm.neon.vst4.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
112112 ret void
113113 }
114114
120120 %A = load float*, float** %ptr
121121 %tmp0 = bitcast float* %A to i8*
122122 %tmp1 = load <4 x float>, <4 x float>* %B
123 call void @llvm.arm.neon.vst4.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
123 call void @llvm.arm.neon.vst4.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
124124 %tmp2 = getelementptr float, float* %A, i32 16
125125 store float* %tmp2, float** %ptr
126126 ret void
127127 }
128128
129 declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
130 declare void @llvm.arm.neon.vst4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
131 declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
132 declare void @llvm.arm.neon.vst4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32) nounwind
133 declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32) nounwind
129 declare void @llvm.arm.neon.vst4.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
130 declare void @llvm.arm.neon.vst4.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
131 declare void @llvm.arm.neon.vst4.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
132 declare void @llvm.arm.neon.vst4.p0i8.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32) nounwind
133 declare void @llvm.arm.neon.vst4.p0i8.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32) nounwind
134134
135 declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32) nounwind
136 declare void @llvm.arm.neon.vst4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind
137 declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
138 declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) nounwind
135 declare void @llvm.arm.neon.vst4.p0i8.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32) nounwind
136 declare void @llvm.arm.neon.vst4.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind
137 declare void @llvm.arm.neon.vst4.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
138 declare void @llvm.arm.neon.vst4.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) nounwind
109109 ;Check the alignment value. Max for this instruction is 16 bits:
110110 ;CHECK: vst2.8 {d16[1], d17[1]}, [r0:16]
111111 %tmp1 = load <8 x i8>, <8 x i8>* %B
112 call void @llvm.arm.neon.vst2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
112 call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
113113 ret void
114114 }
115115
119119 ;CHECK: vst2.16 {d16[1], d17[1]}, [r0:32]
120120 %tmp0 = bitcast i16* %A to i8*
121121 %tmp1 = load <4 x i16>, <4 x i16>* %B
122 call void @llvm.arm.neon.vst2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
122 call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
123123 ret void
124124 }
125125
130130 %A = load i16*, i16** %ptr
131131 %tmp0 = bitcast i16* %A to i8*
132132 %tmp1 = load <4 x i16>, <4 x i16>* %B
133 call void @llvm.arm.neon.vst2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 2)
133 call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 2)
134134 %tmp2 = getelementptr i16, i16* %A, i32 %inc
135135 store i16* %tmp2, i16** %ptr
136136 ret void
141141 ;CHECK: vst2.32
142142 %tmp0 = bitcast i32* %A to i8*
143143 %tmp1 = load <2 x i32>, <2 x i32>* %B
144 call void @llvm.arm.neon.vst2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
144 call void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
145145 ret void
146146 }
147147
150150 ;CHECK: vst2.32
151151 %tmp0 = bitcast float* %A to i8*
152152 %tmp1 = load <2 x float>, <2 x float>* %B
153 call void @llvm.arm.neon.vst2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
153 call void @llvm.arm.neon.vst2lane.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
154154 ret void
155155 }
156156
160160 ;CHECK: vst2.16 {d17[1], d19[1]}, [r0]
161161 %tmp0 = bitcast i16* %A to i8*
162162 %tmp1 = load <8 x i16>, <8 x i16>* %B
163 call void @llvm.arm.neon.vst2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
163 call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
164164 ret void
165165 }
166166
170170 ;CHECK: vst2.32 {d17[0], d19[0]}, [r0:64]
171171 %tmp0 = bitcast i32* %A to i8*
172172 %tmp1 = load <4 x i32>, <4 x i32>* %B
173 call void @llvm.arm.neon.vst2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
173 call void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
174174 ret void
175175 }
176176
179179 ;CHECK: vst2.32
180180 %tmp0 = bitcast float* %A to i8*
181181 %tmp1 = load <4 x float>, <4 x float>* %B
182 call void @llvm.arm.neon.vst2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 3, i32 1)
183 ret void
184 }
185
186 declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind
187 declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind
188 declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind
189 declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind
190
191 declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind
192 declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind
193 declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind
182 call void @llvm.arm.neon.vst2lane.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 3, i32 1)
183 ret void
184 }
185
186 declare void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind
187 declare void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind
188 declare void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind
189 declare void @llvm.arm.neon.vst2lane.p0i8.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind
190
191 declare void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind
192 declare void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind
193 declare void @llvm.arm.neon.vst2lane.p0i8.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind
194194
195195 define void @vst3lanei8(i8* %A, <8 x i8>* %B) nounwind {
196196 ;CHECK-LABEL: vst3lanei8:
197197 ;CHECK: vst3.8
198198 %tmp1 = load <8 x i8>, <8 x i8>* %B
199 call void @llvm.arm.neon.vst3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
199 call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
200200 ret void
201201 }
202202
206206 ;CHECK: vst3.16 {d16[1], d17[1], d18[1]}, [r0]
207207 %tmp0 = bitcast i16* %A to i8*
208208 %tmp1 = load <4 x i16>, <4 x i16>* %B
209 call void @llvm.arm.neon.vst3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
209 call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
210210 ret void
211211 }
212212
215215 ;CHECK: vst3.32
216216 %tmp0 = bitcast i32* %A to i8*
217217 %tmp1 = load <2 x i32>, <2 x i32>* %B
218 call void @llvm.arm.neon.vst3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
218 call void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
219219 ret void
220220 }
221221
224224 ;CHECK: vst3.32
225225 %tmp0 = bitcast float* %A to i8*
226226 %tmp1 = load <2 x float>, <2 x float>* %B
227 call void @llvm.arm.neon.vst3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
227 call void @llvm.arm.neon.vst3lane.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
228228 ret void
229229 }
230230
234234 ;CHECK: vst3.16 {d17[2], d19[2], d21[2]}, [r0]
235235 %tmp0 = bitcast i16* %A to i8*
236236 %tmp1 = load <8 x i16>, <8 x i16>* %B
237 call void @llvm.arm.neon.vst3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 6, i32 8)
237 call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 6, i32 8)
238238 ret void
239239 }
240240
243243 ;CHECK: vst3.32
244244 %tmp0 = bitcast i32* %A to i8*
245245 %tmp1 = load <4 x i32>, <4 x i32>* %B
246 call void @llvm.arm.neon.vst3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1)
246 call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1)
247247 ret void
248248 }
249249
254254 %A = load i32*, i32** %ptr
255255 %tmp0 = bitcast i32* %A to i8*
256256 %tmp1 = load <4 x i32>, <4 x i32>* %B
257 call void @llvm.arm.neon.vst3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1)
257 call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1)
258258 %tmp2 = getelementptr i32, i32* %A, i32 3
259259 store i32* %tmp2, i32** %ptr
260260 ret void
265265 ;CHECK: vst3.32
266266 %tmp0 = bitcast float* %A to i8*
267267 %tmp1 = load <4 x float>, <4 x float>* %B
268 call void @llvm.arm.neon.vst3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
269 ret void
270 }
271
272 declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind
273 declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind
274 declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
275 declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind
276
277 declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind
278 declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind
279 declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind
268 call void @llvm.arm.neon.vst3lane.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
269 ret void
270 }
271
272 declare void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind
273 declare void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind
274 declare void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
275 declare void @llvm.arm.neon.vst3lane.p0i8.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind
276
277 declare void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind
278 declare void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind
279 declare void @llvm.arm.neon.vst3lane.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind
280280
281281
282282 define void @vst4lanei8(i8* %A, <8 x i8>* %B) nounwind {
284284 ;Check the alignment value. Max for this instruction is 32 bits:
285285 ;CHECK: vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0:32]
286286 %tmp1 = load <8 x i8>, <8 x i8>* %B
287 call void @llvm.arm.neon.vst4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
287 call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
288288 ret void
289289 }
290290
294294 ;CHECK: vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r1:32]!
295295 %A = load i8*, i8** %ptr
296296 %tmp1 = load <8 x i8>, <8 x i8>* %B
297 call void @llvm.arm.neon.vst4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
297 call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
298298 %tmp2 = getelementptr i8, i8* %A, i32 4
299299 store i8* %tmp2, i8** %ptr
300300 ret void
305305 ;CHECK: vst4.16
306306 %tmp0 = bitcast i16* %A to i8*
307307 %tmp1 = load <4 x i16>, <4 x i16>* %B
308 call void @llvm.arm.neon.vst4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1)
308 call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1)
309309 ret void
310310 }
311311
315315 ;CHECK: vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0:128]
316316 %tmp0 = bitcast i32* %A to i8*
317317 %tmp1 = load <2 x i32>, <2 x i32>* %B
318 call void @llvm.arm.neon.vst4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 16)
318 call void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 16)
319319 ret void
320320 }
321321
324324 ;CHECK: vst4.32
325325 %tmp0 = bitcast float* %A to i8*
326326 %tmp1 = load <2 x float>, <2 x float>* %B
327 call void @llvm.arm.neon.vst4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
327 call void @llvm.arm.neon.vst4lane.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
328328 ret void
329329 }
330330
334334 ;CHECK: vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r0:64]
335335 %tmp0 = bitcast i16* %A to i8*
336336 %tmp1 = load <8 x i16>, <8 x i16>* %B
337 call void @llvm.arm.neon.vst4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 7, i32 16)
337 call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 7, i32 16)
338338 ret void
339339 }
340340
344344 ;CHECK: vst4.32 {d17[0], d19[0], d21[0], d23[0]}, [r0]
345345 %tmp0 = bitcast i32* %A to i8*
346346 %tmp1 = load <4 x i32>, <4 x i32>* %B
347 call void @llvm.arm.neon.vst4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
347 call void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
348348 ret void
349349 }
350350
353353 ;CHECK: vst4.32
354354 %tmp0 = bitcast float* %A to i8*
355355 %tmp1 = load <4 x float>, <4 x float>* %B
356 call void @llvm.arm.neon.vst4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
356 call void @llvm.arm.neon.vst4lane.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
357357 ret void
358358 }
359359
364364 ret <8 x i16> %r
365365 }
366366
367 declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind
368 declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind
369 declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
370 declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind
371
372 declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind
373 declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind
374 declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind
367 declare void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind
368 declare void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind
369 declare void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
370 declare void @llvm.arm.neon.vst4lane.p0i8.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind
371
372 declare void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind
373 declare void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind
374 declare void @llvm.arm.neon.vst4lane.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind
1414 %6 = bitcast i32* %sp3 to <4 x i32>* ; <<4 x i32>*> [#uses=1]
1515 %7 = load <4 x i32>, <4 x i32>* %6, align 16 ; <<4 x i32>> [#uses=1]
1616 %8 = bitcast i32* %dp to i8* ; [#uses=1]
17 tail call void @llvm.arm.neon.vst4.v4i32(i8* %8, <4 x i32> %1, <4 x i32> %3, <4 x i32> %5, <4 x i32> %7, i32 1)
17 tail call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* %8, <4 x i32> %1, <4 x i32> %3, <4 x i32> %5, <4 x i32> %7, i32 1)
1818 ret void
1919 }
2020
21 declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
21 declare void @llvm.arm.neon.vst4.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
2222
2323 @sbuf = common global [16 x i32] zeroinitializer, align 16 ; <[16 x i32]*> [#uses=5]
2424 @dbuf = common global [16 x i32] zeroinitializer ; <[16 x i32]*> [#uses=2]
4444 %3 = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @sbuf, i32 0, i32 4) to <4 x i32>*), align 16 ; <<4 x i32>> [#uses=1]
4545 %4 = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @sbuf, i32 0, i32 8) to <4 x i32>*), align 16 ; <<4 x i32>> [#uses=1]
4646 %5 = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @sbuf, i32 0, i32 12) to <4 x i32>*), align 16 ; <<4 x i32>> [#uses=1]
47 tail call void @llvm.arm.neon.vst4.v4i32(i8* bitcast ([16 x i32]* @dbuf to i8*), <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, <4 x i32> %5, i32 1) nounwind
47 tail call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* bitcast ([16 x i32]* @dbuf to i8*), <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, <4 x i32> %5, i32 1) nounwind
4848 ret i32 0
4949 }
5050
5252 ; Make sure the DPair register class can spill.
5353 define void @pr12389(i8* %p) nounwind ssp {
5454 entry:
55 %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %p, i32 1)
55 %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %p, i32 1)
5656 tail call void asm sideeffect "", "~{q0},~{q1},~{q2},~{q3},~{q4},~{q5},~{q6},~{q7},~{q8},~{q9},~{q10},~{q11},~{q12},~{q13},~{q14},~{q15}"() nounwind
57 tail call void @llvm.arm.neon.vst1.v4f32(i8* %p, <4 x float> %vld1, i32 1)
57 tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %p, <4 x float> %vld1, i32 1)
5858 ret void
5959 }
6060
61 declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
61 declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
6262
63 declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
63 declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind
6464
6565 ;
6666 ; When an strd is expanded into two str instructions, make sure the first str
5858 %indvar = phi i32 [ %indvar.next, %bb1 ], [ 0, %entry ]
5959 %tmp1 = shl i32 %indvar, 2
6060 %gep1 = getelementptr i8, i8* %ptr1, i32 %tmp1
61 %tmp2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %gep1, i32 1)
61 %tmp2 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %gep1, i32 1)
6262 %tmp3 = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> , <4 x float> %tmp2)
6363 %gep2 = getelementptr i8, i8* %ptr2, i32 %tmp1
64 call void @llvm.arm.neon.vst1.v4f32(i8* %gep2, <4 x float> %tmp3, i32 1)
64 call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %gep2, <4 x float> %tmp3, i32 1)
6565 %indvar.next = add i32 %indvar, 1
6666 %cond = icmp eq i32 %indvar.next, 10
6767 br i1 %cond, label %bb2, label %bb1
7272
7373 ; CHECK-NOT: LCPI1_0:
7474
75 declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
75 declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
7676
77 declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
77 declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind
7878
7979 declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
8080
66 %quux = type { i32 (...)**, %baz*, i32 }
77 %quuz = type { %quux, i32, %bar, [128 x i8], [16 x %foo], %foo, %foo, %foo }
88
9 declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
9 declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
1010
1111 define void @aaa(%quuz* %this, i8* %block) {
1212 ; CHECK-LABEL: aaa:
1717 %aligned_vec = alloca <4 x float>, align 16
1818 %"alloca point" = bitcast i32 0 to i32
1919