llvm.org GIT mirror llvm / 80ce670
Support arbitrary addrspace pointers in masked load/store intrinsics This is a resubmittion of 263158 change. This patch fixes the problem which occurs when loop-vectorize tries to use @llvm.masked.load/store intrinsic for a non-default addrspace pointer. It fails with "Calling a function with a bad signature!" assertion in CallInst constructor because it tries to pass a non-default addrspace pointer to the pointer argument which has default addrspace. The fix is to add pointer type as another overloaded type to @llvm.masked.load/store intrinsics. Reviewed By: reames Differential Revision: http://reviews.llvm.org/D17270 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@266086 91177308-0d34-0410-b5e6-96231b3b80d8 Artur Pilipenko 3 years ago
12 changed file(s) with 339 addition(s) and 206 deletion(s). Raw diff Collapse all Expand all
1139911399
1140011400 ::
1140111401
11402 declare <16 x float> @llvm.masked.load.v16f32 (<16 x float>* , i32 , <16 x i1> , <16 x float> )
11403 declare <2 x double> @llvm.masked.load.v2f64 (<2 x double>* , i32 , <2 x i1> , <2 x double> )
11402 declare <16 x float> @llvm.masked.load.v16f32.p0v16f32 (<16 x float>* , i32 , <16 x i1> , <16 x float> )
11403 declare <2 x double> @llvm.masked.load.v2f64.p0v2f64 (<2 x double>* , i32 , <2 x i1> , <2 x double> )
1140411404 ;; The data is a vector of pointers to double
11405 declare <8 x double*> @llvm.masked.load.v8p0f64 (<8 x double*>* , i32 , <8 x i1> , <8 x double*> )
11405 declare <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64 (<8 x double*>* , i32 , <8 x i1> , <8 x double*> )
1140611406 ;; The data is a vector of function pointers
11407 declare <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f (<8 x i32 ()*>* , i32 , <8 x i1> , <8 x i32 ()*> )
11407 declare <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f (<8 x i32 ()*>* , i32 , <8 x i1> , <8 x i32 ()*> )
1140811408
1140911409 Overview:
1141011410 """""""""
1142711427
1142811428 ::
1142911429
11430 %res = call <16 x float> @llvm.masked.load.v16f32 (<16 x float>* %ptr, i32 4, <16 x i1>%mask, <16 x float> %passthru)
11430 %res = call <16 x float> @llvm.masked.load.v16f32.p0v16f32 (<16 x float>* %ptr, i32 4, <16 x i1>%mask, <16 x float> %passthru)
1143111431
1143211432 ;; The result of the two following instructions is identical aside from potential memory access exception
1143311433 %loadlal = load <16 x float>, <16 x float>* %ptr, align 4
1144411444
1144511445 ::
1144611446
11447 declare void @llvm.masked.store.v8i32 (<8 x i32> , <8 x i32>* , i32 , <8 x i1> )
11448 declare void @llvm.masked.store.v16f32 (<16 x float> , <16 x float>* , i32 , <16 x i1> )
11447 declare void @llvm.masked.store.v8i32.p0v8i32 (<8 x i32> , <8 x i32>* , i32 , <8 x i1> )
11448 declare void @llvm.masked.store.v16f32.p0v16f32 (<16 x float> , <16 x float>* , i32 , <16 x i1> )
1144911449 ;; The data is a vector of pointers to double
11450 declare void @llvm.masked.store.v8p0f64 (<8 x double*> , <8 x double*>* , i32 , <8 x i1> )
11450 declare void @llvm.masked.store.v8p0f64.p0v8p0f64 (<8 x double*> , <8 x double*>* , i32 , <8 x i1> )
1145111451 ;; The data is a vector of function pointers
11452 declare void @llvm.masked.store.v4p0f_i32f (<4 x i32 ()*> , <4 x i32 ()*>* , i32 , <4 x i1> )
11452 declare void @llvm.masked.store.v4p0f_i32f.p0v4p0f_i32f (<4 x i32 ()*> , <4 x i32 ()*>* , i32 , <4 x i1> )
1145311453
1145411454 Overview:
1145511455 """""""""
1147011470
1147111471 ::
1147211472
11473 call void @llvm.masked.store.v16f32(<16 x float> %value, <16 x float>* %ptr, i32 4, <16 x i1> %mask)
11473 call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> %value, <16 x float>* %ptr, i32 4, <16 x i1> %mask)
1147411474
1147511475 ;; The result of the following instructions is identical aside from potential data races and memory access exceptions
1147611476 %oldval = load <16 x float>, <16 x float>* %ptr, align 4
517517
518518 private:
519519 /// \brief Create a call to a masked intrinsic with given Id.
520 /// Masked intrinsic has only one overloaded type - data type.
521520 CallInst *CreateMaskedIntrinsic(Intrinsic::ID Id, ArrayRef Ops,
522 Type *DataTy, const Twine &Name = "");
521 ArrayRef OverloadedTypes,
522 const Twine &Name = "");
523523
524524 Value *getCastedInt8PtrValue(Value *Ptr);
525525 };
637637
638638 //===-------------------------- Masked Intrinsics -------------------------===//
639639 //
640 def int_masked_store : Intrinsic<[], [llvm_anyvector_ty, LLVMPointerTo<0>,
640 def int_masked_store : Intrinsic<[], [llvm_anyvector_ty,
641 LLVMAnyPointerType>,
641642 llvm_i32_ty,
642643 LLVMVectorSameWidth<0, llvm_i1_ty>],
643644 [IntrReadWriteArgMem]>;
644645
645646 def int_masked_load : Intrinsic<[llvm_anyvector_ty],
646 [LLVMPointerTo<0>, llvm_i32_ty,
647 [LLVMAnyPointerType>, llvm_i32_ty,
647648 LLVMVectorSameWidth<0, llvm_i1_ty>, LLVMMatchType<0>],
648649 [IntrReadArgMem]>;
649650
140140 NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::cttz,
141141 F->arg_begin()->getType());
142142 return true;
143 }
144 break;
145 }
146
147 case 'm': {
148 if (Name.startswith("masked.load.")) {
149 Type *Tys[] = { F->getReturnType(), F->arg_begin()->getType() };
150 if (F->getName() != Intrinsic::getName(Intrinsic::masked_load, Tys)) {
151 F->setName(Name + ".old");
152 NewFn = Intrinsic::getDeclaration(F->getParent(),
153 Intrinsic::masked_load,
154 Tys);
155 return true;
156 }
157 }
158 if (Name.startswith("masked.store.")) {
159 auto Args = F->getFunctionType()->params();
160 Type *Tys[] = { Args[0], Args[1] };
161 if (F->getName() != Intrinsic::getName(Intrinsic::masked_store, Tys)) {
162 F->setName(Name + ".old");
163 NewFn = Intrinsic::getDeclaration(F->getParent(),
164 Intrinsic::masked_store,
165 Tys);
166 return true;
167 }
143168 }
144169 break;
145170 }
798823 CI->eraseFromParent();
799824 return;
800825 }
826
827 case Intrinsic::masked_load:
828 case Intrinsic::masked_store: {
829 SmallVector Args(CI->arg_operands().begin(),
830 CI->arg_operands().end());
831 CI->replaceAllUsesWith(Builder.CreateCall(NewFn, Args));
832 CI->eraseFromParent();
833 return;
834 }
801835 }
802836 }
803837
211211 CallInst *IRBuilderBase::CreateMaskedLoad(Value *Ptr, unsigned Align,
212212 Value *Mask, Value *PassThru,
213213 const Twine &Name) {
214 // DataTy is the overloaded type
215 Type *DataTy = cast(Ptr->getType())->getElementType();
214 PointerType *PtrTy = cast(Ptr->getType());
215 Type *DataTy = PtrTy->getElementType();
216216 assert(DataTy->isVectorTy() && "Ptr should point to a vector");
217217 if (!PassThru)
218218 PassThru = UndefValue::get(DataTy);
219 Type *OverloadedTypes[] = { DataTy, PtrTy };
219220 Value *Ops[] = { Ptr, getInt32(Align), Mask, PassThru};
220 return CreateMaskedIntrinsic(Intrinsic::masked_load, Ops, DataTy, Name);
221 return CreateMaskedIntrinsic(Intrinsic::masked_load, Ops,
222 OverloadedTypes, Name);
221223 }
222224
223225 /// \brief Create a call to a Masked Store intrinsic.
228230 /// be accessed in memory
229231 CallInst *IRBuilderBase::CreateMaskedStore(Value *Val, Value *Ptr,
230232 unsigned Align, Value *Mask) {
233 PointerType *PtrTy = cast(Ptr->getType());
234 Type *DataTy = PtrTy->getElementType();
235 assert(DataTy->isVectorTy() && "Ptr should point to a vector");
236 Type *OverloadedTypes[] = { DataTy, PtrTy };
231237 Value *Ops[] = { Val, Ptr, getInt32(Align), Mask };
232 // Type of the data to be stored - the only one overloaded type
233 return CreateMaskedIntrinsic(Intrinsic::masked_store, Ops, Val->getType());
238 return CreateMaskedIntrinsic(Intrinsic::masked_store, Ops, OverloadedTypes);
234239 }
235240
236241 /// Create a call to a Masked intrinsic, with given intrinsic Id,
237 /// an array of operands - Ops, and one overloaded type - DataTy
242 /// an array of operands - Ops, and an array of overloaded types -
243 /// OverloadedTypes.
238244 CallInst *IRBuilderBase::CreateMaskedIntrinsic(Intrinsic::ID Id,
239245 ArrayRef Ops,
240 Type *DataTy,
246 ArrayRef OverloadedTypes,
241247 const Twine &Name) {
242248 Module *M = BB->getParent()->getParent();
243 Type *OverloadedTypes[] = { DataTy };
244249 Value *TheFn = Intrinsic::getDeclaration(M, Id, OverloadedTypes);
245250 return createCallHelper(TheFn, Ops, this, Name);
246251 }
269274
270275 // We specify only one type when we create this intrinsic. Types of other
271276 // arguments are derived from this type.
272 return CreateMaskedIntrinsic(Intrinsic::masked_gather, Ops, DataTy, Name);
277 return CreateMaskedIntrinsic(Intrinsic::masked_gather, Ops, { DataTy }, Name);
273278 }
274279
275280 /// \brief Create a call to a Masked Scatter intrinsic.
299304
300305 // We specify only one type when we create this intrinsic. Types of other
301306 // arguments are derived from this type.
302 return CreateMaskedIntrinsic(Intrinsic::masked_scatter, Ops, DataTy);
307 return CreateMaskedIntrinsic(Intrinsic::masked_scatter, Ops, { DataTy });
303308 }
304309
305310 template
66 ; AVX2: Found an estimated cost of 4 {{.*}}.masked
77 define <2 x double> @test1(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) {
88 %mask = icmp eq <2 x i64> %trigger, zeroinitializer
9 %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
9 %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
1010 ret <2 x double> %res
1111 }
1212
1414 ; AVX2: Found an estimated cost of 4 {{.*}}.masked
1515 define <4 x i32> @test2(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
1616 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
17 %res = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
17 %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
1818 ret <4 x i32> %res
1919 }
2020
2222 ; AVX2: Found an estimated cost of 4 {{.*}}.masked
2323 define void @test3(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
2424 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
25 call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask)
25 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask)
2626 ret void
2727 }
2828
3030 ; AVX2: Found an estimated cost of 4 {{.*}}.masked
3131 define <8 x float> @test4(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
3232 %mask = icmp eq <8 x i32> %trigger, zeroinitializer
33 %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 4, <8 x i1>%mask, <8 x float>%dst)
33 %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1>%mask, <8 x float>%dst)
3434 ret <8 x float> %res
3535 }
3636
3838 ; AVX2: Found an estimated cost of 5 {{.*}}.masked
3939 define void @test5(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
4040 %mask = icmp eq <2 x i32> %trigger, zeroinitializer
41 call void @llvm.masked.store.v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask)
41 call void @llvm.masked.store.v2f32.p0v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask)
4242 ret void
4343 }
4444
4646 ; AVX2: Found an estimated cost of 6 {{.*}}.masked
4747 define void @test6(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
4848 %mask = icmp eq <2 x i32> %trigger, zeroinitializer
49 call void @llvm.masked.store.v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)
49 call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)
5050 ret void
5151 }
5252
5454 ; AVX2: Found an estimated cost of 5 {{.*}}.masked
5555 define <2 x float> @test7(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
5656 %mask = icmp eq <2 x i32> %trigger, zeroinitializer
57 %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
57 %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
5858 ret <2 x float> %res
5959 }
6060
6262 ; AVX2: Found an estimated cost of 6 {{.*}}.masked
6363 define <2 x i32> @test8(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
6464 %mask = icmp eq <2 x i32> %trigger, zeroinitializer
65 %res = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
65 %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
6666 ret <2 x i32> %res
6767 }
6868
278278 declare void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32, <16 x i1> %imask)
279279 declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32, <16 x i1> %mask, <16 x float>)
280280
281 declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
282 declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
283 declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
284 declare void @llvm.masked.store.v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>)
285 declare void @llvm.masked.store.v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>)
286 declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
287 declare void @llvm.masked.store.v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
288 declare void @llvm.masked.store.v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
289 declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
290 declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>)
291 declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
292 declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
293 declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
294 declare <2 x float> @llvm.masked.load.v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
295 declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
296 declare <4 x double> @llvm.masked.load.v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)
297 declare <2 x double> @llvm.masked.load.v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
298 declare void @llvm.masked.store.v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>)
299 declare void @llvm.masked.store.v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
300 declare void @llvm.masked.store.v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)
301
281 declare <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
282 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
283 declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
284 declare void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>)
285 declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>)
286 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
287 declare void @llvm.masked.store.v2f32.p0v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
288 declare void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
289 declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
290 declare <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
291 declare <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
292 declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
293 declare <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
294 declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
295 declare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)
296 declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
297 declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>)
298 declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
299 declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)
5757 ret i32 %s
5858 }
5959
60 declare <2 x double> @llvm.masked.load.v2f64(<2 x double>* %ptrs, i32, <2 x i1> %mask, <2 x double> %src0)
61
62 define <2 x double> @tests.masked.load(<2 x double>* %ptr, <2 x i1> %mask, <2 x double> %passthru) {
63 ; CHECK-LABEL: @tests.masked.load(
64 ; CHECK: @llvm.masked.load.v2f64.p0v2f64
65 %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %ptr, i32 1, <2 x i1> %mask, <2 x double> %passthru)
66 ret <2 x double> %res
67 }
68
69 declare void @llvm.masked.store.v2f64(<2 x double> %val, <2 x double>* %ptrs, i32, <2 x i1> %mask)
70
71 define void @tests.masked.store(<2 x double>* %ptr, <2 x i1> %mask, <2 x double> %val) {
72 ; CHECK-LABEL: @tests.masked.store(
73 ; CHECK: @llvm.masked.store.v2f64.p0v2f64
74 call void @llvm.masked.store.v2f64(<2 x double> %val, <2 x double>* %ptr, i32 3, <2 x i1> %mask)
75 ret void
76 }
77
6078 @__stack_chk_guard = external global i8*
6179 declare void @llvm.stackprotectorcheck(i8**)
6280
7088 ; This is part of @test.objectsize(), since llvm.objectsize declaration gets
7189 ; emitted at the end.
7290 ; CHECK: declare i32 @llvm.objectsize.i32.p0i8
91
1717 ; AVX512BW-NEXT: vmovaps %zmm3, %zmm2
1818 ; AVX512BW-NEXT: vmovaps %zmm4, %zmm3
1919 ; AVX512BW-NEXT: retq
20 %res = call <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
20 %res = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
2121 ret <32 x double> %res
2222 }
2323
3838 ; AVX512BW-NEXT: vmovaps %zmm3, %zmm2
3939 ; AVX512BW-NEXT: vmovaps %zmm4, %zmm3
4040 ; AVX512BW-NEXT: retq
41 %res = call <32 x i64> @llvm.masked.load.v32i64(<32 x i64>* %ptrs, i32 4, <32 x i1> %mask, <32 x i64> %src0)
41 %res = call <32 x i64> @llvm.masked.load.v32i64.p0v32i64(<32 x i64>* %ptrs, i32 4, <32 x i1> %mask, <32 x i64> %src0)
4242 ret <32 x i64> %res
4343 }
4444
45 declare <32 x i64> @llvm.masked.load.v32i64(<32 x i64>* %ptrs, i32, <32 x i1> %mask, <32 x i64> %src0)
46 declare <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0)
45 declare <32 x i64> @llvm.masked.load.v32i64.p0v32i64(<32 x i64>* %ptrs, i32, <32 x i1> %mask, <32 x i64> %src0)
46 declare <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0)
3939 ; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
4040 ; AVX512-NEXT: retq
4141 %mask = icmp eq <16 x i32> %trigger, zeroinitializer
42 %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>undef)
42 %res = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>undef)
4343 ret <16 x i32> %res
4444 }
4545
7575 ; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
7676 ; AVX512-NEXT: retq
7777 %mask = icmp eq <16 x i32> %trigger, zeroinitializer
78 %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>zeroinitializer)
78 %res = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>zeroinitializer)
7979 ret <16 x i32> %res
8080 }
8181
113113 ; AVX512-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1}
114114 ; AVX512-NEXT: retq
115115 %mask = icmp eq <16 x i32> %trigger, zeroinitializer
116 call void @llvm.masked.store.v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask)
116 call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask)
117117 ret void
118118 }
119119
154154 ; AVX512-NEXT: vmovaps %zmm1, %zmm0
155155 ; AVX512-NEXT: retq
156156 %mask = icmp eq <16 x i32> %trigger, zeroinitializer
157 %res = call <16 x float> @llvm.masked.load.v16f32(<16 x float>* %addr, i32 4, <16 x i1>%mask, <16 x float> %dst)
157 %res = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %addr, i32 4, <16 x i1>%mask, <16 x float> %dst)
158158 ret <16 x float> %res
159159 }
160160
209209 ; SKX-NEXT: vmovaps %zmm1, %zmm0
210210 ; SKX-NEXT: retq
211211 %mask = icmp eq <8 x i32> %trigger, zeroinitializer
212 %res = call <8 x double> @llvm.masked.load.v8f64(<8 x double>* %addr, i32 4, <8 x i1>%mask, <8 x double>%dst)
212 %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1>%mask, <8 x double>%dst)
213213 ret <8 x double> %res
214214 }
215215
238238 ; SKX-NEXT: vmovaps %zmm1, %zmm0
239239 ; SKX-NEXT: retq
240240 %mask = icmp eq <2 x i64> %trigger, zeroinitializer
241 %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
241 %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
242242 ret <2 x double> %res
243243 }
244244
267267 ; SKX-NEXT: vmovaps %zmm1, %zmm0
268268 ; SKX-NEXT: retq
269269 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
270 %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst)
270 %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst)
271271 ret <4 x float> %res
272272 }
273273
304304 ; SKX-NEXT: vmovaps %zmm1, %zmm0
305305 ; SKX-NEXT: retq
306306 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
307 %res = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
307 %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
308308 ret <4 x i32> %res
309309 }
310310
337337 ; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1}
338338 ; SKX-NEXT: retq
339339 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
340 call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask)
340 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask)
341341 ret void
342342 }
343343
380380 ; SKX-NEXT: vmovaps %zmm1, %zmm0
381381 ; SKX-NEXT: retq
382382 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
383 %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>%dst)
383 %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>%dst)
384384 ret <4 x double> %res
385385 }
386386
419419 ; SKX-NEXT: vmovapd (%rdi), %ymm0 {%k1} {z}
420420 ; SKX-NEXT: retq
421421 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
422 %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>zeroinitializer)
422 %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>zeroinitializer)
423423 ret <4 x double> %res
424424 }
425425
461461 ; SKX-NEXT: vmovaps %zmm1, %zmm0
462462 ; SKX-NEXT: retq
463463 %mask = icmp eq <8 x i32> %trigger, zeroinitializer
464 %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst)
464 %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst)
465465 ret <8 x float> %res
466466 }
467467
506506 ; SKX-NEXT: vmovdqu32 (%rdi), %ymm1 {%k1}
507507 ; SKX-NEXT: vmovaps %zmm1, %zmm0
508508 ; SKX-NEXT: retq
509 %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst)
509 %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst)
510510 ret <8 x i32> %res
511511 }
512512
547547 ; SKX-NEXT: vpmovw2m %xmm0, %k1
548548 ; SKX-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z}
549549 ; SKX-NEXT: retq
550 %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer)
550 %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer)
551551 ret <8 x float> %res
552552 }
553553
588588 ; SKX-NEXT: vpmovw2m %xmm0, %k1
589589 ; SKX-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z}
590590 ; SKX-NEXT: retq
591 %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer)
591 %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer)
592592 ret <8 x i32> %res
593593 }
594594
628628 ; SKX-NEXT: vmovdqu32 %ymm1, (%rdi) {%k1}
629629 ; SKX-NEXT: retq
630630 %mask = icmp eq <8 x i32> %trigger, zeroinitializer
631 call void @llvm.masked.store.v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask)
631 call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask)
632632 ret void
633633 }
634634
666666 ; AVX512-NEXT: vmovups %zmm1, (%rdi) {%k1}
667667 ; AVX512-NEXT: retq
668668 %mask = icmp eq <16 x i32> %trigger, zeroinitializer
669 call void @llvm.masked.store.v16f32(<16 x float>%val, <16 x float>* %addr, i32 4, <16 x i1>%mask)
669 call void @llvm.masked.store.v16f32.p0v16f32(<16 x float>%val, <16 x float>* %addr, i32 4, <16 x i1>%mask)
670670 ret void
671671 }
672672
711711 ; SKX-NEXT: vmovups %xmm1, (%rdi) {%k1}
712712 ; SKX-NEXT: retq
713713 %mask = icmp eq <2 x i32> %trigger, zeroinitializer
714 call void @llvm.masked.store.v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask)
714 call void @llvm.masked.store.v2f32.p0v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask)
715715 ret void
716716 }
717717
757757 ; SKX-NEXT: vpmovqd %xmm1, (%rdi) {%k1}
758758 ; SKX-NEXT: retq
759759 %mask = icmp eq <2 x i32> %trigger, zeroinitializer
760 call void @llvm.masked.store.v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)
760 call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)
761761 ret void
762762 }
763763
806806 ; SKX-NEXT: vmovaps %zmm1, %zmm0
807807 ; SKX-NEXT: retq
808808 %mask = icmp eq <2 x i32> %trigger, zeroinitializer
809 %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
809 %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
810810 ret <2 x float> %res
811811 }
812812
862862 ; SKX-NEXT: vpmovsxdq %xmm0, %xmm0
863863 ; SKX-NEXT: retq
864864 %mask = icmp eq <2 x i32> %trigger, zeroinitializer
865 %res = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
865 %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
866866 ret <2 x i32> %res
867867 }
868868
907907 ; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
908908 ; SKX-NEXT: retq
909909 %mask = icmp eq <2 x i32> %trigger, zeroinitializer
910 %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef)
910 %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef)
911911 ret <2 x float> %res
912912 }
913913
929929 ; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
930930 ; SKX-NEXT: retq
931931 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
932 %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1>, <4 x float>undef)
932 %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1>, <4 x float>undef)
933933 ret <4 x float> %res
934934 }
935935
956956 ; SKX-NEXT: kmovw %eax, %k1
957957 ; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1}
958958 ; SKX-NEXT: retq
959 %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1> , <4 x float> %dst)
959 %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> , <4 x float> %dst)
960960 ret <4 x float> %res
961961 }
962962
990990 ; SKX-NEXT: kmovw %eax, %k1
991991 ; SKX-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1}
992992 ; SKX-NEXT: retq
993 %res = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %addr, i32 4, <4 x i1> , <4 x i32> %dst)
993 %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> , <4 x i32> %dst)
994994 ret <4 x i32> %res
995995 }
996996
10171017 ; SKX-NEXT: kmovw %eax, %k1
10181018 ; SKX-NEXT: vmovups (%rdi), %ymm0 {%k1}
10191019 ; SKX-NEXT: retq
1020 %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 4, <8 x i1> , <8 x float> %dst)
1020 %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> , <8 x float> %dst)
10211021 ret <8 x float> %res
10221022 }
10231023
10421042 ; SKX-NEXT: kmovw %eax, %k1
10431043 ; SKX-NEXT: vmovupd (%rdi), %ymm0 {%k1}
10441044 ; SKX-NEXT: retq
1045 %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 4, <4 x i1> , <4 x double> %dst)
1045 %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> , <4 x double> %dst)
10461046 ret <4 x double> %res
10471047 }
10481048
10721072 ; SKX-NEXT: kmovw %eax, %k1
10731073 ; SKX-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1}
10741074 ; SKX-NEXT: retq
1075 %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1> , <8 x i32> %dst)
1075 %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> , <8 x i32> %dst)
10761076 ret <8 x i32> %res
10771077 }
10781078
11001100 ; SKX-NEXT: kmovw %eax, %k1
11011101 ; SKX-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1}
11021102 ; SKX-NEXT: retq
1103 %res = call <4 x i64> @llvm.masked.load.v4i64(<4 x i64>* %addr, i32 4, <4 x i1> , <4 x i64> %dst)
1103 %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> , <4 x i64> %dst)
11041104 ret <4 x i64> %res
11051105 }
11061106
11191119 ; AVX512-NEXT: kmovw %eax, %k1
11201120 ; AVX512-NEXT: vmovupd (%rdi), %zmm0 {%k1}
11211121 ; AVX512-NEXT: retq
1122 %res = call <8 x double> @llvm.masked.load.v8f64(<8 x double>* %addr, i32 4, <8 x i1> , <8 x double> %dst)
1122 %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> , <8 x double> %dst)
11231123 ret <8 x double> %res
11241124 }
11251125
11441144 ; SKX-NEXT: kmovw %eax, %k1
11451145 ; SKX-NEXT: vmovupd (%rdi), %ymm0 {%k1} {z}
11461146 ; SKX-NEXT: retq
1147 %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 4, <4 x i1> , <4 x double> undef)
1147 %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> , <4 x double> undef)
11481148 ret <4 x double> %res
11491149 }
11501150
11731173 ; SKX-NEXT: kmovw %eax, %k1
11741174 ; SKX-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z}
11751175 ; SKX-NEXT: retq
1176 %res = call <4 x i64> @llvm.masked.load.v4i64(<4 x i64>* %addr, i32 4, <4 x i1> , <4 x i64> undef)
1176 %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> , <4 x i64> undef)
11771177 ret <4 x i64> %res
11781178 }
11791179
12021202 ; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1}
12031203 ; SKX-NEXT: retq
12041204 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
1205 call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>)
1205 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>)
12061206 ret void
12071207 }
12081208
12181218 ; AVX512: ## BB#0:
12191219 ; AVX512-NEXT: vmovd %xmm0, (%rdi)
12201220 ; AVX512-NEXT: retq
1221 call void @llvm.masked.store.v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1>)
1221 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1>)
12221222 ret void
12231223 }
12241224
12341234 ; AVX512: ## BB#0:
12351235 ; AVX512-NEXT: vextractps $2, %xmm0, 8(%rdi)
12361236 ; AVX512-NEXT: retq
1237 call void @llvm.masked.store.v4f32(<4 x float> %val, <4 x float>* %addr, i32 4, <4 x i1>)
1237 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %val, <4 x float>* %addr, i32 4, <4 x i1>)
12381238 ret void
12391239 }
12401240
12531253 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
12541254 ; AVX512-NEXT: vmovq %xmm0, 16(%rdi)
12551255 ; AVX512-NEXT: retq
1256 call void @llvm.masked.store.v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1>)
1256 call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1>)
12571257 ret void
12581258 }
12591259
12721272 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
12731273 ; AVX512-NEXT: vmovhpd %xmm0, 24(%rdi)
12741274 ; AVX512-NEXT: retq
1275 call void @llvm.masked.store.v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1>)
1275 call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1>)
12761276 ret void
12771277 }
12781278
12911291 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
12921292 ; AVX512-NEXT: vmovlpd %xmm0, 48(%rdi)
12931293 ; AVX512-NEXT: retq
1294 call void @llvm.masked.store.v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1>)
1294 call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1>)
12951295 ret void
12961296 }
12971297
13071307 ; AVX512: ## BB#0:
13081308 ; AVX512-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0
13091309 ; AVX512-NEXT: retq
1310 %res = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %addr, i32 4, <4 x i1>, <4 x i32> %val)
1310 %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1>, <4 x i32> %val)
13111311 ret <4 x i32> %res
13121312 }
13131313
13231323 ; AVX512: ## BB#0:
13241324 ; AVX512-NEXT: vinsertps $32, 8(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
13251325 ; AVX512-NEXT: retq
1326 %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1>, <4 x float> %val)
1326 %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1>, <4 x float> %val)
13271327 ret <4 x float> %res
13281328 }
13291329
13571357 ; SKX-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
13581358 ; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
13591359 ; SKX-NEXT: retq
1360 %res = call <4 x i64> @llvm.masked.load.v4i64(<4 x i64>* %addr, i32 4, <4 x i1>, <4 x i64> %val)
1360 %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1>, <4 x i64> %val)
13611361 ret <4 x i64> %res
13621362 }
13631363
13841384 ; SKX-NEXT: vmovhpd 24(%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0],mem[0]
13851385 ; SKX-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
13861386 ; SKX-NEXT: retq
1387 %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 4, <4 x i1>, <4 x double> %val)
1387 %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1>, <4 x double> %val)
13881388 ret <4 x double> %res
13891389 }
13901390
14051405 ; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
14061406 ; AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
14071407 ; AVX512-NEXT: retq
1408 %res = call <8 x double> @llvm.masked.load.v8f64(<8 x double>* %addr, i32 4, <8 x i1>, <8 x double> %val)
1408 %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1>, <8 x double> %val)
14091409 ret <8 x double> %res
14101410 }
14111411
1412 declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
1413 declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
1414 declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
1415 declare <4 x i64> @llvm.masked.load.v4i64(<4 x i64>*, i32, <4 x i1>, <4 x i64>)
1416 declare void @llvm.masked.store.v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>)
1417 declare void @llvm.masked.store.v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>)
1418 declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
1419 declare void @llvm.masked.store.v4i64(<4 x i64>, <4 x i64>*, i32, <4 x i1>)
1420 declare void @llvm.masked.store.v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
1421 declare void @llvm.masked.store.v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
1422 declare void @llvm.masked.store.v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
1423 declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
1424 declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>)
1425 declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
1426 declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
1427 declare <8 x i32> @llvm.masked.load.v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>)
1428 declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
1429 declare <2 x float> @llvm.masked.load.v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
1430 declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
1431 declare <4 x double> @llvm.masked.load.v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)
1432 declare <2 x double> @llvm.masked.load.v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
1433 declare void @llvm.masked.store.v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>)
1434 declare void @llvm.masked.store.v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>)
1435 declare void @llvm.masked.store.v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
1436 declare void @llvm.masked.store.v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)
1437
1438 declare <16 x i32*> @llvm.masked.load.v16p0i32(<16 x i32*>*, i32, <16 x i1>, <16 x i32*>)
1412 declare <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
1413 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
1414 declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
1415 declare <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>*, i32, <4 x i1>, <4 x i64>)
1416 declare void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>)
1417 declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>)
1418 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
1419 declare void @llvm.masked.store.v4i64.p0v4i64(<4 x i64>, <4 x i64>*, i32, <4 x i1>)
1420 declare void @llvm.masked.store.v2f32.p0v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
1421 declare void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
1422 declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
1423 declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
1424 declare <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
1425 declare <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
1426 declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>)
1427 declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
1428 declare <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
1429 declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
1430 declare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)
1431 declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
1432 declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>)
1433 declare void @llvm.masked.store.v4f64.p0v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>)
1434 declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
1435 declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)
1436
1437 declare <16 x i32*> @llvm.masked.load.v16p0i32.p0v16p0i32(<16 x i32*>*, i32, <16 x i1>, <16 x i32*>)
14391438
14401439 define <16 x i32*> @test23(<16 x i32*> %trigger, <16 x i32*>* %addr) {
14411440 ; AVX1-LABEL: test23:
14851484 ; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z}
14861485 ; AVX512-NEXT: retq
14871486 %mask = icmp eq <16 x i32*> %trigger, zeroinitializer
1488 %res = call <16 x i32*> @llvm.masked.load.v16p0i32(<16 x i32*>* %addr, i32 4, <16 x i1>%mask, <16 x i32*>zeroinitializer)
1487 %res = call <16 x i32*> @llvm.masked.load.v16p0i32.p0v16p0i32(<16 x i32*>* %addr, i32 4, <16 x i1>%mask, <16 x i32*>zeroinitializer)
14891488 ret <16 x i32*> %res
14901489 }
14911490
14921491 %mystruct = type { i16, i16, [1 x i8*] }
14931492
1494 declare <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>*, i32, <16 x i1>, <16 x %mystruct*>)
1493 declare <16 x %mystruct*> @llvm.masked.load.v16p0mystruct.p0v16p0mystruct(<16 x %mystruct*>*, i32, <16 x i1>, <16 x %mystruct*>)
14951494
14961495 define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) {
14971496 ; AVX1-LABEL: test24:
15801579 ; SKX-NEXT: kshiftrw $8, %k1, %k1
15811580 ; SKX-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z}
15821581 ; SKX-NEXT: retq
1583 %res = call <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>* %addr, i32 4, <16 x i1>%mask, <16 x %mystruct*>zeroinitializer)
1582 %res = call <16 x %mystruct*> @llvm.masked.load.v16p0mystruct.p0v16p0mystruct(<16 x %mystruct*>* %addr, i32 4, <16 x i1>%mask, <16 x %mystruct*>zeroinitializer)
15841583 ret <16 x %mystruct*> %res
15851584 }
15861585
16711670 ; SKX-NEXT: kshiftrw $8, %k1, %k1
16721671 ; SKX-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1}
16731672 ; SKX-NEXT: retq
1674 call void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32 4, <16 x i1> %mask)
1673 call void @llvm.masked.store.v16i64.p0v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32 4, <16 x i1> %mask)
16751674 ret void
16761675 }
1677 declare void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32, <16 x i1> %mask)
1676 declare void @llvm.masked.store.v16i64.p0v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32, <16 x i1> %mask)
16781677
16791678 define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) {
16801679 ; AVX1-LABEL: test_store_16f64:
17631762 ; SKX-NEXT: kshiftrw $8, %k1, %k1
17641763 ; SKX-NEXT: vmovupd %zmm2, 64(%rdi) {%k1}
17651764 ; SKX-NEXT: retq
1766 call void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32 4, <16 x i1> %mask)
1765 call void @llvm.masked.store.v16f64.p0v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32 4, <16 x i1> %mask)
17671766 ret void
17681767 }
1769 declare void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32, <16 x i1> %mask)
1768 declare void @llvm.masked.store.v16f64.p0v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32, <16 x i1> %mask)
17701769
17711770 define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
17721771 ; AVX1-LABEL: test_load_16i64:
18671866 ; SKX-NEXT: vmovaps %zmm1, %zmm0
18681867 ; SKX-NEXT: vmovaps %zmm2, %zmm1
18691868 ; SKX-NEXT: retq
1870 %res = call <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
1869 %res = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
18711870 ret <16 x i64> %res
18721871 }
1873 declare <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
1872 declare <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
18741873
18751874 define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) {
18761875 ; AVX1-LABEL: test_load_16f64:
19711970 ; SKX-NEXT: vmovaps %zmm1, %zmm0
19721971 ; SKX-NEXT: vmovaps %zmm2, %zmm1
19731972 ; SKX-NEXT: retq
1974 %res = call <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
1973 %res = call <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
19751974 ret <16 x double> %res
19761975 }
1977 declare <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
1976 declare <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
19781977
19791978 define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0) {
19801979 ; AVX1-LABEL: test_load_32f64:
22022201 ; SKX-NEXT: vmovaps %zmm3, %zmm2
22032202 ; SKX-NEXT: vmovaps %zmm4, %zmm3
22042203 ; SKX-NEXT: retq
2205 %res = call <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
2204 %res = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
22062205 ret <32 x double> %res
22072206 }
2208 declare <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0)
2207
2208 declare <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0)
22092209
22102210 define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) {
22112211 ; SKX-LABEL: test_mask_load_16xi8:
22142214 ; SKX-NEXT: vpmovb2m %xmm0, %k1
22152215 ; SKX-NEXT: vmovdqu8 (%rdi), %xmm0 {%k1} {z}
22162216 ; SKX-NEXT: retq
2217 %res = call <16 x i8> @llvm.masked.load.v16i8(<16 x i8>* %addr, i32 4, <16 x i1>%mask, <16 x i8> undef)
2217 %res = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %addr, i32 4, <16 x i1>%mask, <16 x i8> undef)
22182218 ret <16 x i8> %res
22192219 }
2220 declare <16 x i8> @llvm.masked.load.v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
2220 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
22212221
22222222 define <32 x i8> @test_mask_load_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) {
22232223 ; SKX-LABEL: test_mask_load_32xi8:
22262226 ; SKX-NEXT: vpmovb2m %ymm0, %k1
22272227 ; SKX-NEXT: vmovdqu8 (%rdi), %ymm0 {%k1} {z}
22282228 ; SKX-NEXT: retq
2229 %res = call <32 x i8> @llvm.masked.load.v32i8(<32 x i8>* %addr, i32 4, <32 x i1>%mask, <32 x i8> zeroinitializer)
2229 %res = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* %addr, i32 4, <32 x i1>%mask, <32 x i8> zeroinitializer)
22302230 ret <32 x i8> %res
22312231 }
2232 declare <32 x i8> @llvm.masked.load.v32i8(<32 x i8>*, i32, <32 x i1>, <32 x i8>)
2232 declare <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>*, i32, <32 x i1>, <32 x i8>)
22332233
22342234 define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> %val) {
22352235 ; SKX-LABEL: test_mask_load_64xi8:
22392239 ; SKX-NEXT: vmovdqu8 (%rdi), %zmm1 {%k1}
22402240 ; SKX-NEXT: vmovaps %zmm1, %zmm0
22412241 ; SKX-NEXT: retq
2242 %res = call <64 x i8> @llvm.masked.load.v64i8(<64 x i8>* %addr, i32 4, <64 x i1>%mask, <64 x i8> %val)
2242 %res = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* %addr, i32 4, <64 x i1>%mask, <64 x i8> %val)
22432243 ret <64 x i8> %res
22442244 }
2245 declare <64 x i8> @llvm.masked.load.v64i8(<64 x i8>*, i32, <64 x i1>, <64 x i8>)
2245 declare <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>*, i32, <64 x i1>, <64 x i8>)
22462246
22472247 define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) {
22482248 ; SKX-LABEL: test_mask_load_8xi16:
22512251 ; SKX-NEXT: vpmovw2m %xmm0, %k1
22522252 ; SKX-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
22532253 ; SKX-NEXT: retq
2254 %res = call <8 x i16> @llvm.masked.load.v8i16(<8 x i16>* %addr, i32 4, <8 x i1>%mask, <8 x i16> undef)
2254 %res = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %addr, i32 4, <8 x i1>%mask, <8 x i16> undef)
22552255 ret <8 x i16> %res
22562256 }
2257 declare <8 x i16> @llvm.masked.load.v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
2257 declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
22582258
22592259 define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) {
22602260 ; SKX-LABEL: test_mask_load_16xi16:
22632263 ; SKX-NEXT: vpmovb2m %xmm0, %k1
22642264 ; SKX-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
22652265 ; SKX-NEXT: retq
2266 %res = call <16 x i16> @llvm.masked.load.v16i16(<16 x i16>* %addr, i32 4, <16 x i1>%mask, <16 x i16> zeroinitializer)
2266 %res = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* %addr, i32 4, <16 x i1>%mask, <16 x i16> zeroinitializer)
22672267 ret <16 x i16> %res
22682268 }
2269 declare <16 x i16> @llvm.masked.load.v16i16(<16 x i16>*, i32, <16 x i1>, <16 x i16>)
2269 declare <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>*, i32, <16 x i1>, <16 x i16>)
22702270
22712271 define <32 x i16> @test_mask_load_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32 x i16> %val) {
22722272 ; SKX-LABEL: test_mask_load_32xi16:
22762276 ; SKX-NEXT: vmovdqu16 (%rdi), %zmm1 {%k1}
22772277 ; SKX-NEXT: vmovaps %zmm1, %zmm0
22782278 ; SKX-NEXT: retq
2279 %res = call <32 x i16> @llvm.masked.load.v32i16(<32 x i16>* %addr, i32 4, <32 x i1>%mask, <32 x i16> %val)
2279 %res = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* %addr, i32 4, <32 x i1>%mask, <32 x i16> %val)
22802280 ret <32 x i16> %res
22812281 }
2282 declare <32 x i16> @llvm.masked.load.v32i16(<32 x i16>*, i32, <32 x i1>, <32 x i16>)
2282 declare <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>*, i32, <32 x i1>, <32 x i16>)
22832283
22842284 define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) {
22852285 ; SKX-LABEL: test_mask_store_16xi8:
22882288 ; SKX-NEXT: vpmovb2m %xmm0, %k1
22892289 ; SKX-NEXT: vmovdqu8 %xmm1, (%rdi) {%k1}
22902290 ; SKX-NEXT: retq
2291 call void @llvm.masked.store.v16i8(<16 x i8> %val, <16 x i8>* %addr, i32 4, <16 x i1>%mask)
2291 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %val, <16 x i8>* %addr, i32 4, <16 x i1>%mask)
22922292 ret void
22932293 }
2294 declare void @llvm.masked.store.v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
2294 declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
22952295
22962296 define void @test_mask_store_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) {
22972297 ; SKX-LABEL: test_mask_store_32xi8:
23002300 ; SKX-NEXT: vpmovb2m %ymm0, %k1
23012301 ; SKX-NEXT: vmovdqu8 %ymm1, (%rdi) {%k1}
23022302 ; SKX-NEXT: retq
2303 call void @llvm.masked.store.v32i8(<32 x i8> %val, <32 x i8>* %addr, i32 4, <32 x i1>%mask)
2303 call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> %val, <32 x i8>* %addr, i32 4, <32 x i1>%mask)
23042304 ret void
23052305 }
2306 declare void @llvm.masked.store.v32i8(<32 x i8>, <32 x i8>*, i32, <32 x i1>)
2306 declare void @llvm.masked.store.v32i8.p0v32i8(<32 x i8>, <32 x i8>*, i32, <32 x i1>)
23072307
23082308 define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> %val) {
23092309 ; SKX-LABEL: test_mask_store_64xi8:
23122312 ; SKX-NEXT: vpmovb2m %zmm0, %k1
23132313 ; SKX-NEXT: vmovdqu8 %zmm1, (%rdi) {%k1}
23142314 ; SKX-NEXT: retq
2315 call void @llvm.masked.store.v64i8(<64 x i8> %val, <64 x i8>* %addr, i32 4, <64 x i1>%mask)
2315 call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> %val, <64 x i8>* %addr, i32 4, <64 x i1>%mask)
23162316 ret void
23172317 }
2318 declare void @llvm.masked.store.v64i8(<64 x i8>, <64 x i8>*, i32, <64 x i1>)
2318 declare void @llvm.masked.store.v64i8.p0v64i8(<64 x i8>, <64 x i8>*, i32, <64 x i1>)
23192319
23202320 define void @test_mask_store_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) {
23212321 ; SKX-LABEL: test_mask_store_8xi16:
23242324 ; SKX-NEXT: vpmovw2m %xmm0, %k1
23252325 ; SKX-NEXT: vmovdqu16 %xmm1, (%rdi) {%k1}
23262326 ; SKX-NEXT: retq
2327 call void @llvm.masked.store.v8i16(<8 x i16> %val, <8 x i16>* %addr, i32 4, <8 x i1>%mask)
2327 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %val, <8 x i16>* %addr, i32 4, <8 x i1>%mask)
23282328 ret void
23292329 }
2330 declare void @llvm.masked.store.v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)
2330 declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)
23312331
23322332 define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) {
23332333 ; SKX-LABEL: test_mask_store_16xi16:
23362336 ; SKX-NEXT: vpmovb2m %xmm0, %k1
23372337 ; SKX-NEXT: vmovdqu16 %ymm1, (%rdi) {%k1}
23382338 ; SKX-NEXT: retq
2339 call void @llvm.masked.store.v16i16(<16 x i16> %val, <16 x i16>* %addr, i32 4, <16 x i1>%mask)
2339 call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %val, <16 x i16>* %addr, i32 4, <16 x i1>%mask)
23402340 ret void
23412341 }
2342 declare void @llvm.masked.store.v16i16(<16 x i16>, <16 x i16>*, i32, <16 x i1>)
2342 declare void @llvm.masked.store.v16i16.p0v16i16(<16 x i16>, <16 x i16>*, i32, <16 x i1>)
23432343
23442344 define void @test_mask_store_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32 x i16> %val) {
23452345 ; SKX-LABEL: test_mask_store_32xi16:
23482348 ; SKX-NEXT: vpmovb2m %ymm0, %k1
23492349 ; SKX-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1}
23502350 ; SKX-NEXT: retq
2351 call void @llvm.masked.store.v32i16(<32 x i16> %val, <32 x i16>* %addr, i32 4, <32 x i1>%mask)
2351 call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> %val, <32 x i16>* %addr, i32 4, <32 x i1>%mask)
23522352 ret void
23532353 }
2354 declare void @llvm.masked.store.v32i16(<32 x i16>, <32 x i16>*, i32, <32 x i1>)
2354
2355 declare void @llvm.masked.store.v32i16.p0v32i16(<32 x i16>, <32 x i16>*, i32, <32 x i1>)
0 ; RUN: opt -instcombine -S < %s | FileCheck %s
11
2 declare <2 x double> @llvm.masked.load.v2f64(<2 x double>* %ptrs, i32, <2 x i1> %mask, <2 x double> %src0)
3 declare void @llvm.masked.store.v2f64(<2 x double> %val, <2 x double>* %ptrs, i32, <2 x i1> %mask)
2 declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptrs, i32, <2 x i1> %mask, <2 x double> %src0)
3 declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %val, <2 x double>* %ptrs, i32, <2 x i1> %mask)
44 declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %passthru)
55 declare void @llvm.masked.scatter.v2f64(<2 x double> %val, <2 x double*> %ptrs, i32, <2 x i1> %mask)
66
77 define <2 x double> @load_zeromask(<2 x double>* %ptr, <2 x double> %passthru) {
8 %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %ptr, i32 1, <2 x i1> zeroinitializer, <2 x double> %passthru)
8 %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptr, i32 1, <2 x i1> zeroinitializer, <2 x double> %passthru)
99 ret <2 x double> %res
1010
1111 ; CHECK-LABEL: @load_zeromask(
1313 }
1414
1515 define <2 x double> @load_onemask(<2 x double>* %ptr, <2 x double> %passthru) {
16 %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %ptr, i32 2, <2 x i1> , <2 x double> %passthru)
16 %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptr, i32 2, <2 x i1> , <2 x double> %passthru)
1717 ret <2 x double> %res
1818
1919 ; CHECK-LABEL: @load_onemask(
2222 }
2323
2424 define void @store_zeromask(<2 x double>* %ptr, <2 x double> %val) {
25 call void @llvm.masked.store.v2f64(<2 x double> %val, <2 x double>* %ptr, i32 3, <2 x i1> zeroinitializer)
25 call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %val, <2 x double>* %ptr, i32 3, <2 x i1> zeroinitializer)
2626 ret void
2727
2828 ; CHECK-LABEL: @store_zeromask(
3030 }
3131
3232 define void @store_onemask(<2 x double>* %ptr, <2 x double> %val) {
33 call void @llvm.masked.store.v2f64(<2 x double> %val, <2 x double>* %ptr, i32 4, <2 x i1> )
33 call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %val, <2 x double>* %ptr, i32 4, <2 x i1> )
3434 ret void
3535
3636 ; CHECK-LABEL: @store_onemask(
5252
5353 ; CHECK-LABEL: @mload_one_one(
5454 ; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x float>*
55 ; CHECK-NEXT: %1 = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %castvec, i32 1, <4 x i1> , <4 x float> undef)
55 ; CHECK-NEXT: %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %castvec, i32 1, <4 x i1> , <4 x float> undef)
5656 ; CHECK-NEXT: ret <4 x float> %1
5757 }
5858
6464
6565 ; CHECK-LABEL: @mload_one_one_double(
6666 ; CHECK-NEXT: %castvec = bitcast i8* %f to <2 x double>*
67 ; CHECK-NEXT: %1 = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %castvec, i32 1, <2 x i1> , <2 x double> undef)
67 ; CHECK-NEXT: %1 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %castvec, i32 1, <2 x i1> , <2 x double> undef)
6868 ; CHECK-NEXT: ret <2 x double> %1
6969 }
7070
7676
7777 ; CHECK-LABEL: @mload_v8f32(
7878 ; CHECK-NEXT: %castvec = bitcast i8* %f to <8 x float>*
79 ; CHECK-NEXT: %1 = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %castvec, i32 1, <8 x i1> , <8 x float> undef)
79 ; CHECK-NEXT: %1 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %castvec, i32 1, <8 x i1> , <8 x float> undef)
8080 ; CHECK-NEXT: ret <8 x float> %1
8181 }
8282
8686
8787 ; CHECK-LABEL: @mload_v4f64(
8888 ; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x double>*
89 ; CHECK-NEXT: %1 = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %castvec, i32 1, <4 x i1> , <4 x double> undef)
89 ; CHECK-NEXT: %1 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %castvec, i32 1, <4 x i1> , <4 x double> undef)
9090 ; CHECK-NEXT: ret <4 x double> %1
9191 }
9292
9898
9999 ; CHECK-LABEL: @mload_v4i32(
100100 ; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x i32>*
101 ; CHECK-NEXT: %1 = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %castvec, i32 1, <4 x i1> , <4 x i32> undef)
101 ; CHECK-NEXT: %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %castvec, i32 1, <4 x i1> , <4 x i32> undef)
102102 ; CHECK-NEXT: ret <4 x i32> %1
103103 }
104104
108108
109109 ; CHECK-LABEL: @mload_v2i64(
110110 ; CHECK-NEXT: %castvec = bitcast i8* %f to <2 x i64>*
111 ; CHECK-NEXT: %1 = call <2 x i64> @llvm.masked.load.v2i64(<2 x i64>* %castvec, i32 1, <2 x i1> , <2 x i64> undef)
111 ; CHECK-NEXT: %1 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %castvec, i32 1, <2 x i1> , <2 x i64> undef)
112112 ; CHECK-NEXT: ret <2 x i64> %1
113113 }
114114
118118
119119 ; CHECK-LABEL: @mload_v8i32(
120120 ; CHECK-NEXT: %castvec = bitcast i8* %f to <8 x i32>*
121 ; CHECK-NEXT: %1 = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %castvec, i32 1, <8 x i1> , <8 x i32> undef)
121 ; CHECK-NEXT: %1 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %castvec, i32 1, <8 x i1> , <8 x i32> undef)
122122 ; CHECK-NEXT: ret <8 x i32> %1
123123 }
124124
128128
129129 ; CHECK-LABEL: @mload_v4i64(
130130 ; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x i64>*
131 ; CHECK-NEXT: %1 = call <4 x i64> @llvm.masked.load.v4i64(<4 x i64>* %castvec, i32 1, <4 x i1> , <4 x i64> undef)
131 ; CHECK-NEXT: %1 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %castvec, i32 1, <4 x i1> , <4 x i64> undef)
132132 ; CHECK-NEXT: ret <4 x i64> %1
133133 }
134134
186186
187187 ; CHECK-LABEL: @mstore_one_one(
188188 ; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x float>*
189 ; CHECK-NEXT: call void @llvm.masked.store.v4f32(<4 x float> %v, <4 x float>* %castvec, i32 1, <4 x i1> )
189 ; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %v, <4 x float>* %castvec, i32 1, <4 x i1> )
190190 ; CHECK-NEXT: ret void
191191 }
192192
198198
199199 ; CHECK-LABEL: @mstore_one_one_double(
200200 ; CHECK-NEXT: %castvec = bitcast i8* %f to <2 x double>*
201 ; CHECK-NEXT: call void @llvm.masked.store.v2f64(<2 x double> %v, <2 x double>* %castvec, i32 1, <2 x i1> )
201 ; CHECK-NEXT: call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %v, <2 x double>* %castvec, i32 1, <2 x i1> )
202202 ; CHECK-NEXT: ret void
203203 }
204204
210210
211211 ; CHECK-LABEL: @mstore_v8f32(
212212 ; CHECK-NEXT: %castvec = bitcast i8* %f to <8 x float>*
213 ; CHECK-NEXT: call void @llvm.masked.store.v8f32(<8 x float> %v, <8 x float>* %castvec, i32 1, <8 x i1> )
213 ; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> %v, <8 x float>* %castvec, i32 1, <8 x i1> )
214214 ; CHECK-NEXT: ret void
215215 }
216216
220220
221221 ; CHECK-LABEL: @mstore_v4f64(
222222 ; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x double>*
223 ; CHECK-NEXT: call void @llvm.masked.store.v4f64(<4 x double> %v, <4 x double>* %castvec, i32 1, <4 x i1> )
223 ; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %v, <4 x double>* %castvec, i32 1, <4 x i1> )
224224 ; CHECK-NEXT: ret void
225225 }
226226
232232
233233 ; CHECK-LABEL: @mstore_v4i32(
234234 ; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x i32>*
235 ; CHECK-NEXT: call void @llvm.masked.store.v4i32(<4 x i32> %v, <4 x i32>* %castvec, i32 1, <4 x i1> )
235 ; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v, <4 x i32>* %castvec, i32 1, <4 x i1> )
236236 ; CHECK-NEXT: ret void
237237 }
238238
242242
243243 ; CHECK-LABEL: @mstore_v2i64(
244244 ; CHECK-NEXT: %castvec = bitcast i8* %f to <2 x i64>*
245 ; CHECK-NEXT: call void @llvm.masked.store.v2i64(<2 x i64> %v, <2 x i64>* %castvec, i32 1, <2 x i1> )
245 ; CHECK-NEXT: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %v, <2 x i64>* %castvec, i32 1, <2 x i1> )
246246 ; CHECK-NEXT: ret void
247247 }
248248
252252
253253 ; CHECK-LABEL: @mstore_v8i32(
254254 ; CHECK-NEXT: %castvec = bitcast i8* %f to <8 x i32>*
255 ; CHECK-NEXT: call void @llvm.masked.store.v8i32(<8 x i32> %v, <8 x i32>* %castvec, i32 1, <8 x i1> )
255 ; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v, <8 x i32>* %castvec, i32 1, <8 x i1> )
256256 ; CHECK-NEXT: ret void
257257 }
258258
262262
263263 ; CHECK-LABEL: @mstore_v4i64(
264264 ; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x i64>*
265 ; CHECK-NEXT: call void @llvm.masked.store.v4i64(<4 x i64> %v, <4 x i64>* %castvec, i32 1, <4 x i1> )
265 ; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %v, <4 x i64>* %castvec, i32 1, <4 x i1> )
266266 ; CHECK-NEXT: ret void
267267 }
268268
1717
1818 ;AVX-LABEL: @foo1
1919 ;AVX: icmp slt <8 x i32> %wide.load,
20 ;AVX: call <8 x i32> @llvm.masked.load.v8i32
20 ;AVX: call <8 x i32> @llvm.masked.load.v8i32.p0v8i32
2121 ;AVX: add nsw <8 x i32>
22 ;AVX: call void @llvm.masked.store.v8i32
22 ;AVX: call void @llvm.masked.store.v8i32.p0v8i32
2323 ;AVX: ret void
2424
2525 ;AVX512-LABEL: @foo1
2626 ;AVX512: icmp slt <16 x i32> %wide.load,
27 ;AVX512: call <16 x i32> @llvm.masked.load.v16i32
27 ;AVX512: call <16 x i32> @llvm.masked.load.v16i32.p0v16i32
2828 ;AVX512: add nsw <16 x i32>
29 ;AVX512: call void @llvm.masked.store.v16i32
29 ;AVX512: call void @llvm.masked.store.v16i32.p0v16i32
3030 ;AVX512: ret void
3131
3232 ; Function Attrs: nounwind uwtable
8888 ret void
8989 }
9090
91 ; The same as @foo1 but all the pointers are address space 1 pointers.
92
93 ;AVX-LABEL: @foo1_addrspace1
94 ;AVX: icmp slt <8 x i32> %wide.load,
95 ;AVX: call <8 x i32> @llvm.masked.load.v8i32.p1v8i32
96 ;AVX: add nsw <8 x i32>
97 ;AVX: call void @llvm.masked.store.v8i32.p1v8i32
98 ;AVX: ret void
99
100 ;AVX512-LABEL: @foo1_addrspace1
101 ;AVX512: icmp slt <16 x i32> %wide.load,
102 ;AVX512: call <16 x i32> @llvm.masked.load.v16i32.p1v16i32
103 ;AVX512: add nsw <16 x i32>
104 ;AVX512: call void @llvm.masked.store.v16i32.p1v16i32
105 ;AVX512: ret void
106
107 ; Function Attrs: nounwind uwtable
108 define void @foo1_addrspace1(i32 addrspace(1)* %A, i32 addrspace(1)* %B, i32 addrspace(1)* %trigger) {
109 entry:
110 %A.addr = alloca i32 addrspace(1)*, align 8
111 %B.addr = alloca i32 addrspace(1)*, align 8
112 %trigger.addr = alloca i32 addrspace(1)*, align 8
113 %i = alloca i32, align 4
114 store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 8
115 store i32 addrspace(1)* %B, i32 addrspace(1)** %B.addr, align 8
116 store i32 addrspace(1)* %trigger, i32 addrspace(1)** %trigger.addr, align 8
117 store i32 0, i32* %i, align 4
118 br label %for.cond
119
120 for.cond: ; preds = %for.inc, %entry
121 %0 = load i32, i32* %i, align 4
122 %cmp = icmp slt i32 %0, 10000
123 br i1 %cmp, label %for.body, label %for.end
124
125 for.body: ; preds = %for.cond
126 %1 = load i32, i32* %i, align 4
127 %idxprom = sext i32 %1 to i64
128 %2 = load i32 addrspace(1)*, i32 addrspace(1)** %trigger.addr, align 8
129 %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %2, i64 %idxprom
130 %3 = load i32, i32 addrspace(1)* %arrayidx, align 4
131 %cmp1 = icmp slt i32 %3, 100
132 br i1 %cmp1, label %if.then, label %if.end
133
134 if.then: ; preds = %for.body
135 %4 = load i32, i32* %i, align 4
136 %idxprom2 = sext i32 %4 to i64
137 %5 = load i32 addrspace(1)*, i32 addrspace(1)** %B.addr, align 8
138 %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %5, i64 %idxprom2
139 %6 = load i32, i32 addrspace(1)* %arrayidx3, align 4
140 %7 = load i32, i32* %i, align 4
141 %idxprom4 = sext i32 %7 to i64
142 %8 = load i32 addrspace(1)*, i32 addrspace(1)** %trigger.addr, align 8
143 %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %8, i64 %idxprom4
144 %9 = load i32, i32 addrspace(1)* %arrayidx5, align 4
145 %add = add nsw i32 %6, %9
146 %10 = load i32, i32* %i, align 4
147 %idxprom6 = sext i32 %10 to i64
148 %11 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 8
149 %arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %11, i64 %idxprom6
150 store i32 %add, i32 addrspace(1)* %arrayidx7, align 4
151 br label %if.end
152
153 if.end: ; preds = %if.then, %for.body
154 br label %for.inc
155
156 for.inc: ; preds = %if.end
157 %12 = load i32, i32* %i, align 4
158 %inc = add nsw i32 %12, 1
159 store i32 %inc, i32* %i, align 4
160 br label %for.cond
161
162 for.end: ; preds = %for.cond
163 ret void
164 }
165
91166 ; The source code:
92167 ;
93168 ;void foo2(float *A, float *B, int *trigger) {
101176
102177 ;AVX-LABEL: @foo2
103178 ;AVX: icmp slt <8 x i32> %wide.load,
104 ;AVX: call <8 x float> @llvm.masked.load.v8f32
179 ;AVX: call <8 x float> @llvm.masked.load.v8f32.p0v8f32
105180 ;AVX: fadd <8 x float>
106 ;AVX: call void @llvm.masked.store.v8f32
181 ;AVX: call void @llvm.masked.store.v8f32.p0v8f32
107182 ;AVX: ret void
108183
109184 ;AVX512-LABEL: @foo2
110185 ;AVX512: icmp slt <16 x i32> %wide.load,
111 ;AVX512: call <16 x float> @llvm.masked.load.v16f32
186 ;AVX512: call <16 x float> @llvm.masked.load.v16f32.p0v16f32
112187 ;AVX512: fadd <16 x float>
113 ;AVX512: call void @llvm.masked.store.v16f32
188 ;AVX512: call void @llvm.masked.store.v16f32.p0v16f32
114189 ;AVX512: ret void
115190
116191 ; Function Attrs: nounwind uwtable
186261
187262 ;AVX-LABEL: @foo3
188263 ;AVX: icmp slt <4 x i32> %wide.load,
189 ;AVX: call <4 x double> @llvm.masked.load.v4f64
264 ;AVX: call <4 x double> @llvm.masked.load.v4f64.p0v4f64
190265 ;AVX: sitofp <4 x i32> %wide.load to <4 x double>
191266 ;AVX: fadd <4 x double>
192 ;AVX: call void @llvm.masked.store.v4f64
267 ;AVX: call void @llvm.masked.store.v4f64.p0v4f64
193268 ;AVX: ret void
194269
195270 ;AVX512-LABEL: @foo3
196271 ;AVX512: icmp slt <8 x i32> %wide.load,
197 ;AVX512: call <8 x double> @llvm.masked.load.v8f64
272 ;AVX512: call <8 x double> @llvm.masked.load.v8f64.p0v8f64
198273 ;AVX512: sitofp <8 x i32> %wide.load to <8 x double>
199274 ;AVX512: fadd <8 x double>
200 ;AVX512: call void @llvm.masked.store.v8f64
275 ;AVX512: call void @llvm.masked.store.v8f64.p0v8f64
201276 ;AVX512: ret void
202277
203278
428503 ;AVX2-LABEL: @foo6
429504 ;AVX2: icmp sgt <4 x i32> %reverse, zeroinitializer
430505 ;AVX2: shufflevector <4 x i1>{{.*}}<4 x i32>
431 ;AVX2: call <4 x double> @llvm.masked.load.v4f64
506 ;AVX2: call <4 x double> @llvm.masked.load.v4f64.p0v4f64
432507 ;AVX2: fadd <4 x double>
433 ;AVX2: call void @llvm.masked.store.v4f64
508 ;AVX2: call void @llvm.masked.store.v4f64.p0v4f64
434509 ;AVX2: ret void
435510
436511 ;AVX512-LABEL: @foo6
437512 ;AVX512: icmp sgt <8 x i32> %reverse, zeroinitializer
438513 ;AVX512: shufflevector <8 x i1>{{.*}}<8 x i32>
439 ;AVX512: call <8 x double> @llvm.masked.load.v8f64
514 ;AVX512: call <8 x double> @llvm.masked.load.v8f64.p0v8f64
440515 ;AVX512: fadd <8 x double>
441 ;AVX512: call void @llvm.masked.store.v8f64
516 ;AVX512: call void @llvm.masked.store.v8f64.p0v8f64
442517 ;AVX512: ret void
443518
444519
506581 ; }
507582
508583 ;AVX512-LABEL: @foo7
509 ;AVX512: call <8 x double*> @llvm.masked.load.v8p0f64(<8 x double*>*
510 ;AVX512: call void @llvm.masked.store.v8f64
584 ;AVX512: call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>*
585 ;AVX512: call void @llvm.masked.store.v8f64.p0v8f64
511586 ;AVX512: ret void
512587
513588 define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigger, i32 %size) #0 {
578653 ;}
579654
580655 ;AVX512-LABEL: @foo8
581 ;AVX512: call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f(<8 x i32 ()*>* %
582 ;AVX512: call void @llvm.masked.store.v8f64
656 ;AVX512: call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* %
657 ;AVX512: call void @llvm.masked.store.v8f64.p0v8f64
583658 ;AVX512: ret void
584659
585660 define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigger, i32 %size) #0 {