llvm.org GIT mirror llvm / b23b2fb
AVX-512: Added all SKX forms of GATHER instructions. Added intrinsics. Added encoding and tests. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@240905 91177308-0d34-0410-b5e6-96231b3b80d8 Elena Demikhovsky 4 years ago
11 changed file(s) with 940 addition(s) and 125 deletion(s). Raw diff Collapse all Expand all
42634263 llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
42644264 [IntrReadArgMem]>;
42654265
4266 def int_x86_avx512_gather3div2_df :
4267 GCCBuiltin<"__builtin_ia32_gather3div2df">,
4268 Intrinsic<[llvm_v2f64_ty],
4269 [llvm_v2f64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i32_ty],
4270 [IntrReadArgMem]>;
4271
4272 def int_x86_avx512_gather3div2_di :
4273 GCCBuiltin<"__builtin_ia32_gather3div2di">,
4274 Intrinsic<[llvm_v4i32_ty],
4275 [llvm_v2i64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i32_ty],
4276 [IntrReadArgMem]>;
4277
4278 def int_x86_avx512_gather3div4_df :
4279 GCCBuiltin<"__builtin_ia32_gather3div4df">,
4280 Intrinsic<[llvm_v4f64_ty],
4281 [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty, llvm_i32_ty],
4282 [IntrReadArgMem]>;
4283
4284 def int_x86_avx512_gather3div4_di :
4285 GCCBuiltin<"__builtin_ia32_gather3div4di">,
4286 Intrinsic<[llvm_v8i32_ty],
4287 [llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty, llvm_i32_ty],
4288 [IntrReadArgMem]>;
4289
4290 def int_x86_avx512_gather3div4_sf :
4291 GCCBuiltin<"__builtin_ia32_gather3div4sf">,
4292 Intrinsic<[llvm_v4f32_ty],
4293 [llvm_v4f32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i32_ty],
4294 [IntrReadArgMem]>;
4295
4296 def int_x86_avx512_gather3div4_si :
4297 GCCBuiltin<"__builtin_ia32_gather3div4si">,
4298 Intrinsic<[llvm_v4i32_ty],
4299 [llvm_v4i32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i32_ty],
4300 [IntrReadArgMem]>;
4301
4302 def int_x86_avx512_gather3div8_sf :
4303 GCCBuiltin<"__builtin_ia32_gather3div8sf">,
4304 Intrinsic<[llvm_v4f32_ty],
4305 [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty, llvm_i32_ty],
4306 [IntrReadArgMem]>;
4307
4308 def int_x86_avx512_gather3div8_si :
4309 GCCBuiltin<"__builtin_ia32_gather3div8si">,
4310 Intrinsic<[llvm_v4i32_ty],
4311 [llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty, llvm_i32_ty],
4312 [IntrReadArgMem]>;
4313
4314 def int_x86_avx512_gather3siv2_df :
4315 GCCBuiltin<"__builtin_ia32_gather3siv2df">,
4316 Intrinsic<[llvm_v2f64_ty],
4317 [llvm_v2f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty],
4318 [IntrReadArgMem]>;
4319
4320 def int_x86_avx512_gather3siv2_di :
4321 GCCBuiltin<"__builtin_ia32_gather3siv2di">,
4322 Intrinsic<[llvm_v4i32_ty],
4323 [llvm_v2i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty],
4324 [IntrReadArgMem]>;
4325
4326 def int_x86_avx512_gather3siv4_df :
4327 GCCBuiltin<"__builtin_ia32_gather3siv4df">,
4328 Intrinsic<[llvm_v4f64_ty],
4329 [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty],
4330 [IntrReadArgMem]>;
4331
4332 def int_x86_avx512_gather3siv4_di :
4333 GCCBuiltin<"__builtin_ia32_gather3siv4di">,
4334 Intrinsic<[llvm_v8i32_ty],
4335 [llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty],
4336 [IntrReadArgMem]>;
4337
4338 def int_x86_avx512_gather3siv4_sf :
4339 GCCBuiltin<"__builtin_ia32_gather3siv4sf">,
4340 Intrinsic<[llvm_v4f32_ty],
4341 [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty],
4342 [IntrReadArgMem]>;
4343
4344 def int_x86_avx512_gather3siv4_si :
4345 GCCBuiltin<"__builtin_ia32_gather3siv4si">,
4346 Intrinsic<[llvm_v4i32_ty],
4347 [llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty],
4348 [IntrReadArgMem]>;
4349
4350 def int_x86_avx512_gather3siv8_sf :
4351 GCCBuiltin<"__builtin_ia32_gather3siv8sf">,
4352 Intrinsic<[llvm_v8f32_ty],
4353 [llvm_v8f32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
4354 [IntrReadArgMem]>;
4355
4356 def int_x86_avx512_gather3siv8_si :
4357 GCCBuiltin<"__builtin_ia32_gather3siv8si">,
4358 Intrinsic<[llvm_v8i32_ty],
4359 [llvm_v8i32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
4360 [IntrReadArgMem]>;
4361
42664362 // scatter
42674363 def int_x86_avx512_scatter_dpd_512 : GCCBuiltin<"__builtin_ia32_scattersiv8df">,
42684364 Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
237237 return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
238238 getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15;
239239 }
240 bool isMemVX32X() const {
241 return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
242 getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM31;
243 }
240244 bool isMemVY32() const {
241245 return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
242246 getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
243247 }
248 bool isMemVY32X() const {
249 return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
250 getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM31;
251 }
244252 bool isMemVX64() const {
245253 return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
246254 getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15;
247255 }
256 bool isMemVX64X() const {
257 return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
258 getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM31;
259 }
248260 bool isMemVY64() const {
249261 return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
250262 getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
263 }
264 bool isMemVY64X() const {
265 return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
266 getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM31;
251267 }
252268 bool isMemVZ32() const {
253269 return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
1542315423 const X86Subtarget * Subtarget) {
1542415424 SDLoc dl(Op);
1542515425 ConstantSDNode *C = dyn_cast(ScaleOp);
15426 assert(C && "Invalid scale type");
15426 if (!C)
15427 llvm_unreachable("Invalid scale type");
15428 unsigned ScaleVal = C->getZExtValue();
15429 if (ScaleVal > 2 && ScaleVal != 4 && ScaleVal != 8)
15430 llvm_unreachable("Valid scale values are 1, 2, 4, 8");
15431
1542715432 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
1542815433 EVT MaskVT = MVT::getVectorVT(MVT::i1,
1542915434 Index.getSimpleValueType().getVectorNumElements());
1543115436 ConstantSDNode *MaskC = dyn_cast(Mask);
1543215437 if (MaskC)
1543315438 MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
15434 else
15435 MaskInReg = DAG.getBitcast(MaskVT, Mask);
15439 else {
15440 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
15441 Mask.getValueType().getSizeInBits());
15442
15443 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
15444 // are extracted by EXTRACT_SUBVECTOR.
15445 MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
15446 DAG.getBitcast(BitcastVT, Mask),
15447 DAG.getIntPtrConstant(0, dl));
15448 }
1543615449 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
1543715450 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
1543815451 SDValue Segment = DAG.getRegister(0, MVT::i32);
54395439
54405440 multiclass avx512_gather opc, string OpcodeStr, X86VectorVTInfo _,
54415441 X86MemOperand memop, PatFrag GatherNode> {
5442 let Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb" in
5442 let Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb",
5443 ExeDomain = _.ExeDomain in
54435444 def rm : AVX5128I
54445445 (ins _.RC:$src1, _.KRCWM:$mask, memop:$src2),
5445 !strconcat(OpcodeStr,
5446 !strconcat(OpcodeStr#_.Suffix,
54465447 "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
54475448 [(set _.RC:$dst, _.KRCWM:$mask_wb,
54485449 (GatherNode (_.VT _.RC:$src1), _.KRCWM:$mask,
54505451 EVEX_CD8<_.EltSize, CD8VT1>;
54515452 }
54525453
5453 let ExeDomain = SSEPackedDouble in {
5454 defm VGATHERDPDZ : avx512_gather<0x92, "vgatherdpd", v8f64_info, vy64xmem,
5455 mgatherv8i32>, EVEX_V512, VEX_W;
5456 defm VGATHERQPDZ : avx512_gather<0x93, "vgatherqpd", v8f64_info, vz64mem,
5457 mgatherv8i64>, EVEX_V512, VEX_W;
5458 }
5459
5460 let ExeDomain = SSEPackedSingle in {
5461 defm VGATHERDPSZ : avx512_gather<0x92, "vgatherdps", v16f32_info, vz32mem,
5462 mgatherv16i32>, EVEX_V512;
5463 defm VGATHERQPSZ : avx512_gather<0x93, "vgatherqps", v8f32x_info, vz64mem,
5464 mgatherv8i64>, EVEX_V512;
5465 }
5466
5467 defm VPGATHERDQZ : avx512_gather<0x90, "vpgatherdq", v8i64_info, vy64xmem,
5468 mgatherv8i32>, EVEX_V512, VEX_W;
5469 defm VPGATHERDDZ : avx512_gather<0x90, "vpgatherdd", v16i32_info, vz32mem,
5470 mgatherv16i32>, EVEX_V512;
5471
5472 defm VPGATHERQQZ : avx512_gather<0x91, "vpgatherqq", v8i64_info, vz64mem,
5473 mgatherv8i64>, EVEX_V512, VEX_W;
5474 defm VPGATHERQDZ : avx512_gather<0x91, "vpgatherqd", v8i32x_info, vz64mem,
5475 mgatherv8i64>, EVEX_V512;
5454 multiclass avx512_gather_q_pd dopc, bits<8> qopc,
5455 AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
5456 defm NAME##D##SUFF##Z: avx512_gather
5457 vy32xmem, mgatherv8i32>, EVEX_V512, VEX_W;
5458 defm NAME##Q##SUFF##Z: avx512_gather
5459 vz64mem, mgatherv8i64>, EVEX_V512, VEX_W;
5460 let Predicates = [HasVLX] in {
5461 defm NAME##D##SUFF##Z256: avx512_gather
5462 vx32xmem, mgatherv4i32>, EVEX_V256, VEX_W;
5463 defm NAME##Q##SUFF##Z256: avx512_gather
5464 vy64xmem, mgatherv4i64>, EVEX_V256, VEX_W;
5465 defm NAME##D##SUFF##Z128: avx512_gather
5466 vx32xmem, mgatherv4i32>, EVEX_V128, VEX_W;
5467 defm NAME##Q##SUFF##Z128: avx512_gather
5468 vx64xmem, mgatherv2i64>, EVEX_V128, VEX_W;
5469 }
5470 }
5471
5472 multiclass avx512_gather_d_ps dopc, bits<8> qopc,
5473 AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
5474 defm NAME##D##SUFF##Z: avx512_gather
5475 mgatherv16i32>, EVEX_V512;
5476 defm NAME##Q##SUFF##Z: avx512_gather
5477 mgatherv8i64>, EVEX_V512;
5478 let Predicates = [HasVLX] in {
5479 defm NAME##D##SUFF##Z256: avx512_gather
5480 vy32xmem, mgatherv8i32>, EVEX_V256;
5481 defm NAME##Q##SUFF##Z256: avx512_gather
5482 vy64xmem, mgatherv4i64>, EVEX_V256;
5483 defm NAME##D##SUFF##Z128: avx512_gather
5484 vx32xmem, mgatherv4i32>, EVEX_V128;
5485 defm NAME##Q##SUFF##Z128: avx512_gather
5486 vx64xmem, mgatherv2i64>, EVEX_V128;
5487 }
5488 }
5489
5490
5491 defm VGATHER : avx512_gather_q_pd<0x92, 0x93, avx512vl_f64_info, "vgather", "PD">,
5492 avx512_gather_d_ps<0x92, 0x93, avx512vl_f32_info, "vgather", "PS">;
5493
5494 defm VPGATHER : avx512_gather_q_pd<0x90, 0x91, avx512vl_i64_info, "vpgather", "Q">,
5495 avx512_gather_d_ps<0x90, 0x91, avx512vl_i32_info, "vpgather", "D">;
54765496
54775497 multiclass avx512_scatter opc, string OpcodeStr, X86VectorVTInfo _,
54785498 X86MemOperand memop, PatFrag ScatterNode> {
559559 return false;
560560 }]>;
561561
562 def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
563 (masked_gather node:$src1, node:$src2, node:$src3) , [{
564 if (MaskedGatherSDNode *Mgt = dyn_cast(N))
565 return (Mgt->getIndex().getValueType() == MVT::v4i32 ||
566 Mgt->getBasePtr().getValueType() == MVT::v4i32);
567 return false;
568 }]>;
569
562570 def mgatherv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
563571 (masked_gather node:$src1, node:$src2, node:$src3) , [{
564572 if (MaskedGatherSDNode *Mgt = dyn_cast(N))
567575 return false;
568576 }]>;
569577
578 def mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
579 (masked_gather node:$src1, node:$src2, node:$src3) , [{
580 if (MaskedGatherSDNode *Mgt = dyn_cast(N))
581 return (Mgt->getIndex().getValueType() == MVT::v2i64 ||
582 Mgt->getBasePtr().getValueType() == MVT::v2i64);
583 return false;
584 }]>;
585 def mgatherv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
586 (masked_gather node:$src1, node:$src2, node:$src3) , [{
587 if (MaskedGatherSDNode *Mgt = dyn_cast(N))
588 return (Mgt->getIndex().getValueType() == MVT::v4i64 ||
589 Mgt->getBasePtr().getValueType() == MVT::v4i64);
590 return false;
591 }]>;
570592 def mgatherv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
571593 (masked_gather node:$src1, node:$src2, node:$src3) , [{
572594 if (MaskedGatherSDNode *Mgt = dyn_cast(N))
281281 def X86MemVX64Operand : AsmOperandClass { let Name = "MemVX64"; }
282282 def X86MemVY64Operand : AsmOperandClass { let Name = "MemVY64"; }
283283 def X86MemVZ64Operand : AsmOperandClass { let Name = "MemVZ64"; }
284 def X86MemVX32XOperand : AsmOperandClass { let Name = "MemVX32X"; }
285 def X86MemVY32XOperand : AsmOperandClass { let Name = "MemVY32X"; }
286 def X86MemVX64XOperand : AsmOperandClass { let Name = "MemVX64X"; }
287 def X86MemVY64XOperand : AsmOperandClass { let Name = "MemVY64X"; }
284288 }
285289
286290 def X86AbsMemAsmOperand : AsmOperandClass {
331335 def vy32mem : X86VMemOperand;
332336 def vx64mem : X86VMemOperand;
333337 def vy64mem : X86VMemOperand;
334 def vy64xmem : X86VMemOperand;
338
339 def vx32xmem : X86VMemOperand;
340 def vx64xmem : X86VMemOperand;
341 def vy32xmem : X86VMemOperand;
342 def vy64xmem : X86VMemOperand;
335343 def vz32mem : X86VMemOperand;
336344 def vz64mem : X86VMemOperand;
337345
5555 X86_INTRINSIC_DATA(addcarryx_u32, ADX, X86ISD::ADC, 0),
5656 X86_INTRINSIC_DATA(addcarryx_u64, ADX, X86ISD::ADC, 0),
5757
58 X86_INTRINSIC_DATA(avx512_gather3div2_df, GATHER, X86::VGATHERQPDZ128rm, 0),
59 X86_INTRINSIC_DATA(avx512_gather3div2_di, GATHER, X86::VPGATHERQQZ128rm, 0),
60 X86_INTRINSIC_DATA(avx512_gather3div4_df, GATHER, X86::VGATHERQPDZ256rm, 0),
61 X86_INTRINSIC_DATA(avx512_gather3div4_di, GATHER, X86::VPGATHERQQZ256rm, 0),
62 X86_INTRINSIC_DATA(avx512_gather3div4_sf, GATHER, X86::VGATHERQPSZ128rm, 0),
63 X86_INTRINSIC_DATA(avx512_gather3div4_si, GATHER, X86::VPGATHERQDZ128rm, 0),
64 X86_INTRINSIC_DATA(avx512_gather3div8_sf, GATHER, X86::VGATHERQPSZ256rm, 0),
65 X86_INTRINSIC_DATA(avx512_gather3div8_si, GATHER, X86::VPGATHERQDZ256rm, 0),
66 X86_INTRINSIC_DATA(avx512_gather3siv2_df, GATHER, X86::VGATHERDPDZ128rm, 0),
67 X86_INTRINSIC_DATA(avx512_gather3siv2_di, GATHER, X86::VPGATHERDQZ128rm, 0),
68 X86_INTRINSIC_DATA(avx512_gather3siv4_df, GATHER, X86::VGATHERDPDZ256rm, 0),
69 X86_INTRINSIC_DATA(avx512_gather3siv4_di, GATHER, X86::VPGATHERDQZ256rm, 0),
70 X86_INTRINSIC_DATA(avx512_gather3siv4_sf, GATHER, X86::VGATHERDPSZ128rm, 0),
71 X86_INTRINSIC_DATA(avx512_gather3siv4_si, GATHER, X86::VPGATHERDDZ128rm, 0),
72 X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, X86::VGATHERDPSZ256rm, 0),
73 X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, X86::VPGATHERDDZ256rm, 0),
5874 X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0),
5975 X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0),
6076 X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0),
None ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
0 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
11
22 declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float>, i8*, <16 x i32>, i16, i32)
33 declare void @llvm.x86.avx512.scatter.dps.512 (i8*, i16, <16 x i32>, <16 x float>, i32)
99 declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double>, i8*, <8 x i64>, i8, i32)
1010 declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, i8, <8 x i64>, <8 x double>, i32)
1111
12 ;CHECK-LABEL: gather_mask_dps
13 ;CHECK: kmovw
14 ;CHECK: vgatherdps
15 ;CHECK: vpadd
16 ;CHECK: vscatterdps
17 ;CHECK: ret
1812 define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf) {
13 ; CHECK-LABEL: gather_mask_dps:
14 ; CHECK: ## BB#0:
15 ; CHECK-NEXT: kmovw %edi, %k1
16 ; CHECK-NEXT: kmovw %k1, %k2
17 ; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2}
18 ; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
19 ; CHECK-NEXT: vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1}
20 ; CHECK-NEXT: retq
1921 %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
2022 %ind2 = add <16 x i32> %ind,
2123 call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4)
2224 ret void
2325 }
2426
25 ;CHECK-LABEL: gather_mask_dpd
26 ;CHECK: kmovw
27 ;CHECK: vgatherdpd
28 ;CHECK: vpadd
29 ;CHECK: vscatterdpd
30 ;CHECK: ret
3127 define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) {
28 ; CHECK-LABEL: gather_mask_dpd:
29 ; CHECK: ## BB#0:
30 ; CHECK-NEXT: kmovb %edi, %k1
31 ; CHECK-NEXT: kmovw %k1, %k2
32 ; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2}
33 ; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
34 ; CHECK-NEXT: vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1}
35 ; CHECK-NEXT: retq
3236 %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
3337 %ind2 = add <8 x i32> %ind,
3438 call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4)
3539 ret void
3640 }
3741
38 ;CHECK-LABEL: gather_mask_qps
39 ;CHECK: kmovw
40 ;CHECK: vgatherqps
41 ;CHECK: vpadd
42 ;CHECK: vscatterqps
43 ;CHECK: ret
4442 define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf) {
43 ; CHECK-LABEL: gather_mask_qps:
44 ; CHECK: ## BB#0:
45 ; CHECK-NEXT: kmovb %edi, %k1
46 ; CHECK-NEXT: kmovw %k1, %k2
47 ; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2}
48 ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
49 ; CHECK-NEXT: vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1}
50 ; CHECK-NEXT: retq
4551 %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
4652 %ind2 = add <8 x i64> %ind,
4753 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4)
4854 ret void
4955 }
5056
51 ;CHECK-LABEL: gather_mask_qpd
52 ;CHECK: kmovw
53 ;CHECK: vgatherqpd
54 ;CHECK: vpadd
55 ;CHECK: vscatterqpd
56 ;CHECK: ret
5757 define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) {
58 ; CHECK-LABEL: gather_mask_qpd:
59 ; CHECK: ## BB#0:
60 ; CHECK-NEXT: kmovb %edi, %k1
61 ; CHECK-NEXT: kmovw %k1, %k2
62 ; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2}
63 ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
64 ; CHECK-NEXT: vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1}
65 ; CHECK-NEXT: retq
5866 %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
5967 %ind2 = add <8 x i64> %ind,
6068 call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4)
7381 declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, <8 x i64>, i8, i32)
7482 declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, i8, <8 x i64>, <8 x i64>, i32)
7583
76 ;CHECK-LABEL: gather_mask_dd
77 ;CHECK: kmovw
78 ;CHECK: vpgatherdd
79 ;CHECK: vpadd
80 ;CHECK: vpscatterdd
81 ;CHECK: ret
8284 define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf) {
85 ; CHECK-LABEL: gather_mask_dd:
86 ; CHECK: ## BB#0:
87 ; CHECK-NEXT: kmovw %edi, %k1
88 ; CHECK-NEXT: kmovw %k1, %k2
89 ; CHECK-NEXT: vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2}
90 ; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
91 ; CHECK-NEXT: vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1}
92 ; CHECK-NEXT: retq
8393 %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
8494 %ind2 = add <16 x i32> %ind,
8595 call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4)
8696 ret void
8797 }
8898
89 ;CHECK-LABEL: gather_mask_qd
90 ;CHECK: kmovw
91 ;CHECK: vpgatherqd
92 ;CHECK: vpadd
93 ;CHECK: vpscatterqd
94 ;CHECK: ret
9599 define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf) {
100 ; CHECK-LABEL: gather_mask_qd:
101 ; CHECK: ## BB#0:
102 ; CHECK-NEXT: kmovb %edi, %k1
103 ; CHECK-NEXT: kmovw %k1, %k2
104 ; CHECK-NEXT: vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2}
105 ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
106 ; CHECK-NEXT: vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1}
107 ; CHECK-NEXT: retq
96108 %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
97109 %ind2 = add <8 x i64> %ind,
98110 call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4)
99111 ret void
100112 }
101113
102 ;CHECK-LABEL: gather_mask_qq
103 ;CHECK: kmovw
104 ;CHECK: vpgatherqq
105 ;CHECK: vpadd
106 ;CHECK: vpscatterqq
107 ;CHECK: ret
108114 define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) {
115 ; CHECK-LABEL: gather_mask_qq:
116 ; CHECK: ## BB#0:
117 ; CHECK-NEXT: kmovb %edi, %k1
118 ; CHECK-NEXT: kmovw %k1, %k2
119 ; CHECK-NEXT: vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2}
120 ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
121 ; CHECK-NEXT: vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1}
122 ; CHECK-NEXT: retq
109123 %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
110124 %ind2 = add <8 x i64> %ind,
111125 call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4)
112126 ret void
113127 }
114128
115 ;CHECK-LABEL: gather_mask_dq
116 ;CHECK: kmovw
117 ;CHECK: vpgatherdq
118 ;CHECK: vpadd
119 ;CHECK: vpscatterdq
120 ;CHECK: ret
121129 define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) {
130 ; CHECK-LABEL: gather_mask_dq:
131 ; CHECK: ## BB#0:
132 ; CHECK-NEXT: kmovb %edi, %k1
133 ; CHECK-NEXT: kmovw %k1, %k2
134 ; CHECK-NEXT: vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2}
135 ; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
136 ; CHECK-NEXT: vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1}
137 ; CHECK-NEXT: retq
122138 %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
123139 %ind2 = add <8 x i32> %ind,
124140 call void @llvm.x86.avx512.scatter.dpq.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4)
125141 ret void
126142 }
127143
128
129 ;CHECK-LABEL: gather_mask_dpd_execdomain
130 ;CHECK: vgatherdpd
131 ;CHECK: vmovapd
132 ;CHECK: ret
133144 define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) {
145 ; CHECK-LABEL: gather_mask_dpd_execdomain:
146 ; CHECK: ## BB#0:
147 ; CHECK-NEXT: kmovb %edi, %k1
148 ; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k1}
149 ; CHECK-NEXT: vmovapd %zmm1, (%rdx)
150 ; CHECK-NEXT: retq
134151 %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
135152 store <8 x double> %x, <8 x double>* %stbuf
136153 ret void
137154 }
138155
139 ;CHECK-LABEL: gather_mask_qpd_execdomain
140 ;CHECK: vgatherqpd
141 ;CHECK: vmovapd
142 ;CHECK: ret
143156 define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) {
157 ; CHECK-LABEL: gather_mask_qpd_execdomain:
158 ; CHECK: ## BB#0:
159 ; CHECK-NEXT: kmovb %edi, %k1
160 ; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k1}
161 ; CHECK-NEXT: vmovapd %zmm1, (%rdx)
162 ; CHECK-NEXT: retq
144163 %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
145164 store <8 x double> %x, <8 x double>* %stbuf
146165 ret void
147166 }
148167
149 ;CHECK-LABEL: gather_mask_dps_execdomain
150 ;CHECK: vgatherdps
151 ;CHECK: vmovaps
152 ;CHECK: ret
153168 define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base) {
169 ; CHECK-LABEL: gather_mask_dps_execdomain:
170 ; CHECK: ## BB#0:
171 ; CHECK-NEXT: kmovw %edi, %k1
172 ; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k1}
173 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
174 ; CHECK-NEXT: retq
154175 %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
155176 ret <16 x float> %res;
156177 }
157178
158 ;CHECK-LABEL: gather_mask_qps_execdomain
159 ;CHECK: vgatherqps
160 ;CHECK: vmovaps
161 ;CHECK: ret
162179 define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base) {
180 ; CHECK-LABEL: gather_mask_qps_execdomain:
181 ; CHECK: ## BB#0:
182 ; CHECK-NEXT: kmovb %edi, %k1
183 ; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k1}
184 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
185 ; CHECK-NEXT: retq
163186 %res = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
164187 ret <8 x float> %res;
165188 }
166189
167 ;CHECK-LABEL: scatter_mask_dpd_execdomain
168 ;CHECK: vmovapd
169 ;CHECK: vscatterdpd
170 ;CHECK: ret
171190 define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) {
172 %x = load <8 x double>, <8 x double>* %src, align 64
191 ; CHECK-LABEL: scatter_mask_dpd_execdomain:
192 ; CHECK: ## BB#0:
193 ; CHECK-NEXT: kmovb %esi, %k1
194 ; CHECK-NEXT: vmovapd (%rdi), %zmm1
195 ; CHECK-NEXT: vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1}
196 ; CHECK-NEXT: retq
197 %x = load <8 x double>, <8 x double>* %src, align 64
173198 call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4)
174199 ret void
175200 }
176201
177 ;CHECK-LABEL: scatter_mask_qpd_execdomain
178 ;CHECK: vmovapd
179 ;CHECK: vscatterqpd
180 ;CHECK: ret
181202 define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) {
203 ; CHECK-LABEL: scatter_mask_qpd_execdomain:
204 ; CHECK: ## BB#0:
205 ; CHECK-NEXT: kmovb %esi, %k1
206 ; CHECK-NEXT: vmovapd (%rdi), %zmm1
207 ; CHECK-NEXT: vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1}
208 ; CHECK-NEXT: retq
182209 %x = load <8 x double>, <8 x double>* %src, align 64
183210 call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4)
184211 ret void
185212 }
186213
187 ;CHECK-LABEL: scatter_mask_dps_execdomain
188 ;CHECK: vmovaps
189 ;CHECK: vscatterdps
190 ;CHECK: ret
191214 define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i16 %mask, i8* %base, i8* %stbuf) {
215 ; CHECK-LABEL: scatter_mask_dps_execdomain:
216 ; CHECK: ## BB#0:
217 ; CHECK-NEXT: kmovw %esi, %k1
218 ; CHECK-NEXT: vmovaps (%rdi), %zmm1
219 ; CHECK-NEXT: vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1}
220 ; CHECK-NEXT: retq
192221 %x = load <16 x float>, <16 x float>* %src, align 64
193222 call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4)
194223 ret void
195224 }
196225
197 ;CHECK-LABEL: scatter_mask_qps_execdomain
198 ;CHECK: vmovaps
199 ;CHECK: vscatterqps
200 ;CHECK: ret
201226 define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %mask, i8* %base, i8* %stbuf) {
202 %x = load <8 x float>, <8 x float>* %src, align 32
227 ; CHECK-LABEL: scatter_mask_qps_execdomain:
228 ; CHECK: ## BB#0:
229 ; CHECK-NEXT: kmovb %esi, %k1
230 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
231 ; CHECK-NEXT: vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1}
232 ; CHECK-NEXT: retq
233 %x = load <8 x float>, <8 x float>* %src, align 32
203234 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4)
204235 ret void
205236 }
206237
207 ;CHECK-LABEL: gather_qps
208 ;CHECK: kxnorw
209 ;CHECK: vgatherqps
210 ;CHECK: vpadd
211 ;CHECK: vscatterqps
212 ;CHECK: ret
213238 define void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf) {
239 ; CHECK-LABEL: gather_qps:
240 ; CHECK: ## BB#0:
241 ; CHECK-NEXT: kxnorw %k1, %k1, %k1
242 ; CHECK-NEXT: kxnorw %k2, %k2, %k2
243 ; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
244 ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
245 ; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1}
246 ; CHECK-NEXT: retq
214247 %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 -1, i32 4)
215248 %ind2 = add <8 x i64> %ind,
216249 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 -1, <8 x i64>%ind2, <8 x float> %x, i32 4)
217250 ret void
218251 }
219252
220 ;CHECK-LABEL: prefetch
221 ;CHECK: gatherpf0
222 ;CHECK: gatherpf1
223 ;CHECK: scatterpf0
224 ;CHECK: scatterpf1
225 ;CHECK: ret
226253 declare void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
227254 declare void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
228255 define void @prefetch(<8 x i64> %ind, i8* %base) {
256 ; CHECK-LABEL: prefetch:
257 ; CHECK: ## BB#0:
258 ; CHECK-NEXT: kxnorw %k1, %k1, %k1
259 ; CHECK-NEXT: vgatherpf0qps (%rdi,%zmm0,4) {%k1}
260 ; CHECK-NEXT: vgatherpf1qps (%rdi,%zmm0,4) {%k1}
261 ; CHECK-NEXT: vscatterpf0qps (%rdi,%zmm0,2) {%k1}
262 ; CHECK-NEXT: vscatterpf1qps (%rdi,%zmm0,2) {%k1}
263 ; CHECK-NEXT: retq
229264 call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 0)
230265 call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 1)
231266 call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 0)
232267 call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 1)
233268 ret void
234269 }
270
271
272 declare <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double>, i8*, <2 x i64>, i8, i32)
273
274 define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
275 ; CHECK-LABEL: test_int_x86_avx512_gather3div2_df:
276 ; CHECK: ## BB#0:
277 ; CHECK-NEXT: kmovb %esi, %k1
278 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
279 ; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,4), %xmm2 {%k1}
280 ; CHECK-NEXT: kxnorw %k1, %k1, %k1
281 ; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,0), %xmm0 {%k1}
282 ; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
283 ; CHECK-NEXT: retq
284 %res = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
285 %res1 = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 0)
286 %res2 = fadd <2 x double> %res, %res1
287 ret <2 x double> %res2
288 }
289
290 declare <4 x i32> @llvm.x86.avx512.gather3div2.di(<2 x i64>, i8*, <2 x i64>, i8, i32)
291
292 define <4 x i32>@test_int_x86_avx512_gather3div2_di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
293 ; CHECK-LABEL: test_int_x86_avx512_gather3div2_di:
294 ; CHECK: ## BB#0:
295 ; CHECK-NEXT: kmovb %esi, %k1
296 ; CHECK-NEXT: vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1}
297 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0
298 ; CHECK-NEXT: retq
299 %res = call <4 x i32> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8)
300 %res1 = call <4 x i32> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8)
301 %res2 = add <4 x i32> %res, %res1
302 ret <4 x i32> %res2
303 }
304
305 declare <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double>, i8*, <4 x i64>, i8, i32)
306
307 define <4 x double>@test_int_x86_avx512_gather3div4_df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
308 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_df:
309 ; CHECK: ## BB#0:
310 ; CHECK-NEXT: kmovb %esi, %k1
311 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
312 ; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,4), %ymm2 {%k1}
313 ; CHECK-NEXT: kxnorw %k1, %k1, %k1
314 ; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,0), %ymm0 {%k1}
315 ; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0
316 ; CHECK-NEXT: retq
317 %res = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
318 %res1 = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 0)
319 %res2 = fadd <4 x double> %res, %res1
320 ret <4 x double> %res2
321 }
322
323 declare <8 x i32> @llvm.x86.avx512.gather3div4.di(<4 x i64>, i8*, <4 x i64>, i8, i32)
324
325 define <8 x i32>@test_int_x86_avx512_gather3div4_di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
326 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_di:
327 ; CHECK: ## BB#0:
328 ; CHECK-NEXT: kmovb %esi, %k1
329 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
330 ; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1}
331 ; CHECK-NEXT: kxnorw %k1, %k1, %k1
332 ; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1}
333 ; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
334 ; CHECK-NEXT: retq
335 %res = call <8 x i32> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 8)
336 %res1 = call <8 x i32> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 8)
337 %res2 = add <8 x i32> %res, %res1
338 ret <8 x i32> %res2
339 }
340
341 declare <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float>, i8*, <2 x i64>, i8, i32)
342
343 define <4 x float>@test_int_x86_avx512_gather3div4_sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
344 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_sf:
345 ; CHECK: ## BB#0:
346 ; CHECK-NEXT: kmovb %esi, %k1
347 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
348 ; CHECK-NEXT: vgatherqps (%rdi,%xmm1,4), %xmm2 {%k1}
349 ; CHECK-NEXT: kxnorw %k1, %k1, %k1
350 ; CHECK-NEXT: vgatherqps (%rdi,%xmm1,0), %xmm0 {%k1}
351 ; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
352 ; CHECK-NEXT: retq
353 %res = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
354 %res1 = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 0)
355 %res2 = fadd <4 x float> %res, %res1
356 ret <4 x float> %res2
357 }
358
359 declare <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32>, i8*, <2 x i64>, i8, i32)
360
361 define <4 x i32>@test_int_x86_avx512_gather3div4_si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
362 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_si:
363 ; CHECK: ## BB#0:
364 ; CHECK-NEXT: kmovb %esi, %k1
365 ; CHECK-NEXT: kxnorw %k2, %k2, %k2
366 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
367 ; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k2}
368 ; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm0 {%k1}
369 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
370 ; CHECK-NEXT: retq
371 %res = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 4)
372 %res1 = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
373 %res2 = add <4 x i32> %res, %res1
374 ret <4 x i32> %res2
375 }
376
377 declare <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float>, i8*, <4 x i64>, i8, i32)
378
379 define <4 x float>@test_int_x86_avx512_gather3div8_sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
380 ; CHECK-LABEL: test_int_x86_avx512_gather3div8_sf:
381 ; CHECK: ## BB#0:
382 ; CHECK-NEXT: kmovb %esi, %k1
383 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
384 ; CHECK-NEXT: vgatherqps (%rdi,%ymm1,4), %xmm2 {%k1}
385 ; CHECK-NEXT: kxnorw %k1, %k1, %k1
386 ; CHECK-NEXT: vgatherqps (%rdi,%ymm1,0), %xmm0 {%k1}
387 ; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
388 ; CHECK-NEXT: retq
389 %res = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
390 %res1 = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 0)
391 %res2 = fadd <4 x float> %res, %res1
392 ret <4 x float> %res2
393 }
394
395 declare <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32>, i8*, <4 x i64>, i8, i32)
396
397 define <4 x i32>@test_int_x86_avx512_gather3div8_si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
398 ; CHECK-LABEL: test_int_x86_avx512_gather3div8_si:
399 ; CHECK: ## BB#0:
400 ; CHECK-NEXT: kmovb %esi, %k1
401 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
402 ; CHECK-NEXT: kmovw %k1, %k2
403 ; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2}
404 ; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1}
405 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
406 ; CHECK-NEXT: retq
407 %res = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
408 %res1 = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 2)
409 %res2 = add <4 x i32> %res, %res1
410 ret <4 x i32> %res2
411 }
412
413 declare <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double>, i8*, <4 x i32>, i8, i32)
414
415 define <2 x double>@test_int_x86_avx512_gather3siv2_df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
416 ; CHECK-LABEL: test_int_x86_avx512_gather3siv2_df:
417 ; CHECK: ## BB#0:
418 ; CHECK-NEXT: kmovb %esi, %k1
419 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
420 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %xmm2 {%k1}
421 ; CHECK-NEXT: kxnorw %k1, %k1, %k1
422 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,0), %xmm0 {%k1}
423 ; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
424 ; CHECK-NEXT: retq
425 %res = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
426 %res1 = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 0)
427 %res2 = fadd <2 x double> %res, %res1
428 ret <2 x double> %res2
429 }
430
431 declare <4 x i32> @llvm.x86.avx512.gather3siv2.di(<2 x i64>, i8*, <4 x i32>, i8, i32)
432
433 define <4 x i32>@test_int_x86_avx512_gather3siv2_di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
434 ; CHECK-LABEL: test_int_x86_avx512_gather3siv2_di:
435 ; CHECK: ## BB#0:
436 ; CHECK-NEXT: kmovb %esi, %k1
437 ; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1}
438 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0
439 ; CHECK-NEXT: retq
440 %res = call <4 x i32> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
441 %res1 = call <4 x i32> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
442 %res2 = add <4 x i32> %res, %res1
443 ret <4 x i32> %res2
444 }
445
446 declare <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double>, i8*, <4 x i32>, i8, i32)
447
448 define <4 x double>@test_int_x86_avx512_gather3siv4_df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
449 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_df:
450 ; CHECK: ## BB#0:
451 ; CHECK-NEXT: kmovb %esi, %k1
452 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
453 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %ymm2 {%k1}
454 ; CHECK-NEXT: kxnorw %k1, %k1, %k1
455 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,0), %ymm0 {%k1}
456 ; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0
457 ; CHECK-NEXT: retq
458 %res = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
459 %res1 = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 0)
460 %res2 = fadd <4 x double> %res, %res1
461 ret <4 x double> %res2
462 }
463
464 declare <8 x i32> @llvm.x86.avx512.gather3siv4.di(<4 x i64>, i8*, <4 x i32>, i8, i32)
465
466 define <8 x i32>@test_int_x86_avx512_gather3siv4_di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
467 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_di:
468 ; CHECK: ## BB#0:
469 ; CHECK-NEXT: kmovb %esi, %k1
470 ; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1}
471 ; CHECK-NEXT: vpaddd %ymm0, %ymm0, %ymm0
472 ; CHECK-NEXT: retq
473 %res = call <8 x i32> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
474 %res1 = call <8 x i32> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
475 %res2 = add <8 x i32> %res, %res1
476 ret <8 x i32> %res2
477 }
478
479 declare <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float>, i8*, <4 x i32>, i8, i32)
480
481 define <4 x float>@test_int_x86_avx512_gather3siv4_sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
482 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_sf:
483 ; CHECK: ## BB#0:
484 ; CHECK-NEXT: kmovb %esi, %k1
485 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
486 ; CHECK-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm2 {%k1}
487 ; CHECK-NEXT: kxnorw %k1, %k1, %k1
488 ; CHECK-NEXT: vgatherdps (%rdi,%xmm1,0), %xmm0 {%k1}
489 ; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
490 ; CHECK-NEXT: retq
491 %res = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
492 %res1 = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 0)
493 %res2 = fadd <4 x float> %res, %res1
494 ret <4 x float> %res2
495 }
496
497 declare <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32>, i8*, <4 x i32>, i8, i32)
498
499 define <4 x i32>@test_int_x86_avx512_gather3siv4_si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
500 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_si:
501 ; CHECK: ## BB#0:
502 ; CHECK-NEXT: kmovb %esi, %k1
503 ; CHECK-NEXT: kxnorw %k2, %k2, %k2
504 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
505 ; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k2}
506 ; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,0), %xmm0 {%k1}
507 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
508 ; CHECK-NEXT: retq
509 %res = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 4)
510 %res1 = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 0)
511 %res2 = add <4 x i32> %res, %res1
512 ret <4 x i32> %res2
513 }
514
515 declare <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float>, i8*, <8 x i32>, i8, i32)
516
517 define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) {
518 ; CHECK-LABEL: test_int_x86_avx512_gather3siv8_sf:
519 ; CHECK: ## BB#0:
520 ; CHECK-NEXT: kmovb %esi, %k1
521 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
522 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm2 {%k1}
523 ; CHECK-NEXT: kxnorw %k1, %k1, %k1
524 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,0), %ymm0 {%k1}
525 ; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0
526 ; CHECK-NEXT: retq
527 %res = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4)
528 %res1 = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 -1, i32 0)
529 %res2 = fadd <8 x float> %res, %res1
530 ret <8 x float> %res2
531 }
532
533 declare <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32>, i8*, <8 x i32>, i8, i32)
534
535 define <8 x i32>@test_int_x86_avx512_gather3siv8_si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) {
536 ; CHECK-LABEL: test_int_x86_avx512_gather3siv8_si:
537 ; CHECK: ## BB#0:
538 ; CHECK-NEXT: kmovb %esi, %k1
539 ; CHECK-NEXT: vmovaps %zmm0, %zmm2
540 ; CHECK-NEXT: kmovw %k1, %k2
541 ; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,4), %ymm2 {%k2}
542 ; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,0), %ymm0 {%k1}
543 ; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
544 ; CHECK-NEXT: retq
545 %res = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4)
546 %res1 = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 0)
547 %res2 = add <8 x i32> %res, %res1
548 ret <8 x i32> %res2
549 }
97929792 // CHECK: vpabsq -1032(%rdx){1to8}, %zmm5
97939793 // CHECK: encoding: [0x62,0xf2,0xfd,0x58,0x1f,0xaa,0xf8,0xfb,0xff,0xff]
97949794 vpabsq -1032(%rdx){1to8}, %zmm5
9795
9796 // CHECK: vpgatherdd 123(%r14,%zmm11,8), %zmm17 {%k1}
9797 // CHECK: encoding: [0x62,0x82,0x7d,0x49,0x90,0x8c,0xde,0x7b,0x00,0x00,0x00]
9798 vpgatherdd 123(%r14, %zmm11,8), %zmm17 {%k1}
9799
9800 // CHECK: vpgatherdd 256(%r9,%zmm11), %zmm17 {%k1}
9801 // CHECK: encoding: [0x62,0x82,0x7d,0x49,0x90,0x4c,0x19,0x40]
9802 vpgatherdd 256(%r9,%zmm11), %zmm17 {%k1}
9803
9804 // CHECK: vpgatherdd 1024(%rcx,%zmm11,4), %zmm17 {%k1}
9805 // CHECK: encoding: [0x62,0xa2,0x7d,0x49,0x90,0x8c,0x99,0x00,0x04,0x00,0x00]
9806 vpgatherdd 1024(%rcx, %zmm11,4), %zmm17 {%k1}
9807
9808 // CHECK: vpgatherdq 123(%r14,%ymm14,8), %zmm8 {%k1}
9809 // CHECK: encoding: [0x62,0x12,0xfd,0x49,0x90,0x84,0xf6,0x7b,0x00,0x00,0x00]
9810 vpgatherdq 123(%r14, %ymm14,8), %zmm8 {%k1}
9811
9812 // CHECK: vpgatherdq 256(%r9,%ymm14), %zmm8 {%k1}
9813 // CHECK: encoding: [0x62,0x12,0xfd,0x49,0x90,0x44,0x31,0x20]
9814 vpgatherdq 256(%r9, %ymm14), %zmm8 {%k1}
9815
9816 // CHECK: vpgatherdq 1024(%rcx,%ymm14,4), %zmm8 {%k1}
9817 // CHECK: encoding: [0x62,0x32,0xfd,0x49,0x90,0x84,0xb1,0x00,0x04,0x00,0x00]
9818 vpgatherdq 1024(%rcx, %ymm14,4), %zmm8 {%k1}
9819
9820 // CHECK: vpgatherqd 123(%r14,%zmm17,8), %ymm3 {%k1}
9821 // CHECK: encoding: [0x62,0xd2,0x7d,0x41,0x91,0x9c,0xce,0x7b,0x00,0x00,0x00]
9822 vpgatherqd 123(%r14, %zmm17,8), %ymm3 {%k1}
9823
9824 // CHECK: vpgatherqd 256(%r9,%zmm17), %ymm3 {%k1}
9825 // CHECK: encoding: [0x62,0xd2,0x7d,0x41,0x91,0x5c,0x09,0x40]
9826 vpgatherqd 256(%r9,%zmm17), %ymm3 {%k1}
9827
9828 // CHECK: vpgatherqd 1024(%rcx,%zmm17,4), %ymm3 {%k1}
9829 // CHECK: encoding: [0x62,0xf2,0x7d,0x41,0x91,0x9c,0x89,0x00,0x04,0x00,0x00]
9830 vpgatherqd 1024(%rcx, %zmm17,4), %ymm3 {%k1}
9831
9832 // CHECK: vpgatherqq 123(%r14,%zmm21,8), %zmm17 {%k1}
9833 // CHECK: encoding: [0x62,0xc2,0xfd,0x41,0x91,0x8c,0xee,0x7b,0x00,0x00,0x00]
9834 vpgatherqq 123(%r14, %zmm21,8), %zmm17 {%k1}
9835
9836 // CHECK: vpgatherqq 256(%r9,%zmm21), %zmm17 {%k1}
9837 // CHECK: encoding: [0x62,0xc2,0xfd,0x41,0x91,0x4c,0x29,0x20]
9838 vpgatherqq 256(%r9,%zmm21), %zmm17 {%k1}
9839
9840 // CHECK: vpgatherqq 1024(%rcx,%zmm21,4), %zmm17 {%k1}
9841 // CHECK: encoding: [0x62,0xe2,0xfd,0x41,0x91,0x8c,0xa9,0x00,0x04,0x00,0x00]
9842 vpgatherqq 1024(%rcx, %zmm21,4), %zmm17 {%k1}
9843
9844 // CHECK: vpscatterdd %zmm19, 123(%r14,%zmm16,8) {%k1}
9845 // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0xa0,0x9c,0xc6,0x7b,0x00,0x00,0x00]
9846 vpscatterdd %zmm19, 123(%r14,%zmm16,8) {%k1}
9847
9848 // CHECK: vpscatterdd %zmm19, 123(%r14,%zmm16,8) {%k1}
9849 // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0xa0,0x9c,0xc6,0x7b,0x00,0x00,0x00]
9850 vpscatterdd %zmm19, 123(%r14,%zmm16,8) {%k1}
9851
9852 // CHECK: vpscatterdd %zmm19, 256(%r9,%zmm16) {%k1}
9853 // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0xa0,0x5c,0x01,0x40]
9854 vpscatterdd %zmm19, 256(%r9,%zmm16) {%k1}
9855
9856 // CHECK: vpscatterdd %zmm19, 1024(%rcx,%zmm16,4) {%k1}
9857 // CHECK: encoding: [0x62,0xe2,0x7d,0x41,0xa0,0x9c,0x81,0x00,0x04,0x00,0x00]
9858 vpscatterdd %zmm19, 1024(%rcx,%zmm16,4) {%k1}
9859
9860 // CHECK: vpscatterdq %zmm5, 123(%r14,%ymm6,8) {%k1}
9861 // CHECK: encoding: [0x62,0xd2,0xfd,0x49,0xa0,0xac,0xf6,0x7b,0x00,0x00,0x00]
9862 vpscatterdq %zmm5, 123(%r14,%ymm6,8) {%k1}
9863
9864 // CHECK: vpscatterdq %zmm5, 123(%r14,%ymm6,8) {%k1}
9865 // CHECK: encoding: [0x62,0xd2,0xfd,0x49,0xa0,0xac,0xf6,0x7b,0x00,0x00,0x00]
9866 vpscatterdq %zmm5, 123(%r14,%ymm6,8) {%k1}
9867
9868 // CHECK: vpscatterdq %zmm5, 256(%r9,%ymm6) {%k1}
9869 // CHECK: encoding: [0x62,0xd2,0xfd,0x49,0xa0,0x6c,0x31,0x20]
9870 vpscatterdq %zmm5, 256(%r9,%ymm6) {%k1}
9871
9872 // CHECK: vpscatterdq %zmm5, 1024(%rcx,%ymm6,4) {%k1}
9873 // CHECK: encoding: [0x62,0xf2,0xfd,0x49,0xa0,0xac,0xb1,0x00,0x04,0x00,0x00]
9874 vpscatterdq %zmm5, 1024(%rcx,%ymm6,4) {%k1}
9875
9876 // CHECK: vpscatterqd %ymm20, 123(%r14,%zmm2,8) {%k1}
9877 // CHECK: encoding: [0x62,0xc2,0x7d,0x49,0xa1,0xa4,0xd6,0x7b,0x00,0x00,0x00]
9878 vpscatterqd %ymm20, 123(%r14,%zmm2,8) {%k1}
9879
9880 // CHECK: vpscatterqd %ymm20, 123(%r14,%zmm2,8) {%k1}
9881 // CHECK: encoding: [0x62,0xc2,0x7d,0x49,0xa1,0xa4,0xd6,0x7b,0x00,0x00,0x00]
9882 vpscatterqd %ymm20, 123(%r14,%zmm2,8) {%k1}
9883
9884 // CHECK: vpscatterqd %ymm20, 256(%r9,%zmm2) {%k1}
9885 // CHECK: encoding: [0x62,0xc2,0x7d,0x49,0xa1,0x64,0x11,0x40]
9886 vpscatterqd %ymm20, 256(%r9,%zmm2) {%k1}
9887
9888 // CHECK: vpscatterqd %ymm20, 1024(%rcx,%zmm2,4) {%k1}
9889 // CHECK: encoding: [0x62,0xe2,0x7d,0x49,0xa1,0xa4,0x91,0x00,0x04,0x00,0x00]
9890 vpscatterqd %ymm20, 1024(%rcx,%zmm2,4) {%k1}
9891
9892 // CHECK: vpscatterqq %zmm14, 123(%r14,%zmm20,8) {%k1}
9893 // CHECK: encoding: [0x62,0x52,0xfd,0x41,0xa1,0xb4,0xe6,0x7b,0x00,0x00,0x00]
9894 vpscatterqq %zmm14, 123(%r14,%zmm20,8) {%k1}
9895
9896 // CHECK: vpscatterqq %zmm14, 123(%r14,%zmm20,8) {%k1}
9897 // CHECK: encoding: [0x62,0x52,0xfd,0x41,0xa1,0xb4,0xe6,0x7b,0x00,0x00,0x00]
9898 vpscatterqq %zmm14, 123(%r14,%zmm20,8) {%k1}
9899
9900 // CHECK: vpscatterqq %zmm14, 256(%r9,%zmm20) {%k1}
9901 // CHECK: encoding: [0x62,0x52,0xfd,0x41,0xa1,0x74,0x21,0x20]
9902 vpscatterqq %zmm14, 256(%r9,%zmm20) {%k1}
9903
9904 // CHECK: vpscatterqq %zmm14, 1024(%rcx,%zmm20,4) {%k1}
9905 // CHECK: encoding: [0x62,0x72,0xfd,0x41,0xa1,0xb4,0xa1,0x00,0x04,0x00,0x00]
9906 vpscatterqq %zmm14, 1024(%rcx,%zmm20,4) {%k1}
14511451 // CHECK: encoding: [0x62,0xe2,0xfd,0x38,0x1f,0xb2,0xf8,0xfb,0xff,0xff]
14521452 vpabsq -1032(%rdx){1to4}, %ymm22
14531453
1454 // CHECK: vpgatherdd 123(%r14,%xmm31,8), %xmm17 {%k1}
1455 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x90,0x8c,0xfe,0x7b,0x00,0x00,0x00]
1456 vpgatherdd 123(%r14,%xmm31,8), %xmm17 {%k1}
1457
1458 // CHECK: vpgatherdd 256(%r9,%xmm31), %xmm17 {%k1}
1459 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x90,0x4c,0x39,0x40]
1460 vpgatherdd 256(%r9,%xmm31), %xmm17 {%k1}
1461
1462 // CHECK: vpgatherdd 1024(%rcx,%xmm31,4), %xmm17 {%k1}
1463 // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x90,0x8c,0xb9,0x00,0x04,0x00,0x00]
1464 vpgatherdd 1024(%rcx,%xmm31,4), %xmm17 {%k1}
1465
1466 // CHECK: vpgatherdd 123(%r14,%ymm31,8), %ymm19 {%k1}
1467 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0x90,0x9c,0xfe,0x7b,0x00,0x00,0x00]
1468 vpgatherdd 123(%r14,%ymm31,8), %ymm19 {%k1}
1469
1470 // CHECK: vpgatherdd 256(%r9,%ymm31), %ymm19 {%k1}
1471 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0x90,0x5c,0x39,0x40]
1472 vpgatherdd 256(%r9,%ymm31), %ymm19 {%k1}
1473
1474 // CHECK: vpgatherdd 1024(%rcx,%ymm31,4), %ymm19 {%k1}
1475 // CHECK: encoding: [0x62,0xa2,0x7d,0x21,0x90,0x9c,0xb9,0x00,0x04,0x00,0x00]
1476 vpgatherdd 1024(%rcx,%ymm31,4), %ymm19 {%k1}
1477
1478 // CHECK: vpgatherdq 123(%r14,%xmm31,8), %xmm17 {%k1}
1479 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x90,0x8c,0xfe,0x7b,0x00,0x00,0x00]
1480 vpgatherdq 123(%r14,%xmm31,8), %xmm17 {%k1}
1481
1482 // CHECK: vpgatherdq 256(%r9,%xmm31), %xmm17 {%k1}
1483 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x90,0x4c,0x39,0x20]
1484 vpgatherdq 256(%r9,%xmm31), %xmm17 {%k1}
1485
1486 // CHECK: vpgatherdq 1024(%rcx,%xmm31,4), %xmm17 {%k1}
1487 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x90,0x8c,0xb9,0x00,0x04,0x00,0x00]
1488 vpgatherdq 1024(%rcx,%xmm31,4), %xmm17 {%k1}
1489
1490 // CHECK: vpgatherdq 123(%r14,%xmm31,8), %ymm26 {%k1}
1491 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0x90,0x94,0xfe,0x7b,0x00,0x00,0x00]
1492 vpgatherdq 123(%r14,%xmm31,8), %ymm26 {%k1}
1493
1494 // CHECK: vpgatherdq 256(%r9,%xmm31), %ymm26 {%k1}
1495 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0x90,0x54,0x39,0x20]
1496 vpgatherdq 256(%r9,%xmm31), %ymm26 {%k1}
1497
1498 // CHECK: vpgatherdq 1024(%rcx,%xmm31,4), %ymm26 {%k1}
1499 // CHECK: encoding: [0x62,0x22,0xfd,0x21,0x90,0x94,0xb9,0x00,0x04,0x00,0x00]
1500 vpgatherdq 1024(%rcx,%xmm31,4), %ymm26 {%k1}
1501
1502 // CHECK: vpgatherqd 123(%r14,%xmm31,8), %xmm21 {%k1}
1503 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x91,0xac,0xfe,0x7b,0x00,0x00,0x00]
1504 vpgatherqd 123(%r14,%xmm31,8), %xmm21 {%k1}
1505
1506 // CHECK: vpgatherqd 256(%r9,%xmm31), %xmm21 {%k1}
1507 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x91,0x6c,0x39,0x40]
1508 vpgatherqd 256(%r9,%xmm31), %xmm21 {%k1}
1509
1510 // CHECK: vpgatherqd 1024(%rcx,%xmm31,4), %xmm21 {%k1}
1511 // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x91,0xac,0xb9,0x00,0x04,0x00,0x00]
1512 vpgatherqd 1024(%rcx,%xmm31,4), %xmm21 {%k1}
1513
1514 // CHECK: vpgatherqd 123(%r14,%ymm31,8), %xmm25 {%k1}
1515 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x91,0x8c,0xfe,0x7b,0x00,0x00,0x00]
1516 vpgatherqd 123(%r14,%ymm31,8), %xmm25 {%k1}
1517
1518 // CHECK: vpgatherqd 256(%r9,%ymm31), %xmm25 {%k1}
1519 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x91,0x4c,0x39,0x40]
1520 vpgatherqd 256(%r9,%ymm31), %xmm25 {%k1}
1521
1522 // CHECK: vpgatherqd 1024(%rcx,%ymm31,4), %xmm25 {%k1}
1523 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0x91,0x8c,0xb9,0x00,0x04,0x00,0x00]
1524 vpgatherqd 1024(%rcx,%ymm31,4), %xmm25 {%k1}
1525
1526 // CHECK: vpgatherqq 123(%r14,%xmm31,8), %xmm18 {%k1}
1527 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x91,0x94,0xfe,0x7b,0x00,0x00,0x00]
1528 vpgatherqq 123(%r14,%xmm31,8), %xmm18 {%k1}
1529
1530 // CHECK: vpgatherqq 256(%r9,%xmm31), %xmm18 {%k1}
1531 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x91,0x54,0x39,0x20]
1532 vpgatherqq 256(%r9,%xmm31), %xmm18 {%k1}
1533
1534 // CHECK: vpgatherqq 1024(%rcx,%xmm31,4), %xmm18 {%k1}
1535 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x91,0x94,0xb9,0x00,0x04,0x00,0x00]
1536 vpgatherqq 1024(%rcx,%xmm31,4), %xmm18 {%k1}
1537
1538 // CHECK: vpgatherqq 123(%r14,%ymm31,8), %ymm19 {%k1}
1539 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x91,0x9c,0xfe,0x7b,0x00,0x00,0x00]
1540 vpgatherqq 123(%r14,%ymm31,8), %ymm19 {%k1}
1541
1542 // CHECK: vpgatherqq 256(%r9,%ymm31), %ymm19 {%k1}
1543 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x91,0x5c,0x39,0x20]
1544 vpgatherqq 256(%r9,%ymm31), %ymm19 {%k1}
1545
1546 // CHECK: vpgatherqq 1024(%rcx,%ymm31,4), %ymm19 {%k1}
1547 // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0x91,0x9c,0xb9,0x00,0x04,0x00,0x00]
1548 vpgatherqq 1024(%rcx,%ymm31,4), %ymm19 {%k1}
1549
1550 // CHECK: vgatherdpd 123(%r14,%xmm31,8), %xmm17 {%k1}
1551 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x92,0x8c,0xfe,0x7b,0x00,0x00,0x00]
1552 vgatherdpd 123(%r14,%xmm31,8), %xmm17 {%k1}
1553
1554 // CHECK: vgatherdpd 256(%r9,%xmm31), %xmm17 {%k1}
1555 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x92,0x4c,0x39,0x20]
1556 vgatherdpd 256(%r9,%xmm31), %xmm17 {%k1}
1557
1558 // CHECK: vgatherdpd 1024(%rcx,%xmm31,4), %xmm17 {%k1}
1559 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x92,0x8c,0xb9,0x00,0x04,0x00,0x00]
1560 vgatherdpd 1024(%rcx,%xmm31,4), %xmm17 {%k1}
1561
1562 // CHECK: vgatherdpd 123(%r14,%xmm31,8), %ymm23 {%k1}
1563 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x92,0xbc,0xfe,0x7b,0x00,0x00,0x00]
1564 vgatherdpd 123(%r14,%xmm31,8), %ymm23 {%k1}
1565
1566 // CHECK: vgatherdpd 256(%r9,%xmm31), %ymm23 {%k1}
1567 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x92,0x7c,0x39,0x20]
1568 vgatherdpd 256(%r9,%xmm31), %ymm23 {%k1}
1569
1570 // CHECK: vgatherdpd 1024(%rcx,%xmm31,4), %ymm23 {%k1}
1571 // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0x92,0xbc,0xb9,0x00,0x04,0x00,0x00]
1572 vgatherdpd 1024(%rcx,%xmm31,4), %ymm23 {%k1}
1573
1574 // CHECK: vgatherdps 123(%r14,%xmm31,8), %xmm18 {%k1}
1575 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x92,0x94,0xfe,0x7b,0x00,0x00,0x00]
1576 vgatherdps 123(%r14,%xmm31,8), %xmm18 {%k1}
1577
1578 // CHECK: vgatherdps 256(%r9,%xmm31), %xmm18 {%k1}
1579 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x92,0x54,0x39,0x40]
1580 vgatherdps 256(%r9,%xmm31), %xmm18 {%k1}
1581
1582 // CHECK: vgatherdps 1024(%rcx,%xmm31,4), %xmm18 {%k1}
1583 // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x92,0x94,0xb9,0x00,0x04,0x00,0x00]
1584 vgatherdps 1024(%rcx,%xmm31,4), %xmm18 {%k1}
1585
1586 // CHECK: vgatherdps 123(%r14,%ymm31,8), %ymm27 {%k1}
1587 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x92,0x9c,0xfe,0x7b,0x00,0x00,0x00]
1588 vgatherdps 123(%r14,%ymm31,8), %ymm27 {%k1}
1589
1590 // CHECK: vgatherdps 256(%r9,%ymm31), %ymm27 {%k1}
1591 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x92,0x5c,0x39,0x40]
1592 vgatherdps 256(%r9,%ymm31), %ymm27 {%k1}
1593
1594 // CHECK: vgatherdps 1024(%rcx,%ymm31,4), %ymm27 {%k1}
1595 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0x92,0x9c,0xb9,0x00,0x04,0x00,0x00]
1596 vgatherdps 1024(%rcx,%ymm31,4), %ymm27 {%k1}
1597
1598 // CHECK: vgatherqpd 123(%r14,%xmm31,8), %xmm17 {%k1}
1599 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x93,0x8c,0xfe,0x7b,0x00,0x00,0x00]
1600 vgatherqpd 123(%r14,%xmm31,8), %xmm17 {%k1}
1601
1602 // CHECK: vgatherqpd 256(%r9,%xmm31), %xmm17 {%k1}
1603 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x93,0x4c,0x39,0x20]
1604 vgatherqpd 256(%r9,%xmm31), %xmm17 {%k1}
1605
1606 // CHECK: vgatherqpd 1024(%rcx,%xmm31,4), %xmm17 {%k1}
1607 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x93,0x8c,0xb9,0x00,0x04,0x00,0x00]
1608 vgatherqpd 1024(%rcx,%xmm31,4), %xmm17 {%k1}
1609
1610 // CHECK: vgatherqpd 123(%r14,%ymm31,8), %ymm29 {%k1}
1611 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0x93,0xac,0xfe,0x7b,0x00,0x00,0x00]
1612 vgatherqpd 123(%r14,%ymm31,8), %ymm29 {%k1}
1613
1614 // CHECK: vgatherqpd 256(%r9,%ymm31), %ymm29 {%k1}
1615 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0x93,0x6c,0x39,0x20]
1616 vgatherqpd 256(%r9,%ymm31), %ymm29 {%k1}
1617
1618 // CHECK: vgatherqpd 1024(%rcx,%ymm31,4), %ymm29 {%k1}
1619 // CHECK: encoding: [0x62,0x22,0xfd,0x21,0x93,0xac,0xb9,0x00,0x04,0x00,0x00]
1620 vgatherqpd 1024(%rcx,%ymm31,4), %ymm29 {%k1}
1621
1622 // CHECK: vgatherqps 123(%r14,%xmm31,8), %xmm21 {%k1}
1623 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x93,0xac,0xfe,0x7b,0x00,0x00,0x00]
1624 vgatherqps 123(%r14,%xmm31,8), %xmm21 {%k1}
1625
1626 // CHECK: vgatherqps 256(%r9,%xmm31), %xmm21 {%k1}
1627 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x93,0x6c,0x39,0x40]
1628 vgatherqps 256(%r9,%xmm31), %xmm21 {%k1}
1629
1630 // CHECK: vgatherqps 1024(%rcx,%xmm31,4), %xmm21 {%k1}
1631 // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x93,0xac,0xb9,0x00,0x04,0x00,0x00]
1632 vgatherqps 1024(%rcx,%xmm31,4), %xmm21 {%k1}
1633
1634 // CHECK: vgatherqps 123(%r14,%ymm31,8), %xmm19 {%k1}
1635 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0x93,0x9c,0xfe,0x7b,0x00,0x00,0x00]
1636 vgatherqps 123(%r14,%ymm31,8), %xmm19 {%k1}
1637
1638 // CHECK: vgatherqps 256(%r9,%ymm31), %xmm19 {%k1}
1639 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0x93,0x5c,0x39,0x40]
1640 vgatherqps 256(%r9,%ymm31), %xmm19 {%k1}
1641
1642 // CHECK: vgatherqps 1024(%rcx,%ymm31,4), %xmm19 {%k1}
1643 // CHECK: encoding: [0x62,0xa2,0x7d,0x21,0x93,0x9c,0xb9,0x00,0x04,0x00,0x00]
1644 vgatherqps 1024(%rcx,%ymm31,4), %xmm19 {%k1}
10261026 TYPE("GR32_NOAX", TYPE_Rv)
10271027 TYPE("GR64_NOAX", TYPE_R64)
10281028 TYPE("vx32mem", TYPE_M32)
1029 TYPE("vx32xmem", TYPE_M32)
10291030 TYPE("vy32mem", TYPE_M32)
1031 TYPE("vy32xmem", TYPE_M32)
10301032 TYPE("vz32mem", TYPE_M32)
10311033 TYPE("vx64mem", TYPE_M64)
1034 TYPE("vx64xmem", TYPE_M64)
10321035 TYPE("vy64mem", TYPE_M64)
10331036 TYPE("vy64xmem", TYPE_M64)
10341037 TYPE("vz64mem", TYPE_M64)
12121215 ENCODING("opaque80mem", ENCODING_RM)
12131216 ENCODING("opaque512mem", ENCODING_RM)
12141217 ENCODING("vx32mem", ENCODING_RM)
1218 ENCODING("vx32xmem", ENCODING_RM)
12151219 ENCODING("vy32mem", ENCODING_RM)
1220 ENCODING("vy32xmem", ENCODING_RM)
12161221 ENCODING("vz32mem", ENCODING_RM)
12171222 ENCODING("vx64mem", ENCODING_RM)
1223 ENCODING("vx64xmem", ENCODING_RM)
12181224 ENCODING("vy64mem", ENCODING_RM)
12191225 ENCODING("vy64xmem", ENCODING_RM)
12201226 ENCODING("vz64mem", ENCODING_RM)