llvm.org GIT mirror llvm / a15cc59
[X86] Add versions of the avx512 gather intrinsics that take the mask as a vXi1 vector instead of a scalar In keeping with our general direction of having the vXi1 type present in IR, this patch converts the mask argument for avx512 gather to vXi1. This can avoid k-register to GPR to k-register transitions late in codegen. I left the existing intrinsics behind because they have many out of tree users such as ISPC. They generate their own code and don't go through the autoupgrade path which only works for bitcode and ll parsing. Ideally we will get them to migrate to target independent intrinsics, but it might be easier for them to migrate to these new intrinsics. I'll work on scatter and gatherpf/scatterpf next. Differential Revision: https://reviews.llvm.org/D56527 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@351234 91177308-0d34-0410-b5e6-96231b3b80d8 Craig Topper 1 year, 10 months ago
5 changed file(s) with 1224 addition(s) and 179 deletion(s). Raw diff Collapse all Expand all
35683568
35693569 // Gather and Scatter ops
35703570 let TargetPrefix = "x86" in {
3571 def int_x86_avx512_gather_dpd_512 : GCCBuiltin<"__builtin_ia32_gathersiv8df">,
3571 // These are gather intrinsics that use a scalar integer for the mask. They
3572 // have been superceded by new versions that use a vXi1 mask. Leaving these
3573 // for now as they have multiple out of tree users that need to migrate.
3574 // TODO: Remove when we can confirm out of tree migration.
3575 def int_x86_avx512_gather_dpd_512 :
35723576 Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty,
35733577 llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
35743578 [IntrReadMem, IntrArgMemOnly]>;
3575 def int_x86_avx512_gather_dps_512 : GCCBuiltin<"__builtin_ia32_gathersiv16sf">,
3579 def int_x86_avx512_gather_dps_512 :
35763580 Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_ptr_ty,
35773581 llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty],
35783582 [IntrReadMem, IntrArgMemOnly]>;
3579 def int_x86_avx512_gather_qpd_512 : GCCBuiltin<"__builtin_ia32_gatherdiv8df">,
3583 def int_x86_avx512_gather_qpd_512 :
35803584 Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty,
35813585 llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
35823586 [IntrReadMem, IntrArgMemOnly]>;
3583 def int_x86_avx512_gather_qps_512 : GCCBuiltin<"__builtin_ia32_gatherdiv16sf">,
3587 def int_x86_avx512_gather_qps_512 :
35843588 Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_ptr_ty,
35853589 llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
35863590 [IntrReadMem, IntrArgMemOnly]>;
35873591
35883592
3589 def int_x86_avx512_gather_dpq_512 : GCCBuiltin<"__builtin_ia32_gathersiv8di">,
3593 def int_x86_avx512_gather_dpq_512 :
35903594 Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_ptr_ty,
35913595 llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
35923596 [IntrReadMem, IntrArgMemOnly]>;
3593 def int_x86_avx512_gather_dpi_512 : GCCBuiltin<"__builtin_ia32_gathersiv16si">,
3597 def int_x86_avx512_gather_dpi_512 :
35943598 Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_ptr_ty,
35953599 llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty],
35963600 [IntrReadMem, IntrArgMemOnly]>;
3597 def int_x86_avx512_gather_qpq_512 : GCCBuiltin<"__builtin_ia32_gatherdiv8di">,
3601 def int_x86_avx512_gather_qpq_512 :
35983602 Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_ptr_ty,
35993603 llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
36003604 [IntrReadMem, IntrArgMemOnly]>;
3601 def int_x86_avx512_gather_qpi_512 : GCCBuiltin<"__builtin_ia32_gatherdiv16si">,
3605 def int_x86_avx512_gather_qpi_512 :
36023606 Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_ptr_ty,
36033607 llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
36043608 [IntrReadMem, IntrArgMemOnly]>;
36053609
36063610 def int_x86_avx512_gather3div2_df :
3607 GCCBuiltin<"__builtin_ia32_gather3div2df">,
36083611 Intrinsic<[llvm_v2f64_ty],
36093612 [llvm_v2f64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i32_ty],
36103613 [IntrReadMem, IntrArgMemOnly]>;
36113614
36123615 def int_x86_avx512_gather3div2_di :
3613 GCCBuiltin<"__builtin_ia32_gather3div2di">,
36143616 Intrinsic<[llvm_v2i64_ty],
36153617 [llvm_v2i64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i32_ty],
36163618 [IntrReadMem, IntrArgMemOnly]>;
36173619
36183620 def int_x86_avx512_gather3div4_df :
3619 GCCBuiltin<"__builtin_ia32_gather3div4df">,
36203621 Intrinsic<[llvm_v4f64_ty],
36213622 [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty, llvm_i32_ty],
36223623 [IntrReadMem, IntrArgMemOnly]>;
36233624
36243625 def int_x86_avx512_gather3div4_di :
3625 GCCBuiltin<"__builtin_ia32_gather3div4di">,
36263626 Intrinsic<[llvm_v4i64_ty],
36273627 [llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty, llvm_i32_ty],
36283628 [IntrReadMem, IntrArgMemOnly]>;
36293629
36303630 def int_x86_avx512_gather3div4_sf :
3631 GCCBuiltin<"__builtin_ia32_gather3div4sf">,
36323631 Intrinsic<[llvm_v4f32_ty],
36333632 [llvm_v4f32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i32_ty],
36343633 [IntrReadMem, IntrArgMemOnly]>;
36353634
36363635 def int_x86_avx512_gather3div4_si :
3637 GCCBuiltin<"__builtin_ia32_gather3div4si">,
36383636 Intrinsic<[llvm_v4i32_ty],
36393637 [llvm_v4i32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i32_ty],
36403638 [IntrReadMem, IntrArgMemOnly]>;
36413639
36423640 def int_x86_avx512_gather3div8_sf :
3643 GCCBuiltin<"__builtin_ia32_gather3div8sf">,
36443641 Intrinsic<[llvm_v4f32_ty],
36453642 [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty, llvm_i32_ty],
36463643 [IntrReadMem, IntrArgMemOnly]>;
36473644
36483645 def int_x86_avx512_gather3div8_si :
3649 GCCBuiltin<"__builtin_ia32_gather3div8si">,
36503646 Intrinsic<[llvm_v4i32_ty],
36513647 [llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty, llvm_i32_ty],
36523648 [IntrReadMem, IntrArgMemOnly]>;
36533649
36543650 def int_x86_avx512_gather3siv2_df :
3655 GCCBuiltin<"__builtin_ia32_gather3siv2df">,
36563651 Intrinsic<[llvm_v2f64_ty],
36573652 [llvm_v2f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty],
36583653 [IntrReadMem, IntrArgMemOnly]>;
36593654
36603655 def int_x86_avx512_gather3siv2_di :
3661 GCCBuiltin<"__builtin_ia32_gather3siv2di">,
36623656 Intrinsic<[llvm_v2i64_ty],
36633657 [llvm_v2i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty],
36643658 [IntrReadMem, IntrArgMemOnly]>;
36653659
36663660 def int_x86_avx512_gather3siv4_df :
3667 GCCBuiltin<"__builtin_ia32_gather3siv4df">,
36683661 Intrinsic<[llvm_v4f64_ty],
36693662 [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty],
36703663 [IntrReadMem, IntrArgMemOnly]>;
36713664
36723665 def int_x86_avx512_gather3siv4_di :
3673 GCCBuiltin<"__builtin_ia32_gather3siv4di">,
36743666 Intrinsic<[llvm_v4i64_ty],
36753667 [llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty],
36763668 [IntrReadMem, IntrArgMemOnly]>;
36773669
36783670 def int_x86_avx512_gather3siv4_sf :
3679 GCCBuiltin<"__builtin_ia32_gather3siv4sf">,
36803671 Intrinsic<[llvm_v4f32_ty],
36813672 [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty],
36823673 [IntrReadMem, IntrArgMemOnly]>;
36833674
36843675 def int_x86_avx512_gather3siv4_si :
3685 GCCBuiltin<"__builtin_ia32_gather3siv4si">,
36863676 Intrinsic<[llvm_v4i32_ty],
36873677 [llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty],
36883678 [IntrReadMem, IntrArgMemOnly]>;
36893679
36903680 def int_x86_avx512_gather3siv8_sf :
3691 GCCBuiltin<"__builtin_ia32_gather3siv8sf">,
36923681 Intrinsic<[llvm_v8f32_ty],
36933682 [llvm_v8f32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
36943683 [IntrReadMem, IntrArgMemOnly]>;
36953684
36963685 def int_x86_avx512_gather3siv8_si :
3697 GCCBuiltin<"__builtin_ia32_gather3siv8si">,
36983686 Intrinsic<[llvm_v8i32_ty],
36993687 [llvm_v8i32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
37003688 [IntrReadMem, IntrArgMemOnly]>;
38583846 def int_x86_avx512_scatterpf_qps_512 : GCCBuiltin<"__builtin_ia32_scatterpfqps">,
38593847 Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
38603848 llvm_i32_ty, llvm_i32_ty], [IntrArgMemOnly]>;
3849 }
3850
3851 // AVX512 gather intrinsics that use vXi1 masks.
3852 let TargetPrefix = "x86" in {
3853 def int_x86_avx512_mask_gather_dpd_512 :
3854 Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty,
3855 llvm_v8i32_ty, llvm_v8i1_ty, llvm_i32_ty],
3856 [IntrReadMem, IntrArgMemOnly]>;
3857 def int_x86_avx512_mask_gather_dps_512 :
3858 Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_ptr_ty,
3859 llvm_v16i32_ty, llvm_v16i1_ty, llvm_i32_ty],
3860 [IntrReadMem, IntrArgMemOnly]>;
3861 def int_x86_avx512_mask_gather_qpd_512 :
3862 Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty,
3863 llvm_v8i64_ty, llvm_v8i1_ty, llvm_i32_ty],
3864 [IntrReadMem, IntrArgMemOnly]>;
3865 def int_x86_avx512_mask_gather_qps_512 :
3866 Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_ptr_ty,
3867 llvm_v8i64_ty, llvm_v8i1_ty, llvm_i32_ty],
3868 [IntrReadMem, IntrArgMemOnly]>;
3869
3870
3871 def int_x86_avx512_mask_gather_dpq_512 :
3872 Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_ptr_ty,
3873 llvm_v8i32_ty, llvm_v8i1_ty, llvm_i32_ty],
3874 [IntrReadMem, IntrArgMemOnly]>;
3875 def int_x86_avx512_mask_gather_dpi_512 :
3876 Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_ptr_ty,
3877 llvm_v16i32_ty, llvm_v16i1_ty, llvm_i32_ty],
3878 [IntrReadMem, IntrArgMemOnly]>;
3879 def int_x86_avx512_mask_gather_qpq_512 :
3880 Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_ptr_ty,
3881 llvm_v8i64_ty, llvm_v8i1_ty, llvm_i32_ty],
3882 [IntrReadMem, IntrArgMemOnly]>;
3883 def int_x86_avx512_mask_gather_qpi_512 :
3884 Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_ptr_ty,
3885 llvm_v8i64_ty, llvm_v8i1_ty, llvm_i32_ty],
3886 [IntrReadMem, IntrArgMemOnly]>;
3887
3888 def int_x86_avx512_mask_gather3div2_df :
3889 Intrinsic<[llvm_v2f64_ty],
3890 [llvm_v2f64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i1_ty, llvm_i32_ty],
3891 [IntrReadMem, IntrArgMemOnly]>;
3892
3893 def int_x86_avx512_mask_gather3div2_di :
3894 Intrinsic<[llvm_v2i64_ty],
3895 [llvm_v2i64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i1_ty, llvm_i32_ty],
3896 [IntrReadMem, IntrArgMemOnly]>;
3897
3898 def int_x86_avx512_mask_gather3div4_df :
3899 Intrinsic<[llvm_v4f64_ty],
3900 [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i1_ty, llvm_i32_ty],
3901 [IntrReadMem, IntrArgMemOnly]>;
3902
3903 def int_x86_avx512_mask_gather3div4_di :
3904 Intrinsic<[llvm_v4i64_ty],
3905 [llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i1_ty, llvm_i32_ty],
3906 [IntrReadMem, IntrArgMemOnly]>;
3907
3908 def int_x86_avx512_mask_gather3div4_sf :
3909 Intrinsic<[llvm_v4f32_ty],
3910 [llvm_v4f32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i1_ty, llvm_i32_ty],
3911 [IntrReadMem, IntrArgMemOnly]>;
3912
3913 def int_x86_avx512_mask_gather3div4_si :
3914 Intrinsic<[llvm_v4i32_ty],
3915 [llvm_v4i32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i1_ty, llvm_i32_ty],
3916 [IntrReadMem, IntrArgMemOnly]>;
3917
3918 def int_x86_avx512_mask_gather3div8_sf :
3919 Intrinsic<[llvm_v4f32_ty],
3920 [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i1_ty, llvm_i32_ty],
3921 [IntrReadMem, IntrArgMemOnly]>;
3922
3923 def int_x86_avx512_mask_gather3div8_si :
3924 Intrinsic<[llvm_v4i32_ty],
3925 [llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i1_ty, llvm_i32_ty],
3926 [IntrReadMem, IntrArgMemOnly]>;
3927
3928 def int_x86_avx512_mask_gather3siv2_df :
3929 Intrinsic<[llvm_v2f64_ty],
3930 [llvm_v2f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v2i1_ty, llvm_i32_ty],
3931 [IntrReadMem, IntrArgMemOnly]>;
3932
3933 def int_x86_avx512_mask_gather3siv2_di :
3934 Intrinsic<[llvm_v2i64_ty],
3935 [llvm_v2i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v2i1_ty, llvm_i32_ty],
3936 [IntrReadMem, IntrArgMemOnly]>;
3937
3938 def int_x86_avx512_mask_gather3siv4_df :
3939 Intrinsic<[llvm_v4f64_ty],
3940 [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i1_ty, llvm_i32_ty],
3941 [IntrReadMem, IntrArgMemOnly]>;
3942
3943 def int_x86_avx512_mask_gather3siv4_di :
3944 Intrinsic<[llvm_v4i64_ty],
3945 [llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i1_ty, llvm_i32_ty],
3946 [IntrReadMem, IntrArgMemOnly]>;
3947
3948 def int_x86_avx512_mask_gather3siv4_sf :
3949 Intrinsic<[llvm_v4f32_ty],
3950 [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i1_ty, llvm_i32_ty],
3951 [IntrReadMem, IntrArgMemOnly]>;
3952
3953 def int_x86_avx512_mask_gather3siv4_si :
3954 Intrinsic<[llvm_v4i32_ty],
3955 [llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i1_ty, llvm_i32_ty],
3956 [IntrReadMem, IntrArgMemOnly]>;
3957
3958 def int_x86_avx512_mask_gather3siv8_sf :
3959 Intrinsic<[llvm_v8f32_ty],
3960 [llvm_v8f32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i1_ty, llvm_i32_ty],
3961 [IntrReadMem, IntrArgMemOnly]>;
3962
3963 def int_x86_avx512_mask_gather3siv8_si :
3964 Intrinsic<[llvm_v8i32_ty],
3965 [llvm_v8i32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i1_ty, llvm_i32_ty],
3966 [IntrReadMem, IntrArgMemOnly]>;
38613967 }
38623968
38633969 // AVX-512 conflict detection instruction
2232522325 VT.getVectorNumElements());
2232622326 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
2232722327
22328 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
22328 // We support two versions of the gather intrinsics. One with scalar mask and
22329 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
22330 if (Mask.getValueType() != MaskVT)
22331 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
22332
2232922333 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
2233022334 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
2233122335 SDValue Segment = DAG.getRegister(0, MVT::i32);
2233222336 // If source is undef or we know it won't be used, use a zero vector
2233322337 // to break register dependency.
2233422338 // TODO: use undef instead and let BreakFalseDeps deal with it?
22335 if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
22339 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
2233622340 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
22337 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
22341 SDValue Ops[] = {Src, Mask, Base, Scale, Index, Disp, Segment, Chain};
2233822342 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
2233922343 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
2234022344 return DAG.getMergeValues(RetOps, dl);
113113 X86::VGATHERPF0QPDm, X86::VGATHERPF1QPDm),
114114 X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH,
115115 X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm),
116
117 X86_INTRINSIC_DATA(avx512_mask_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0),
118 X86_INTRINSIC_DATA(avx512_mask_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0),
119 X86_INTRINSIC_DATA(avx512_mask_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0),
120 X86_INTRINSIC_DATA(avx512_mask_gather_dps_512, GATHER, X86::VGATHERDPSZrm, 0),
121 X86_INTRINSIC_DATA(avx512_mask_gather_qpd_512, GATHER, X86::VGATHERQPDZrm, 0),
122 X86_INTRINSIC_DATA(avx512_mask_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0),
123 X86_INTRINSIC_DATA(avx512_mask_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0),
124 X86_INTRINSIC_DATA(avx512_mask_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0),
125 X86_INTRINSIC_DATA(avx512_mask_gather3div2_df, GATHER, X86::VGATHERQPDZ128rm, 0),
126 X86_INTRINSIC_DATA(avx512_mask_gather3div2_di, GATHER, X86::VPGATHERQQZ128rm, 0),
127 X86_INTRINSIC_DATA(avx512_mask_gather3div4_df, GATHER, X86::VGATHERQPDZ256rm, 0),
128 X86_INTRINSIC_DATA(avx512_mask_gather3div4_di, GATHER, X86::VPGATHERQQZ256rm, 0),
129 X86_INTRINSIC_DATA(avx512_mask_gather3div4_sf, GATHER, X86::VGATHERQPSZ128rm, 0),
130 X86_INTRINSIC_DATA(avx512_mask_gather3div4_si, GATHER, X86::VPGATHERQDZ128rm, 0),
131 X86_INTRINSIC_DATA(avx512_mask_gather3div8_sf, GATHER, X86::VGATHERQPSZ256rm, 0),
132 X86_INTRINSIC_DATA(avx512_mask_gather3div8_si, GATHER, X86::VPGATHERQDZ256rm, 0),
133 X86_INTRINSIC_DATA(avx512_mask_gather3siv2_df, GATHER, X86::VGATHERDPDZ128rm, 0),
134 X86_INTRINSIC_DATA(avx512_mask_gather3siv2_di, GATHER, X86::VPGATHERDQZ128rm, 0),
135 X86_INTRINSIC_DATA(avx512_mask_gather3siv4_df, GATHER, X86::VGATHERDPDZ256rm, 0),
136 X86_INTRINSIC_DATA(avx512_mask_gather3siv4_di, GATHER, X86::VPGATHERDQZ256rm, 0),
137 X86_INTRINSIC_DATA(avx512_mask_gather3siv4_sf, GATHER, X86::VGATHERDPSZ128rm, 0),
138 X86_INTRINSIC_DATA(avx512_mask_gather3siv4_si, GATHER, X86::VPGATHERDDZ128rm, 0),
139 X86_INTRINSIC_DATA(avx512_mask_gather3siv8_sf, GATHER, X86::VGATHERDPSZ256rm, 0),
140 X86_INTRINSIC_DATA(avx512_mask_gather3siv8_si, GATHER, X86::VPGATHERDDZ256rm, 0),
116141
117142 X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
118143 X86ISD::VTRUNC, 0),
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s
2
3 declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float>, i8*, <16 x i32>, i16, i32)
4 declare void @llvm.x86.avx512.scatter.dps.512 (i8*, i16, <16 x i32>, <16 x float>, i32)
5 declare <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double>, i8*, <8 x i32>, i8, i32)
6 declare void @llvm.x86.avx512.scatter.dpd.512 (i8*, i8, <8 x i32>, <8 x double>, i32)
7
8 declare <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float>, i8*, <8 x i64>, i8, i32)
9 declare void @llvm.x86.avx512.scatter.qps.512 (i8*, i8, <8 x i64>, <8 x float>, i32)
10 declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double>, i8*, <8 x i64>, i8, i32)
11 declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, i8, <8 x i64>, <8 x double>, i32)
12
13 define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf) {
14 ; CHECK-LABEL: gather_mask_dps:
15 ; CHECK: ## %bb.0:
16 ; CHECK-NEXT: kmovd %edi, %k1
17 ; CHECK-NEXT: kmovq %k1, %k2
18 ; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2}
19 ; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
20 ; CHECK-NEXT: vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1}
21 ; CHECK-NEXT: vzeroupper
22 ; CHECK-NEXT: retq
23 %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
24 %ind2 = add <16 x i32> %ind,
25 call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4)
26 ret void
27 }
28
29 define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) {
30 ; CHECK-LABEL: gather_mask_dpd:
31 ; CHECK: ## %bb.0:
32 ; CHECK-NEXT: kmovd %edi, %k1
33 ; CHECK-NEXT: kmovq %k1, %k2
34 ; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2}
35 ; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
36 ; CHECK-NEXT: vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1}
37 ; CHECK-NEXT: vzeroupper
38 ; CHECK-NEXT: retq
39 %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
40 %ind2 = add <8 x i32> %ind,
41 call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4)
42 ret void
43 }
44
45 define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf) {
46 ; CHECK-LABEL: gather_mask_qps:
47 ; CHECK: ## %bb.0:
48 ; CHECK-NEXT: kmovd %edi, %k1
49 ; CHECK-NEXT: kmovq %k1, %k2
50 ; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2}
51 ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
52 ; CHECK-NEXT: vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1}
53 ; CHECK-NEXT: vzeroupper
54 ; CHECK-NEXT: retq
55 %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
56 %ind2 = add <8 x i64> %ind,
57 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4)
58 ret void
59 }
60
61 define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) {
62 ; CHECK-LABEL: gather_mask_qpd:
63 ; CHECK: ## %bb.0:
64 ; CHECK-NEXT: kmovd %edi, %k1
65 ; CHECK-NEXT: kmovq %k1, %k2
66 ; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2}
67 ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
68 ; CHECK-NEXT: vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1}
69 ; CHECK-NEXT: vzeroupper
70 ; CHECK-NEXT: retq
71 %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
72 %ind2 = add <8 x i64> %ind,
73 call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4)
74 ret void
75 }
76 ;;
77 ;; Integer Gather/Scatter
78 ;;
79 declare <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>, i8*, <16 x i32>, i16, i32)
80 declare void @llvm.x86.avx512.scatter.dpi.512 (i8*, i16, <16 x i32>, <16 x i32>, i32)
81 declare <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64>, i8*, <8 x i32>, i8, i32)
82 declare void @llvm.x86.avx512.scatter.dpq.512 (i8*, i8, <8 x i32>, <8 x i64>, i32)
83
84 declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32>, i8*, <8 x i64>, i8, i32)
85 declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, i8, <8 x i64>, <8 x i32>, i32)
86 declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, <8 x i64>, i8, i32)
87 declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, i8, <8 x i64>, <8 x i64>, i32)
88
89 define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf) {
90 ; CHECK-LABEL: gather_mask_dd:
91 ; CHECK: ## %bb.0:
92 ; CHECK-NEXT: kmovd %edi, %k1
93 ; CHECK-NEXT: kmovq %k1, %k2
94 ; CHECK-NEXT: vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2}
95 ; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
96 ; CHECK-NEXT: vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1}
97 ; CHECK-NEXT: vzeroupper
98 ; CHECK-NEXT: retq
99 %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
100 %ind2 = add <16 x i32> %ind,
101 call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4)
102 ret void
103 }
104
105 define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf) {
106 ; CHECK-LABEL: gather_mask_qd:
107 ; CHECK: ## %bb.0:
108 ; CHECK-NEXT: kmovd %edi, %k1
109 ; CHECK-NEXT: kmovq %k1, %k2
110 ; CHECK-NEXT: vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2}
111 ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
112 ; CHECK-NEXT: vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1}
113 ; CHECK-NEXT: vzeroupper
114 ; CHECK-NEXT: retq
115 %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
116 %ind2 = add <8 x i64> %ind,
117 call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4)
118 ret void
119 }
120
121 define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) {
122 ; CHECK-LABEL: gather_mask_qq:
123 ; CHECK: ## %bb.0:
124 ; CHECK-NEXT: kmovd %edi, %k1
125 ; CHECK-NEXT: kmovq %k1, %k2
126 ; CHECK-NEXT: vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2}
127 ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
128 ; CHECK-NEXT: vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1}
129 ; CHECK-NEXT: vzeroupper
130 ; CHECK-NEXT: retq
131 %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
132 %ind2 = add <8 x i64> %ind,
133 call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4)
134 ret void
135 }
136
137 define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) {
138 ; CHECK-LABEL: gather_mask_dq:
139 ; CHECK: ## %bb.0:
140 ; CHECK-NEXT: kmovd %edi, %k1
141 ; CHECK-NEXT: kmovq %k1, %k2
142 ; CHECK-NEXT: vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2}
143 ; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
144 ; CHECK-NEXT: vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1}
145 ; CHECK-NEXT: vzeroupper
146 ; CHECK-NEXT: retq
147 %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
148 %ind2 = add <8 x i32> %ind,
149 call void @llvm.x86.avx512.scatter.dpq.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4)
150 ret void
151 }
152
153 define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) {
154 ; CHECK-LABEL: gather_mask_dpd_execdomain:
155 ; CHECK: ## %bb.0:
156 ; CHECK-NEXT: kmovd %edi, %k1
157 ; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k1}
158 ; CHECK-NEXT: vmovapd %zmm1, (%rdx)
159 ; CHECK-NEXT: vzeroupper
160 ; CHECK-NEXT: retq
161 %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
162 store <8 x double> %x, <8 x double>* %stbuf
163 ret void
164 }
165
166 define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) {
167 ; CHECK-LABEL: gather_mask_qpd_execdomain:
168 ; CHECK: ## %bb.0:
169 ; CHECK-NEXT: kmovd %edi, %k1
170 ; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k1}
171 ; CHECK-NEXT: vmovapd %zmm1, (%rdx)
172 ; CHECK-NEXT: vzeroupper
173 ; CHECK-NEXT: retq
174 %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
175 store <8 x double> %x, <8 x double>* %stbuf
176 ret void
177 }
178
179 define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base) {
180 ; CHECK-LABEL: gather_mask_dps_execdomain:
181 ; CHECK: ## %bb.0:
182 ; CHECK-NEXT: kmovd %edi, %k1
183 ; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k1}
184 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
185 ; CHECK-NEXT: retq
186 %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
187 ret <16 x float> %res;
188 }
189
190 define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base) {
191 ; CHECK-LABEL: gather_mask_qps_execdomain:
192 ; CHECK: ## %bb.0:
193 ; CHECK-NEXT: kmovd %edi, %k1
194 ; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k1}
195 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
196 ; CHECK-NEXT: retq
197 %res = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
198 ret <8 x float> %res;
199 }
200
201 define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) {
202 ; CHECK-LABEL: scatter_mask_dpd_execdomain:
203 ; CHECK: ## %bb.0:
204 ; CHECK-NEXT: kmovd %esi, %k1
205 ; CHECK-NEXT: vmovapd (%rdi), %zmm1
206 ; CHECK-NEXT: vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1}
207 ; CHECK-NEXT: vzeroupper
208 ; CHECK-NEXT: retq
209 %x = load <8 x double>, <8 x double>* %src, align 64
210 call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4)
211 ret void
212 }
213
214 define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) {
215 ; CHECK-LABEL: scatter_mask_qpd_execdomain:
216 ; CHECK: ## %bb.0:
217 ; CHECK-NEXT: kmovd %esi, %k1
218 ; CHECK-NEXT: vmovapd (%rdi), %zmm1
219 ; CHECK-NEXT: vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1}
220 ; CHECK-NEXT: vzeroupper
221 ; CHECK-NEXT: retq
222 %x = load <8 x double>, <8 x double>* %src, align 64
223 call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4)
224 ret void
225 }
226
227 define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i16 %mask, i8* %base, i8* %stbuf) {
228 ; CHECK-LABEL: scatter_mask_dps_execdomain:
229 ; CHECK: ## %bb.0:
230 ; CHECK-NEXT: kmovd %esi, %k1
231 ; CHECK-NEXT: vmovaps (%rdi), %zmm1
232 ; CHECK-NEXT: vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1}
233 ; CHECK-NEXT: vzeroupper
234 ; CHECK-NEXT: retq
235 %x = load <16 x float>, <16 x float>* %src, align 64
236 call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4)
237 ret void
238 }
239
240 define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %mask, i8* %base, i8* %stbuf) {
241 ; CHECK-LABEL: scatter_mask_qps_execdomain:
242 ; CHECK: ## %bb.0:
243 ; CHECK-NEXT: kmovd %esi, %k1
244 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
245 ; CHECK-NEXT: vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1}
246 ; CHECK-NEXT: vzeroupper
247 ; CHECK-NEXT: retq
248 %x = load <8 x float>, <8 x float>* %src, align 32
249 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4)
250 ret void
251 }
252
253 define void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf) {
254 ; CHECK-LABEL: gather_qps:
255 ; CHECK: ## %bb.0:
256 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
257 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
258 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
259 ; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
260 ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
261 ; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1}
262 ; CHECK-NEXT: vzeroupper
263 ; CHECK-NEXT: retq
264 %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 -1, i32 4)
265 %ind2 = add <8 x i64> %ind,
266 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 -1, <8 x i64>%ind2, <8 x float> %x, i32 4)
267 ret void
268 }
269
270 declare void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
271 declare void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
272 define void @prefetch(<8 x i64> %ind, i8* %base) {
273 ; CHECK-LABEL: prefetch:
274 ; CHECK: ## %bb.0:
275 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
276 ; CHECK-NEXT: vgatherpf0qps (%rdi,%zmm0,4) {%k1}
277 ; CHECK-NEXT: kxorw %k0, %k0, %k1
278 ; CHECK-NEXT: vgatherpf1qps (%rdi,%zmm0,4) {%k1}
279 ; CHECK-NEXT: movb $1, %al
280 ; CHECK-NEXT: kmovd %eax, %k1
281 ; CHECK-NEXT: vscatterpf0qps (%rdi,%zmm0,2) {%k1}
282 ; CHECK-NEXT: movb $120, %al
283 ; CHECK-NEXT: kmovd %eax, %k1
284 ; CHECK-NEXT: vscatterpf1qps (%rdi,%zmm0,2) {%k1}
285 ; CHECK-NEXT: vzeroupper
286 ; CHECK-NEXT: retq
287 call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 3)
288 call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, i8* %base, i32 4, i32 2)
289 call void @llvm.x86.avx512.scatterpf.qps.512(i8 1, <8 x i64> %ind, i8* %base, i32 2, i32 3)
290 call void @llvm.x86.avx512.scatterpf.qps.512(i8 120, <8 x i64> %ind, i8* %base, i32 2, i32 2)
291 ret void
292 }
293
294 declare <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double>, i8*, <2 x i64>, i8, i32)
295
296 define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
297 ; CHECK-LABEL: test_int_x86_avx512_gather3div2_df:
298 ; CHECK: ## %bb.0:
299 ; CHECK-NEXT: kmovd %esi, %k1
300 ; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,4), %xmm0 {%k1}
301 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
302 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
303 ; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,2), %xmm2 {%k1}
304 ; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0
305 ; CHECK-NEXT: retq
306 %res = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
307 %res1 = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 2)
308 %res2 = fadd <2 x double> %res, %res1
309 ret <2 x double> %res2
310 }
311
312 declare <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64>, i8*, <2 x i64>, i8, i32)
313
314 define <2 x i64>@test_int_x86_avx512_gather3div2_di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
315 ; CHECK-LABEL: test_int_x86_avx512_gather3div2_di:
316 ; CHECK: ## %bb.0:
317 ; CHECK-NEXT: kmovd %esi, %k1
318 ; CHECK-NEXT: vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1}
319 ; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0
320 ; CHECK-NEXT: retq
321 %res = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8)
322 %res1 = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8)
323 %res2 = add <2 x i64> %res, %res1
324 ret <2 x i64> %res2
325 }
326
327 declare <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double>, i8*, <4 x i64>, i8, i32)
328
329 define <4 x double>@test_int_x86_avx512_gather3div4_df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
330 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_df:
331 ; CHECK: ## %bb.0:
332 ; CHECK-NEXT: kmovd %esi, %k1
333 ; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,4), %ymm0 {%k1}
334 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
335 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
336 ; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,2), %ymm2 {%k1}
337 ; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0
338 ; CHECK-NEXT: retq
339 %res = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
340 %res1 = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 2)
341 %res2 = fadd <4 x double> %res, %res1
342 ret <4 x double> %res2
343 }
344
345 declare <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64>, i8*, <4 x i64>, i8, i32)
346
347 define <4 x i64>@test_int_x86_avx512_gather3div4_di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
348 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_di:
349 ; CHECK: ## %bb.0:
350 ; CHECK-NEXT: kmovd %esi, %k1
351 ; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1}
352 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
353 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
354 ; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1}
355 ; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0
356 ; CHECK-NEXT: retq
357 %res = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 8)
358 %res1 = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 8)
359 %res2 = add <4 x i64> %res, %res1
360 ret <4 x i64> %res2
361 }
362
363 declare <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float>, i8*, <2 x i64>, i8, i32)
364
365 define <4 x float>@test_int_x86_avx512_gather3div4_sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
366 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_sf:
367 ; CHECK: ## %bb.0:
368 ; CHECK-NEXT: kmovd %esi, %k1
369 ; CHECK-NEXT: vgatherqps (%rdi,%xmm1,4), %xmm0 {%k1}
370 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
371 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
372 ; CHECK-NEXT: vgatherqps (%rdi,%xmm1,2), %xmm2 {%k1}
373 ; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0
374 ; CHECK-NEXT: retq
375 %res = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
376 %res1 = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 2)
377 %res2 = fadd <4 x float> %res, %res1
378 ret <4 x float> %res2
379 }
380
381 declare <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32>, i8*, <2 x i64>, i8, i32)
382
383 define <4 x i32>@test_int_x86_avx512_gather3div4_si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
384 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_si:
385 ; CHECK: ## %bb.0:
386 ; CHECK-NEXT: kmovd %esi, %k1
387 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
388 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
389 ; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k2}
390 ; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm0 {%k1}
391 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
392 ; CHECK-NEXT: retq
393 %res = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 4)
394 %res1 = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
395 %res2 = add <4 x i32> %res, %res1
396 ret <4 x i32> %res2
397 }
398
399 declare <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float>, i8*, <4 x i64>, i8, i32)
400
401 define <4 x float>@test_int_x86_avx512_gather3div8_sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
402 ; CHECK-LABEL: test_int_x86_avx512_gather3div8_sf:
403 ; CHECK: ## %bb.0:
404 ; CHECK-NEXT: kmovd %esi, %k1
405 ; CHECK-NEXT: vgatherqps (%rdi,%ymm1,4), %xmm0 {%k1}
406 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
407 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
408 ; CHECK-NEXT: vgatherqps (%rdi,%ymm1,2), %xmm2 {%k1}
409 ; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0
410 ; CHECK-NEXT: vzeroupper
411 ; CHECK-NEXT: retq
412 %res = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
413 %res1 = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 2)
414 %res2 = fadd <4 x float> %res, %res1
415 ret <4 x float> %res2
416 }
417
418 declare <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32>, i8*, <4 x i64>, i8, i32)
419
420 define <4 x i32>@test_int_x86_avx512_gather3div8_si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
421 ; CHECK-LABEL: test_int_x86_avx512_gather3div8_si:
422 ; CHECK: ## %bb.0:
423 ; CHECK-NEXT: kmovd %esi, %k1
424 ; CHECK-NEXT: vmovdqa %xmm0, %xmm2
425 ; CHECK-NEXT: kmovq %k1, %k2
426 ; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2}
427 ; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1}
428 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
429 ; CHECK-NEXT: vzeroupper
430 ; CHECK-NEXT: retq
431 %res = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
432 %res1 = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 2)
433 %res2 = add <4 x i32> %res, %res1
434 ret <4 x i32> %res2
435 }
436
437 declare <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double>, i8*, <4 x i32>, i8, i32)
438
439 define <2 x double>@test_int_x86_avx512_gather3siv2_df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
440 ; CHECK-LABEL: test_int_x86_avx512_gather3siv2_df:
441 ; CHECK: ## %bb.0:
442 ; CHECK-NEXT: kmovd %esi, %k1
443 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %xmm0 {%k1}
444 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
445 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
446 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %xmm2 {%k1}
447 ; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0
448 ; CHECK-NEXT: retq
449 %res = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
450 %res1 = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2)
451 %res2 = fadd <2 x double> %res, %res1
452 ret <2 x double> %res2
453 }
454
455 declare <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64>, i8*, <4 x i32>, i8, i32)
456
457 define <2 x i64>@test_int_x86_avx512_gather3siv2_di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
458 ; CHECK-LABEL: test_int_x86_avx512_gather3siv2_di:
459 ; CHECK: ## %bb.0:
460 ; CHECK-NEXT: kmovd %esi, %k1
461 ; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1}
462 ; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0
463 ; CHECK-NEXT: retq
464 %res = call <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
465 %res1 = call <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
466 %res2 = add <2 x i64> %res, %res1
467 ret <2 x i64> %res2
468 }
469
470 declare <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double>, i8*, <4 x i32>, i8, i32)
471
472 define <4 x double>@test_int_x86_avx512_gather3siv4_df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
473 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_df:
474 ; CHECK: ## %bb.0:
475 ; CHECK-NEXT: kmovd %esi, %k1
476 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %ymm0 {%k1}
477 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
478 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
479 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %ymm2 {%k1}
480 ; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0
481 ; CHECK-NEXT: retq
482 %res = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
483 %res1 = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2)
484 %res2 = fadd <4 x double> %res, %res1
485 ret <4 x double> %res2
486 }
487
488 declare <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64>, i8*, <4 x i32>, i8, i32)
489
490 define <4 x i64>@test_int_x86_avx512_gather3siv4_di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
491 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_di:
492 ; CHECK: ## %bb.0:
493 ; CHECK-NEXT: kmovd %esi, %k1
494 ; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1}
495 ; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm0
496 ; CHECK-NEXT: retq
497 %res = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
498 %res1 = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
499 %res2 = add <4 x i64> %res, %res1
500 ret <4 x i64> %res2
501 }
502
503 declare <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float>, i8*, <4 x i32>, i8, i32)
504
505 define <4 x float>@test_int_x86_avx512_gather3siv4_sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
506 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_sf:
507 ; CHECK: ## %bb.0:
508 ; CHECK-NEXT: kmovd %esi, %k1
509 ; CHECK-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
510 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
511 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
512 ; CHECK-NEXT: vgatherdps (%rdi,%xmm1,2), %xmm2 {%k1}
513 ; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0
514 ; CHECK-NEXT: retq
515 %res = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
516 %res1 = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2)
517 %res2 = fadd <4 x float> %res, %res1
518 ret <4 x float> %res2
519 }
520
521 declare <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32>, i8*, <4 x i32>, i8, i32)
522
523 define <4 x i32>@test_int_x86_avx512_gather3siv4_si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
524 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_si:
525 ; CHECK: ## %bb.0:
526 ; CHECK-NEXT: kmovd %esi, %k1
527 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
528 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
529 ; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k2}
530 ; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,2), %xmm0 {%k1}
531 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
532 ; CHECK-NEXT: retq
533 %res = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 4)
534 %res1 = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 2)
535 %res2 = add <4 x i32> %res, %res1
536 ret <4 x i32> %res2
537 }
538
539 declare <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float>, i8*, <8 x i32>, i8, i32)
540
541 define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) {
542 ; CHECK-LABEL: test_int_x86_avx512_gather3siv8_sf:
543 ; CHECK: ## %bb.0:
544 ; CHECK-NEXT: kmovd %esi, %k1
545 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
546 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
547 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
548 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1}
549 ; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0
550 ; CHECK-NEXT: retq
551 %res = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4)
552 %res1 = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 -1, i32 2)
553 %res2 = fadd <8 x float> %res, %res1
554 ret <8 x float> %res2
555 }
556
557 declare <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32>, i8*, <8 x i32>, i8, i32)
558
559 define <8 x i32>@test_int_x86_avx512_gather3siv8_si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) {
560 ; CHECK-LABEL: test_int_x86_avx512_gather3siv8_si:
561 ; CHECK: ## %bb.0:
562 ; CHECK-NEXT: kmovd %esi, %k1
563 ; CHECK-NEXT: vmovdqa %ymm0, %ymm2
564 ; CHECK-NEXT: kmovq %k1, %k2
565 ; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,4), %ymm2 {%k2}
566 ; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,2), %ymm0 {%k1}
567 ; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
568 ; CHECK-NEXT: retq
569 %res = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4)
570 %res1 = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 2)
571 %res2 = add <8 x i32> %res, %res1
572 ret <8 x i32> %res2
573 }
574
575 declare void @llvm.x86.avx512.scatterdiv2.df(i8*, i8, <2 x i64>, <2 x double>, i32)
576
577 define void@test_int_x86_avx512_scatterdiv2_df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3) {
578 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_df:
579 ; CHECK: ## %bb.0:
580 ; CHECK-NEXT: kmovd %esi, %k1
581 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
582 ; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,2) {%k2}
583 ; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,4) {%k1}
584 ; CHECK-NEXT: retq
585 call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 -1, <2 x i64> %x2, <2 x double> %x3, i32 2)
586 call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3, i32 4)
587 ret void
588 }
589
590 declare void @llvm.x86.avx512.scatterdiv2.di(i8*, i8, <2 x i64>, <2 x i64>, i32)
591
592 define void@test_int_x86_avx512_scatterdiv2_di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3) {
593 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_di:
594 ; CHECK: ## %bb.0:
595 ; CHECK-NEXT: kmovd %esi, %k1
596 ; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,2) {%k1}
597 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
598 ; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,4) {%k1}
599 ; CHECK-NEXT: retq
600 call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3, i32 2)
601 call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 -1, <2 x i64> %x2, <2 x i64> %x3, i32 4)
602 ret void
603 }
604
605 declare void @llvm.x86.avx512.scatterdiv4.df(i8*, i8, <4 x i64>, <4 x double>, i32)
606
607 define void@test_int_x86_avx512_scatterdiv4_df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3) {
608 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_df:
609 ; CHECK: ## %bb.0:
610 ; CHECK-NEXT: kmovd %esi, %k1
611 ; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,2) {%k1}
612 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
613 ; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1}
614 ; CHECK-NEXT: vzeroupper
615 ; CHECK-NEXT: retq
616 call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3, i32 2)
617 call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 -1, <4 x i64> %x2, <4 x double> %x3, i32 4)
618 ret void
619 }
620
621 declare void @llvm.x86.avx512.scatterdiv4.di(i8*, i8, <4 x i64>, <4 x i64>, i32)
622
623 define void@test_int_x86_avx512_scatterdiv4_di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3) {
624 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_di:
625 ; CHECK: ## %bb.0:
626 ; CHECK-NEXT: kmovd %esi, %k1
627 ; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,2) {%k1}
628 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
629 ; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,4) {%k1}
630 ; CHECK-NEXT: vzeroupper
631 ; CHECK-NEXT: retq
632 call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3, i32 2)
633 call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i64> %x3, i32 4)
634 ret void
635 }
636
637 declare void @llvm.x86.avx512.scatterdiv4.sf(i8*, i8, <2 x i64>, <4 x float>, i32)
638
639 define void@test_int_x86_avx512_scatterdiv4_sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3) {
640 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_sf:
641 ; CHECK: ## %bb.0:
642 ; CHECK-NEXT: kmovd %esi, %k1
643 ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,2) {%k1}
644 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
645 ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,4) {%k1}
646 ; CHECK-NEXT: retq
647 call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3, i32 2)
648 call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 -1, <2 x i64> %x2, <4 x float> %x3, i32 4)
649 ret void
650 }
651
652 declare void @llvm.x86.avx512.scatterdiv4.si(i8*, i8, <2 x i64>, <4 x i32>, i32)
653
654 define void@test_int_x86_avx512_scatterdiv4_si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3) {
655 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_si:
656 ; CHECK: ## %bb.0:
657 ; CHECK-NEXT: kmovd %esi, %k1
658 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
659 ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,2) {%k2}
660 ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,4) {%k1}
661 ; CHECK-NEXT: retq
662 call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 -1, <2 x i64> %x2, <4 x i32> %x3, i32 2)
663 call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3, i32 4)
664 ret void
665 }
666
667 declare void @llvm.x86.avx512.scatterdiv8.sf(i8*, i8, <4 x i64>, <4 x float>, i32)
668
669 define void@test_int_x86_avx512_scatterdiv8_sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3) {
670 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_sf:
671 ; CHECK: ## %bb.0:
672 ; CHECK-NEXT: kmovd %esi, %k1
673 ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,2) {%k1}
674 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
675 ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,4) {%k1}
676 ; CHECK-NEXT: vzeroupper
677 ; CHECK-NEXT: retq
678 call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3, i32 2)
679 call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 -1, <4 x i64> %x2, <4 x float> %x3, i32 4)
680 ret void
681 }
682
683 declare void @llvm.x86.avx512.scatterdiv8.si(i8*, i8, <4 x i64>, <4 x i32>, i32)
684
685 define void@test_int_x86_avx512_scatterdiv8_si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3) {
686 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_si:
687 ; CHECK: ## %bb.0:
688 ; CHECK-NEXT: kmovd %esi, %k1
689 ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,2) {%k1}
690 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
691 ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,4) {%k1}
692 ; CHECK-NEXT: vzeroupper
693 ; CHECK-NEXT: retq
694 call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3, i32 2)
695 call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i32> %x3, i32 4)
696 ret void
697 }
698
699 declare void @llvm.x86.avx512.scattersiv2.df(i8*, i8, <4 x i32>, <2 x double>, i32)
700
701 define void@test_int_x86_avx512_scattersiv2_df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3) {
702 ; CHECK-LABEL: test_int_x86_avx512_scattersiv2_df:
703 ; CHECK: ## %bb.0:
704 ; CHECK-NEXT: kmovd %esi, %k1
705 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
706 ; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,2) {%k2}
707 ; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,4) {%k1}
708 ; CHECK-NEXT: retq
709 call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 -1, <4 x i32> %x2, <2 x double> %x3, i32 2)
710 call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3, i32 4)
711 ret void
712 }
713
714 declare void @llvm.x86.avx512.scattersiv2.di(i8*, i8, <4 x i32>, <2 x i64>, i32)
715
716 define void@test_int_x86_avx512_scattersiv2_di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3) {
717 ; CHECK-LABEL: test_int_x86_avx512_scattersiv2_di:
718 ; CHECK: ## %bb.0:
719 ; CHECK-NEXT: kmovd %esi, %k1
720 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
721 ; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,2) {%k2}
722 ; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,4) {%k1}
723 ; CHECK-NEXT: retq
724 call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 -1, <4 x i32> %x2, <2 x i64> %x3, i32 2)
725 call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3, i32 4)
726 ret void
727 }
728
729 declare void @llvm.x86.avx512.scattersiv4.df(i8*, i8, <4 x i32>, <4 x double>, i32)
730
731 define void@test_int_x86_avx512_scattersiv4_df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3) {
732 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_df:
733 ; CHECK: ## %bb.0:
734 ; CHECK-NEXT: kmovd %esi, %k1
735 ; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,2) {%k1}
736 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
737 ; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,4) {%k1}
738 ; CHECK-NEXT: vzeroupper
739 ; CHECK-NEXT: retq
740 call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3, i32 2)
741 call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 -1, <4 x i32> %x2, <4 x double> %x3, i32 4)
742 ret void
743 }
744
745 declare void @llvm.x86.avx512.scattersiv4.di(i8*, i8, <4 x i32>, <4 x i64>, i32)
746
747 define void@test_int_x86_avx512_scattersiv4_di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3) {
748 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_di:
749 ; CHECK: ## %bb.0:
750 ; CHECK-NEXT: kmovd %esi, %k1
751 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
752 ; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,2) {%k2}
753 ; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,4) {%k1}
754 ; CHECK-NEXT: vzeroupper
755 ; CHECK-NEXT: retq
756 call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i64> %x3, i32 2)
757 call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3, i32 4)
758 ret void
759 }
760
761 declare void @llvm.x86.avx512.scattersiv4.sf(i8*, i8, <4 x i32>, <4 x float>, i32)
762
763 define void@test_int_x86_avx512_scattersiv4_sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3) {
764 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_sf:
765 ; CHECK: ## %bb.0:
766 ; CHECK-NEXT: kmovd %esi, %k1
767 ; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,2) {%k1}
768 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
769 ; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,4) {%k1}
770 ; CHECK-NEXT: retq
771 call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3, i32 2)
772 call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 -1, <4 x i32> %x2, <4 x float> %x3, i32 4)
773 ret void
774 }
775
776 declare void @llvm.x86.avx512.scattersiv4.si(i8*, i8, <4 x i32>, <4 x i32>, i32)
777
778 define void@test_int_x86_avx512_scattersiv4_si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3) {
779 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_si:
780 ; CHECK: ## %bb.0:
781 ; CHECK-NEXT: kmovd %esi, %k1
782 ; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,2) {%k1}
783 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
784 ; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1}
785 ; CHECK-NEXT: retq
786 call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3, i32 2)
787 call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i32> %x3, i32 4)
788 ret void
789 }
790
791 declare void @llvm.x86.avx512.scattersiv8.sf(i8*, i8, <8 x i32>, <8 x float>, i32)
792
793 define void@test_int_x86_avx512_scattersiv8_sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3) {
794 ; CHECK-LABEL: test_int_x86_avx512_scattersiv8_sf:
795 ; CHECK: ## %bb.0:
796 ; CHECK-NEXT: kmovd %esi, %k1
797 ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1}
798 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
799 ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1}
800 ; CHECK-NEXT: vzeroupper
801 ; CHECK-NEXT: retq
802 call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3, i32 2)
803 call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 -1, <8 x i32> %x2, <8 x float> %x3, i32 4)
804 ret void
805 }
806
807 declare void @llvm.x86.avx512.scattersiv8.si(i8*, i8, <8 x i32>, <8 x i32>, i32)
808
809 define void@test_int_x86_avx512_scattersiv8_si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3) {
810 ; CHECK-LABEL: test_int_x86_avx512_scattersiv8_si:
811 ; CHECK: ## %bb.0:
812 ; CHECK-NEXT: kmovd %esi, %k1
813 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
814 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
815 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
816 ; CHECK-NEXT: vzeroupper
817 ; CHECK-NEXT: retq
818 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
819 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 4)
820 ret void
821 }
822
823 define void @scatter_mask_test(i8* %x0, <8 x i32> %x2, <8 x i32> %x3) {
824 ; CHECK-LABEL: scatter_mask_test:
825 ; CHECK: ## %bb.0:
826 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
827 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
828 ; CHECK-NEXT: kxorw %k0, %k0, %k1
829 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
830 ; CHECK-NEXT: movb $1, %al
831 ; CHECK-NEXT: kmovd %eax, %k1
832 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
833 ; CHECK-NEXT: movb $96, %al
834 ; CHECK-NEXT: kmovd %eax, %k1
835 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
836 ; CHECK-NEXT: vzeroupper
837 ; CHECK-NEXT: retq
838 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
839 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 0, <8 x i32> %x2, <8 x i32> %x3, i32 4)
840 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
841 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 96, <8 x i32> %x2, <8 x i32> %x3, i32 4)
842 ret void
843 }
844
845 define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %base) {
846 ; CHECK-LABEL: gather_mask_test:
847 ; CHECK: ## %bb.0:
848 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
849 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
850 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1}
851 ; CHECK-NEXT: kxorw %k0, %k0, %k1
852 ; CHECK-NEXT: vmovaps %zmm1, %zmm3
853 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
854 ; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2
855 ; CHECK-NEXT: movw $1, %ax
856 ; CHECK-NEXT: kmovd %eax, %k1
857 ; CHECK-NEXT: vmovaps %zmm1, %zmm3
858 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
859 ; CHECK-NEXT: movw $220, %ax
860 ; CHECK-NEXT: kmovd %eax, %k1
861 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
862 ; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0
863 ; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0
864 ; CHECK-NEXT: retq
865 %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4)
866 %res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4)
867 %res2 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 1, i32 4)
868 %res3 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 220, i32 4)
869
870 %res4 = fadd <16 x float> %res, %res1
871 %res5 = fadd <16 x float> %res3, %res2
872 %res6 = fadd <16 x float> %res5, %res4
873 ret <16 x float> %res6
874 }
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
11 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s
22
3 declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float>, i8*, <16 x i32>, i16, i32)
43 declare void @llvm.x86.avx512.scatter.dps.512 (i8*, i16, <16 x i32>, <16 x float>, i32)
5 declare <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double>, i8*, <8 x i32>, i8, i32)
64 declare void @llvm.x86.avx512.scatter.dpd.512 (i8*, i8, <8 x i32>, <8 x double>, i32)
75
8 declare <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float>, i8*, <8 x i64>, i8, i32)
96 declare void @llvm.x86.avx512.scatter.qps.512 (i8*, i8, <8 x i64>, <8 x float>, i32)
10 declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double>, i8*, <8 x i64>, i8, i32)
117 declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, i8, <8 x i64>, <8 x double>, i32)
128
13 define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf) {
9 define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf) {
1410 ; CHECK-LABEL: gather_mask_dps:
1511 ; CHECK: ## %bb.0:
1612 ; CHECK-NEXT: kmovd %edi, %k1
2016 ; CHECK-NEXT: vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1}
2117 ; CHECK-NEXT: vzeroupper
2218 ; CHECK-NEXT: retq
23 %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
19 %1 = bitcast i16 %mask to <16 x i1>
20 %x = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> %1, i32 4)
2421 %ind2 = add <16 x i32> %ind,
2522 call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4)
2623 ret void
2724 }
2825
29 define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) {
26 define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) {
3027 ; CHECK-LABEL: gather_mask_dpd:
3128 ; CHECK: ## %bb.0:
3229 ; CHECK-NEXT: kmovd %edi, %k1
3633 ; CHECK-NEXT: vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1}
3734 ; CHECK-NEXT: vzeroupper
3835 ; CHECK-NEXT: retq
39 %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
36 %1 = bitcast i8 %mask to <8 x i1>
37 %x = call <8 x double> @llvm.x86.avx512.mask.gather.dpd.512(<8 x double> %src, i8* %base, <8 x i32> %ind, <8 x i1> %1, i32 4)
4038 %ind2 = add <8 x i32> %ind,
4139 call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4)
4240 ret void
4341 }
4442
45 define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf) {
43 define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf) {
4644 ; CHECK-LABEL: gather_mask_qps:
4745 ; CHECK: ## %bb.0:
4846 ; CHECK-NEXT: kmovd %edi, %k1
5250 ; CHECK-NEXT: vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1}
5351 ; CHECK-NEXT: vzeroupper
5452 ; CHECK-NEXT: retq
55 %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
53 %1 = bitcast i8 %mask to <8 x i1>
54 %x = call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
5655 %ind2 = add <8 x i64> %ind,
5756 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4)
5857 ret void
5958 }
6059
61 define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) {
60 define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) {
6261 ; CHECK-LABEL: gather_mask_qpd:
6362 ; CHECK: ## %bb.0:
6463 ; CHECK-NEXT: kmovd %edi, %k1
6867 ; CHECK-NEXT: vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1}
6968 ; CHECK-NEXT: vzeroupper
7069 ; CHECK-NEXT: retq
71 %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
70 %1 = bitcast i8 %mask to <8 x i1>
71 %x = call <8 x double> @llvm.x86.avx512.mask.gather.qpd.512(<8 x double> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
7272 %ind2 = add <8 x i64> %ind,
7373 call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4)
7474 ret void
7676 ;;
7777 ;; Integer Gather/Scatter
7878 ;;
79 declare <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>, i8*, <16 x i32>, i16, i32)
8079 declare void @llvm.x86.avx512.scatter.dpi.512 (i8*, i16, <16 x i32>, <16 x i32>, i32)
81 declare <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64>, i8*, <8 x i32>, i8, i32)
8280 declare void @llvm.x86.avx512.scatter.dpq.512 (i8*, i8, <8 x i32>, <8 x i64>, i32)
8381
84 declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32>, i8*, <8 x i64>, i8, i32)
8582 declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, i8, <8 x i64>, <8 x i32>, i32)
86 declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, <8 x i64>, i8, i32)
8783 declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, i8, <8 x i64>, <8 x i64>, i32)
8884
89 define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf) {
85 define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf) {
9086 ; CHECK-LABEL: gather_mask_dd:
9187 ; CHECK: ## %bb.0:
9288 ; CHECK-NEXT: kmovd %edi, %k1
9692 ; CHECK-NEXT: vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1}
9793 ; CHECK-NEXT: vzeroupper
9894 ; CHECK-NEXT: retq
99 %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
95 %1 = bitcast i16 %mask to <16 x i1>
96 %x = call <16 x i32> @llvm.x86.avx512.mask.gather.dpi.512(<16 x i32> %src, i8* %base, <16 x i32> %ind, <16 x i1> %1, i32 4)
10097 %ind2 = add <16 x i32> %ind,
10198 call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4)
10299 ret void
103100 }
104101
105 define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf) {
102 define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf) {
106103 ; CHECK-LABEL: gather_mask_qd:
107104 ; CHECK: ## %bb.0:
108105 ; CHECK-NEXT: kmovd %edi, %k1
112109 ; CHECK-NEXT: vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1}
113110 ; CHECK-NEXT: vzeroupper
114111 ; CHECK-NEXT: retq
115 %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
112 %1 = bitcast i8 %mask to <8 x i1>
113 %x = call <8 x i32> @llvm.x86.avx512.mask.gather.qpi.512(<8 x i32> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
116114 %ind2 = add <8 x i64> %ind,
117115 call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4)
118116 ret void
119117 }
120118
121 define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) {
119 define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) {
122120 ; CHECK-LABEL: gather_mask_qq:
123121 ; CHECK: ## %bb.0:
124122 ; CHECK-NEXT: kmovd %edi, %k1
128126 ; CHECK-NEXT: vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1}
129127 ; CHECK-NEXT: vzeroupper
130128 ; CHECK-NEXT: retq
131 %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
129 %1 = bitcast i8 %mask to <8 x i1>
130 %x = call <8 x i64> @llvm.x86.avx512.mask.gather.qpq.512(<8 x i64> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
132131 %ind2 = add <8 x i64> %ind,
133132 call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4)
134133 ret void
135134 }
136135
137 define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) {
136 define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) {
138137 ; CHECK-LABEL: gather_mask_dq:
139138 ; CHECK: ## %bb.0:
140139 ; CHECK-NEXT: kmovd %edi, %k1
144143 ; CHECK-NEXT: vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1}
145144 ; CHECK-NEXT: vzeroupper
146145 ; CHECK-NEXT: retq
147 %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
146 %1 = bitcast i8 %mask to <8 x i1>
147 %x = call <8 x i64> @llvm.x86.avx512.mask.gather.dpq.512(<8 x i64> %src, i8* %base, <8 x i32> %ind, <8 x i1> %1, i32 4)
148148 %ind2 = add <8 x i32> %ind,
149149 call void @llvm.x86.avx512.scatter.dpq.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4)
150150 ret void
151151 }
152152
153 define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) {
153 define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) {
154154 ; CHECK-LABEL: gather_mask_dpd_execdomain:
155155 ; CHECK: ## %bb.0:
156156 ; CHECK-NEXT: kmovd %edi, %k1
158158 ; CHECK-NEXT: vmovapd %zmm1, (%rdx)
159159 ; CHECK-NEXT: vzeroupper
160160 ; CHECK-NEXT: retq
161 %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
161 %1 = bitcast i8 %mask to <8 x i1>
162 %x = call <8 x double> @llvm.x86.avx512.mask.gather.dpd.512(<8 x double> %src, i8* %base, <8 x i32> %ind, <8 x i1> %1, i32 4)
162163 store <8 x double> %x, <8 x double>* %stbuf
163164 ret void
164165 }
165166
166 define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) {
167 define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) {
167168 ; CHECK-LABEL: gather_mask_qpd_execdomain:
168169 ; CHECK: ## %bb.0:
169170 ; CHECK-NEXT: kmovd %edi, %k1
171172 ; CHECK-NEXT: vmovapd %zmm1, (%rdx)
172173 ; CHECK-NEXT: vzeroupper
173174 ; CHECK-NEXT: retq
174 %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
175 %1 = bitcast i8 %mask to <8 x i1>
176 %x = call <8 x double> @llvm.x86.avx512.mask.gather.qpd.512(<8 x double> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
175177 store <8 x double> %x, <8 x double>* %stbuf
176178 ret void
177179 }
178180
179 define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base) {
181 define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base) {
180182 ; CHECK-LABEL: gather_mask_dps_execdomain:
181183 ; CHECK: ## %bb.0:
182184 ; CHECK-NEXT: kmovd %edi, %k1
183185 ; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k1}
184186 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
185187 ; CHECK-NEXT: retq
186 %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
187 ret <16 x float> %res;
188 }
189
190 define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base) {
188 %1 = bitcast i16 %mask to <16 x i1>
189 %res = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> %1, i32 4)
190 ret <16 x float> %res
191 }
192
193 define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base) {
191194 ; CHECK-LABEL: gather_mask_qps_execdomain:
192195 ; CHECK: ## %bb.0:
193196 ; CHECK-NEXT: kmovd %edi, %k1
194197 ; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k1}
195198 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
196199 ; CHECK-NEXT: retq
197 %res = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
198 ret <8 x float> %res;
199 }
200
201 define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) {
200 %1 = bitcast i8 %mask to <8 x i1>
201 %res = call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
202 ret <8 x float> %res
203 }
204
205 define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) {
202206 ; CHECK-LABEL: scatter_mask_dpd_execdomain:
203207 ; CHECK: ## %bb.0:
204208 ; CHECK-NEXT: kmovd %esi, %k1
250254 ret void
251255 }
252256
253 define void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf) {
257 define void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf) {
254258 ; CHECK-LABEL: gather_qps:
255259 ; CHECK: ## %bb.0:
260 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
256261 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
257 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
258262 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
259263 ; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
260264 ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
261265 ; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1}
262266 ; CHECK-NEXT: vzeroupper
263267 ; CHECK-NEXT: retq
264 %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 -1, i32 4)
268 %x = call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> %src, i8* %base, <8 x i64> %ind, <8 x i1> , i32 4)
265269 %ind2 = add <8 x i64> %ind,
266270 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 -1, <8 x i64>%ind2, <8 x float> %x, i32 4)
267271 ret void
291295 ret void
292296 }
293297
294 declare <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double>, i8*, <2 x i64>, i8, i32)
295
296 define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
297 ; CHECK-LABEL: test_int_x86_avx512_gather3div2_df:
298 define <2 x double> @test_int_x86_avx512_mask_gather3div2_df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
299 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3div2_df:
298300 ; CHECK: ## %bb.0:
299301 ; CHECK-NEXT: kmovd %esi, %k1
300302 ; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,4), %xmm0 {%k1}
303305 ; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,2), %xmm2 {%k1}
304306 ; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0
305307 ; CHECK-NEXT: retq
306 %res = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
307 %res1 = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 2)
308 %1 = bitcast i8 %x3 to <8 x i1>
309 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32>
310 %res = call <2 x double> @llvm.x86.avx512.mask.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> %extract, i32 4)
311 %res1 = call <2 x double> @llvm.x86.avx512.mask.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> , i32 2)
308312 %res2 = fadd <2 x double> %res, %res1
309313 ret <2 x double> %res2
310314 }
311315
312 declare <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64>, i8*, <2 x i64>, i8, i32)
313
314 define <2 x i64>@test_int_x86_avx512_gather3div2_di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
315 ; CHECK-LABEL: test_int_x86_avx512_gather3div2_di:
316 define <2 x i64> @test_int_x86_avx512_mask_gather3div2_di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
317 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3div2_di:
316318 ; CHECK: ## %bb.0:
317319 ; CHECK-NEXT: kmovd %esi, %k1
318320 ; CHECK-NEXT: vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1}
319321 ; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0
320322 ; CHECK-NEXT: retq
321 %res = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8)
322 %res1 = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8)
323 %1 = bitcast i8 %x3 to <8 x i1>
324 %extract1 = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32>
325 %res = call <2 x i64> @llvm.x86.avx512.mask.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> %extract1, i32 8)
326 %2 = bitcast i8 %x3 to <8 x i1>
327 %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32>
328 %res1 = call <2 x i64> @llvm.x86.avx512.mask.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> %extract, i32 8)
323329 %res2 = add <2 x i64> %res, %res1
324330 ret <2 x i64> %res2
325331 }
326332
327 declare <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double>, i8*, <4 x i64>, i8, i32)
328
329 define <4 x double>@test_int_x86_avx512_gather3div4_df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
330 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_df:
333 define <4 x double> @test_int_x86_avx512_mask_gather3div4_df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
334 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3div4_df:
331335 ; CHECK: ## %bb.0:
332336 ; CHECK-NEXT: kmovd %esi, %k1
333337 ; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,4), %ymm0 {%k1}
336340 ; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,2), %ymm2 {%k1}
337341 ; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0
338342 ; CHECK-NEXT: retq
339 %res = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
340 %res1 = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 2)
343 %1 = bitcast i8 %x3 to <8 x i1>
344 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32>
345 %res = call <4 x double> @llvm.x86.avx512.mask.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> %extract, i32 4)
346 %res1 = call <4 x double> @llvm.x86.avx512.mask.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> , i32 2)
341347 %res2 = fadd <4 x double> %res, %res1
342348 ret <4 x double> %res2
343349 }
344350
345 declare <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64>, i8*, <4 x i64>, i8, i32)
346
347 define <4 x i64>@test_int_x86_avx512_gather3div4_di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
348 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_di:
351 define <4 x i64> @test_int_x86_avx512_mask_gather3div4_di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
352 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3div4_di:
349353 ; CHECK: ## %bb.0:
350354 ; CHECK-NEXT: kmovd %esi, %k1
351355 ; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1}
354358 ; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1}
355359 ; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0
356360 ; CHECK-NEXT: retq
357 %res = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 8)
358 %res1 = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 8)
361 %1 = bitcast i8 %x3 to <8 x i1>
362 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32>
363 %res = call <4 x i64> @llvm.x86.avx512.mask.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> %extract, i32 8)
364 %res1 = call <4 x i64> @llvm.x86.avx512.mask.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> , i32 8)
359365 %res2 = add <4 x i64> %res, %res1
360366 ret <4 x i64> %res2
361367 }
362368
363 declare <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float>, i8*, <2 x i64>, i8, i32)
364
365 define <4 x float>@test_int_x86_avx512_gather3div4_sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
366 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_sf:
369 define <4 x float> @test_int_x86_avx512_mask_gather3div4_sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
370 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3div4_sf:
367371 ; CHECK: ## %bb.0:
368372 ; CHECK-NEXT: kmovd %esi, %k1
369373 ; CHECK-NEXT: vgatherqps (%rdi,%xmm1,4), %xmm0 {%k1}
372376 ; CHECK-NEXT: vgatherqps (%rdi,%xmm1,2), %xmm2 {%k1}
373377 ; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0
374378 ; CHECK-NEXT: retq
375 %res = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
376 %res1 = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 2)
379 %1 = bitcast i8 %x3 to <8 x i1>
380 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32>
381 %res = call <4 x float> @llvm.x86.avx512.mask.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> %extract, i32 4)
382 %res1 = call <4 x float> @llvm.x86.avx512.mask.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> , i32 2)
377383 %res2 = fadd <4 x float> %res, %res1
378384 ret <4 x float> %res2
379385 }
380386
381 declare <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32>, i8*, <2 x i64>, i8, i32)
382
383 define <4 x i32>@test_int_x86_avx512_gather3div4_si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
384 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_si:
385 ; CHECK: ## %bb.0:
386 ; CHECK-NEXT: kmovd %esi, %k1
387 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
387 define <4 x i32> @test_int_x86_avx512_mask_gather3div4_si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
388 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3div4_si:
389 ; CHECK: ## %bb.0:
390 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
388391 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
389 ; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k2}
392 ; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k1}
393 ; CHECK-NEXT: kmovd %esi, %k1
390394 ; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm0 {%k1}
391395 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
392396 ; CHECK-NEXT: retq
393 %res = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 4)
394 %res1 = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
397 %res = call <4 x i32> @llvm.x86.avx512.mask.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> , i32 4)
398 %1 = bitcast i8 %x3 to <8 x i1>
399 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32>
400 %res1 = call <4 x i32> @llvm.x86.avx512.mask.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> %extract, i32 4)
395401 %res2 = add <4 x i32> %res, %res1
396402 ret <4 x i32> %res2
397403 }
398404
399 declare <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float>, i8*, <4 x i64>, i8, i32)
400
401 define <4 x float>@test_int_x86_avx512_gather3div8_sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
402 ; CHECK-LABEL: test_int_x86_avx512_gather3div8_sf:
405 define <4 x float> @test_int_x86_avx512_mask_gather3div8_sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
406 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3div8_sf:
403407 ; CHECK: ## %bb.0:
404408 ; CHECK-NEXT: kmovd %esi, %k1
405409 ; CHECK-NEXT: vgatherqps (%rdi,%ymm1,4), %xmm0 {%k1}
409413 ; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0
410414 ; CHECK-NEXT: vzeroupper
411415 ; CHECK-NEXT: retq
412 %res = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
413 %res1 = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 2)
416 %1 = bitcast i8 %x3 to <8 x i1>
417 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32>
418 %res = call <4 x float> @llvm.x86.avx512.mask.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> %extract, i32 4)
419 %res1 = call <4 x float> @llvm.x86.avx512.mask.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> , i32 2)
414420 %res2 = fadd <4 x float> %res, %res1
415421 ret <4 x float> %res2
416422 }
417423
418 declare <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32>, i8*, <4 x i64>, i8, i32)
419
420 define <4 x i32>@test_int_x86_avx512_gather3div8_si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
421 ; CHECK-LABEL: test_int_x86_avx512_gather3div8_si:
424 define <4 x i32> @test_int_x86_avx512_mask_gather3div8_si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
425 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3div8_si:
422426 ; CHECK: ## %bb.0:
423427 ; CHECK-NEXT: kmovd %esi, %k1
424428 ; CHECK-NEXT: vmovdqa %xmm0, %xmm2
428432 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
429433 ; CHECK-NEXT: vzeroupper
430434 ; CHECK-NEXT: retq
431 %res = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
432 %res1 = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 2)
435 %1 = bitcast i8 %x3 to <8 x i1>
436 %extract1 = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32>
437 %res = call <4 x i32> @llvm.x86.avx512.mask.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> %extract1, i32 4)
438 %2 = bitcast i8 %x3 to <8 x i1>
439 %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32>
440 %res1 = call <4 x i32> @llvm.x86.avx512.mask.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> %extract, i32 2)
433441 %res2 = add <4 x i32> %res, %res1
434442 ret <4 x i32> %res2
435443 }
436444
437 declare <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double>, i8*, <4 x i32>, i8, i32)
438
439 define <2 x double>@test_int_x86_avx512_gather3siv2_df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
440 ; CHECK-LABEL: test_int_x86_avx512_gather3siv2_df:
445 define <2 x double> @test_int_x86_avx512_mask_gather3siv2_df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
446 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv2_df:
441447 ; CHECK: ## %bb.0:
442448 ; CHECK-NEXT: kmovd %esi, %k1
443449 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %xmm0 {%k1}
446452 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %xmm2 {%k1}
447453 ; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0
448454 ; CHECK-NEXT: retq
449 %res = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
450 %res1 = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2)
455 %1 = bitcast i8 %x3 to <8 x i1>
456 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32>
457 %res = call <2 x double> @llvm.x86.avx512.mask.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, <2 x i1> %extract, i32 4)
458 %res1 = call <2 x double> @llvm.x86.avx512.mask.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, <2 x i1> , i32 2)
451459 %res2 = fadd <2 x double> %res, %res1
452460 ret <2 x double> %res2
453461 }
454462
455 declare <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64>, i8*, <4 x i32>, i8, i32)
456
457 define <2 x i64>@test_int_x86_avx512_gather3siv2_di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
458 ; CHECK-LABEL: test_int_x86_avx512_gather3siv2_di:
463 define <2 x i64> @test_int_x86_avx512_mask_gather3siv2_di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
464 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv2_di:
459465 ; CHECK: ## %bb.0:
460466 ; CHECK-NEXT: kmovd %esi, %k1
461467 ; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1}
462468 ; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0
463469 ; CHECK-NEXT: retq
464 %res = call <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
465 %res1 = call <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
470 %1 = bitcast i8 %x3 to <8 x i1>
471 %extract1 = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32>
472 %res = call <2 x i64> @llvm.x86.avx512.mask.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, <2 x i1> %extract1, i32 8)
473 %2 = bitcast i8 %x3 to <8 x i1>
474 %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32>
475 %res1 = call <2 x i64> @llvm.x86.avx512.mask.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, <2 x i1> %extract, i32 8)
466476 %res2 = add <2 x i64> %res, %res1
467477 ret <2 x i64> %res2
468478 }
469479
470 declare <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double>, i8*, <4 x i32>, i8, i32)
471
472 define <4 x double>@test_int_x86_avx512_gather3siv4_df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
473 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_df:
480 define <4 x double> @test_int_x86_avx512_mask_gather3siv4_df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
481 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv4_df:
474482 ; CHECK: ## %bb.0:
475483 ; CHECK-NEXT: kmovd %esi, %k1
476484 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %ymm0 {%k1}
479487 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %ymm2 {%k1}
480488 ; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0
481489 ; CHECK-NEXT: retq
482 %res = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
483 %res1 = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2)
490 %1 = bitcast i8 %x3 to <8 x i1>
491 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32>
492 %res = call <4 x double> @llvm.x86.avx512.mask.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> %extract, i32 4)
493 %res1 = call <4 x double> @llvm.x86.avx512.mask.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> , i32 2)
484494 %res2 = fadd <4 x double> %res, %res1
485495 ret <4 x double> %res2
486496 }
487497
488 declare <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64>, i8*, <4 x i32>, i8, i32)
489
490 define <4 x i64>@test_int_x86_avx512_gather3siv4_di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
491 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_di:
498 define <4 x i64> @test_int_x86_avx512_mask_gather3siv4_di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
499 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv4_di:
492500 ; CHECK: ## %bb.0:
493501 ; CHECK-NEXT: kmovd %esi, %k1
494502 ; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1}
495503 ; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm0
496504 ; CHECK-NEXT: retq
497 %res = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
498 %res1 = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
505 %1 = bitcast i8 %x3 to <8 x i1>
506 %extract1 = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32>
507 %res = call <4 x i64> @llvm.x86.avx512.mask.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> %extract1, i32 8)
508 %2 = bitcast i8 %x3 to <8 x i1>
509 %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32>
510 %res1 = call <4 x i64> @llvm.x86.avx512.mask.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> %extract, i32 8)
499511 %res2 = add <4 x i64> %res, %res1
500512 ret <4 x i64> %res2
501513 }
502514
503 declare <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float>, i8*, <4 x i32>, i8, i32)
504
505 define <4 x float>@test_int_x86_avx512_gather3siv4_sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
506 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_sf:
515 define <4 x float> @test_int_x86_avx512_mask_gather3siv4_sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
516 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv4_sf:
507517 ; CHECK: ## %bb.0:
508518 ; CHECK-NEXT: kmovd %esi, %k1
509519 ; CHECK-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
512522 ; CHECK-NEXT: vgatherdps (%rdi,%xmm1,2), %xmm2 {%k1}
513523 ; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0
514524 ; CHECK-NEXT: retq
515 %res = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
516 %res1 = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2)
525 %1 = bitcast i8 %x3 to <8 x i1>
526 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32>
527 %res = call <4 x float> @llvm.x86.avx512.mask.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> %extract, i32 4)
528 %res1 = call <4 x float> @llvm.x86.avx512.mask.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> , i32 2)
517529 %res2 = fadd <4 x float> %res, %res1
518530 ret <4 x float> %res2
519531 }
520532
521 declare <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32>, i8*, <4 x i32>, i8, i32)
522
523 define <4 x i32>@test_int_x86_avx512_gather3siv4_si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
524 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_si:
525 ; CHECK: ## %bb.0:
526 ; CHECK-NEXT: kmovd %esi, %k1
527 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
533 define <4 x i32> @test_int_x86_avx512_mask_gather3siv4_si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
534 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv4_si:
535 ; CHECK: ## %bb.0:
536 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
528537 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
529 ; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k2}
538 ; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k1}
539 ; CHECK-NEXT: kmovd %esi, %k1
530540 ; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,2), %xmm0 {%k1}
531541 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
532542 ; CHECK-NEXT: retq
533 %res = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 4)
534 %res1 = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 2)
543 %res = call <4 x i32> @llvm.x86.avx512.mask.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> , i32 4)
544 %1 = bitcast i8 %x3 to <8 x i1>
545 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32>
546 %res1 = call <4 x i32> @llvm.x86.avx512.mask.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> %extract, i32 2)
535547 %res2 = add <4 x i32> %res, %res1
536548 ret <4 x i32> %res2
537549 }
538550
539 declare <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float>, i8*, <8 x i32>, i8, i32)
540
541 define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) {
542 ; CHECK-LABEL: test_int_x86_avx512_gather3siv8_sf:
551 define <8 x float> @test_int_x86_avx512_mask_gather3siv8_sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) {
552 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv8_sf:
543553 ; CHECK: ## %bb.0:
544554 ; CHECK-NEXT: kmovd %esi, %k1
545555 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
548558 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1}
549559 ; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0
550560 ; CHECK-NEXT: retq
551 %res = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4)
552 %res1 = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 -1, i32 2)
561 %1 = bitcast i8 %x3 to <8 x i1>
562 %res = call <8 x float> @llvm.x86.avx512.mask.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, <8 x i1> %1, i32 4)
563 %res1 = call <8 x float> @llvm.x86.avx512.mask.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, <8 x i1> , i32 2)
553564 %res2 = fadd <8 x float> %res, %res1
554565 ret <8 x float> %res2
555566 }
556567
557 declare <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32>, i8*, <8 x i32>, i8, i32)
558
559 define <8 x i32>@test_int_x86_avx512_gather3siv8_si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) {
560 ; CHECK-LABEL: test_int_x86_avx512_gather3siv8_si:
568 define <8 x i32> @test_int_x86_avx512_mask_gather3siv8_si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) {
569 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv8_si:
561570 ; CHECK: ## %bb.0:
562571 ; CHECK-NEXT: kmovd %esi, %k1
563572 ; CHECK-NEXT: vmovdqa %ymm0, %ymm2
566575 ; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,2), %ymm0 {%k1}
567576 ; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
568577 ; CHECK-NEXT: retq
569 %res = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4)
570 %res1 = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 2)
578 %1 = bitcast i8 %x3 to <8 x i1>
579 %res = call <8 x i32> @llvm.x86.avx512.mask.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, <8 x i1> %1, i32 4)
580 %2 = bitcast i8 %x3 to <8 x i1>
581 %res1 = call <8 x i32> @llvm.x86.avx512.mask.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, <8 x i1> %2, i32 2)
571582 %res2 = add <8 x i32> %res, %res1
572583 ret <8 x i32> %res2
573584 }
842853 ret void
843854 }
844855
845 define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %base) {
856 define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %base) {
846857 ; CHECK-LABEL: gather_mask_test:
847858 ; CHECK: ## %bb.0:
848859 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
862873 ; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0
863874 ; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0
864875 ; CHECK-NEXT: retq
865 %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4)
866 %res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4)
867 %res2 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 1, i32 4)
868 %res3 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 220, i32 4)
869
876 %res = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> , i32 4)
877 %res1 = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> zeroinitializer, i32 4)
878 %res2 = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> bitcast (<1 x i16> to <16 x i1>), i32 4)
879 %res3 = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> bitcast (<1 x i16> to <16 x i1>), i32 4)
870880 %res4 = fadd <16 x float> %res, %res1
871881 %res5 = fadd <16 x float> %res3, %res2
872882 %res6 = fadd <16 x float> %res5, %res4
873883 ret <16 x float> %res6
874884 }
885
886 declare <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float>, i8*, <16 x i32>, <16 x i1>, i32)
887 declare <8 x double> @llvm.x86.avx512.mask.gather.dpd.512(<8 x double>, i8*, <8 x i32>, <8 x i1>, i32)
888 declare <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float>, i8*, <8 x i64>, <8 x i1>, i32)
889 declare <8 x double> @llvm.x86.avx512.mask.gather.qpd.512(<8 x double>, i8*, <8 x i64>, <8 x i1>, i32)
890 declare <16 x i32> @llvm.x86.avx512.mask.gather.dpi.512(<16 x i32>, i8*, <16 x i32>, <16 x i1>, i32)
891 declare <8 x i64> @llvm.x86.avx512.mask.gather.dpq.512(<8 x i64>, i8*, <8 x i32>, <8 x i1>, i32)
892 declare <8 x i32> @llvm.x86.avx512.mask.gather.qpi.512(<8 x i32>, i8*, <8 x i64>, <8 x i1>, i32)
893 declare <8 x i64> @llvm.x86.avx512.mask.gather.qpq.512(<8 x i64>, i8*, <8 x i64>, <8 x i1>, i32)
894 declare <2 x double> @llvm.x86.avx512.mask.gather3div2.df(<2 x double>, i8*, <2 x i64>, <2 x i1>, i32)
895 declare <2 x i64> @llvm.x86.avx512.mask.gather3div2.di(<2 x i64>, i8*, <2 x i64>, <2 x i1>, i32)
896 declare <4 x double> @llvm.x86.avx512.mask.gather3div4.df(<4 x double>, i8*, <4 x i64>, <4 x i1>, i32)
897 declare <4 x i64> @llvm.x86.avx512.mask.gather3div4.di(<4 x i64>, i8*, <4 x i64>, <4 x i1>, i32)
898 declare <4 x float> @llvm.x86.avx512.mask.gather3div4.sf(<4 x float>, i8*, <2 x i64>, <2 x i1>, i32)
899 declare <4 x i32> @llvm.x86.avx512.mask.gather3div4.si(<4 x i32>, i8*, <2 x i64>, <2 x i1>, i32)
900 declare <4 x float> @llvm.x86.avx512.mask.gather3div8.sf(<4 x float>, i8*, <4 x i64>, <4 x i1>, i32)
901 declare <4 x i32> @llvm.x86.avx512.mask.gather3div8.si(<4 x i32>, i8*, <4 x i64>, <4 x i1>, i32)
902 declare <2 x double> @llvm.x86.avx512.mask.gather3siv2.df(<2 x double>, i8*, <4 x i32>, <2 x i1>, i32)
903 declare <2 x i64> @llvm.x86.avx512.mask.gather3siv2.di(<2 x i64>, i8*, <4 x i32>, <2 x i1>, i32)
904 declare <4 x double> @llvm.x86.avx512.mask.gather3siv4.df(<4 x double>, i8*, <4 x i32>, <4 x i1>, i32)
905 declare <4 x i64> @llvm.x86.avx512.mask.gather3siv4.di(<4 x i64>, i8*, <4 x i32>, <4 x i1>, i32)
906 declare <4 x float> @llvm.x86.avx512.mask.gather3siv4.sf(<4 x float>, i8*, <4 x i32>, <4 x i1>, i32)
907 declare <4 x i32> @llvm.x86.avx512.mask.gather3siv4.si(<4 x i32>, i8*, <4 x i32>, <4 x i1>, i32)
908 declare <8 x float> @llvm.x86.avx512.mask.gather3siv8.sf(<8 x float>, i8*, <8 x i32>, <8 x i1>, i32)
909 declare <8 x i32> @llvm.x86.avx512.mask.gather3siv8.si(<8 x i32>, i8*, <8 x i32>, <8 x i1>, i32)