llvm.org GIT mirror llvm / 0e3399b
[X86] Add avx512 scatter intrinsics that use a vXi1 mask instead of a scalar integer. We're trying to have the vXi1 types in IR as much as possible. This prevents the need for bitcasts when the producer of the mask was already a vXi1 value like an icmp. The bitcasts can be subject to code motion and interfere with basic block at a time isel in bad ways. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@351275 91177308-0d34-0410-b5e6-96231b3b80d8 Craig Topper 1 year, 10 months ago
4 changed file(s) with 287 addition(s) and 125 deletion(s). Raw diff Collapse all Expand all
35683568
35693569 // Gather and Scatter ops
35703570 let TargetPrefix = "x86" in {
3571 // NOTE: These are deprecated in favor of the versions that take a vXi1 mask.
35713572 def int_x86_avx512_gather_dpd_512 : GCCBuiltin<"__builtin_ia32_gathersiv8df">,
35723573 Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty,
35733574 llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
37003701 [IntrReadMem, IntrArgMemOnly]>;
37013702
37023703 // scatter
3704 // NOTE: These are deprecated in favor of the versions that take a vXi1 mask.
37033705 def int_x86_avx512_scatter_dpd_512 : GCCBuiltin<"__builtin_ia32_scattersiv8df">,
37043706 Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
37053707 llvm_v8i32_ty, llvm_v8f64_ty, llvm_i32_ty],
38603862 llvm_i32_ty, llvm_i32_ty], [IntrArgMemOnly]>;
38613863 }
38623864
3863 // AVX512 gather intrinsics that use vXi1 masks.
3865 // AVX512 gather/scatter intrinsics that use vXi1 masks.
38643866 let TargetPrefix = "x86" in {
38653867 def int_x86_avx512_mask_gather_dpd_512 :
38663868 Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty,
39763978 Intrinsic<[llvm_v8i32_ty],
39773979 [llvm_v8i32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i1_ty, llvm_i32_ty],
39783980 [IntrReadMem, IntrArgMemOnly]>;
3981
3982 def int_x86_avx512_mask_scatter_dpd_512 :
3983 Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty,
3984 llvm_v8i32_ty, llvm_v8f64_ty, llvm_i32_ty],
3985 [IntrArgMemOnly]>;
3986 def int_x86_avx512_mask_scatter_dps_512 :
3987 Intrinsic<[], [llvm_ptr_ty, llvm_v16i1_ty,
3988 llvm_v16i32_ty, llvm_v16f32_ty, llvm_i32_ty],
3989 [IntrArgMemOnly]>;
3990 def int_x86_avx512_mask_scatter_qpd_512 :
3991 Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty,
3992 llvm_v8i64_ty, llvm_v8f64_ty, llvm_i32_ty],
3993 [IntrArgMemOnly]>;
3994 def int_x86_avx512_mask_scatter_qps_512 :
3995 Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty,
3996 llvm_v8i64_ty, llvm_v8f32_ty, llvm_i32_ty],
3997 [IntrArgMemOnly]>;
3998
3999
4000 def int_x86_avx512_mask_scatter_dpq_512 :
4001 Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty,
4002 llvm_v8i32_ty, llvm_v8i64_ty, llvm_i32_ty],
4003 [IntrArgMemOnly]>;
4004 def int_x86_avx512_mask_scatter_dpi_512 :
4005 Intrinsic<[], [llvm_ptr_ty, llvm_v16i1_ty,
4006 llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty],
4007 [IntrArgMemOnly]>;
4008 def int_x86_avx512_mask_scatter_qpq_512 :
4009 Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty,llvm_v8i64_ty, llvm_v8i64_ty,
4010 llvm_i32_ty],
4011 [IntrArgMemOnly]>;
4012 def int_x86_avx512_mask_scatter_qpi_512 :
4013 Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty, llvm_v8i64_ty, llvm_v8i32_ty,
4014 llvm_i32_ty],
4015 [IntrArgMemOnly]>;
4016
4017 def int_x86_avx512_mask_scatterdiv2_df :
4018 Intrinsic<[],
4019 [llvm_ptr_ty, llvm_v2i1_ty, llvm_v2i64_ty, llvm_v2f64_ty, llvm_i32_ty],
4020 [IntrArgMemOnly]>;
4021
4022 def int_x86_avx512_mask_scatterdiv2_di :
4023 Intrinsic<[],
4024 [llvm_ptr_ty, llvm_v2i1_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
4025 [IntrArgMemOnly]>;
4026
4027 def int_x86_avx512_mask_scatterdiv4_df :
4028 Intrinsic<[],
4029 [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i64_ty, llvm_v4f64_ty, llvm_i32_ty],
4030 [IntrArgMemOnly]>;
4031
4032 def int_x86_avx512_mask_scatterdiv4_di :
4033 Intrinsic<[],
4034 [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty],
4035 [IntrArgMemOnly]>;
4036
4037 def int_x86_avx512_mask_scatterdiv4_sf :
4038 Intrinsic<[],
4039 [llvm_ptr_ty, llvm_v2i1_ty, llvm_v2i64_ty, llvm_v4f32_ty, llvm_i32_ty],
4040 [IntrArgMemOnly]>;
4041
4042 def int_x86_avx512_mask_scatterdiv4_si :
4043 Intrinsic<[],
4044 [llvm_ptr_ty, llvm_v2i1_ty, llvm_v2i64_ty, llvm_v4i32_ty, llvm_i32_ty],
4045 [IntrArgMemOnly]>;
4046
4047 def int_x86_avx512_mask_scatterdiv8_sf :
4048 Intrinsic<[],
4049 [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i64_ty, llvm_v4f32_ty, llvm_i32_ty],
4050 [IntrArgMemOnly]>;
4051
4052 def int_x86_avx512_mask_scatterdiv8_si :
4053 Intrinsic<[],
4054 [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i64_ty, llvm_v4i32_ty, llvm_i32_ty],
4055 [IntrArgMemOnly]>;
4056
4057 def int_x86_avx512_mask_scattersiv2_df :
4058 Intrinsic<[],
4059 [llvm_ptr_ty, llvm_v2i1_ty, llvm_v4i32_ty, llvm_v2f64_ty, llvm_i32_ty],
4060 [IntrArgMemOnly]>;
4061
4062 def int_x86_avx512_mask_scattersiv2_di :
4063 Intrinsic<[],
4064 [llvm_ptr_ty, llvm_v2i1_ty, llvm_v4i32_ty, llvm_v2i64_ty, llvm_i32_ty],
4065 [IntrArgMemOnly]>;
4066
4067 def int_x86_avx512_mask_scattersiv4_df :
4068 Intrinsic<[],
4069 [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i32_ty, llvm_v4f64_ty, llvm_i32_ty],
4070 [IntrArgMemOnly]>;
4071
4072 def int_x86_avx512_mask_scattersiv4_di :
4073 Intrinsic<[],
4074 [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i32_ty, llvm_v4i64_ty, llvm_i32_ty],
4075 [IntrArgMemOnly]>;
4076
4077 def int_x86_avx512_mask_scattersiv4_sf :
4078 Intrinsic<[],
4079 [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i32_ty, llvm_v4f32_ty, llvm_i32_ty],
4080 [IntrArgMemOnly]>;
4081
4082 def int_x86_avx512_mask_scattersiv4_si :
4083 Intrinsic<[],
4084 [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty],
4085 [IntrArgMemOnly]>;
4086
4087 def int_x86_avx512_mask_scattersiv8_sf :
4088 Intrinsic<[],
4089 [llvm_ptr_ty, llvm_v8i1_ty, llvm_v8i32_ty, llvm_v8f32_ty, llvm_i32_ty],
4090 [IntrArgMemOnly]>;
4091
4092 def int_x86_avx512_mask_scattersiv8_si :
4093 Intrinsic<[],
4094 [llvm_ptr_ty, llvm_v8i1_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty],
4095 [IntrArgMemOnly]>;
39794096 }
39804097
39814098 // AVX-512 conflict detection instruction
2236022360 Src.getSimpleValueType().getVectorNumElements());
2236122361 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
2236222362
22363 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
22363 // We support two versions of the scatter intrinsics. One with scalar mask and
22364 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
22365 if (Mask.getValueType() != MaskVT)
22366 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
22367
2236422368 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
22365 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
22369 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Src, Chain};
2236622370 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
2236722371 return SDValue(Res, 1);
2236822372 }
247247 X86ISD::VTRUNCUS, 0),
248248 X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_512, TRUNCATE_TO_MEM_VI8,
249249 X86ISD::VTRUNCUS, 0),
250
251 X86_INTRINSIC_DATA(avx512_mask_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0),
252 X86_INTRINSIC_DATA(avx512_mask_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0),
253 X86_INTRINSIC_DATA(avx512_mask_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0),
254 X86_INTRINSIC_DATA(avx512_mask_scatter_dps_512, SCATTER, X86::VSCATTERDPSZmr, 0),
255 X86_INTRINSIC_DATA(avx512_mask_scatter_qpd_512, SCATTER, X86::VSCATTERQPDZmr, 0),
256 X86_INTRINSIC_DATA(avx512_mask_scatter_qpi_512, SCATTER, X86::VPSCATTERQDZmr, 0),
257 X86_INTRINSIC_DATA(avx512_mask_scatter_qpq_512, SCATTER, X86::VPSCATTERQQZmr, 0),
258 X86_INTRINSIC_DATA(avx512_mask_scatter_qps_512, SCATTER, X86::VSCATTERQPSZmr, 0),
259 X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_df, SCATTER, X86::VSCATTERQPDZ128mr, 0),
260 X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_di, SCATTER, X86::VPSCATTERQQZ128mr, 0),
261 X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_df, SCATTER, X86::VSCATTERQPDZ256mr, 0),
262 X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_di, SCATTER, X86::VPSCATTERQQZ256mr, 0),
263 X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_sf, SCATTER, X86::VSCATTERQPSZ128mr, 0),
264 X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_si, SCATTER, X86::VPSCATTERQDZ128mr, 0),
265 X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_sf, SCATTER, X86::VSCATTERQPSZ256mr, 0),
266 X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_si, SCATTER, X86::VPSCATTERQDZ256mr, 0),
267 X86_INTRINSIC_DATA(avx512_mask_scattersiv2_df, SCATTER, X86::VSCATTERDPDZ128mr, 0),
268 X86_INTRINSIC_DATA(avx512_mask_scattersiv2_di, SCATTER, X86::VPSCATTERDQZ128mr, 0),
269 X86_INTRINSIC_DATA(avx512_mask_scattersiv4_df, SCATTER, X86::VSCATTERDPDZ256mr, 0),
270 X86_INTRINSIC_DATA(avx512_mask_scattersiv4_di, SCATTER, X86::VPSCATTERDQZ256mr, 0),
271 X86_INTRINSIC_DATA(avx512_mask_scattersiv4_sf, SCATTER, X86::VSCATTERDPSZ128mr, 0),
272 X86_INTRINSIC_DATA(avx512_mask_scattersiv4_si, SCATTER, X86::VPSCATTERDDZ128mr, 0),
273 X86_INTRINSIC_DATA(avx512_mask_scattersiv8_sf, SCATTER, X86::VSCATTERDPSZ256mr, 0),
274 X86_INTRINSIC_DATA(avx512_mask_scattersiv8_si, SCATTER, X86::VPSCATTERDDZ256mr, 0),
250275
251276 X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0),
252277 X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0),
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
11 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s
2
3 declare void @llvm.x86.avx512.scatter.dps.512 (i8*, i16, <16 x i32>, <16 x float>, i32)
4 declare void @llvm.x86.avx512.scatter.dpd.512 (i8*, i8, <8 x i32>, <8 x double>, i32)
5
6 declare void @llvm.x86.avx512.scatter.qps.512 (i8*, i8, <8 x i64>, <8 x float>, i32)
7 declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, i8, <8 x i64>, <8 x double>, i32)
82
93 define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf) {
104 ; CHECK-LABEL: gather_mask_dps:
1913 %1 = bitcast i16 %mask to <16 x i1>
2014 %x = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> %1, i32 4)
2115 %ind2 = add <16 x i32> %ind,
22 call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4)
16 call void @llvm.x86.avx512.mask.scatter.dps.512(i8* %stbuf, <16 x i1> %1, <16 x i32> %ind2, <16 x float> %x, i32 4)
2317 ret void
2418 }
2519
3630 %1 = bitcast i8 %mask to <8 x i1>
3731 %x = call <8 x double> @llvm.x86.avx512.mask.gather.dpd.512(<8 x double> %src, i8* %base, <8 x i32> %ind, <8 x i1> %1, i32 4)
3832 %ind2 = add <8 x i32> %ind,
39 call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4)
33 call void @llvm.x86.avx512.mask.scatter.dpd.512(i8* %stbuf, <8 x i1> %1, <8 x i32> %ind2, <8 x double> %x, i32 4)
4034 ret void
4135 }
4236
5347 %1 = bitcast i8 %mask to <8 x i1>
5448 %x = call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
5549 %ind2 = add <8 x i64> %ind,
56 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4)
50 call void @llvm.x86.avx512.mask.scatter.qps.512(i8* %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x float> %x, i32 4)
5751 ret void
5852 }
5953
7064 %1 = bitcast i8 %mask to <8 x i1>
7165 %x = call <8 x double> @llvm.x86.avx512.mask.gather.qpd.512(<8 x double> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
7266 %ind2 = add <8 x i64> %ind,
73 call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4)
67 call void @llvm.x86.avx512.mask.scatter.qpd.512(i8* %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x double> %x, i32 4)
7468 ret void
7569 }
7670 ;;
7771 ;; Integer Gather/Scatter
7872 ;;
79 declare void @llvm.x86.avx512.scatter.dpi.512 (i8*, i16, <16 x i32>, <16 x i32>, i32)
80 declare void @llvm.x86.avx512.scatter.dpq.512 (i8*, i8, <8 x i32>, <8 x i64>, i32)
81
82 declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, i8, <8 x i64>, <8 x i32>, i32)
83 declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, i8, <8 x i64>, <8 x i64>, i32)
8473
8574 define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf) {
8675 ; CHECK-LABEL: gather_mask_dd:
9584 %1 = bitcast i16 %mask to <16 x i1>
9685 %x = call <16 x i32> @llvm.x86.avx512.mask.gather.dpi.512(<16 x i32> %src, i8* %base, <16 x i32> %ind, <16 x i1> %1, i32 4)
9786 %ind2 = add <16 x i32> %ind,
98 call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4)
87 call void @llvm.x86.avx512.mask.scatter.dpi.512(i8* %stbuf, <16 x i1> %1, <16 x i32> %ind2, <16 x i32> %x, i32 4)
9988 ret void
10089 }
10190
112101 %1 = bitcast i8 %mask to <8 x i1>
113102 %x = call <8 x i32> @llvm.x86.avx512.mask.gather.qpi.512(<8 x i32> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
114103 %ind2 = add <8 x i64> %ind,
115 call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4)
104 call void @llvm.x86.avx512.mask.scatter.qpi.512(i8* %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x i32> %x, i32 4)
116105 ret void
117106 }
118107
129118 %1 = bitcast i8 %mask to <8 x i1>
130119 %x = call <8 x i64> @llvm.x86.avx512.mask.gather.qpq.512(<8 x i64> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
131120 %ind2 = add <8 x i64> %ind,
132 call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4)
121 call void @llvm.x86.avx512.mask.scatter.qpq.512(i8* %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x i64> %x, i32 4)
133122 ret void
134123 }
135124
146135 %1 = bitcast i8 %mask to <8 x i1>
147136 %x = call <8 x i64> @llvm.x86.avx512.mask.gather.dpq.512(<8 x i64> %src, i8* %base, <8 x i32> %ind, <8 x i1> %1, i32 4)
148137 %ind2 = add <8 x i32> %ind,
149 call void @llvm.x86.avx512.scatter.dpq.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4)
138 call void @llvm.x86.avx512.mask.scatter.dpq.512(i8* %stbuf, <8 x i1> %1, <8 x i32> %ind2, <8 x i64> %x, i32 4)
150139 ret void
151140 }
152141
210199 ; CHECK-NEXT: vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1}
211200 ; CHECK-NEXT: vzeroupper
212201 ; CHECK-NEXT: retq
202 %1 = bitcast i8 %mask to <8 x i1>
213203 %x = load <8 x double>, <8 x double>* %src, align 64
214 call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4)
204 call void @llvm.x86.avx512.mask.scatter.dpd.512(i8* %stbuf, <8 x i1> %1, <8 x i32>%ind, <8 x double> %x, i32 4)
215205 ret void
216206 }
217207
223213 ; CHECK-NEXT: vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1}
224214 ; CHECK-NEXT: vzeroupper
225215 ; CHECK-NEXT: retq
216 %1 = bitcast i8 %mask to <8 x i1>
226217 %x = load <8 x double>, <8 x double>* %src, align 64
227 call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4)
218 call void @llvm.x86.avx512.mask.scatter.qpd.512(i8* %stbuf, <8 x i1> %1, <8 x i64>%ind, <8 x double> %x, i32 4)
228219 ret void
229220 }
230221
236227 ; CHECK-NEXT: vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1}
237228 ; CHECK-NEXT: vzeroupper
238229 ; CHECK-NEXT: retq
230 %1 = bitcast i16 %mask to <16 x i1>
239231 %x = load <16 x float>, <16 x float>* %src, align 64
240 call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4)
232 call void @llvm.x86.avx512.mask.scatter.dps.512(i8* %stbuf, <16 x i1> %1, <16 x i32>%ind, <16 x float> %x, i32 4)
241233 ret void
242234 }
243235
249241 ; CHECK-NEXT: vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1}
250242 ; CHECK-NEXT: vzeroupper
251243 ; CHECK-NEXT: retq
244 %1 = bitcast i8 %mask to <8 x i1>
252245 %x = load <8 x float>, <8 x float>* %src, align 32
253 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4)
246 call void @llvm.x86.avx512.mask.scatter.qps.512(i8* %stbuf, <8 x i1> %1, <8 x i64>%ind, <8 x float> %x, i32 4)
254247 ret void
255248 }
256249
267260 ; CHECK-NEXT: retq
268261 %x = call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> %src, i8* %base, <8 x i64> %ind, <8 x i1> , i32 4)
269262 %ind2 = add <8 x i64> %ind,
270 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 -1, <8 x i64>%ind2, <8 x float> %x, i32 4)
263 call void @llvm.x86.avx512.mask.scatter.qps.512(i8* %stbuf, <8 x i1> , <8 x i64> %ind2, <8 x float> %x, i32 4)
271264 ret void
272265 }
273266
583576 ret <8 x i32> %res2
584577 }
585578
586 declare void @llvm.x86.avx512.scatterdiv2.df(i8*, i8, <2 x i64>, <2 x double>, i32)
587
588579 define void@test_int_x86_avx512_scatterdiv2_df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3) {
589580 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_df:
590581 ; CHECK: ## %bb.0:
593584 ; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,2) {%k2}
594585 ; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,4) {%k1}
595586 ; CHECK-NEXT: retq
596 call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 -1, <2 x i64> %x2, <2 x double> %x3, i32 2)
597 call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3, i32 4)
598 ret void
599 }
600
601 declare void @llvm.x86.avx512.scatterdiv2.di(i8*, i8, <2 x i64>, <2 x i64>, i32)
587 %1 = bitcast i8 %x1 to <8 x i1>
588 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32>
589 call void @llvm.x86.avx512.mask.scatterdiv2.df(i8* %x0, <2 x i1> , <2 x i64> %x2, <2 x double> %x3, i32 2)
590 call void @llvm.x86.avx512.mask.scatterdiv2.df(i8* %x0, <2 x i1> %2, <2 x i64> %x2, <2 x double> %x3, i32 4)
591 ret void
592 }
602593
603594 define void@test_int_x86_avx512_scatterdiv2_di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3) {
604595 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_di:
608599 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
609600 ; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,4) {%k1}
610601 ; CHECK-NEXT: retq
611 call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3, i32 2)
612 call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 -1, <2 x i64> %x2, <2 x i64> %x3, i32 4)
613 ret void
614 }
615
616 declare void @llvm.x86.avx512.scatterdiv4.df(i8*, i8, <4 x i64>, <4 x double>, i32)
602 %1 = bitcast i8 %x1 to <8 x i1>
603 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32>
604 call void @llvm.x86.avx512.mask.scatterdiv2.di(i8* %x0, <2 x i1> %2, <2 x i64> %x2, <2 x i64> %x3, i32 2)
605 call void @llvm.x86.avx512.mask.scatterdiv2.di(i8* %x0, <2 x i1> , <2 x i64> %x2, <2 x i64> %x3, i32 4)
606 ret void
607 }
617608
618609 define void@test_int_x86_avx512_scatterdiv4_df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3) {
619610 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_df:
624615 ; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1}
625616 ; CHECK-NEXT: vzeroupper
626617 ; CHECK-NEXT: retq
627 call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3, i32 2)
628 call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 -1, <4 x i64> %x2, <4 x double> %x3, i32 4)
629 ret void
630 }
631
632 declare void @llvm.x86.avx512.scatterdiv4.di(i8*, i8, <4 x i64>, <4 x i64>, i32)
618 %1 = bitcast i8 %x1 to <8 x i1>
619 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32>
620 call void @llvm.x86.avx512.mask.scatterdiv4.df(i8* %x0, <4 x i1> %2, <4 x i64> %x2, <4 x double> %x3, i32 2)
621 call void @llvm.x86.avx512.mask.scatterdiv4.df(i8* %x0, <4 x i1> , <4 x i64> %x2, <4 x double> %x3, i32 4)
622 ret void
623 }
633624
634625 define void@test_int_x86_avx512_scatterdiv4_di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3) {
635626 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_di:
640631 ; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,4) {%k1}
641632 ; CHECK-NEXT: vzeroupper
642633 ; CHECK-NEXT: retq
643 call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3, i32 2)
644 call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i64> %x3, i32 4)
645 ret void
646 }
647
648 declare void @llvm.x86.avx512.scatterdiv4.sf(i8*, i8, <2 x i64>, <4 x float>, i32)
634 %1 = bitcast i8 %x1 to <8 x i1>
635 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32>
636 call void @llvm.x86.avx512.mask.scatterdiv4.di(i8* %x0, <4 x i1> %2, <4 x i64> %x2, <4 x i64> %x3, i32 2)
637 call void @llvm.x86.avx512.mask.scatterdiv4.di(i8* %x0, <4 x i1> , <4 x i64> %x2, <4 x i64> %x3, i32 4)
638 ret void
639 }
649640
650641 define void@test_int_x86_avx512_scatterdiv4_sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3) {
651642 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_sf:
655646 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
656647 ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,4) {%k1}
657648 ; CHECK-NEXT: retq
658 call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3, i32 2)
659 call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 -1, <2 x i64> %x2, <4 x float> %x3, i32 4)
660 ret void
661 }
662
663 declare void @llvm.x86.avx512.scatterdiv4.si(i8*, i8, <2 x i64>, <4 x i32>, i32)
649 %1 = bitcast i8 %x1 to <8 x i1>
650 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32>
651 call void @llvm.x86.avx512.mask.scatterdiv4.sf(i8* %x0, <2 x i1> %2, <2 x i64> %x2, <4 x float> %x3, i32 2)
652 call void @llvm.x86.avx512.mask.scatterdiv4.sf(i8* %x0, <2 x i1> , <2 x i64> %x2, <4 x float> %x3, i32 4)
653 ret void
654 }
664655
665656 define void@test_int_x86_avx512_scatterdiv4_si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3) {
666657 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_si:
670661 ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,2) {%k2}
671662 ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,4) {%k1}
672663 ; CHECK-NEXT: retq
673 call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 -1, <2 x i64> %x2, <4 x i32> %x3, i32 2)
674 call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3, i32 4)
675 ret void
676 }
677
678 declare void @llvm.x86.avx512.scatterdiv8.sf(i8*, i8, <4 x i64>, <4 x float>, i32)
664 %1 = bitcast i8 %x1 to <8 x i1>
665 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32>
666 call void @llvm.x86.avx512.mask.scatterdiv4.si(i8* %x0, <2 x i1> , <2 x i64> %x2, <4 x i32> %x3, i32 2)
667 call void @llvm.x86.avx512.mask.scatterdiv4.si(i8* %x0, <2 x i1> %2, <2 x i64> %x2, <4 x i32> %x3, i32 4)
668 ret void
669 }
679670
680671 define void@test_int_x86_avx512_scatterdiv8_sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3) {
681672 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_sf:
686677 ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,4) {%k1}
687678 ; CHECK-NEXT: vzeroupper
688679 ; CHECK-NEXT: retq
689 call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3, i32 2)
690 call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 -1, <4 x i64> %x2, <4 x float> %x3, i32 4)
691 ret void
692 }
693
694 declare void @llvm.x86.avx512.scatterdiv8.si(i8*, i8, <4 x i64>, <4 x i32>, i32)
680 %1 = bitcast i8 %x1 to <8 x i1>
681 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32>
682 call void @llvm.x86.avx512.mask.scatterdiv8.sf(i8* %x0, <4 x i1> %2, <4 x i64> %x2, <4 x float> %x3, i32 2)
683 call void @llvm.x86.avx512.mask.scatterdiv8.sf(i8* %x0, <4 x i1> , <4 x i64> %x2, <4 x float> %x3, i32 4)
684 ret void
685 }
695686
696687 define void@test_int_x86_avx512_scatterdiv8_si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3) {
697688 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_si:
702693 ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,4) {%k1}
703694 ; CHECK-NEXT: vzeroupper
704695 ; CHECK-NEXT: retq
705 call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3, i32 2)
706 call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i32> %x3, i32 4)
707 ret void
708 }
709
710 declare void @llvm.x86.avx512.scattersiv2.df(i8*, i8, <4 x i32>, <2 x double>, i32)
696 %1 = bitcast i8 %x1 to <8 x i1>
697 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32>
698 call void @llvm.x86.avx512.mask.scatterdiv8.si(i8* %x0, <4 x i1> %2, <4 x i64> %x2, <4 x i32> %x3, i32 2)
699 call void @llvm.x86.avx512.mask.scatterdiv8.si(i8* %x0, <4 x i1> , <4 x i64> %x2, <4 x i32> %x3, i32 4)
700 ret void
701 }
711702
712703 define void@test_int_x86_avx512_scattersiv2_df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3) {
713704 ; CHECK-LABEL: test_int_x86_avx512_scattersiv2_df:
717708 ; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,2) {%k2}
718709 ; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,4) {%k1}
719710 ; CHECK-NEXT: retq
720 call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 -1, <4 x i32> %x2, <2 x double> %x3, i32 2)
721 call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3, i32 4)
722 ret void
723 }
724
725 declare void @llvm.x86.avx512.scattersiv2.di(i8*, i8, <4 x i32>, <2 x i64>, i32)
711 %1 = bitcast i8 %x1 to <8 x i1>
712 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32>
713 call void @llvm.x86.avx512.mask.scattersiv2.df(i8* %x0, <2 x i1> , <4 x i32> %x2, <2 x double> %x3, i32 2)
714 call void @llvm.x86.avx512.mask.scattersiv2.df(i8* %x0, <2 x i1> %2, <4 x i32> %x2, <2 x double> %x3, i32 4)
715 ret void
716 }
726717
727718 define void@test_int_x86_avx512_scattersiv2_di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3) {
728719 ; CHECK-LABEL: test_int_x86_avx512_scattersiv2_di:
732723 ; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,2) {%k2}
733724 ; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,4) {%k1}
734725 ; CHECK-NEXT: retq
735 call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 -1, <4 x i32> %x2, <2 x i64> %x3, i32 2)
736 call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3, i32 4)
737 ret void
738 }
739
740 declare void @llvm.x86.avx512.scattersiv4.df(i8*, i8, <4 x i32>, <4 x double>, i32)
726 %1 = bitcast i8 %x1 to <8 x i1>
727 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32>
728 call void @llvm.x86.avx512.mask.scattersiv2.di(i8* %x0, <2 x i1> , <4 x i32> %x2, <2 x i64> %x3, i32 2)
729 call void @llvm.x86.avx512.mask.scattersiv2.di(i8* %x0, <2 x i1> %2, <4 x i32> %x2, <2 x i64> %x3, i32 4)
730 ret void
731 }
741732
742733 define void@test_int_x86_avx512_scattersiv4_df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3) {
743734 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_df:
748739 ; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,4) {%k1}
749740 ; CHECK-NEXT: vzeroupper
750741 ; CHECK-NEXT: retq
751 call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3, i32 2)
752 call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 -1, <4 x i32> %x2, <4 x double> %x3, i32 4)
753 ret void
754 }
755
756 declare void @llvm.x86.avx512.scattersiv4.di(i8*, i8, <4 x i32>, <4 x i64>, i32)
742 %1 = bitcast i8 %x1 to <8 x i1>
743 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32>
744 call void @llvm.x86.avx512.mask.scattersiv4.df(i8* %x0, <4 x i1> %2, <4 x i32> %x2, <4 x double> %x3, i32 2)
745 call void @llvm.x86.avx512.mask.scattersiv4.df(i8* %x0, <4 x i1> , <4 x i32> %x2, <4 x double> %x3, i32 4)
746 ret void
747 }
757748
758749 define void@test_int_x86_avx512_scattersiv4_di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3) {
759750 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_di:
764755 ; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,4) {%k1}
765756 ; CHECK-NEXT: vzeroupper
766757 ; CHECK-NEXT: retq
767 call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i64> %x3, i32 2)
768 call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3, i32 4)
769 ret void
770 }
771
772 declare void @llvm.x86.avx512.scattersiv4.sf(i8*, i8, <4 x i32>, <4 x float>, i32)
758 %1 = bitcast i8 %x1 to <8 x i1>
759 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32>
760 call void @llvm.x86.avx512.mask.scattersiv4.di(i8* %x0, <4 x i1> , <4 x i32> %x2, <4 x i64> %x3, i32 2)
761 call void @llvm.x86.avx512.mask.scattersiv4.di(i8* %x0, <4 x i1> %2, <4 x i32> %x2, <4 x i64> %x3, i32 4)
762 ret void
763 }
773764
774765 define void@test_int_x86_avx512_scattersiv4_sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3) {
775766 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_sf:
779770 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
780771 ; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,4) {%k1}
781772 ; CHECK-NEXT: retq
782 call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3, i32 2)
783 call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 -1, <4 x i32> %x2, <4 x float> %x3, i32 4)
784 ret void
785 }
786
787 declare void @llvm.x86.avx512.scattersiv4.si(i8*, i8, <4 x i32>, <4 x i32>, i32)
773 %1 = bitcast i8 %x1 to <8 x i1>
774 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32>
775 call void @llvm.x86.avx512.mask.scattersiv4.sf(i8* %x0, <4 x i1> %2, <4 x i32> %x2, <4 x float> %x3, i32 2)
776 call void @llvm.x86.avx512.mask.scattersiv4.sf(i8* %x0, <4 x i1> , <4 x i32> %x2, <4 x float> %x3, i32 4)
777 ret void
778 }
788779
789780 define void@test_int_x86_avx512_scattersiv4_si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3) {
790781 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_si:
794785 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
795786 ; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1}
796787 ; CHECK-NEXT: retq
797 call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3, i32 2)
798 call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i32> %x3, i32 4)
799 ret void
800 }
801
802 declare void @llvm.x86.avx512.scattersiv8.sf(i8*, i8, <8 x i32>, <8 x float>, i32)
788 %1 = bitcast i8 %x1 to <8 x i1>
789 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32>
790 call void @llvm.x86.avx512.mask.scattersiv4.si(i8* %x0, <4 x i1> %2, <4 x i32> %x2, <4 x i32> %x3, i32 2)
791 call void @llvm.x86.avx512.mask.scattersiv4.si(i8* %x0, <4 x i1> , <4 x i32> %x2, <4 x i32> %x3, i32 4)
792 ret void
793 }
803794
804795 define void@test_int_x86_avx512_scattersiv8_sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3) {
805796 ; CHECK-LABEL: test_int_x86_avx512_scattersiv8_sf:
810801 ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1}
811802 ; CHECK-NEXT: vzeroupper
812803 ; CHECK-NEXT: retq
813 call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3, i32 2)
814 call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 -1, <8 x i32> %x2, <8 x float> %x3, i32 4)
815 ret void
816 }
817
818 declare void @llvm.x86.avx512.scattersiv8.si(i8*, i8, <8 x i32>, <8 x i32>, i32)
804 %1 = bitcast i8 %x1 to <8 x i1>
805 call void @llvm.x86.avx512.mask.scattersiv8.sf(i8* %x0, <8 x i1> %1, <8 x i32> %x2, <8 x float> %x3, i32 2)
806 call void @llvm.x86.avx512.mask.scattersiv8.sf(i8* %x0, <8 x i1> , <8 x i32> %x2, <8 x float> %x3, i32 4)
807 ret void
808 }
819809
820810 define void@test_int_x86_avx512_scattersiv8_si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3) {
821811 ; CHECK-LABEL: test_int_x86_avx512_scattersiv8_si:
826816 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
827817 ; CHECK-NEXT: vzeroupper
828818 ; CHECK-NEXT: retq
829 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
830 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 4)
819 %1 = bitcast i8 %x1 to <8 x i1>
820 call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> %1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
821 call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> , <8 x i32> %x2, <8 x i32> %x3, i32 4)
831822 ret void
832823 }
833824
846837 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
847838 ; CHECK-NEXT: vzeroupper
848839 ; CHECK-NEXT: retq
849 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
850 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 0, <8 x i32> %x2, <8 x i32> %x3, i32 4)
851 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
852 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 96, <8 x i32> %x2, <8 x i32> %x3, i32 4)
840 call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> , <8 x i32> %x2, <8 x i32> %x3, i32 2)
841 call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> zeroinitializer, <8 x i32> %x2, <8 x i32> %x3, i32 4)
842 call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> bitcast (<1 x i8> to <8 x i1>), <8 x i32> %x2, <8 x i32> %x3, i32 2)
843 call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> bitcast (<1 x i8> to <8 x i1>), <8 x i32> %x2, <8 x i32> %x3, i32 4)
853844 ret void
854845 }
855846
907898 declare <4 x i32> @llvm.x86.avx512.mask.gather3siv4.si(<4 x i32>, i8*, <4 x i32>, <4 x i1>, i32)
908899 declare <8 x float> @llvm.x86.avx512.mask.gather3siv8.sf(<8 x float>, i8*, <8 x i32>, <8 x i1>, i32)
909900 declare <8 x i32> @llvm.x86.avx512.mask.gather3siv8.si(<8 x i32>, i8*, <8 x i32>, <8 x i1>, i32)
901 declare void @llvm.x86.avx512.mask.scatter.dps.512(i8*, <16 x i1>, <16 x i32>, <16 x float>, i32)
902 declare void @llvm.x86.avx512.mask.scatter.dpd.512(i8*, <8 x i1>, <8 x i32>, <8 x double>, i32)
903 declare void @llvm.x86.avx512.mask.scatter.qps.512(i8*, <8 x i1>, <8 x i64>, <8 x float>, i32)
904 declare void @llvm.x86.avx512.mask.scatter.qpd.512(i8*, <8 x i1>, <8 x i64>, <8 x double>, i32)
905 declare void @llvm.x86.avx512.mask.scatter.dpi.512(i8*, <16 x i1>, <16 x i32>, <16 x i32>, i32)
906 declare void @llvm.x86.avx512.mask.scatter.dpq.512(i8*, <8 x i1>, <8 x i32>, <8 x i64>, i32)
907 declare void @llvm.x86.avx512.mask.scatter.qpi.512(i8*, <8 x i1>, <8 x i64>, <8 x i32>, i32)
908 declare void @llvm.x86.avx512.mask.scatter.qpq.512(i8*, <8 x i1>, <8 x i64>, <8 x i64>, i32)
909 declare void @llvm.x86.avx512.mask.scatterdiv2.df(i8*, <2 x i1>, <2 x i64>, <2 x double>, i32)
910 declare void @llvm.x86.avx512.mask.scatterdiv2.di(i8*, <2 x i1>, <2 x i64>, <2 x i64>, i32)
911 declare void @llvm.x86.avx512.mask.scatterdiv4.df(i8*, <4 x i1>, <4 x i64>, <4 x double>, i32)
912 declare void @llvm.x86.avx512.mask.scatterdiv4.di(i8*, <4 x i1>, <4 x i64>, <4 x i64>, i32)
913 declare void @llvm.x86.avx512.mask.scatterdiv4.sf(i8*, <2 x i1>, <2 x i64>, <4 x float>, i32)
914 declare void @llvm.x86.avx512.mask.scatterdiv4.si(i8*, <2 x i1>, <2 x i64>, <4 x i32>, i32)
915 declare void @llvm.x86.avx512.mask.scatterdiv8.sf(i8*, <4 x i1>, <4 x i64>, <4 x float>, i32)
916 declare void @llvm.x86.avx512.mask.scatterdiv8.si(i8*, <4 x i1>, <4 x i64>, <4 x i32>, i32)
917 declare void @llvm.x86.avx512.mask.scattersiv2.df(i8*, <2 x i1>, <4 x i32>, <2 x double>, i32)
918 declare void @llvm.x86.avx512.mask.scattersiv2.di(i8*, <2 x i1>, <4 x i32>, <2 x i64>, i32)
919 declare void @llvm.x86.avx512.mask.scattersiv4.df(i8*, <4 x i1>, <4 x i32>, <4 x double>, i32)
920 declare void @llvm.x86.avx512.mask.scattersiv4.di(i8*, <4 x i1>, <4 x i32>, <4 x i64>, i32)
921 declare void @llvm.x86.avx512.mask.scattersiv4.sf(i8*, <4 x i1>, <4 x i32>, <4 x float>, i32)
922 declare void @llvm.x86.avx512.mask.scattersiv4.si(i8*, <4 x i1>, <4 x i32>, <4 x i32>, i32)
923 declare void @llvm.x86.avx512.mask.scattersiv8.sf(i8*, <8 x i1>, <8 x i32>, <8 x float>, i32)
924 declare void @llvm.x86.avx512.mask.scattersiv8.si(i8*, <8 x i1>, <8 x i32>, <8 x i32>, i32)
925