llvm.org GIT mirror llvm / ad0ddd8
[X86] Replace avx2 broadcast intrinsics with native IR. Since r245605, the clang headers don't use these anymore. r245165 updated some of the tests already; update the others, add an autoupgrade, remove the intrinsics, and cleanup the definitions. Differential Revision: http://reviews.llvm.org/D10555 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@245606 91177308-0d34-0410-b5e6-96231b3b80d8 Ahmed Bougacha 4 years ago
6 changed file(s) with 163 addition(s) and 210 deletion(s). Raw diff Collapse all Expand all
21662166
21672167 // Vector load with broadcast
21682168 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
2169 def int_x86_avx2_vbroadcast_ss_ps :
2170 GCCBuiltin<"__builtin_ia32_vbroadcastss_ps">,
2171 Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
2172 def int_x86_avx2_vbroadcast_sd_pd_256 :
2173 GCCBuiltin<"__builtin_ia32_vbroadcastsd_pd256">,
2174 Intrinsic<[llvm_v4f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
2175 def int_x86_avx2_vbroadcast_ss_ps_256 :
2176 GCCBuiltin<"__builtin_ia32_vbroadcastss_ps256">,
2177 Intrinsic<[llvm_v8f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
2178 def int_x86_avx2_pbroadcastb_128 :
2179 GCCBuiltin<"__builtin_ia32_pbroadcastb128">,
2180 Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
2181 def int_x86_avx2_pbroadcastb_256 :
2182 GCCBuiltin<"__builtin_ia32_pbroadcastb256">,
2183 Intrinsic<[llvm_v32i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
2184 def int_x86_avx2_pbroadcastw_128 :
2185 GCCBuiltin<"__builtin_ia32_pbroadcastw128">,
2186 Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty], [IntrNoMem]>;
2187 def int_x86_avx2_pbroadcastw_256 :
2188 GCCBuiltin<"__builtin_ia32_pbroadcastw256">,
2189 Intrinsic<[llvm_v16i16_ty], [llvm_v8i16_ty], [IntrNoMem]>;
2190 def int_x86_avx2_pbroadcastd_128 :
2191 GCCBuiltin<"__builtin_ia32_pbroadcastd128">,
2192 Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
2193 def int_x86_avx2_pbroadcastd_256 :
2194 GCCBuiltin<"__builtin_ia32_pbroadcastd256">,
2195 Intrinsic<[llvm_v8i32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
2196 def int_x86_avx2_pbroadcastq_128 :
2197 GCCBuiltin<"__builtin_ia32_pbroadcastq128">,
2198 Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
2199 def int_x86_avx2_pbroadcastq_256 :
2200 GCCBuiltin<"__builtin_ia32_pbroadcastq256">,
2201 Intrinsic<[llvm_v4i64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
22022169 def int_x86_avx512_mask_pbroadcast_d_gpr_512 :
22032170 GCCBuiltin<"__builtin_ia32_pbroadcastd512_gpr_mask">,
22042171 Intrinsic<[llvm_v16i32_ty], [llvm_i32_ty, llvm_v16i32_ty,
128128 Name.startswith("x86.sse2.pcmpgt.") ||
129129 Name.startswith("x86.avx2.pcmpeq.") ||
130130 Name.startswith("x86.avx2.pcmpgt.") ||
131 Name.startswith("x86.avx2.vbroadcast") ||
132 Name.startswith("x86.avx2.pbroadcast") ||
131133 Name.startswith("x86.avx.vpermil.") ||
132134 Name == "x86.avx.vinsertf128.pd.256" ||
133135 Name == "x86.avx.vinsertf128.ps.256" ||
446448 const int Idxs[4] = { 0, 1, 0, 1 };
447449 Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()),
448450 Idxs);
451 } else if (Name.startswith("llvm.x86.avx2.pbroadcast") ||
452 Name.startswith("llvm.x86.avx2.vbroadcast")) {
453 // Replace vp?broadcasts with a vector shuffle.
454 Value *Op = CI->getArgOperand(0);
455 unsigned NumElts = CI->getType()->getVectorNumElements();
456 Type *MaskTy = VectorType::get(Type::getInt32Ty(C), NumElts);
457 Rep = Builder.CreateShuffleVector(Op, UndefValue::get(Op->getType()),
458 Constant::getNullValue(MaskTy));
449459 } else if (Name == "llvm.x86.sse2.psll.dq") {
450460 // 128-bit shift left specified in bits.
451461 unsigned Shift = cast(CI->getArgOperand(1))->getZExtValue();
78227822 // VBROADCAST - Load from memory and broadcast to all elements of the
78237823 // destination operand
78247824 //
7825 class avx_broadcast opc, string OpcodeStr, RegisterClass RC,
7826 X86MemOperand x86memop, Intrinsic Int, SchedWrite Sched> :
7827 AVX8I
7828 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7829 [(set RC:$dst, (Int addr:$src))]>, Sched<[Sched]>, VEX;
7830
7831 class avx_broadcast_no_int opc, string OpcodeStr, RegisterClass RC,
7825 class avx_broadcast_rm opc, string OpcodeStr, RegisterClass RC,
78327826 X86MemOperand x86memop, ValueType VT,
78337827 PatFrag ld_frag, SchedWrite Sched> :
78347828 AVX8I
78397833 }
78407834
78417835 // AVX2 adds register forms
7842 class avx2_broadcast_reg opc, string OpcodeStr, RegisterClass RC,
7843 Intrinsic Int, SchedWrite Sched> :
7836 class avx2_broadcast_rr opc, string OpcodeStr, RegisterClass RC,
7837 ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
78447838 AVX28I
78457839 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7846 [(set RC:$dst, (Int VR128:$src))]>, Sched<[Sched]>, VEX;
7840 [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
7841 Sched<[Sched]>, VEX;
78477842
78487843 let ExeDomain = SSEPackedSingle in {
7849 def VBROADCASTSSrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR128,
7844 def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
78507845 f32mem, v4f32, loadf32, WriteLoad>;
7851 def VBROADCASTSSYrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR256,
7846 def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
78527847 f32mem, v8f32, loadf32,
78537848 WriteFShuffleLd>, VEX_L;
78547849 }
78557850 let ExeDomain = SSEPackedDouble in
7856 def VBROADCASTSDYrm : avx_broadcast_no_int<0x19, "vbroadcastsd", VR256, f64mem,
7851 def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
78577852 v4f64, loadf64, WriteFShuffleLd>, VEX_L;
7858 def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
7859 int_x86_avx_vbroadcastf128_pd_256,
7860 WriteFShuffleLd>, VEX_L;
78617853
78627854 let ExeDomain = SSEPackedSingle in {
7863 def VBROADCASTSSrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR128,
7864 int_x86_avx2_vbroadcast_ss_ps,
7865 WriteFShuffle>;
7866 def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256,
7867 int_x86_avx2_vbroadcast_ss_ps_256,
7868 WriteFShuffle256>, VEX_L;
7855 def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
7856 v4f32, v4f32, WriteFShuffle>;
7857 def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
7858 v8f32, v4f32, WriteFShuffle256>, VEX_L;
78697859 }
78707860 let ExeDomain = SSEPackedDouble in
7871 def VBROADCASTSDYrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
7872 int_x86_avx2_vbroadcast_sd_pd_256,
7873 WriteFShuffle256>, VEX_L;
7861 def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
7862 v4f64, v2f64, WriteFShuffle256>, VEX_L;
78747863
78757864 let mayLoad = 1, Predicates = [HasAVX2] in
78767865 def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
78777866 (ins i128mem:$src),
78787867 "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
78797868 Sched<[WriteLoad]>, VEX, VEX_L;
7869
7870 def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
7871 (ins f128mem:$src),
7872 "vbroadcastf128\t{$src, $dst|$dst, $src}",
7873 [(set VR256:$dst,
7874 (int_x86_avx_vbroadcastf128_pd_256 addr:$src))]>,
7875 Sched<[WriteFShuffleLd]>, VEX, VEX_L;
78807876
78817877 let Predicates = [HasAVX] in
78827878 def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
83168312 //
83178313 multiclass avx2_broadcast opc, string OpcodeStr,
83188314 X86MemOperand x86memop, PatFrag ld_frag,
8319 Intrinsic Int128, Intrinsic Int256> {
8315 ValueType OpVT128, ValueType OpVT256> {
83208316 def rr : AVX28I
83218317 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8322 [(set VR128:$dst, (Int128 VR128:$src))]>,
8318 [(set VR128:$dst, (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
83238319 Sched<[WriteShuffle]>, VEX;
83248320 def rm : AVX28I
83258321 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8326 [(set VR128:$dst,
8327 (Int128 (scalar_to_vector (ld_frag addr:$src))))]>,
8322 [(set VR128:$dst, (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>,
83288323 Sched<[WriteLoad]>, VEX;
83298324 def Yrr : AVX28I
83308325 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8331 [(set VR256:$dst, (Int256 VR128:$src))]>,
8326 [(set VR256:$dst, (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
83328327 Sched<[WriteShuffle256]>, VEX, VEX_L;
83338328 def Yrm : AVX28I
83348329 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8335 [(set VR256:$dst,
8336 (Int256 (scalar_to_vector (ld_frag addr:$src))))]>,
8330 [(set VR256:$dst, (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>,
83378331 Sched<[WriteLoad]>, VEX, VEX_L;
83388332 }
83398333
8340 defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
8341 int_x86_avx2_pbroadcastb_128,
8342 int_x86_avx2_pbroadcastb_256>;
8343 defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
8344 int_x86_avx2_pbroadcastw_128,
8345 int_x86_avx2_pbroadcastw_256>;
8346 defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
8347 int_x86_avx2_pbroadcastd_128,
8348 int_x86_avx2_pbroadcastd_256>;
8349 defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
8350 int_x86_avx2_pbroadcastq_128,
8351 int_x86_avx2_pbroadcastq_256>;
8334 defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, v16i8, v32i8>;
8335 defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16, v8i16, v16i16>;
8336 defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, v4i32, v8i32>;
8337 defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, v2i64, v4i64>;
83528338
83538339 let Predicates = [HasAVX2] in {
8354 def : Pat<(v16i8 (X86VBroadcast (loadi8 addr:$src))),
8355 (VPBROADCASTBrm addr:$src)>;
8356 def : Pat<(v32i8 (X86VBroadcast (loadi8 addr:$src))),
8357 (VPBROADCASTBYrm addr:$src)>;
8358 def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
8359 (VPBROADCASTWrm addr:$src)>;
8360 def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
8361 (VPBROADCASTWYrm addr:$src)>;
8362 def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
8363 (VPBROADCASTDrm addr:$src)>;
8364 def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
8365 (VPBROADCASTDYrm addr:$src)>;
8366 def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
8367 (VPBROADCASTQrm addr:$src)>;
8368 def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
8369 (VPBROADCASTQYrm addr:$src)>;
8370
8371 def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))),
8372 (VPBROADCASTBrr VR128:$src)>;
8373 def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))),
8374 (VPBROADCASTBYrr VR128:$src)>;
8375 def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))),
8376 (VPBROADCASTWrr VR128:$src)>;
8377 def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))),
8378 (VPBROADCASTWYrr VR128:$src)>;
8379 def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))),
8380 (VPBROADCASTDrr VR128:$src)>;
8381 def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))),
8382 (VPBROADCASTDYrr VR128:$src)>;
8383 def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))),
8384 (VPBROADCASTQrr VR128:$src)>;
8385 def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))),
8386 (VPBROADCASTQYrr VR128:$src)>;
8387 def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))),
8388 (VBROADCASTSSrr VR128:$src)>;
8389 def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))),
8390 (VBROADCASTSSYrr VR128:$src)>;
8391 def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))),
8392 (VPBROADCASTQrr VR128:$src)>;
8393 def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))),
8394 (VBROADCASTSDYrr VR128:$src)>;
8395
83968340 // Provide aliases for broadcast from the same register class that
83978341 // automatically does the extract.
83988342 def : Pat<(v32i8 (X86VBroadcast (v32i8 VR256:$src))),
8282 }
8383 declare <4 x i64> @llvm.x86.avx2.vinserti128(<4 x i64>, <2 x i64>, i8) nounwind readnone
8484
85
86 define <4 x double> @test_x86_avx2_vbroadcast_sd_pd_256(<2 x double> %a0) {
87 ; CHECK-LABEL: test_x86_avx2_vbroadcast_sd_pd_256:
88 ; CHECK: ## BB#0:
89 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
90 ; CHECK-NEXT: retl
91 %res = call <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double> %a0)
92 ret <4 x double> %res
93 }
94 declare <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double>) nounwind readonly
95
96
97 define <4 x float> @test_x86_avx2_vbroadcast_ss_ps(<4 x float> %a0) {
98 ; CHECK-LABEL: test_x86_avx2_vbroadcast_ss_ps:
99 ; CHECK: ## BB#0:
100 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
101 ; CHECK-NEXT: retl
102 %res = call <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float> %a0)
103 ret <4 x float> %res
104 }
105 declare <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float>) nounwind readonly
106
107
108 define <8 x float> @test_x86_avx2_vbroadcast_ss_ps_256(<4 x float> %a0) {
109 ; CHECK-LABEL: test_x86_avx2_vbroadcast_ss_ps_256:
110 ; CHECK: ## BB#0:
111 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
112 ; CHECK-NEXT: retl
113 %res = call <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float> %a0)
114 ret <8 x float> %res
115 }
116 declare <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float>) nounwind readonly
117
118
119 define <16 x i8> @test_x86_avx2_pbroadcastb_128(<16 x i8> %a0) {
120 ; CHECK-LABEL: test_x86_avx2_pbroadcastb_128:
121 ; CHECK: ## BB#0:
122 ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0
123 ; CHECK-NEXT: retl
124 %res = call <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8> %a0)
125 ret <16 x i8> %res
126 }
127 declare <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8>) nounwind readonly
128
129
130 define <32 x i8> @test_x86_avx2_pbroadcastb_256(<16 x i8> %a0) {
131 ; CHECK-LABEL: test_x86_avx2_pbroadcastb_256:
132 ; CHECK: ## BB#0:
133 ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0
134 ; CHECK-NEXT: retl
135 %res = call <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8> %a0)
136 ret <32 x i8> %res
137 }
138 declare <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8>) nounwind readonly
139
140
141 define <8 x i16> @test_x86_avx2_pbroadcastw_128(<8 x i16> %a0) {
142 ; CHECK-LABEL: test_x86_avx2_pbroadcastw_128:
143 ; CHECK: ## BB#0:
144 ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0
145 ; CHECK-NEXT: retl
146 %res = call <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16> %a0)
147 ret <8 x i16> %res
148 }
149 declare <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16>) nounwind readonly
150
151
152 define <16 x i16> @test_x86_avx2_pbroadcastw_256(<8 x i16> %a0) {
153 ; CHECK-LABEL: test_x86_avx2_pbroadcastw_256:
154 ; CHECK: ## BB#0:
155 ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0
156 ; CHECK-NEXT: retl
157 %res = call <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16> %a0)
158 ret <16 x i16> %res
159 }
160 declare <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16>) nounwind readonly
161
162
163 define <4 x i32> @test_x86_avx2_pbroadcastd_128(<4 x i32> %a0) {
164 ; CHECK-LABEL: test_x86_avx2_pbroadcastd_128:
165 ; CHECK: ## BB#0:
166 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
167 ; CHECK-NEXT: retl
168 %res = call <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32> %a0)
169 ret <4 x i32> %res
170 }
171 declare <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32>) nounwind readonly
172
173
174 define <8 x i32> @test_x86_avx2_pbroadcastd_256(<4 x i32> %a0) {
175 ; CHECK-LABEL: test_x86_avx2_pbroadcastd_256:
176 ; CHECK: ## BB#0:
177 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
178 ; CHECK-NEXT: retl
179 %res = call <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32> %a0)
180 ret <8 x i32> %res
181 }
182 declare <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32>) nounwind readonly
183
184
185 define <2 x i64> @test_x86_avx2_pbroadcastq_128(<2 x i64> %a0) {
186 ; CHECK-LABEL: test_x86_avx2_pbroadcastq_128:
187 ; CHECK: ## BB#0:
188 ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0
189 ; CHECK-NEXT: retl
190 %res = call <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64> %a0)
191 ret <2 x i64> %res
192 }
193 declare <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64>) nounwind readonly
194
195
196 define <4 x i64> @test_x86_avx2_pbroadcastq_256(<2 x i64> %a0) {
197 ; CHECK-LABEL: test_x86_avx2_pbroadcastq_256:
198 ; CHECK: ## BB#0:
199 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
200 ; CHECK-NEXT: retl
201 %res = call <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64> %a0)
202 ret <4 x i64> %res
203 }
204 declare <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64>) nounwind readonly
640640 declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
641641
642642
643 define <4 x double> @test_x86_avx2_vbroadcast_sd_pd_256(<2 x double> %a0) {
644 ; CHECK: vbroadcastsd
645 %res = call <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double> %a0) ; <<4 x double>> [#uses=1]
646 ret <4 x double> %res
647 }
648 declare <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double>) nounwind readonly
649
650
651 define <4 x float> @test_x86_avx2_vbroadcast_ss_ps(<4 x float> %a0) {
652 ; CHECK: vbroadcastss
653 %res = call <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
654 ret <4 x float> %res
655 }
656 declare <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float>) nounwind readonly
657
658
659 define <8 x float> @test_x86_avx2_vbroadcast_ss_ps_256(<4 x float> %a0) {
660 ; CHECK: vbroadcastss
661 %res = call <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float> %a0) ; <<8 x float>> [#uses=1]
662 ret <8 x float> %res
663 }
664 declare <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float>) nounwind readonly
665
666
667643 define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
668644 ; CHECK: vpblendd
669645 %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i8 7) ; <<4 x i32>> [#uses=1]
678654 ret <8 x i32> %res
679655 }
680656 declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
681
682
683 define <16 x i8> @test_x86_avx2_pbroadcastb_128(<16 x i8> %a0) {
684 ; CHECK: vpbroadcastb
685 %res = call <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8> %a0) ; <<16 x i8>> [#uses=1]
686 ret <16 x i8> %res
687 }
688 declare <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8>) nounwind readonly
689
690
691 define <32 x i8> @test_x86_avx2_pbroadcastb_256(<16 x i8> %a0) {
692 ; CHECK: vpbroadcastb
693 %res = call <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8> %a0) ; <<32 x i8>> [#uses=1]
694 ret <32 x i8> %res
695 }
696 declare <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8>) nounwind readonly
697
698
699 define <8 x i16> @test_x86_avx2_pbroadcastw_128(<8 x i16> %a0) {
700 ; CHECK: vpbroadcastw
701 %res = call <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16> %a0) ; <<8 x i16>> [#uses=1]
702 ret <8 x i16> %res
703 }
704 declare <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16>) nounwind readonly
705
706
707 define <16 x i16> @test_x86_avx2_pbroadcastw_256(<8 x i16> %a0) {
708 ; CHECK: vpbroadcastw
709 %res = call <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16> %a0) ; <<16 x i16>> [#uses=1]
710 ret <16 x i16> %res
711 }
712 declare <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16>) nounwind readonly
713
714
715 define <4 x i32> @test_x86_avx2_pbroadcastd_128(<4 x i32> %a0) {
716 ; CHECK: vbroadcastss
717 %res = call <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32> %a0) ; <<4 x i32>> [#uses=1]
718 ret <4 x i32> %res
719 }
720 declare <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32>) nounwind readonly
721
722
723 define <8 x i32> @test_x86_avx2_pbroadcastd_256(<4 x i32> %a0) {
724 ; CHECK: vbroadcastss {{[^,]+}}, %ymm{{[0-9]+}}
725 %res = call <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32> %a0) ; <<8 x i32>> [#uses=1]
726 ret <8 x i32> %res
727 }
728 declare <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32>) nounwind readonly
729
730
731 define <2 x i64> @test_x86_avx2_pbroadcastq_128(<2 x i64> %a0) {
732 ; CHECK: vpbroadcastq
733 %res = call <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64> %a0) ; <<2 x i64>> [#uses=1]
734 ret <2 x i64> %res
735 }
736 declare <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64>) nounwind readonly
737
738
739 define <4 x i64> @test_x86_avx2_pbroadcastq_256(<2 x i64> %a0) {
740 ; CHECK: vbroadcastsd {{[^,]+}}, %ymm{{[0-9]+}}
741 %res = call <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64> %a0) ; <<4 x i64>> [#uses=1]
742 ret <4 x i64> %res
743 }
744 declare <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64>) nounwind readonly
745657
746658
747659 define <8 x i32> @test_x86_avx2_permd(<8 x i32> %a0, <8 x i32> %a1) {
1111 ;CHECK-LABEL: stack_fold_broadcastsd_ymm
1212 ;CHECK: vbroadcastsd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1313 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
14 %2 = call <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double> %a0)
14 %2 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer
1515 ; fadd forces execution domain
1616 %3 = fadd <4 x double> %2,
1717 ret <4 x double> %3
2222 ;CHECK-LABEL: stack_fold_broadcastss
2323 ;CHECK: vbroadcastss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
2424 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
25 %2 = call <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float> %a0)
25 %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
2626 ; fadd forces execution domain
2727 %3 = fadd <4 x float> %2,
2828 ret <4 x float> %3
3333 ;CHECK-LABEL: stack_fold_broadcastss_ymm
3434 ;CHECK: vbroadcastss {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
3535 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
36 %2 = call <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float> %a0)
36 %2 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer
3737 ; fadd forces execution domain
3838 %3 = fadd <8 x float> %2,
3939 ret <8 x float> %3