llvm.org GIT mirror llvm / c2cf105
[AVX512] Don't mark EXTLOAD as legal with AVX512. Continue using custom lowering. Summary: This was impeding our ability to combine the extending shuffles with other shuffles as you can see from the test changes. There's one special case that needed to be added to use VZEXT directly for v8i8->v8i64 since the custom lowering requires v64i8. Reviewers: RKSimon, zvi, delena Reviewed By: delena Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D38714 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@315860 91177308-0d34-0410-b5e6-96231b3b80d8 Craig Topper 2 years ago
6 changed file(s) with 83 addition(s) and 240 deletion(s). Raw diff Collapse all Expand all
11391139 for (MVT VT : MVT::fp_vector_valuetypes())
11401140 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
11411141
1142 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1142 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
11431143 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
11441144 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
11451145 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
12431243 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
12441244 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
12451245 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
1246
1247 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1248 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1249 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1250 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
1251 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1252 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1253 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1254 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1255 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
1256 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1257 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
12581246 }
12591247
12601248 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
15141502 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
15151503 }
15161504
1517 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1505 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
15181506 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1519 if (Subtarget.hasVLX()) {
1520 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1521 setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1522 setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal);
1523 }
15241507 }
15251508 }
15261509
1843818421 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
1843918422 loadRegZize = 128;
1844018423
18424 // If we don't have BWI we won't be able to create the shuffle needed for
18425 // v8i8->v8i64.
18426 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
18427 MemVT == MVT::v8i8)
18428 loadRegZize = 128;
18429
1844118430 // Represent our vector as a sequence of elements which are the
1844218431 // largest scalar that we can load.
1844318432 EVT LoadUnitVecVT = EVT::getVectorVT(
1850218491 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
1850318492 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
1850418493 return Shuff;
18494 }
18495
18496 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
18497 MemVT == MVT::v8i8) {
18498 SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG);
18499 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18500 return Sext;
1850518501 }
1850618502
1850718503 // Redistribute the loaded elements into the different locations.
80058005 defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, sext_invec, "s">;
80068006 defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, sext_invec, "s">;
80078007
8008 // EXTLOAD patterns, implemented using vpmovz
8009 multiclass avx512_ext_lowering
8010 X86VectorVTInfo From, PatFrag LdFrag> {
8011 def : Pat<(To.VT (LdFrag addr:$src)),
8012 (!cast("VPMOVZX"#InstrStr#"rm") addr:$src)>;
8013 def : Pat<(To.VT (vselect To.KRCWM:$mask, (LdFrag addr:$src), To.RC:$src0)),
8014 (!cast("VPMOVZX"#InstrStr#"rmk") To.RC:$src0,
8015 To.KRC:$mask, addr:$src)>;
8016 def : Pat<(To.VT (vselect To.KRCWM:$mask, (LdFrag addr:$src),
8017 To.ImmAllZerosV)),
8018 (!cast("VPMOVZX"#InstrStr#"rmkz") To.KRC:$mask,
8019 addr:$src)>;
8020 }
8021
8022 let Predicates = [HasVLX, HasBWI] in {
8023 defm : avx512_ext_lowering<"BWZ128", v8i16x_info, v16i8x_info, extloadvi8>;
8024 defm : avx512_ext_lowering<"BWZ256", v16i16x_info, v16i8x_info, extloadvi8>;
8025 }
8026 let Predicates = [HasBWI] in {
8027 defm : avx512_ext_lowering<"BWZ", v32i16_info, v32i8x_info, extloadvi8>;
8028 }
8029 let Predicates = [HasVLX, HasAVX512] in {
8030 defm : avx512_ext_lowering<"BDZ128", v4i32x_info, v16i8x_info, extloadvi8>;
8031 defm : avx512_ext_lowering<"BDZ256", v8i32x_info, v16i8x_info, extloadvi8>;
8032 defm : avx512_ext_lowering<"BQZ128", v2i64x_info, v16i8x_info, extloadvi8>;
8033 defm : avx512_ext_lowering<"BQZ256", v4i64x_info, v16i8x_info, extloadvi8>;
8034 defm : avx512_ext_lowering<"WDZ128", v4i32x_info, v8i16x_info, extloadvi16>;
8035 defm : avx512_ext_lowering<"WDZ256", v8i32x_info, v8i16x_info, extloadvi16>;
8036 defm : avx512_ext_lowering<"WQZ128", v2i64x_info, v8i16x_info, extloadvi16>;
8037 defm : avx512_ext_lowering<"WQZ256", v4i64x_info, v8i16x_info, extloadvi16>;
8038 defm : avx512_ext_lowering<"DQZ128", v2i64x_info, v4i32x_info, extloadvi32>;
8039 defm : avx512_ext_lowering<"DQZ256", v4i64x_info, v4i32x_info, extloadvi32>;
8040 }
8041 let Predicates = [HasAVX512] in {
8042 defm : avx512_ext_lowering<"BDZ", v16i32_info, v16i8x_info, extloadvi8>;
8043 defm : avx512_ext_lowering<"BQZ", v8i64_info, v16i8x_info, extloadvi8>;
8044 defm : avx512_ext_lowering<"WDZ", v16i32_info, v16i16x_info, extloadvi16>;
8045 defm : avx512_ext_lowering<"WQZ", v8i64_info, v8i16x_info, extloadvi16>;
8046 defm : avx512_ext_lowering<"DQZ", v8i64_info, v8i32x_info, extloadvi32>;
8047 }
80488008
80498009 multiclass AVX512_pmovx_patterns
80508010 SDNode InVecOp, PatFrag ExtLoad16> {
234234 }
235235
236236 define <8 x i16> @broadcast_mem_v4i16_v8i16(<4 x i16>* %ptr) {
237 ; X32-AVX2-LABEL: broadcast_mem_v4i16_v8i16:
238 ; X32-AVX2: ## BB#0:
239 ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
240 ; X32-AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
241 ; X32-AVX2-NEXT: retl
242 ;
243 ; X64-AVX2-LABEL: broadcast_mem_v4i16_v8i16:
244 ; X64-AVX2: ## BB#0:
245 ; X64-AVX2-NEXT: vpbroadcastq (%rdi), %xmm0
246 ; X64-AVX2-NEXT: retq
247 ;
248 ; X32-AVX512VL-LABEL: broadcast_mem_v4i16_v8i16:
249 ; X32-AVX512VL: ## BB#0:
250 ; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
251 ; X32-AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
252 ; X32-AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13]
253 ; X32-AVX512VL-NEXT: retl
254 ;
255 ; X64-AVX512VL-LABEL: broadcast_mem_v4i16_v8i16:
256 ; X64-AVX512VL: ## BB#0:
257 ; X64-AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
258 ; X64-AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13]
259 ; X64-AVX512VL-NEXT: retq
237 ; X32-LABEL: broadcast_mem_v4i16_v8i16:
238 ; X32: ## BB#0:
239 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
240 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
241 ; X32-NEXT: retl
242 ;
243 ; X64-LABEL: broadcast_mem_v4i16_v8i16:
244 ; X64: ## BB#0:
245 ; X64-NEXT: vpbroadcastq (%rdi), %xmm0
246 ; X64-NEXT: retq
260247 %load = load <4 x i16>, <4 x i16>* %ptr
261248 %shuf = shufflevector <4 x i16> %load, <4 x i16> undef, <8 x i32>
262249 ret <8 x i16> %shuf
263250 }
264251
265252 define <16 x i16> @broadcast_mem_v4i16_v16i16(<4 x i16>* %ptr) {
266 ; X32-AVX2-LABEL: broadcast_mem_v4i16_v16i16:
267 ; X32-AVX2: ## BB#0:
268 ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
269 ; X32-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
270 ; X32-AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
271 ; X32-AVX2-NEXT: retl
272 ;
273 ; X64-AVX2-LABEL: broadcast_mem_v4i16_v16i16:
274 ; X64-AVX2: ## BB#0:
275 ; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
276 ; X64-AVX2-NEXT: retq
277 ;
278 ; X32-AVX512VL-LABEL: broadcast_mem_v4i16_v16i16:
279 ; X32-AVX512VL: ## BB#0:
280 ; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
281 ; X32-AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
282 ; X32-AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
283 ; X32-AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
284 ; X32-AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
285 ; X32-AVX512VL-NEXT: vpbroadcastq %xmm0, %ymm0
286 ; X32-AVX512VL-NEXT: retl
287 ;
288 ; X64-AVX512VL-LABEL: broadcast_mem_v4i16_v16i16:
289 ; X64-AVX512VL: ## BB#0:
290 ; X64-AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
291 ; X64-AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
292 ; X64-AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
293 ; X64-AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
294 ; X64-AVX512VL-NEXT: vpbroadcastq %xmm0, %ymm0
295 ; X64-AVX512VL-NEXT: retq
253 ; X32-LABEL: broadcast_mem_v4i16_v16i16:
254 ; X32: ## BB#0:
255 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
256 ; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
257 ; X32-NEXT: vbroadcastsd %xmm0, %ymm0
258 ; X32-NEXT: retl
259 ;
260 ; X64-LABEL: broadcast_mem_v4i16_v16i16:
261 ; X64: ## BB#0:
262 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
263 ; X64-NEXT: retq
296264 %load = load <4 x i16>, <4 x i16>* %ptr
297265 %shuf = shufflevector <4 x i16> %load, <4 x i16> undef, <16 x i32>
298266 ret <16 x i16> %shuf
341341 define <4 x i32> @test_2xi32_to_4xi32_mem(<2 x i32>* %vp) {
342342 ; CHECK-LABEL: test_2xi32_to_4xi32_mem:
343343 ; CHECK: # BB#0:
344 ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
345 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
344 ; CHECK-NEXT: vpbroadcastq (%rdi), %xmm0
346345 ; CHECK-NEXT: retq
347346 %vec = load <2 x i32>, <2 x i32>* %vp
348347 %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32>
351350 define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask0(<2 x i32>* %vp, <4 x i32> %default, <4 x i32> %mask) {
352351 ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mem_mask0:
353352 ; CHECK: # BB#0:
354 ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
355 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
356 ; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1
357 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm2[0,2,0,2]
353 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
354 ; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
355 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} = mem[0,1,0,1]
358356 ; CHECK-NEXT: retq
359357 %vec = load <2 x i32>, <2 x i32>* %vp
360358 %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32>
366364 define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask0(<2 x i32>* %vp, <4 x i32> %mask) {
367365 ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mem_mask0:
368366 ; CHECK: # BB#0:
369 ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
370 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
371 ; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
372 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[0,2,0,2]
367 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
368 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
369 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = mem[0,1,0,1]
373370 ; CHECK-NEXT: retq
374371 %vec = load <2 x i32>, <2 x i32>* %vp
375372 %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32>
380377 define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask1(<2 x i32>* %vp, <4 x i32> %default, <4 x i32> %mask) {
381378 ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mem_mask1:
382379 ; CHECK: # BB#0:
383 ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
384 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
385 ; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1
386 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm2[0,2,0,2]
380 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
381 ; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
382 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} = mem[0,1,0,1]
387383 ; CHECK-NEXT: retq
388384 %vec = load <2 x i32>, <2 x i32>* %vp
389385 %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32>
395391 define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask1(<2 x i32>* %vp, <4 x i32> %mask) {
396392 ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mem_mask1:
397393 ; CHECK: # BB#0:
398 ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
399 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
400 ; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
401 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[0,2,0,2]
394 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
395 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
396 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = mem[0,1,0,1]
402397 ; CHECK-NEXT: retq
403398 %vec = load <2 x i32>, <2 x i32>* %vp
404399 %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32>
409404 define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask2(<2 x i32>* %vp, <4 x i32> %default, <4 x i32> %mask) {
410405 ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mem_mask2:
411406 ; CHECK: # BB#0:
412 ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
413 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
414 ; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1
415 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm2[0,2,0,2]
407 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
408 ; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
409 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} = mem[0,1,0,1]
416410 ; CHECK-NEXT: retq
417411 %vec = load <2 x i32>, <2 x i32>* %vp
418412 %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32>
424418 define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask2(<2 x i32>* %vp, <4 x i32> %mask) {
425419 ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mem_mask2:
426420 ; CHECK: # BB#0:
427 ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
428 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
429 ; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
430 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[0,2,0,2]
421 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
422 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
423 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = mem[0,1,0,1]
431424 ; CHECK-NEXT: retq
432425 %vec = load <2 x i32>, <2 x i32>* %vp
433426 %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32>
438431 define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask3(<2 x i32>* %vp, <4 x i32> %default, <4 x i32> %mask) {
439432 ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mem_mask3:
440433 ; CHECK: # BB#0:
441 ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
442 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
443 ; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1
444 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm2[0,2,0,2]
434 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
435 ; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
436 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} = mem[0,1,0,1]
445437 ; CHECK-NEXT: retq
446438 %vec = load <2 x i32>, <2 x i32>* %vp
447439 %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32>
453445 define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask3(<2 x i32>* %vp, <4 x i32> %mask) {
454446 ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mem_mask3:
455447 ; CHECK: # BB#0:
456 ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
457 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
458 ; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
459 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[0,2,0,2]
448 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
449 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
450 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = mem[0,1,0,1]
460451 ; CHECK-NEXT: retq
461452 %vec = load <2 x i32>, <2 x i32>* %vp
462453 %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32>
26322632 ; SSE-NEXT: cvtdq2pd (%rdi), %xmm0
26332633 ; SSE-NEXT: retq
26342634 ;
2635 ; VEX-LABEL: sitofp_load_2i32_to_2f64:
2636 ; VEX: # BB#0:
2637 ; VEX-NEXT: vcvtdq2pd (%rdi), %xmm0
2638 ; VEX-NEXT: retq
2639 ;
2640 ; AVX512F-LABEL: sitofp_load_2i32_to_2f64:
2641 ; AVX512F: # BB#0:
2642 ; AVX512F-NEXT: vcvtdq2pd (%rdi), %xmm0
2643 ; AVX512F-NEXT: retq
2644 ;
2645 ; AVX512VL-LABEL: sitofp_load_2i32_to_2f64:
2646 ; AVX512VL: # BB#0:
2647 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
2648 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2649 ; AVX512VL-NEXT: vcvtdq2pd %xmm0, %xmm0
2650 ; AVX512VL-NEXT: retq
2651 ;
2652 ; AVX512DQ-LABEL: sitofp_load_2i32_to_2f64:
2653 ; AVX512DQ: # BB#0:
2654 ; AVX512DQ-NEXT: vcvtdq2pd (%rdi), %xmm0
2655 ; AVX512DQ-NEXT: retq
2656 ;
2657 ; AVX512VLDQ-LABEL: sitofp_load_2i32_to_2f64:
2658 ; AVX512VLDQ: # BB#0:
2659 ; AVX512VLDQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
2660 ; AVX512VLDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2661 ; AVX512VLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0
2662 ; AVX512VLDQ-NEXT: retq
2635 ; AVX-LABEL: sitofp_load_2i32_to_2f64:
2636 ; AVX: # BB#0:
2637 ; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0
2638 ; AVX-NEXT: retq
26632639 %ld = load <2 x i32>, <2 x i32> *%a
26642640 %cvt = sitofp <2 x i32> %ld to <2 x double>
26652641 ret <2 x double> %cvt
29812957 ;
29822958 ; AVX512VL-LABEL: uitofp_load_2i32_to_2f64:
29832959 ; AVX512VL: # BB#0:
2984 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
2985 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2986 ; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0
2960 ; AVX512VL-NEXT: vcvtudq2pd (%rdi), %xmm0
29872961 ; AVX512VL-NEXT: retq
29882962 ;
29892963 ; AVX512DQ-LABEL: uitofp_load_2i32_to_2f64:
29962970 ;
29972971 ; AVX512VLDQ-LABEL: uitofp_load_2i32_to_2f64:
29982972 ; AVX512VLDQ: # BB#0:
2999 ; AVX512VLDQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
3000 ; AVX512VLDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3001 ; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0
2973 ; AVX512VLDQ-NEXT: vcvtudq2pd (%rdi), %xmm0
30022974 ; AVX512VLDQ-NEXT: retq
30032975 %ld = load <2 x i32>, <2 x i32> *%a
30042976 %cvt = uitofp <2 x i32> %ld to <2 x double>
30142986 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
30152987 ; SSE-NEXT: retq
30162988 ;
3017 ; VEX-LABEL: uitofp_load_2i16_to_2f64:
3018 ; VEX: # BB#0:
3019 ; VEX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
3020 ; VEX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3021 ; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0
3022 ; VEX-NEXT: retq
3023 ;
3024 ; AVX512F-LABEL: uitofp_load_2i16_to_2f64:
3025 ; AVX512F: # BB#0:
3026 ; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
3027 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3028 ; AVX512F-NEXT: vcvtdq2pd %xmm0, %xmm0
3029 ; AVX512F-NEXT: retq
3030 ;
3031 ; AVX512VL-LABEL: uitofp_load_2i16_to_2f64:
3032 ; AVX512VL: # BB#0:
3033 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
3034 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3035 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3036 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3037 ; AVX512VL-NEXT: vcvtdq2pd %xmm0, %xmm0
3038 ; AVX512VL-NEXT: retq
3039 ;
3040 ; AVX512DQ-LABEL: uitofp_load_2i16_to_2f64:
3041 ; AVX512DQ: # BB#0:
3042 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
3043 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3044 ; AVX512DQ-NEXT: vcvtdq2pd %xmm0, %xmm0
3045 ; AVX512DQ-NEXT: retq
3046 ;
3047 ; AVX512VLDQ-LABEL: uitofp_load_2i16_to_2f64:
3048 ; AVX512VLDQ: # BB#0:
3049 ; AVX512VLDQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
3050 ; AVX512VLDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3051 ; AVX512VLDQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3052 ; AVX512VLDQ-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3053 ; AVX512VLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0
3054 ; AVX512VLDQ-NEXT: retq
2989 ; AVX-LABEL: uitofp_load_2i16_to_2f64:
2990 ; AVX: # BB#0:
2991 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2992 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2993 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
2994 ; AVX-NEXT: retq
30552995 %ld = load <2 x i16>, <2 x i16> *%a
30562996 %cvt = uitofp <2 x i16> %ld to <2 x double>
30572997 ret <2 x double> %cvt
21672167 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
21682168 ; AVX1-NEXT: retq
21692169 ;
2170 ; AVX2-LABEL: insert_mem_lo_v4i32:
2171 ; AVX2: # BB#0:
2172 ; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
2173 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2174 ; AVX2-NEXT: retq
2175 ;
2176 ; AVX512VL-LABEL: insert_mem_lo_v4i32:
2177 ; AVX512VL: # BB#0:
2178 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
2179 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[2,3]
2180 ; AVX512VL-NEXT: retq
2170 ; AVX2OR512VL-LABEL: insert_mem_lo_v4i32:
2171 ; AVX2OR512VL: # BB#0:
2172 ; AVX2OR512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
2173 ; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2174 ; AVX2OR512VL-NEXT: retq
21812175 %a = load <2 x i32>, <2 x i32>* %ptr
21822176 %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32>
21832177 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32>
22092203 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
22102204 ; SSE-NEXT: retq
22112205 ;
2212 ; AVX1OR2-LABEL: insert_mem_hi_v4i32:
2213 ; AVX1OR2: # BB#0:
2214 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
2215 ; AVX1OR2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2216 ; AVX1OR2-NEXT: retq
2217 ;
2218 ; AVX512VL-LABEL: insert_mem_hi_v4i32:
2219 ; AVX512VL: # BB#0:
2220 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
2221 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2222 ; AVX512VL-NEXT: retq
2206 ; AVX-LABEL: insert_mem_hi_v4i32:
2207 ; AVX: # BB#0:
2208 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
2209 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2210 ; AVX-NEXT: retq
22232211 %a = load <2 x i32>, <2 x i32>* %ptr
22242212 %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32>
22252213 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32>