llvm.org GIT mirror llvm / a0c74d2
X86 Tests: Update more isel tests with FastVariableShuffle feature Summary: Added the FastVariableShuffle feature to cases that resembled processors for which this fearure is on. For AVX2 there are processors with and w/o this fearue enable. For AVX512 only KNL does enable this feature so cases which only have +avx512f were left without the FastVariableShuffle enabled. Reviewers: RKSimon, craig.topper Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41851 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@322090 91177308-0d34-0410-b5e6-96231b3b80d8 Zvi Rackover 1 year, 10 months ago
29 changed file(s) with 2676 addition(s) and 1599 deletion(s). Raw diff Collapse all Expand all
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64
1 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X32,X32-SLOW
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X32,X32-FAST
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-SLOW
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X64,X64-FAST
35
46 define <4 x i32> @trunc4(<4 x i64> %A) nounwind {
5 ; X32-LABEL: trunc4:
6 ; X32: # %bb.0:
7 ; X32-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
8 ; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
9 ; X32-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
10 ; X32-NEXT: vzeroupper
11 ; X32-NEXT: retl
12 ;
13 ; X64-LABEL: trunc4:
14 ; X64: # %bb.0:
15 ; X64-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
16 ; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
17 ; X64-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
18 ; X64-NEXT: vzeroupper
19 ; X64-NEXT: retq
7 ; X32-SLOW-LABEL: trunc4:
8 ; X32-SLOW: # %bb.0:
9 ; X32-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
10 ; X32-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
11 ; X32-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
12 ; X32-SLOW-NEXT: vzeroupper
13 ; X32-SLOW-NEXT: retl
14 ;
15 ; X32-FAST-LABEL: trunc4:
16 ; X32-FAST: # %bb.0:
17 ; X32-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
18 ; X32-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
19 ; X32-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
20 ; X32-FAST-NEXT: vzeroupper
21 ; X32-FAST-NEXT: retl
22 ;
23 ; X64-SLOW-LABEL: trunc4:
24 ; X64-SLOW: # %bb.0:
25 ; X64-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
26 ; X64-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
27 ; X64-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
28 ; X64-SLOW-NEXT: vzeroupper
29 ; X64-SLOW-NEXT: retq
30 ;
31 ; X64-FAST-LABEL: trunc4:
32 ; X64-FAST: # %bb.0:
33 ; X64-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
34 ; X64-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
35 ; X64-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
36 ; X64-FAST-NEXT: vzeroupper
37 ; X64-FAST-NEXT: retq
2038 %B = trunc <4 x i64> %A to <4 x i32>
2139 ret <4 x i32>%B
2240 }
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64
1 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X32,X32-SLOW
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X32,X32-FAST
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-SLOW
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X64,X64-FAST
35
46 ; AVX2 Logical Shift Left
57
371373 }
372374
373375 define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
374 ; X32-LABEL: srl_trunc_and_v4i64:
375 ; X32: # %bb.0:
376 ; X32-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
377 ; X32-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
378 ; X32-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
379 ; X32-NEXT: vpand %xmm2, %xmm1, %xmm1
380 ; X32-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
381 ; X32-NEXT: vzeroupper
382 ; X32-NEXT: retl
383 ;
384 ; X64-LABEL: srl_trunc_and_v4i64:
385 ; X64: # %bb.0:
386 ; X64-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
387 ; X64-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
388 ; X64-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
389 ; X64-NEXT: vpand %xmm2, %xmm1, %xmm1
390 ; X64-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
391 ; X64-NEXT: vzeroupper
392 ; X64-NEXT: retq
376 ; X32-SLOW-LABEL: srl_trunc_and_v4i64:
377 ; X32-SLOW: # %bb.0:
378 ; X32-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
379 ; X32-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
380 ; X32-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
381 ; X32-SLOW-NEXT: vpand %xmm2, %xmm1, %xmm1
382 ; X32-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
383 ; X32-SLOW-NEXT: vzeroupper
384 ; X32-SLOW-NEXT: retl
385 ;
386 ; X32-FAST-LABEL: srl_trunc_and_v4i64:
387 ; X32-FAST: # %bb.0:
388 ; X32-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
389 ; X32-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
390 ; X32-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
391 ; X32-FAST-NEXT: vpand %xmm2, %xmm1, %xmm1
392 ; X32-FAST-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
393 ; X32-FAST-NEXT: vzeroupper
394 ; X32-FAST-NEXT: retl
395 ;
396 ; X64-SLOW-LABEL: srl_trunc_and_v4i64:
397 ; X64-SLOW: # %bb.0:
398 ; X64-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
399 ; X64-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
400 ; X64-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
401 ; X64-SLOW-NEXT: vpand %xmm2, %xmm1, %xmm1
402 ; X64-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
403 ; X64-SLOW-NEXT: vzeroupper
404 ; X64-SLOW-NEXT: retq
405 ;
406 ; X64-FAST-LABEL: srl_trunc_and_v4i64:
407 ; X64-FAST: # %bb.0:
408 ; X64-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
409 ; X64-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
410 ; X64-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
411 ; X64-FAST-NEXT: vpand %xmm2, %xmm1, %xmm1
412 ; X64-FAST-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
413 ; X64-FAST-NEXT: vzeroupper
414 ; X64-FAST-NEXT: retq
393415 %and = and <4 x i64> %y,
394416 %trunc = trunc <4 x i64> %and to <4 x i32>
395417 %sra = lshr <4 x i32> %x, %trunc
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -O2 | FileCheck %s --check-prefix=AVX512
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl -O2 | FileCheck %s --check-prefix=AVX512NOTDQ
1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+fast-variable-shuffle -O2 | FileCheck %s --check-prefix=AVX512
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+fast-variable-shuffle -O2 | FileCheck %s --check-prefix=AVX512NOTDQ
33
44 define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
55 ; AVX512-LABEL: load_v8i1_broadcast_4_v2i1:
330330 ; AVX512-NEXT: kmovd (%rdi), %k0
331331 ; AVX512-NEXT: kshiftrd $24, %k0, %k0
332332 ; AVX512-NEXT: vpmovm2d %k0, %ymm2
333 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
334 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
333 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
334 ; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm2
335335 ; AVX512-NEXT: vpmovd2m %ymm2, %k1
336336 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
337337 ; AVX512-NEXT: vmovaps %ymm1, (%rsi)
344344 ; AVX512NOTDQ-NEXT: kshiftrd $24, %k0, %k1
345345 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
346346 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
347 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
348 ; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
347 ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
348 ; AVX512NOTDQ-NEXT: vpermd %ymm2, %ymm3, %ymm2
349349 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
350350 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
351351 ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
540540 ; AVX512-NEXT: kmovq (%rdi), %k0
541541 ; AVX512-NEXT: kshiftrq $56, %k0, %k0
542542 ; AVX512-NEXT: vpmovm2d %k0, %ymm2
543 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
544 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
543 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
544 ; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm2
545545 ; AVX512-NEXT: vpmovd2m %ymm2, %k1
546546 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
547547 ; AVX512-NEXT: vmovaps %ymm1, (%rsi)
554554 ; AVX512NOTDQ-NEXT: kshiftrq $56, %k0, %k1
555555 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
556556 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
557 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
558 ; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
557 ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
558 ; AVX512NOTDQ-NEXT: vpermd %ymm2, %ymm3, %ymm2
559559 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
560560 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
561561 ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
11331133 ; AVX512-NEXT: kmovd (%rdi), %k0
11341134 ; AVX512-NEXT: kshiftrd $24, %k0, %k0
11351135 ; AVX512-NEXT: vpmovm2d %k0, %ymm0
1136 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
1137 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
1136 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
1137 ; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
11381138 ; AVX512-NEXT: vpmovd2m %ymm0, %k0
11391139 ; AVX512-NEXT: kmovb %k0, (%rsi)
11401140 ; AVX512-NEXT: vzeroupper
11461146 ; AVX512NOTDQ-NEXT: kshiftrd $24, %k0, %k1
11471147 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
11481148 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1149 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
1150 ; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
1149 ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
1150 ; AVX512NOTDQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
11511151 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
11521152 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
11531153 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
13681368 ; AVX512-NEXT: kmovq (%rdi), %k0
13691369 ; AVX512-NEXT: kshiftrq $56, %k0, %k0
13701370 ; AVX512-NEXT: vpmovm2d %k0, %ymm0
1371 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
1372 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
1371 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
1372 ; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
13731373 ; AVX512-NEXT: vpmovd2m %ymm0, %k0
13741374 ; AVX512-NEXT: kmovb %k0, (%rsi)
13751375 ; AVX512-NEXT: vzeroupper
13811381 ; AVX512NOTDQ-NEXT: kshiftrq $56, %k0, %k1
13821382 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
13831383 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1384 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
1385 ; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
1384 ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
1385 ; AVX512NOTDQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
13861386 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
13871387 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
13881388 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512dq %s -o - | FileCheck %s
1 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512dq,+fast-variable-shuffle %s -o - | FileCheck %s
22
33 ; FIXME: fixing PR34394 should fix the i32x2 memory cases resulting in a simple vbroadcasti32x2 instruction.
44
458458 define <8 x i32> @test_2xi32_to_8xi32_mem(<2 x i32>* %vp) {
459459 ; CHECK-LABEL: test_2xi32_to_8xi32_mem:
460460 ; CHECK: # %bb.0:
461 ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
462 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
461 ; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
462 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
463463 ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0
464464 ; CHECK-NEXT: retq
465465 %vec = load <2 x i32>, <2 x i32>* %vp
469469 define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
470470 ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask0:
471471 ; CHECK: # %bb.0:
472 ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
473 ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
472 ; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
473 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
474474 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
475475 ; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
476476 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1]
485485 define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32> %mask) {
486486 ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask0:
487487 ; CHECK: # %bb.0:
488 ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
489 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
488 ; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
489 ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
490490 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
491491 ; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
492492 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1]
500500 define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
501501 ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask1:
502502 ; CHECK: # %bb.0:
503 ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
504 ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
503 ; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
504 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
505505 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
506506 ; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
507507 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1]
516516 define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32> %mask) {
517517 ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask1:
518518 ; CHECK: # %bb.0:
519 ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
520 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
519 ; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
520 ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
521521 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
522522 ; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
523523 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1]
531531 define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
532532 ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask2:
533533 ; CHECK: # %bb.0:
534 ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
535 ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
534 ; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
535 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
536536 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
537537 ; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
538538 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1]
547547 define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32> %mask) {
548548 ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask2:
549549 ; CHECK: # %bb.0:
550 ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
551 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
550 ; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
551 ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
552552 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
553553 ; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
554554 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1]
562562 define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
563563 ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask3:
564564 ; CHECK: # %bb.0:
565 ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
566 ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
565 ; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
566 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
567567 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
568568 ; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
569569 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1]
578578 define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32> %mask) {
579579 ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask3:
580580 ; CHECK: # %bb.0:
581 ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
582 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
581 ; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
582 ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
583583 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
584584 ; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
585585 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1]
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw %s -o - | FileCheck %s
1 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-shuffle %s -o - | FileCheck %s
22
33 ; FIXME: All cases here should be fixed by PR34380
44
55 define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) {
66 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0:
77 ; CHECK: # %bb.0:
8 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,6,4]
9 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
8 ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,15,12,13,12,13,8,9,14,15,12,13,12,13,8,9]
109 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
11 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
12 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
10 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,0,1,2,3,12,13,0,1]
1311 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4],xmm0[5,6,7]
1412 ; CHECK-NEXT: vzeroupper
1513 ; CHECK-NEXT: retq
1917 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
2018 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0:
2119 ; CHECK: # %bb.0:
22 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,6,6,4]
23 ; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
20 ; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[14,15,12,13,12,13,8,9,14,15,12,13,12,13,8,9]
2421 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
25 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
26 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
22 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,0,1,2,3,12,13,0,1]
2723 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3,4],xmm0[5,6,7]
2824 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
2925 ; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
3935 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) {
4036 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0:
4137 ; CHECK: # %bb.0:
42 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,7,6,6,4]
43 ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
38 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[14,15,12,13,12,13,8,9,14,15,12,13,12,13,8,9]
4439 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
45 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
46 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
40 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,0,1,2,3,12,13,0,1]
4741 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3,4],xmm0[5,6,7]
4842 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
4943 ; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
5852 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
5953 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1:
6054 ; CHECK: # %bb.0:
61 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
62 ; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
63 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
64 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3]
65 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7]
55 ; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,12,13,14,15]
56 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
57 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
58 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
6659 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
6760 ; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
6861 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
7770 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %mask) {
7871 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask1:
7972 ; CHECK: # %bb.0:
80 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
81 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
82 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
83 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3]
84 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
73 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,12,13,14,15]
74 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
75 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
76 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
8577 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
8678 ; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
8779 ; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
9587 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
9688 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask2:
9789 ; CHECK: # %bb.0:
98 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
99 ; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
100 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
101 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
102 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4,5,6],xmm3[7]
90 ; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9,10,11,4,5,6,7,14,15,2,3,12,13,14,15]
91 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
92 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
93 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3],xmm3[4,5,6],xmm0[7]
10394 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
10495 ; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
10596 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
114105 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %mask) {
115106 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask2:
116107 ; CHECK: # %bb.0:
117 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
118 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
119 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
120 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
121 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4,5,6],xmm2[7]
108 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[8,9,10,11,4,5,6,7,14,15,2,3,12,13,14,15]
109 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
110 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
111 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3],xmm2[4,5,6],xmm0[7]
122112 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
123113 ; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
124114 ; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
180170 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0:
181171 ; CHECK: # %bb.0:
182172 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
183 ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
184 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
185 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,0]
186 ; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
187 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6],xmm0[7]
173 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
174 ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
175 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
176 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
188177 ; CHECK-NEXT: vzeroupper
189178 ; CHECK-NEXT: retq
190179 %vec = load <16 x i16>, <16 x i16>* %vp
195184 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0:
196185 ; CHECK: # %bb.0:
197186 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
198 ; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
199 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
200 ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,0]
201 ; CHECK-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
202 ; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6],xmm2[7]
187 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
188 ; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
189 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
190 ; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7]
203191 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
204192 ; CHECK-NEXT: vpcmpeqw %xmm3, %xmm1, %k1
205193 ; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
216204 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0:
217205 ; CHECK: # %bb.0:
218206 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
219 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
220 ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
221 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,0]
222 ; CHECK-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
223 ; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6],xmm1[7]
207 ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
208 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
209 ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
210 ; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6],xmm2[7]
224211 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
225212 ; CHECK-NEXT: vpcmpeqw %xmm2, %xmm0, %k1
226213 ; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
21912178 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
21922179 ; CHECK: # %bb.0:
21932180 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2194 ; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7]
2195 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
2196 ; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
2197 ; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,2,1]
2198 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2181 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,2,5]
2182 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4
2183 ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
2184 ; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1
2185 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
21992186 ; CHECK-NEXT: retq
22002187 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32>
22012188 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
22062193 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %mask) {
22072194 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1:
22082195 ; CHECK: # %bb.0:
2209 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
2210 ; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
2211 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
2212 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
2213 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,1]
2196 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2197 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,2,5]
2198 ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
2199 ; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1
2200 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
2201 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
22142202 ; CHECK-NEXT: retq
22152203 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32>
22162204 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
22212209 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
22222210 ; CHECK: # %bb.0:
22232211 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2224 ; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
2225 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
2226 ; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
2227 ; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3]
2228 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2212 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,7,2,7]
2213 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4
2214 ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
2215 ; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1
2216 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
22292217 ; CHECK-NEXT: retq
22302218 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32>
22312219 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
22362224 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) {
22372225 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
22382226 ; CHECK: # %bb.0:
2239 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
2240 ; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
2241 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
2242 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
2243 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3]
2227 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2228 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,7,2,7]
2229 ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
2230 ; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1
2231 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
2232 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
22442233 ; CHECK-NEXT: retq
22452234 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32>
22462235 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
22502239 define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) {
22512240 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask3:
22522241 ; CHECK: # %bb.0:
2253 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
2254 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
2255 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,3]
2242 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
2243 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [2,4,4,3]
2244 ; CHECK-NEXT: vpermi2q %ymm0, %ymm2, %ymm1
2245 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
22562246 ; CHECK-NEXT: retq
22572247 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32>
22582248 ret <4 x i64> %res
22612251 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask3:
22622252 ; CHECK: # %bb.0:
22632253 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2264 ; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
2265 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
2266 ; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
2267 ; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,0,3]
2268 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2254 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,4,4,3]
2255 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4
2256 ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
2257 ; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1
2258 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
22692259 ; CHECK-NEXT: retq
22702260 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32>
22712261 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
22762266 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %mask) {
22772267 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3:
22782268 ; CHECK: # %bb.0:
2279 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
2280 ; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
2281 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
2282 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
2283 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,0,3]
2269 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2270 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,4,4,3]
2271 ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
2272 ; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1
2273 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
2274 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
22842275 ; CHECK-NEXT: retq
22852276 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32>
22862277 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
22912282 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
22922283 ; CHECK: # %bb.0:
22932284 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2294 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
2295 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
2296 ; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
2297 ; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,3,1]
2298 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2285 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,3,3,1]
2286 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4
2287 ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
2288 ; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1
2289 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
22992290 ; CHECK-NEXT: retq
23002291 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32>
23012292 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
23062297 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) {
23072298 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:
23082299 ; CHECK: # %bb.0:
2309 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
2310 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
2311 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
2312 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
2313 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,3,1]
2300 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2301 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,3,3,1]
2302 ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
2303 ; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1
2304 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
2305 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
23142306 ; CHECK-NEXT: retq
23152307 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32>
23162308 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
23512343 define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) {
23522344 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask6:
23532345 ; CHECK: # %bb.0:
2354 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
2355 ; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,2,1,3]
2356 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
2346 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
2347 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,7]
2348 ; CHECK-NEXT: vpermi2q %ymm0, %ymm2, %ymm1
2349 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
23572350 ; CHECK-NEXT: retq
23582351 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32>
23592352 ret <4 x i64> %res
23622355 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
23632356 ; CHECK: # %bb.0:
23642357 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2365 ; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,2,1,3]
2366 ; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
2367 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
2368 ; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
2369 ; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2358 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,2,1,7]
2359 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4
2360 ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
2361 ; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1
2362 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
23702363 ; CHECK-NEXT: retq
23712364 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32>
23722365 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
23772370 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %mask) {
23782371 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6:
23792372 ; CHECK: # %bb.0:
2380 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
2381 ; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,2,1,3]
2382 ; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
2383 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
2384 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
2385 ; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
2373 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2374 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,7]
2375 ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
2376 ; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1
2377 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
2378 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
23862379 ; CHECK-NEXT: retq
23872380 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32>
23882381 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
25362529 ; CHECK: # %bb.0:
25372530 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
25382531 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
2539 ; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
2540 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
2541 ; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1
2542 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,3,2,0]
2532 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [4,3,2,4]
2533 ; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm4
2534 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
2535 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
2536 ; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1}
25432537 ; CHECK-NEXT: retq
25442538 %vec = load <8 x i64>, <8 x i64>* %vp
25452539 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32>
25512545 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %mask) {
25522546 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1:
25532547 ; CHECK: # %bb.0:
2554 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
2555 ; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
2556 ; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
2557 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
2558 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1
2559 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,3,2,0]
2548 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
2549 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
2550 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,4]
2551 ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
2552 ; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1
2553 ; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm1 {%k1} {z}
2554 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
25602555 ; CHECK-NEXT: retq
25612556 %vec = load <8 x i64>, <8 x i64>* %vp
25622557 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32>
25702565 ; CHECK: # %bb.0:
25712566 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
25722567 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
2573 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
2574 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
2575 ; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1
2576 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,0]
2568 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,5,5,1]
2569 ; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm4
2570 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
2571 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
2572 ; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1}
25772573 ; CHECK-NEXT: retq
25782574 %vec = load <8 x i64>, <8 x i64>* %vp
25792575 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32>
25852581 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %mask) {
25862582 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
25872583 ; CHECK: # %bb.0:
2588 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
2589 ; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
2590 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
2591 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
2592 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1
2593 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,0]
2584 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
2585 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
2586 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,5,5,1]
2587 ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
2588 ; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1
2589 ; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm1 {%k1} {z}
2590 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
25942591 ; CHECK-NEXT: retq
25952592 %vec = load <8 x i64>, <8 x i64>* %vp
25962593 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32>
26022599 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp) {
26032600 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask3:
26042601 ; CHECK: # %bb.0:
2605 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
2606 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2607 ; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
2608 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,1,3]
2602 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
2603 ; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
2604 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,0,2]
2605 ; CHECK-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
26092606 ; CHECK-NEXT: retq
26102607 %vec = load <8 x i64>, <8 x i64>* %vp
26112608 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32>
26162613 ; CHECK: # %bb.0:
26172614 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
26182615 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
2619 ; CHECK-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
2620 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
2621 ; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1
2622 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,3]
2616 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,0,0,2]
2617 ; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4
2618 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
2619 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
2620 ; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1}
26232621 ; CHECK-NEXT: retq
26242622 %vec = load <8 x i64>, <8 x i64>* %vp
26252623 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32>
26312629 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %mask) {
26322630 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3:
26332631 ; CHECK: # %bb.0:
2634 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
2635 ; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
2636 ; CHECK-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
2637 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
2638 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1
2639 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,3]
2632 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
2633 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
2634 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,0,2]
2635 ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
2636 ; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1
2637 ; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
2638 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
26402639 ; CHECK-NEXT: retq
26412640 %vec = load <8 x i64>, <8 x i64>* %vp
26422641 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32>
26862685 ; CHECK: # %bb.0:
26872686 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
26882687 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
2689 ; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
2690 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
2691 ; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1
2692 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,2,3,1]
2688 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,7,1]
2689 ; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4
2690 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
2691 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
2692 ; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1}
26932693 ; CHECK-NEXT: retq
26942694 %vec = load <8 x i64>, <8 x i64>* %vp
26952695 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32>
27012701 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %mask) {
27022702 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5:
27032703 ; CHECK: # %bb.0:
2704 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
2705 ; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
2706 ; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
2707 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
2708 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1
2709 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,2,3,1]
2704 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
2705 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
2706 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,7,1]
2707 ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
2708 ; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1
2709 ; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
2710 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
27102711 ; CHECK-NEXT: retq
27112712 %vec = load <8 x i64>, <8 x i64>* %vp
27122713 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32>
27682769 ; CHECK: # %bb.0:
27692770 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
27702771 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
2771 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
2772 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
2773 ; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1
2774 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,2,0,1]
2772 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,3,1,5]
2773 ; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm4
2774 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
2775 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
2776 ; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1}
27752777 ; CHECK-NEXT: retq
27762778 %vec = load <8 x i64>, <8 x i64>* %vp
27772779 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32>
27832785 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %mask) {
27842786 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7:
27852787 ; CHECK: # %bb.0:
2786 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
2787 ; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
2788 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
2789 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
2790 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1
2791 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,2,0,1]
2788 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
2789 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
2790 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,1,5]
2791 ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
2792 ; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1
2793 ; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm1 {%k1} {z}
2794 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
27922795 ; CHECK-NEXT: retq
27932796 %vec = load <8 x i64>, <8 x i64>* %vp
27942797 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32>
34643467 define <4 x float> @test_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec) {
34653468 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask3:
34663469 ; CHECK: # %bb.0:
3467 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
3468 ; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,1,3,3]
3469 ; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
3470 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
3471 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
3470 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
3471 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm1
3472 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
3473 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
3474 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
34723475 ; CHECK-NEXT: vzeroupper
34733476 ; CHECK-NEXT: retq
34743477 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32>
34773480 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
34783481 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask3:
34793482 ; CHECK: # %bb.0:
3480 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
3481 ; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3]
3482 ; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
3483 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3484 ; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2],xmm0[3]
3483 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,4,6,6,7]
3484 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm3
3485 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
3486 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
3487 ; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3]
34853488 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
34863489 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
34873490 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
34963499 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %mask) {
34973500 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask3:
34983501 ; CHECK: # %bb.0:
3499 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
3500 ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,3,3]
3501 ; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
3502 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3503 ; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
3502 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
3503 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm2
3504 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
3505 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
3506 ; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
35043507 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
35053508 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
35063509 ; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
40214024 define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) {
40224025 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0:
40234026 ; CHECK: # %bb.0:
4024 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
4025 ; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
4026 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
4027 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
4028 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [3,7,3,7]
4029 ; CHECK-NEXT: vpermi2pd %ymm0, %ymm2, %ymm1
4030 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
40274031 ; CHECK-NEXT: retq
40284032 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
40294033 ret <4 x double> %res
40314035 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
40324036 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0:
40334037 ; CHECK: # %bb.0:
4034 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
4035 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
4036 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
4037 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
4038 ; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3]
4039 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4038 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
4039 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [3,7,3,7]
4040 ; CHECK-NEXT: vpermi2pd %ymm0, %ymm3, %ymm4
4041 ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
4042 ; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
4043 ; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
40404044 ; CHECK-NEXT: retq
40414045 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
40424046 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
40474051 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) {
40484052 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0:
40494053 ; CHECK: # %bb.0:
4050 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
4051 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
4052 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
4053 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4054 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3]
4054 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
4055 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [3,7,3,7]
4056 ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
4057 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
4058 ; CHECK-NEXT: vpermi2pd %ymm0, %ymm3, %ymm2 {%k1} {z}
4059 ; CHECK-NEXT: vmovapd %ymm2, %ymm0
40554060 ; CHECK-NEXT: retq
40564061 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
40574062 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
41904195 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
41914196 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask5:
41924197 ; CHECK: # %bb.0:
4193 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
4194 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
4195 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
4196 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
4197 ; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,2]
4198 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4198 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
4199 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [2,6,2,2]
4200 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4
4201 ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
4202 ; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
4203 ; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
41994204 ; CHECK-NEXT: retq
42004205 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
42014206 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
42064211 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %mask) {
42074212 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask5:
42084213 ; CHECK: # %bb.0:
4209 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
4210 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
4211 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
4212 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4213 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,2]
4214 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
4215 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,6,2,2]
4216 ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
4217 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
4218 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
4219 ; CHECK-NEXT: vmovapd %ymm2, %ymm0
42144220 ; CHECK-NEXT: retq
42154221 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
42164222 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
42204226 define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) {
42214227 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
42224228 ; CHECK: # %bb.0:
4223 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
4224 ; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[3],ymm0[2]
4225 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
4229 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
4230 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [1,4,3,4]
4231 ; CHECK-NEXT: vpermi2pd %ymm0, %ymm2, %ymm1
4232 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
42264233 ; CHECK-NEXT: retq
42274234 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
42284235 ret <4 x double> %res
42314238 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
42324239 ; CHECK: # %bb.0:
42334240 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
4234 ; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[1],ymm0[0],ymm3[3],ymm0[2]
4235 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
4236 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
4237 ; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,1]
4238 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4241 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [1,4,3,4]
4242 ; CHECK-NEXT: vpermi2pd %ymm0, %ymm3, %ymm4
4243 ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
4244 ; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
4245 ; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
42394246 ; CHECK-NEXT: retq
42404247 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
42414248 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
42464253 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) {
42474254 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
42484255 ; CHECK: # %bb.0:
4249 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
4250 ; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[1],ymm0[0],ymm2[3],ymm0[2]
4251 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
4252 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4253 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,1]
4256 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
4257 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [1,4,3,4]
4258 ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
4259 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
4260 ; CHECK-NEXT: vpermi2pd %ymm0, %ymm3, %ymm2 {%k1} {z}
4261 ; CHECK-NEXT: vmovapd %ymm2, %ymm0
42544262 ; CHECK-NEXT: retq
42554263 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
42564264 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
42614269 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
42624270 ; CHECK: # %bb.0:
42634271 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
4264 ; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1,2],ymm0[3]
4265 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
4266 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
4267 ; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,1,0,2]
4268 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4272 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [3,5,0,6]
4273 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4
4274 ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
4275 ; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
4276 ; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
42694277 ; CHECK-NEXT: retq
42704278 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
42714279 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
42764284 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %mask) {
42774285 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7:
42784286 ; CHECK: # %bb.0:
4279 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
4280 ; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3]
4281 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
4282 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4283 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,1,0,2]
4287 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
4288 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [3,5,0,6]
4289 ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
4290 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
4291 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
4292 ; CHECK-NEXT: vmovapd %ymm2, %ymm0
42844293 ; CHECK-NEXT: retq
42854294 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
42864295 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
42904299 define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) {
42914300 ; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
42924301 ; CHECK: # %bb.0:
4293 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
4294 ; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
4295 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
4296 ; CHECK-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
4302 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
4303 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,6,2,6]
4304 ; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
4305 ; CHECK-NEXT: vmovapd %xmm1, %xmm0
42974306 ; CHECK-NEXT: vzeroupper
42984307 ; CHECK-NEXT: retq
42994308 %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32>
43024311 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
43034312 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
43044313 ; CHECK: # %bb.0:
4305 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
4306 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
4307 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
4308 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
4309 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
4310 ; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
4314 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
4315 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [0,6,2,6]
4316 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4
4317 ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
4318 ; CHECK-NEXT: vcmpeqpd %xmm0, %xmm2, %k1
4319 ; CHECK-NEXT: vblendmpd %xmm4, %xmm1, %xmm0 {%k1}
43114320 ; CHECK-NEXT: vzeroupper
43124321 ; CHECK-NEXT: retq
43134322 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32>
43194328 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %mask) {
43204329 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
43214330 ; CHECK: # %bb.0:
4322 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
4323 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
4324 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
4325 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
4326 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
4327 ; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
4331 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
4332 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [0,6,2,6]
4333 ; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm3
4334 ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
4335 ; CHECK-NEXT: vcmpeqpd %xmm0, %xmm1, %k1
4336 ; CHECK-NEXT: vmovapd %xmm3, %xmm0 {%k1} {z}
43284337 ; CHECK-NEXT: vzeroupper
43294338 ; CHECK-NEXT: retq
43304339 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32>
44184427 ; CHECK: # %bb.0:
44194428 ; CHECK-NEXT: vmovapd (%rdi), %zmm2
44204429 ; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
4421 ; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3]
4422 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
4423 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4424 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm2[3,0,2,0]
4430 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [3,4,2,4]
4431 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4
4432 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
4433 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4434 ; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1}
44254435 ; CHECK-NEXT: retq
44264436 %vec = load <8 x double>, <8 x double>* %vp
44274437 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
44334443 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %mask) {
44344444 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
44354445 ; CHECK: # %bb.0:
4436 ; CHECK-NEXT: vmovapd (%rdi), %zmm1
4437 ; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
4438 ; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3]
4439 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
4440 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
4441 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[3,0,2,0]
4446 ; CHECK-NEXT: vmovapd (%rdi), %zmm2
4447 ; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
4448 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [3,4,2,4]
4449 ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
4450 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1
4451 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z}
4452 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
44424453 ; CHECK-NEXT: retq
44434454 %vec = load <8 x double>, <8 x double>* %vp
44444455 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
44524463 ; CHECK: # %bb.0:
44534464 ; CHECK-NEXT: vmovapd (%rdi), %zmm2
44544465 ; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
4455 ; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3]
4456 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
4457 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4458 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm2[1,2,3,0]
4466 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [1,2,3,4]
4467 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4
4468 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
4469 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4470 ; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1}
44594471 ; CHECK-NEXT: retq
44604472 %vec = load <8 x double>, <8 x double>* %vp
44614473 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
44674479 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %mask) {
44684480 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2:
44694481 ; CHECK: # %bb.0:
4470 ; CHECK-NEXT: vmovapd (%rdi), %zmm1
4471 ; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
4472 ; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3]
4473 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
4474 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
4475 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[1,2,3,0]
4482 ; CHECK-NEXT: vmovapd (%rdi), %zmm2
4483 ; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
4484 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [1,2,3,4]
4485 ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
4486 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1
4487 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z}
4488 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
44764489 ; CHECK-NEXT: retq
44774490 %vec = load <8 x double>, <8 x double>* %vp
44784491 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
44844497 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp) {
44854498 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3:
44864499 ; CHECK: # %bb.0:
4487 ; CHECK-NEXT: vmovapd (%rdi), %zmm0
4488 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
4489 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,0]
4490 ; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
4500 ; CHECK-NEXT: vmovapd (%rdi), %zmm1
4501 ; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
4502 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [4,2,1,0]
4503 ; CHECK-NEXT: vpermi2pd %ymm2, %ymm1, %ymm0
44914504 ; CHECK-NEXT: retq
44924505 %vec = load <8 x double>, <8 x double>* %vp
44934506 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
44984511 ; CHECK: # %bb.0:
44994512 ; CHECK-NEXT: vmovapd (%rdi), %zmm2
45004513 ; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
4501 ; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,0]
4502 ; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3]
4503 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
4504 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4505 ; CHECK-NEXT: vmovapd %ymm2, %ymm0 {%k1}
4514 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [4,2,1,0]
4515 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4
4516 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
4517 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4518 ; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1}
45064519 ; CHECK-NEXT: retq
45074520 %vec = load <8 x double>, <8 x double>* %vp
45084521 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
45144527 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp, <4 x double> %mask) {
45154528 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3:
45164529 ; CHECK: # %bb.0:
4517 ; CHECK-NEXT: vmovapd (%rdi), %zmm1
4518 ; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
4519 ; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,0]
4520 ; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3]
4521 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
4522 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
4523 ; CHECK-NEXT: vmovapd %ymm1, %ymm0 {%k1} {z}
4530 ; CHECK-NEXT: vmovapd (%rdi), %zmm2
4531 ; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
4532 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [4,2,1,0]
4533 ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
4534 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1
4535 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z}
4536 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
45244537 ; CHECK-NEXT: retq
45254538 %vec = load <8 x double>, <8 x double>* %vp
45264539 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
45704583 ; CHECK: # %bb.0:
45714584 ; CHECK-NEXT: vmovapd (%rdi), %zmm2
45724585 ; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
4573 ; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3]
4574 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
4575 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4576 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,1]
4586 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [6,1,1,1]
4587 ; CHECK-NEXT: vpermi2pd %ymm2, %ymm3, %ymm4
4588 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
4589 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4590 ; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1}
45774591 ; CHECK-NEXT: retq
45784592 %vec = load <8 x double>, <8 x double>* %vp
45794593 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
45854599 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %mask) {
45864600 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5:
45874601 ; CHECK: # %bb.0:
4588 ; CHECK-NEXT: vmovapd (%rdi), %zmm1
4589 ; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
4590 ; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3]
4591 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
4592 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
4593 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,1]
4602 ; CHECK-NEXT: vmovapd (%rdi), %zmm2
4603 ; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
4604 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [6,1,1,1]
4605 ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
4606 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1
4607 ; CHECK-NEXT: vpermi2pd %ymm2, %ymm3, %ymm1 {%k1} {z}
4608 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
45944609 ; CHECK-NEXT: retq
45954610 %vec = load <8 x double>, <8 x double>* %vp
45964611 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
46044619 ; CHECK: # %bb.0:
46054620 ; CHECK-NEXT: vmovapd (%rdi), %zmm0
46064621 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
4607 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
4608 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
4609 ; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
4622 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
4623 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [0,2,6,1]
4624 ; CHECK-NEXT: vpermi2pd %ymm1, %ymm2, %ymm0
46104625 ; CHECK-NEXT: retq
46114626 %vec = load <8 x double>, <8 x double>* %vp
46124627 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
46184633 ; CHECK-NEXT: vmovapd (%rdi), %zmm2
46194634 ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm3
46204635 ; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm2
4621 ; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,1]
4622 ; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3]
4623 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
4624 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4625 ; CHECK-NEXT: vmovapd %ymm2, %ymm0 {%k1}
4636 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,6,1]
4637 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4
4638 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
4639 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4640 ; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1}
46264641 ; CHECK-NEXT: retq
46274642 %vec = load <8 x double>, <8 x double>* %vp
46284643 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
46364651 ; CHECK: # %bb.0:
46374652 ; CHECK-NEXT: vmovapd (%rdi), %zmm1
46384653 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
4639 ; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm1
4640 ; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,1]
4641 ; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3]
4642 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
4643 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
4644 ; CHECK-NEXT: vmovapd %ymm1, %ymm0 {%k1} {z}
4654 ; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm3
4655 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,6,1]
4656 ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
4657 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1
4658 ; CHECK-NEXT: vpermi2pd %ymm2, %ymm3, %ymm1 {%k1} {z}
4659 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
46454660 ; CHECK-NEXT: retq
46464661 %vec = load <8 x double>, <8 x double>* %vp
46474662 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
46554670 ; CHECK: # %bb.0:
46564671 ; CHECK-NEXT: vmovapd (%rdi), %zmm2
46574672 ; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
4658 ; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3]
4659 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
4660 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4661 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm2[0,1,2,1]
4673 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [0,5,2,5]
4674 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4
4675 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
4676 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4677 ; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1}
46624678 ; CHECK-NEXT: retq
46634679 %vec = load <8 x double>, <8 x double>* %vp
46644680 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
46704686 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp, <4 x double> %mask) {
46714687 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7:
46724688 ; CHECK: # %bb.0:
4673 ; CHECK-NEXT: vmovapd (%rdi), %zmm1
4674 ; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
4675 ; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3]
4676 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
4677 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
4678 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[0,1,2,1]
4689 ; CHECK-NEXT: vmovapd (%rdi), %zmm2
4690 ; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
4691 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,5,2,5]
4692 ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
4693 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1
4694 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z}
4695 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
46794696 ; CHECK-NEXT: retq
46804697 %vec = load <8 x double>, <8 x double>* %vp
46814698 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
46874704 define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp) {
46884705 ; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0:
46894706 ; CHECK: # %bb.0:
4690 ; CHECK-NEXT: vmovapd (%rdi), %zmm0
4691 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
4692 ; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[3],ymm1[2]
4693 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
4707 ; CHECK-NEXT: vmovapd (%rdi), %zmm1
4708 ; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
4709 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [1,6,3,6]
4710 ; CHECK-NEXT: vpermi2pd %ymm2, %ymm1, %ymm0
46944711 ; CHECK-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
46954712 ; CHECK-NEXT: vzeroupper
46964713 ; CHECK-NEXT: retq
47034720 ; CHECK: # %bb.0:
47044721 ; CHECK-NEXT: vmovapd (%rdi), %zmm2
47054722 ; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
4706 ; CHECK-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm3[0],ymm2[3],ymm3[2]
4707 ; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
4708 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
4709 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
4710 ; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1}
4723 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [1,6,3,6]
4724 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4
4725 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
4726 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
4727 ; CHECK-NEXT: vmovapd %xmm4, %xmm0 {%k1}
47114728 ; CHECK-NEXT: vzeroupper
47124729 ; CHECK-NEXT: retq
47134730 %vec = load <8 x double>, <8 x double>* %vp
47224739 ; CHECK: # %bb.0:
47234740 ; CHECK-NEXT: vmovapd (%rdi), %zmm1
47244741 ; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
4725 ; CHECK-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm2[0],ymm1[3],ymm2[2]
4726 ; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
4727 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
4728 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
4729 ; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1} {z}
4742 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [1,6,3,6]
4743 ; CHECK-NEXT: vpermi2pd %ymm2, %ymm1, %ymm3
4744 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
4745 ; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
4746 ; CHECK-NEXT: vmovapd %xmm3, %xmm0 {%k1} {z}
47304747 ; CHECK-NEXT: vzeroupper
47314748 ; CHECK-NEXT: retq
47324749 %vec = load <8 x double>, <8 x double>* %vp
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
11 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=KNL
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=SKX
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,SKX
33
44 attributes #0 = { nounwind }
55
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
22 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
33 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2,AVX2-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX12,AVX2,AVX2-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512
67
78 ;
89 ; 128-bit vectors
378379 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
379380 ; AVX1-NEXT: retq
380381 ;
381 ; AVX2-LABEL: ext_i32_32i8:
382 ; AVX2: # %bb.0:
383 ; AVX2-NEXT: vmovd %edi, %xmm0
384 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
385 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
386 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
387 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
388 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
389 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
390 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
391 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
392 ; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
393 ; AVX2-NEXT: retq
382 ; AVX2-SLOW-LABEL: ext_i32_32i8:
383 ; AVX2-SLOW: # %bb.0:
384 ; AVX2-SLOW-NEXT: vmovd %edi, %xmm0
385 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
386 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
387 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
388 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
389 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
390 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
391 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
392 ; AVX2-SLOW-NEXT: vpand %ymm1, %ymm0, %ymm0
393 ; AVX2-SLOW-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
394 ; AVX2-SLOW-NEXT: retq
395 ;
396 ; AVX2-FAST-LABEL: ext_i32_32i8:
397 ; AVX2-FAST: # %bb.0:
398 ; AVX2-FAST-NEXT: vmovd %edi, %xmm0
399 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
400 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3]
401 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7]
402 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
403 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
404 ; AVX2-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0
405 ; AVX2-FAST-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
406 ; AVX2-FAST-NEXT: retq
394407 ;
395408 ; AVX512-LABEL: ext_i32_32i8:
396409 ; AVX512: # %bb.0:
682695 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
683696 ; AVX1-NEXT: retq
684697 ;
685 ; AVX2-LABEL: ext_i64_64i8:
686 ; AVX2: # %bb.0:
687 ; AVX2-NEXT: vmovq %rdi, %xmm0
688 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
689 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
690 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
691 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
692 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
693 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
694 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
695 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
696 ; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
697 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,5]
698 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
699 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7]
700 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
701 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
702 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
703 ; AVX2-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
704 ; AVX2-NEXT: retq
698 ; AVX2-SLOW-LABEL: ext_i64_64i8:
699 ; AVX2-SLOW: # %bb.0:
700 ; AVX2-SLOW-NEXT: vmovq %rdi, %xmm0
701 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
702 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
703 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
704 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
705 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
706 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
707 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
708 ; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0
709 ; AVX2-SLOW-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
710 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,5]
711 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
712 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7]
713 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
714 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
715 ; AVX2-SLOW-NEXT: vpand %ymm2, %ymm1, %ymm1
716 ; AVX2-SLOW-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
717 ; AVX2-SLOW-NEXT: retq
718 ;
719 ; AVX2-FAST-LABEL: ext_i64_64i8:
720 ; AVX2-FAST: # %bb.0:
721 ; AVX2-FAST-NEXT: vmovq %rdi, %xmm0
722 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
723 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3]
724 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7]
725 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
726 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
727 ; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0
728 ; AVX2-FAST-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
729 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11]
730 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15]
731 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
732 ; AVX2-FAST-NEXT: vpand %ymm2, %ymm1, %ymm1
733 ; AVX2-FAST-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
734 ; AVX2-FAST-NEXT: retq
705735 ;
706736 ; AVX512-LABEL: ext_i64_64i8:
707737 ; AVX512: # %bb.0:
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
22 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
33 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2,AVX2-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX12,AVX2,AVX2-FAST
56 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
78
89 ;
910 ; 128-bit vectors
471472 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
472473 ; AVX1-NEXT: retq
473474 ;
474 ; AVX2-LABEL: ext_i32_32i8:
475 ; AVX2: # %bb.0:
476 ; AVX2-NEXT: vmovd %edi, %xmm0
477 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
478 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
479 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
480 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
481 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
482 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
483 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
484 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
485 ; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
486 ; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0
487 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
488 ; AVX2-NEXT: retq
475 ; AVX2-SLOW-LABEL: ext_i32_32i8:
476 ; AVX2-SLOW: # %bb.0:
477 ; AVX2-SLOW-NEXT: vmovd %edi, %xmm0
478 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
479 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
480 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
481 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
482 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
483 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
484 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
485 ; AVX2-SLOW-NEXT: vpand %ymm1, %ymm0, %ymm0
486 ; AVX2-SLOW-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
487 ; AVX2-SLOW-NEXT: vpsrlw $7, %ymm0, %ymm0
488 ; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
489 ; AVX2-SLOW-NEXT: retq
490 ;
491 ; AVX2-FAST-LABEL: ext_i32_32i8:
492 ; AVX2-FAST: # %bb.0:
493 ; AVX2-FAST-NEXT: vmovd %edi, %xmm0
494 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
495 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3]
496 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7]
497 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
498 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
499 ; AVX2-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0
500 ; AVX2-FAST-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
501 ; AVX2-FAST-NEXT: vpsrlw $7, %ymm0, %ymm0
502 ; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
503 ; AVX2-FAST-NEXT: retq
489504 ;
490505 ; AVX512F-LABEL: ext_i32_32i8:
491506 ; AVX512F: # %bb.0:
884899 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
885900 ; AVX1-NEXT: retq
886901 ;
887 ; AVX2-LABEL: ext_i64_64i8:
888 ; AVX2: # %bb.0:
889 ; AVX2-NEXT: vmovq %rdi, %xmm0
890 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
891 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
892 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
893 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
894 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
895 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
896 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
897 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
898 ; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
899 ; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0
900 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
901 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
902 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,5,5]
903 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
904 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7]
905 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
906 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1
907 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
908 ; AVX2-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
909 ; AVX2-NEXT: vpsrlw $7, %ymm1, %ymm1
910 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
911 ; AVX2-NEXT: retq
902 ; AVX2-SLOW-LABEL: ext_i64_64i8:
903 ; AVX2-SLOW: # %bb.0:
904 ; AVX2-SLOW-NEXT: vmovq %rdi, %xmm0
905 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
906 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
907 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
908 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
909 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
910 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
911 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
912 ; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0
913 ; AVX2-SLOW-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
914 ; AVX2-SLOW-NEXT: vpsrlw $7, %ymm0, %ymm0
915 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
916 ; AVX2-SLOW-NEXT: vpand %ymm3, %ymm0, %ymm0
917 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,5,5]
918 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
919 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7]
920 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
921 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1
922 ; AVX2-SLOW-NEXT: vpand %ymm2, %ymm1, %ymm1
923 ; AVX2-SLOW-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
924 ; AVX2-SLOW-NEXT: vpsrlw $7, %ymm1, %ymm1
925 ; AVX2-SLOW-NEXT: vpand %ymm3, %ymm1, %ymm1
926 ; AVX2-SLOW-NEXT: retq
927 ;
928 ; AVX2-FAST-LABEL: ext_i64_64i8:
929 ; AVX2-FAST: # %bb.0:
930 ; AVX2-FAST-NEXT: vmovq %rdi, %xmm0
931 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
932 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3]
933 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7]
934 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
935 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
936 ; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0
937 ; AVX2-FAST-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
938 ; AVX2-FAST-NEXT: vpsrlw $7, %ymm0, %ymm0
939 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
940 ; AVX2-FAST-NEXT: vpand %ymm3, %ymm0, %ymm0
941 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11]
942 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15]
943 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1
944 ; AVX2-FAST-NEXT: vpand %ymm2, %ymm1, %ymm1
945 ; AVX2-FAST-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
946 ; AVX2-FAST-NEXT: vpsrlw $7, %ymm1, %ymm1
947 ; AVX2-FAST-NEXT: vpand %ymm3, %ymm1, %ymm1
948 ; AVX2-FAST-NEXT: retq
912949 ;
913950 ; AVX512F-LABEL: ext_i64_64i8:
914951 ; AVX512F: # %bb.0:
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CD
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,avx512cd,+avx512bw| FileCheck %s --check-prefix=ALL --check-prefix=AVX512VLCDBW
3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl,avx512cd,+avx512bw| FileCheck %s --check-prefix=ALL --check-prefix=X86-AVX512VLCDBW
1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX512CD
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,avx512cd,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX512VLCDBW
3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl,avx512cd,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,X86-AVX512VLCDBW
44
55 define <2 x i64> @test_mm_epi64(<8 x i16> %a, <8 x i16> %b) {
66 ; AVX512CD-LABEL: test_mm_epi64:
2727 ; X86-AVX512VLCDBW-NEXT: kmovd %k0, %eax
2828 ; X86-AVX512VLCDBW-NEXT: movzbl %al, %eax
2929 ; X86-AVX512VLCDBW-NEXT: vmovd %eax, %xmm0
30 ; X86-AVX512VLCDBW-NEXT: vpbroadcastq %xmm0, %xmm0
30 ; X86-AVX512VLCDBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
3131 ; X86-AVX512VLCDBW-NEXT: retl
3232 entry:
3333 %0 = icmp eq <8 x i16> %a, %b
121121 ; X86-AVX512VLCDBW-NEXT: kmovd %k0, %eax
122122 ; X86-AVX512VLCDBW-NEXT: movzbl %al, %eax
123123 ; X86-AVX512VLCDBW-NEXT: vmovd %eax, %xmm0
124 ; X86-AVX512VLCDBW-NEXT: vpbroadcastq %xmm0, %xmm0
124 ; X86-AVX512VLCDBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
125125 ; X86-AVX512VLCDBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
126126 ; X86-AVX512VLCDBW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
127127 ; X86-AVX512VLCDBW-NEXT: retl
159159 ; X86-AVX512VLCDBW-NEXT: kmovd %k0, %eax
160160 ; X86-AVX512VLCDBW-NEXT: movzbl %al, %eax
161161 ; X86-AVX512VLCDBW-NEXT: vmovd %eax, %xmm0
162 ; X86-AVX512VLCDBW-NEXT: vpbroadcastq %xmm0, %xmm0
162 ; X86-AVX512VLCDBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
163163 ; X86-AVX512VLCDBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
164164 ; X86-AVX512VLCDBW-NEXT: retl
165165 entry:
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX-FAST
34
45 ; fold (shl 0, x) -> 0
56 define <4 x i32> @combine_vec_shl_zero(<4 x i32> %x) {
112113 ; SSE-NEXT: pmulld %xmm1, %xmm0
113114 ; SSE-NEXT: retq
114115 ;
115 ; AVX-LABEL: combine_vec_shl_trunc_and:
116 ; AVX: # %bb.0:
117 ; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
118 ; AVX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
119 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
120 ; AVX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
121 ; AVX-NEXT: vzeroupper
122 ; AVX-NEXT: retq
116 ; AVX-SLOW-LABEL: combine_vec_shl_trunc_and:
117 ; AVX-SLOW: # %bb.0:
118 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
119 ; AVX-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
120 ; AVX-SLOW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
121 ; AVX-SLOW-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
122 ; AVX-SLOW-NEXT: vzeroupper
123 ; AVX-SLOW-NEXT: retq
124 ;
125 ; AVX-FAST-LABEL: combine_vec_shl_trunc_and:
126 ; AVX-FAST: # %bb.0:
127 ; AVX-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
128 ; AVX-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
129 ; AVX-FAST-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
130 ; AVX-FAST-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
131 ; AVX-FAST-NEXT: vzeroupper
132 ; AVX-FAST-NEXT: retq
123133 %1 = and <4 x i64> %y,
124134 %2 = trunc <4 x i64> %1 to <4 x i32>
125135 %3 = shl <4 x i32> %x, %2
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST
34
45 ; fold (sra 0, x) -> 0
56 define <4 x i32> @combine_vec_ashr_zero(<4 x i32> %x) {
179180 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
180181 ; SSE-NEXT: retq
181182 ;
182 ; AVX-LABEL: combine_vec_ashr_trunc_and:
183 ; AVX: # %bb.0:
184 ; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
185 ; AVX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
186 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
187 ; AVX-NEXT: vpsravd %xmm1, %xmm0, %xmm0
188 ; AVX-NEXT: vzeroupper
189 ; AVX-NEXT: retq
183 ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_and:
184 ; AVX2-SLOW: # %bb.0:
185 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
186 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
187 ; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
188 ; AVX2-SLOW-NEXT: vpsravd %xmm1, %xmm0, %xmm0
189 ; AVX2-SLOW-NEXT: vzeroupper
190 ; AVX2-SLOW-NEXT: retq
191 ;
192 ; AVX2-FAST-LABEL: combine_vec_ashr_trunc_and:
193 ; AVX2-FAST: # %bb.0:
194 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
195 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
196 ; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
197 ; AVX2-FAST-NEXT: vpsravd %xmm1, %xmm0, %xmm0
198 ; AVX2-FAST-NEXT: vzeroupper
199 ; AVX2-FAST-NEXT: retq
190200 %1 = and <4 x i64> %y,
191201 %2 = trunc <4 x i64> %1 to <4 x i32>
192202 %3 = ashr <4 x i32> %x, %2
212222 ; SSE-NEXT: movdqa %xmm1, %xmm0
213223 ; SSE-NEXT: retq
214224 ;
215 ; AVX-LABEL: combine_vec_ashr_trunc_lshr:
216 ; AVX: # %bb.0:
217 ; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0
218 ; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
219 ; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
220 ; AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
221 ; AVX-NEXT: vzeroupper
222 ; AVX-NEXT: retq
225 ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr:
226 ; AVX2-SLOW: # %bb.0:
227 ; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0
228 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
229 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
230 ; AVX2-SLOW-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
231 ; AVX2-SLOW-NEXT: vzeroupper
232 ; AVX2-SLOW-NEXT: retq
233 ;
234 ; AVX2-FAST-LABEL: combine_vec_ashr_trunc_lshr:
235 ; AVX2-FAST: # %bb.0:
236 ; AVX2-FAST-NEXT: vpsrlq $32, %ymm0, %ymm0
237 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
238 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
239 ; AVX2-FAST-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
240 ; AVX2-FAST-NEXT: vzeroupper
241 ; AVX2-FAST-NEXT: retq
223242 %1 = lshr <4 x i64> %x,
224243 %2 = trunc <4 x i64> %1 to <4 x i32>
225244 %3 = ashr <4 x i32> %2,
246265 ; SSE-NEXT: movdqa %xmm1, %xmm0
247266 ; SSE-NEXT: retq
248267 ;
249 ; AVX-LABEL: combine_vec_ashr_trunc_ashr:
250 ; AVX: # %bb.0:
251 ; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
252 ; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
253 ; AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
254 ; AVX-NEXT: vzeroupper
255 ; AVX-NEXT: retq
268 ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_ashr:
269 ; AVX2-SLOW: # %bb.0:
270 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
271 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
272 ; AVX2-SLOW-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
273 ; AVX2-SLOW-NEXT: vzeroupper
274 ; AVX2-SLOW-NEXT: retq
275 ;
276 ; AVX2-FAST-LABEL: combine_vec_ashr_trunc_ashr:
277 ; AVX2-FAST: # %bb.0:
278 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,5,7,6,7]
279 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
280 ; AVX2-FAST-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
281 ; AVX2-FAST-NEXT: vzeroupper
282 ; AVX2-FAST-NEXT: retq
256283 %1 = ashr <4 x i64> %x,
257284 %2 = trunc <4 x i64> %1 to <4 x i32>
258285 %3 = ashr <4 x i32> %2,
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST
34
45 ; fold (srl 0, x) -> 0
56 define <4 x i32> @combine_vec_lshr_zero(<4 x i32> %x) {
214215 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
215216 ; SSE-NEXT: retq
216217 ;
217 ; AVX-LABEL: combine_vec_lshr_trunc_lshr1:
218 ; AVX: # %bb.0:
219 ; AVX-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
220 ; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
221 ; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
222 ; AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
223 ; AVX-NEXT: vzeroupper
224 ; AVX-NEXT: retq
218 ; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_lshr1:
219 ; AVX2-SLOW: # %bb.0:
220 ; AVX2-SLOW-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
221 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
222 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
223 ; AVX2-SLOW-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
224 ; AVX2-SLOW-NEXT: vzeroupper
225 ; AVX2-SLOW-NEXT: retq
226 ;
227 ; AVX2-FAST-LABEL: combine_vec_lshr_trunc_lshr1:
228 ; AVX2-FAST: # %bb.0:
229 ; AVX2-FAST-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
230 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
231 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
232 ; AVX2-FAST-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
233 ; AVX2-FAST-NEXT: vzeroupper
234 ; AVX2-FAST-NEXT: retq
225235 %1 = lshr <4 x i64> %x,
226236 %2 = trunc <4 x i64> %1 to <4 x i32>
227237 %3 = lshr <4 x i32> %2,
445455 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
446456 ; SSE-NEXT: retq
447457 ;
448 ; AVX-LABEL: combine_vec_lshr_trunc_and:
449 ; AVX: # %bb.0:
450 ; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
451 ; AVX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
452 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
453 ; AVX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
454 ; AVX-NEXT: vzeroupper
455 ; AVX-NEXT: retq
458 ; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_and:
459 ; AVX2-SLOW: # %bb.0:
460 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
461 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
462 ; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
463 ; AVX2-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
464 ; AVX2-SLOW-NEXT: vzeroupper
465 ; AVX2-SLOW-NEXT: retq
466 ;
467 ; AVX2-FAST-LABEL: combine_vec_lshr_trunc_and:
468 ; AVX2-FAST: # %bb.0:
469 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
470 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
471 ; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
472 ; AVX2-FAST-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
473 ; AVX2-FAST-NEXT: vzeroupper
474 ; AVX2-FAST-NEXT: retq
456475 %1 = and <4 x i64> %y,
457476 %2 = trunc <4 x i64> %1 to <4 x i32>
458477 %3 = lshr <4 x i32> %x, %2
33 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
44 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
55 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST
78
89 define <2 x double> @insert_v2f64_z1(<2 x double> %a) {
910 ; SSE2-LABEL: insert_v2f64_z1:
428429 ; SSE41-NEXT: pinsrb $15, %eax, %xmm0
429430 ; SSE41-NEXT: retq
430431 ;
431 ; AVX-LABEL: insert_v16i8_z123456789ABCDEz:
432 ; AVX: # %bb.0:
433 ; AVX-NEXT: xorl %eax, %eax
434 ; AVX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
435 ; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
436 ; AVX-NEXT: retq
432 ; AVX1-LABEL: insert_v16i8_z123456789ABCDEz:
433 ; AVX1: # %bb.0:
434 ; AVX1-NEXT: xorl %eax, %eax
435 ; AVX1-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
436 ; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
437 ; AVX1-NEXT: retq
438 ;
439 ; AVX2-SLOW-LABEL: insert_v16i8_z123456789ABCDEz:
440 ; AVX2-SLOW: # %bb.0:
441 ; AVX2-SLOW-NEXT: xorl %eax, %eax
442 ; AVX2-SLOW-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
443 ; AVX2-SLOW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
444 ; AVX2-SLOW-NEXT: retq
445 ;
446 ; AVX2-FAST-LABEL: insert_v16i8_z123456789ABCDEz:
447 ; AVX2-FAST: # %bb.0:
448 ; AVX2-FAST-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
449 ; AVX2-FAST-NEXT: retq
437450 %1 = insertelement <16 x i8> %a, i8 0, i32 0
438451 %2 = insertelement <16 x i8> %1, i8 0, i32 15
439452 ret <16 x i8> %2
478491 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
479492 ; AVX1-NEXT: retq
480493 ;
481 ; AVX2-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
482 ; AVX2: # %bb.0:
483 ; AVX2-NEXT: xorl %eax, %eax
484 ; AVX2-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
485 ; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
486 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
487 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
488 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
489 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
490 ; AVX2-NEXT: retq
494 ; AVX2-SLOW-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
495 ; AVX2-SLOW: # %bb.0:
496 ; AVX2-SLOW-NEXT: xorl %eax, %eax
497 ; AVX2-SLOW-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
498 ; AVX2-SLOW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
499 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
500 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
501 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
502 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
503 ; AVX2-SLOW-NEXT: retq
504 ;
505 ; AVX2-FAST-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
506 ; AVX2-FAST: # %bb.0:
507 ; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
508 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
509 ; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
510 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
511 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
512 ; AVX2-FAST-NEXT: retq
491513 %1 = insertelement <32 x i8> %a, i8 0, i32 0
492514 %2 = insertelement <32 x i8> %1, i8 0, i32 15
493515 %3 = insertelement <32 x i8> %2, i8 0, i32 30
11 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
22 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
33 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
4 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
4 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
56 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+xop | FileCheck %s --check-prefix=XOP
67
78 define void @v3i64(<2 x i64> %a, <2 x i64> %b, <3 x i64>* %p) nounwind {
153154 ; SSE42-NEXT: movq %xmm2, (%rdi)
154155 ; SSE42-NEXT: retq
155156 ;
156 ; AVX-LABEL: v5i16:
157 ; AVX: # %bb.0:
158 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
159 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
160 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
161 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
162 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
163 ; AVX-NEXT: vpextrw $6, %xmm0, 8(%rdi)
164 ; AVX-NEXT: vmovq %xmm1, (%rdi)
165 ; AVX-NEXT: retq
157 ; AVX1-LABEL: v5i16:
158 ; AVX1: # %bb.0:
159 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
160 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
161 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
162 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
163 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
164 ; AVX1-NEXT: vpextrw $6, %xmm0, 8(%rdi)
165 ; AVX1-NEXT: vmovq %xmm1, (%rdi)
166 ; AVX1-NEXT: retq
167 ;
168 ; AVX2-SLOW-LABEL: v5i16:
169 ; AVX2-SLOW: # %bb.0:
170 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
171 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
172 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
173 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
174 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
175 ; AVX2-SLOW-NEXT: vpextrw $6, %xmm0, 8(%rdi)
176 ; AVX2-SLOW-NEXT: vmovq %xmm1, (%rdi)
177 ; AVX2-SLOW-NEXT: retq
178 ;
179 ; AVX2-FAST-LABEL: v5i16:
180 ; AVX2-FAST: # %bb.0:
181 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,8,9,4,5,6,7,8,9,10,11,12,13,14,15]
182 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,4,5,12,13,14,15,8,9,10,11,12,13,14,15]
183 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
184 ; AVX2-FAST-NEXT: vpextrw $6, %xmm0, 8(%rdi)
185 ; AVX2-FAST-NEXT: vmovq %xmm1, (%rdi)
186 ; AVX2-FAST-NEXT: retq
166187 ;
167188 ; XOP-LABEL: v5i16:
168189 ; XOP: # %bb.0:
549570 ; AVX1-NEXT: vmovq %xmm2, 16(%rdi)
550571 ; AVX1-NEXT: retq
551572 ;
552 ; AVX2-LABEL: v12i16:
553 ; AVX2: # %bb.0:
554 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
555 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
556 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
557 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7]
558 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
559 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13]
560 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
561 ; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
562 ; AVX2-NEXT: vmovq %xmm2, 16(%rdi)
563 ; AVX2-NEXT: retq
573 ; AVX2-SLOW-LABEL: v12i16:
574 ; AVX2-SLOW: # %bb.0:
575 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
576 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
577 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
578 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7]
579 ; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1
580 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13]
581 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
582 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rdi)
583 ; AVX2-SLOW-NEXT: vmovq %xmm2, 16(%rdi)
584 ; AVX2-SLOW-NEXT: retq
585 ;
586 ; AVX2-FAST-LABEL: v12i16:
587 ; AVX2-FAST: # %bb.0:
588 ; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm2
589 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13]
590 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6,7]
591 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
592 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,14,15,6,7,8,9,10,11,12,13,14,15]
593 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
594 ; AVX2-FAST-NEXT: vmovq %xmm0, 16(%rdi)
595 ; AVX2-FAST-NEXT: vmovdqa %xmm2, (%rdi)
596 ; AVX2-FAST-NEXT: retq
564597 ;
565598 ; XOP-LABEL: v12i16:
566599 ; XOP: # %bb.0:
636669 ; AVX1-NEXT: vzeroupper
637670 ; AVX1-NEXT: retq
638671 ;
639 ; AVX2-LABEL: v12i32:
640 ; AVX2: # %bb.0:
641 ; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3]
642 ; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[3,3,2,3,7,7,6,7]
643 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3]
644 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
645 ; AVX2-NEXT: vmovaps {{.*#+}} ymm3 = <0,4,u,1,5,u,2,6>
646 ; AVX2-NEXT: vpermps %ymm0, %ymm3, %ymm0
647 ; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1
648 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
649 ; AVX2-NEXT: vmovaps %ymm0, (%rdi)
650 ; AVX2-NEXT: vmovaps %xmm2, 32(%rdi)
651 ; AVX2-NEXT: vzeroupper
652 ; AVX2-NEXT: retq
672 ; AVX2-SLOW-LABEL: v12i32:
673 ; AVX2-SLOW: # %bb.0:
674 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3]
675 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[3,3,2,3,7,7,6,7]
676 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3]
677 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
678 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm3 = <0,4,u,1,5,u,2,6>
679 ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm3, %ymm0
680 ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1
681 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
682 ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdi)
683 ; AVX2-SLOW-NEXT: vmovaps %xmm2, 32(%rdi)
684 ; AVX2-SLOW-NEXT: vzeroupper
685 ; AVX2-SLOW-NEXT: retq
686 ;
687 ; AVX2-FAST-LABEL: v12i32:
688 ; AVX2-FAST: # %bb.0:
689 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6>
690 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm2
691 ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm3
692 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
693 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [3,3,7,7,7,7,6,7]
694 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm3, %ymm0
695 ; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
696 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
697 ; AVX2-FAST-NEXT: vmovaps %xmm0, 32(%rdi)
698 ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdi)
699 ; AVX2-FAST-NEXT: vzeroupper
700 ; AVX2-FAST-NEXT: retq
653701 ;
654702 ; XOP-LABEL: v12i32:
655703 ; XOP: # %bb.0:
13991447 ; AVX1-NEXT: vzeroupper
14001448 ; AVX1-NEXT: retq
14011449 ;
1402 ; AVX2-LABEL: interleave_24i32_out:
1403 ; AVX2: # %bb.0:
1404 ; AVX2-NEXT: vmovups (%rdi), %ymm0
1405 ; AVX2-NEXT: vmovups 32(%rdi), %ymm1
1406 ; AVX2-NEXT: vmovups 64(%rdi), %ymm2
1407 ; AVX2-NEXT: vmovaps {{.*#+}} ymm3 =
1408 ; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm3
1409 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1410 ; AVX2-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
1411 ; AVX2-NEXT: vpermps %ymm4, %ymm5, %ymm4
1412 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
1413 ; AVX2-NEXT: vmovaps {{.*#+}} ymm4 =
1414 ; AVX2-NEXT: vpermps %ymm2, %ymm4, %ymm4
1415 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1416 ; AVX2-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
1417 ; AVX2-NEXT: vpermps %ymm5, %ymm6, %ymm5
1418 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
1419 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
1420 ; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
1421 ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
1422 ; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
1423 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
1424 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
1425 ; AVX2-NEXT: vmovups %ymm3, (%rsi)
1426 ; AVX2-NEXT: vmovups %ymm4, (%rdx)
1427 ; AVX2-NEXT: vmovups %ymm0, (%rcx)
1428 ; AVX2-NEXT: vzeroupper
1429 ; AVX2-NEXT: retq
1450 ; AVX2-SLOW-LABEL: interleave_24i32_out:
1451 ; AVX2-SLOW: # %bb.0:
1452 ; AVX2-SLOW-NEXT: vmovups (%rdi), %ymm0
1453 ; AVX2-SLOW-NEXT: vmovups 32(%rdi), %ymm1
1454 ; AVX2-SLOW-NEXT: vmovups 64(%rdi), %ymm2
1455 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm3 =
1456 ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm3
1457 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1458 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
1459 ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm5, %ymm4
1460 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
1461 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm4 =
1462 ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm4
1463 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1464 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
1465 ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5
1466 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
1467 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
1468 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
1469 ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm0
1470 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
1471 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
1472 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
1473 ; AVX2-SLOW-NEXT: vmovups %ymm3, (%rsi)
1474 ; AVX2-SLOW-NEXT: vmovups %ymm4, (%rdx)
1475 ; AVX2-SLOW-NEXT: vmovups %ymm0, (%rcx)
1476 ; AVX2-SLOW-NEXT: vzeroupper
1477 ; AVX2-SLOW-NEXT: retq
1478 ;
1479 ; AVX2-FAST-LABEL: interleave_24i32_out:
1480 ; AVX2-FAST: # %bb.0:
1481 ; AVX2-FAST-NEXT: vmovups (%rdi), %ymm0
1482 ; AVX2-FAST-NEXT: vmovups 32(%rdi), %ymm1
1483 ; AVX2-FAST-NEXT: vmovups 64(%rdi), %ymm2
1484 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 =
1485 ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm3
1486 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1487 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
1488 ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm4
1489 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
1490 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 =
1491 ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm4
1492 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1493 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
1494 ; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm5
1495 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
1496 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = [0,1,0,3,0,1,4,7]
1497 ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2
1498 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
1499 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
1500 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
1501 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
1502 ; AVX2-FAST-NEXT: vmovups %ymm3, (%rsi)
1503 ; AVX2-FAST-NEXT: vmovups %ymm4, (%rdx)
1504 ; AVX2-FAST-NEXT: vmovups %ymm0, (%rcx)
1505 ; AVX2-FAST-NEXT: vzeroupper
1506 ; AVX2-FAST-NEXT: retq
14301507 ;
14311508 ; XOP-LABEL: interleave_24i32_out:
14321509 ; XOP: # %bb.0:
16021679 ; AVX1-NEXT: vzeroupper
16031680 ; AVX1-NEXT: retq
16041681 ;
1605 ; AVX2-LABEL: interleave_24i32_in:
1606 ; AVX2: # %bb.0:
1607 ; AVX2-NEXT: vmovups (%rsi), %ymm0
1608 ; AVX2-NEXT: vmovups (%rdx), %ymm1
1609 ; AVX2-NEXT: vmovups (%rcx), %ymm2
1610 ; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,0,2,2]
1611 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1]
1612 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1]
1613 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
1614 ; AVX2-NEXT: vbroadcastsd %xmm2, %ymm4
1615 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
1616 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3]
1617 ; AVX2-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7]
1618 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3]
1619 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
1620 ; AVX2-NEXT: vbroadcastsd 24(%rsi), %ymm5
1621 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
1622 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
1623 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
1624 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
1625 ; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
1626 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1627 ; AVX2-NEXT: vmovups %ymm0, 32(%rdi)
1628 ; AVX2-NEXT: vmovups %ymm4, 64(%rdi)
1629 ; AVX2-NEXT: vmovups %ymm3, (%rdi)
1630 ; AVX2-NEXT: vzeroupper
1631 ; AVX2-NEXT: retq
1682 ; AVX2-SLOW-LABEL: interleave_24i32_in:
1683 ; AVX2-SLOW: # %bb.0:
1684 ; AVX2-SLOW-NEXT: vmovups (%rsi), %ymm0
1685 ; AVX2-SLOW-NEXT: vmovups (%rdx), %ymm1
1686 ; AVX2-SLOW-NEXT: vmovups (%rcx), %ymm2
1687 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,0,2,2]
1688 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1]
1689 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1]
1690 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
1691 ; AVX2-SLOW-NEXT: vbroadcastsd %xmm2, %ymm4
1692 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
1693 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3]
1694 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7]
1695 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3]
1696 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
1697 ; AVX2-SLOW-NEXT: vbroadcastsd 24(%rsi), %ymm5
1698 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
1699 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
1700 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
1701 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
1702 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
1703 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1704 ; AVX2-SLOW-NEXT: vmovups %ymm0, 32(%rdi)
1705 ; AVX2-SLOW-NEXT: vmovups %ymm4, 64(%rdi)
1706 ; AVX2-SLOW-NEXT: vmovups %ymm3, (%rdi)
1707 ; AVX2-SLOW-NEXT: vzeroupper
1708 ; AVX2-SLOW-NEXT: retq
1709 ;
1710 ; AVX2-FAST-LABEL: interleave_24i32_in:
1711 ; AVX2-FAST: # %bb.0:
1712 ; AVX2-FAST-NEXT: vmovups (%rsi), %ymm0
1713 ; AVX2-FAST-NEXT: vmovups (%rdx), %ymm1
1714 ; AVX2-FAST-NEXT: vmovups (%rcx), %ymm2
1715 ; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,0,2,2]
1716 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1]
1717 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1]
1718 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
1719 ; AVX2-FAST-NEXT: vbroadcastsd %xmm2, %ymm4
1720 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
1721 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
1722 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[1,1,2,2]
1723 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6],ymm0[7]
1724 ; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7]
1725 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
1726 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = [5,6,5,6,5,6,7,7]
1727 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1
1728 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3]
1729 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
1730 ; AVX2-FAST-NEXT: vbroadcastsd 24(%rsi), %ymm2
1731 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
1732 ; AVX2-FAST-NEXT: vmovups %ymm1, 64(%rdi)
1733 ; AVX2-FAST-NEXT: vmovups %ymm0, 32(%rdi)
1734 ; AVX2-FAST-NEXT: vmovups %ymm3, (%rdi)
1735 ; AVX2-FAST-NEXT: vzeroupper
1736 ; AVX2-FAST-NEXT: retq
16321737 ;
16331738 ; XOP-LABEL: interleave_24i32_in:
16341739 ; XOP: # %bb.0:
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
22 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
33 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
78
89 define <8 x i16> @test1(<8 x i16> %x) nounwind {
910 ; SSE-LABEL: test1:
18711872 ; AVX1-NEXT: vzeroupper
18721873 ; AVX1-NEXT: retq
18731874 ;
1874 ; AVX2-LABEL: psubus_8i64_max:
1875 ; AVX2: # %bb.0: # %vector.ph
1876 ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1877 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1878 ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1879 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
1880 ; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm5
1881 ; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm6
1882 ; AVX2-NEXT: vpcmpgtq %ymm6, %ymm5, %ymm5
1883 ; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm6
1884 ; AVX2-NEXT: vpor %ymm4, %ymm3, %ymm4
1885 ; AVX2-NEXT: vpcmpgtq %ymm4, %ymm6, %ymm4
1886 ; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm3
1887 ; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
1888 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
1889 ; AVX2-NEXT: vpsubq %ymm1, %ymm3, %ymm1
1890 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
1891 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1892 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
1893 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1894 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1895 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1896 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1897 ; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
1898 ; AVX2-NEXT: vzeroupper
1899 ; AVX2-NEXT: retq
1875 ; AVX2-SLOW-LABEL: psubus_8i64_max:
1876 ; AVX2-SLOW: # %bb.0: # %vector.ph
1877 ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1878 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1879 ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1880 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
1881 ; AVX2-SLOW-NEXT: vpxor %ymm4, %ymm2, %ymm5
1882 ; AVX2-SLOW-NEXT: vpor %ymm4, %ymm0, %ymm6
1883 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm6, %ymm5, %ymm5
1884 ; AVX2-SLOW-NEXT: vpxor %ymm4, %ymm1, %ymm6
1885 ; AVX2-SLOW-NEXT: vpor %ymm4, %ymm3, %ymm4
1886 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm6, %ymm4
1887 ; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm3
1888 ; AVX2-SLOW-NEXT: vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
1889 ; AVX2-SLOW-NEXT: vpsubq %ymm2, %ymm0, %ymm0
1890 ; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm3, %ymm1
1891 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
1892 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1893 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
1894 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1895 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1896 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1897 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1898 ; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
1899 ; AVX2-SLOW-NEXT: vzeroupper
1900 ; AVX2-SLOW-NEXT: retq
1901 ;
1902 ; AVX2-FAST-LABEL: psubus_8i64_max:
1903 ; AVX2-FAST: # %bb.0: # %vector.ph
1904 ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1905 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1906 ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1907 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
1908 ; AVX2-FAST-NEXT: vpxor %ymm4, %ymm2, %ymm5
1909 ; AVX2-FAST-NEXT: vpor %ymm4, %ymm0, %ymm6
1910 ; AVX2-FAST-NEXT: vpcmpgtq %ymm6, %ymm5, %ymm5
1911 ; AVX2-FAST-NEXT: vpxor %ymm4, %ymm1, %ymm6
1912 ; AVX2-FAST-NEXT: vpor %ymm4, %ymm3, %ymm4
1913 ; AVX2-FAST-NEXT: vpcmpgtq %ymm4, %ymm6, %ymm4
1914 ; AVX2-FAST-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm3
1915 ; AVX2-FAST-NEXT: vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
1916 ; AVX2-FAST-NEXT: vpsubq %ymm2, %ymm0, %ymm0
1917 ; AVX2-FAST-NEXT: vpsubq %ymm1, %ymm3, %ymm1
1918 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
1919 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
1920 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
1921 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1922 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1923 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1924 ; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
1925 ; AVX2-FAST-NEXT: vzeroupper
1926 ; AVX2-FAST-NEXT: retq
19001927 ;
19011928 ; AVX512-LABEL: psubus_8i64_max:
19021929 ; AVX512: # %bb.0: # %vector.ph
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
1 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST
23 ; PR32449
34
45 define <2 x double> @foo2(<2 x double> %v, <2 x double> *%p) nounwind {
2627 }
2728
2829 define <8 x float> @foo8(<8 x float> %v, <8 x float> *%p) nounwind {
29 ; AVX2-LABEL: foo8:
30 ; AVX2: # %bb.0:
31 ; AVX2-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
32 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
33 ; AVX2-NEXT: vmovaps %ymm0, (%rdi)
34 ; AVX2-NEXT: retq
30 ; AVX2-SLOW-LABEL: foo8:
31 ; AVX2-SLOW: # %bb.0:
32 ; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
33 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
34 ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdi)
35 ; AVX2-SLOW-NEXT: retq
36 ;
37 ; AVX2-FAST-LABEL: foo8:
38 ; AVX2-FAST: # %bb.0:
39 ; AVX2-FAST-NEXT: vbroadcastss {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5]
40 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
41 ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rdi)
42 ; AVX2-FAST-NEXT: retq
3543 %res = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32>
3644 %res1 = shufflevector<8 x float> %res, <8 x float> undef, <8 x i32>
3745 store <8 x float> %res, <8 x float>* %p
44 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
55 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
66 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
1010
1111 define void @shuffle_v16i8_to_v8i8_1(<16 x i8>* %L, <8 x i8>* %S) nounwind {
1212 ; SSE2-LABEL: shuffle_v16i8_to_v8i8_1:
399399 ;
400400 ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_1:
401401 ; AVX512BW: # %bb.0:
402 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
403 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
402 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
403 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
404404 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
405405 ; AVX512BW-NEXT: retq
406406 ;
459459 ;
460460 ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_2:
461461 ; AVX512BW: # %bb.0:
462 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
463 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
462 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
463 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
464464 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
465465 ; AVX512BW-NEXT: retq
466466 ;
519519 ;
520520 ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_3:
521521 ; AVX512BW: # %bb.0:
522 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
523 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
522 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
523 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
524524 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
525525 ; AVX512BW-NEXT: retq
526526 ;
22 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
33 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
44 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
88
99 define void @shuffle_v32i8_to_v16i8_1(<32 x i8>* %L, <16 x i8>* %S) nounwind {
1010 ; AVX1-LABEL: shuffle_v32i8_to_v16i8_1:
404404 ; AVX512VL: # %bb.0:
405405 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
406406 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
407 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
408 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
409 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
410 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
407 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
408 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
409 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
411410 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
412411 ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
413412 ; AVX512VL-NEXT: vzeroupper
417416 ; AVX512BW: # %bb.0:
418417 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
419418 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
420 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
421 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
422 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
423 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
419 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
420 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
421 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
424422 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
425423 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
426424 ; AVX512BW-NEXT: vzeroupper
430428 ; AVX512BWVL: # %bb.0:
431429 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
432430 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
433 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
434 ; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
435 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
436 ; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
431 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
432 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
433 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
437434 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
438435 ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
439436 ; AVX512BWVL-NEXT: vzeroupper
509506 ; AVX512BW: # %bb.0:
510507 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
511508 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
512 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
513 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
514 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
515 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
509 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
510 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
511 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
516512 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
517513 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
518514 ; AVX512BW-NEXT: vzeroupper
588584 ; AVX512VL: # %bb.0:
589585 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
590586 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
591 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
592 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
593 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
594 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
587 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
588 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
589 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
595590 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
596591 ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
597592 ; AVX512VL-NEXT: vzeroupper
601596 ; AVX512BW: # %bb.0:
602597 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
603598 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
604 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
605 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
606 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
607 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
599 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
600 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
601 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
608602 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
609603 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
610604 ; AVX512BW-NEXT: vzeroupper
614608 ; AVX512BWVL: # %bb.0:
615609 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
616610 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
617 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
618 ; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
619 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
620 ; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
611 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
612 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
613 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
621614 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
622615 ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
623616 ; AVX512BWVL-NEXT: vzeroupper
747740 ; AVX512VL: # %bb.0:
748741 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
749742 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
750 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
751 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
752 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
753 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
743 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
744 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
745 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
754746 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
755747 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
756748 ; AVX512VL-NEXT: vzeroupper
772764 ; AVX512BWVL: # %bb.0:
773765 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
774766 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
775 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
776 ; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
777 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
778 ; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
767 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
768 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
769 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
779770 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
780771 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
781772 ; AVX512BWVL-NEXT: vzeroupper
10551046 ; AVX512VL: # %bb.0:
10561047 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
10571048 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
1058 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
1059 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
1060 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
1061 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
1049 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
1050 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1051 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
10621052 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
10631053 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
10641054 ; AVX512VL-NEXT: vzeroupper
10801070 ; AVX512BWVL: # %bb.0:
10811071 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
10821072 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
1083 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
1084 ; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
1085 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
1086 ; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
1073 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
1074 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1075 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
10871076 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
10881077 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
10891078 ; AVX512BWVL-NEXT: vzeroupper
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
55
66 define void @shuffle_v64i8_to_v32i8_1(<64 x i8>* %L, <32 x i8>* %S) nounwind {
77 ; AVX512F-LABEL: shuffle_v64i8_to_v32i8_1:
2222 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
2323 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
2424 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
25 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
26 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
27 ; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
25 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
26 ; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
27 ; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
2828 ; AVX512VL-NEXT: vzeroupper
2929 ; AVX512VL-NEXT: retq
3030 ;
4646 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
4747 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
4848 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
49 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
50 ; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
51 ; AVX512BWVL-NEXT: vmovdqa %ymm0, (%rsi)
49 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
50 ; AVX512BWVL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
51 ; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi)
5252 ; AVX512BWVL-NEXT: vzeroupper
5353 ; AVX512BWVL-NEXT: retq
5454 %vec = load <64 x i8>, <64 x i8>* %L
7676 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
7777 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31]
7878 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
79 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
80 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
81 ; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
79 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
80 ; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
81 ; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
8282 ; AVX512VL-NEXT: vzeroupper
8383 ; AVX512VL-NEXT: retq
8484 ;
110110 }
111111
112112 define void @shuffle_v16i32_to_v8i32_1(<16 x i32>* %L, <8 x i32>* %S) nounwind {
113 ; AVX512-LABEL: shuffle_v16i32_to_v8i32_1:
114 ; AVX512: # %bb.0:
115 ; AVX512-NEXT: vmovaps (%rdi), %zmm0
116 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
117 ; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
118 ; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
119 ; AVX512-NEXT: vmovaps %ymm0, (%rsi)
120 ; AVX512-NEXT: vzeroupper
121 ; AVX512-NEXT: retq
113 ; AVX512F-LABEL: shuffle_v16i32_to_v8i32_1:
114 ; AVX512F: # %bb.0:
115 ; AVX512F-NEXT: vmovaps (%rdi), %zmm0
116 ; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm1
117 ; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
118 ; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
119 ; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
120 ; AVX512F-NEXT: vzeroupper
121 ; AVX512F-NEXT: retq
122 ;
123 ; AVX512VL-LABEL: shuffle_v16i32_to_v8i32_1:
124 ; AVX512VL: # %bb.0:
125 ; AVX512VL-NEXT: vmovdqa32 (%rdi), %zmm0
126 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
127 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
128 ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
129 ; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
130 ; AVX512VL-NEXT: vzeroupper
131 ; AVX512VL-NEXT: retq
132 ;
133 ; AVX512BW-LABEL: shuffle_v16i32_to_v8i32_1:
134 ; AVX512BW: # %bb.0:
135 ; AVX512BW-NEXT: vmovaps (%rdi), %zmm0
136 ; AVX512BW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
137 ; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
138 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
139 ; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
140 ; AVX512BW-NEXT: vzeroupper
141 ; AVX512BW-NEXT: retq
142 ;
143 ; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32_1:
144 ; AVX512BWVL: # %bb.0:
145 ; AVX512BWVL-NEXT: vmovdqa32 (%rdi), %zmm0
146 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
147 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
148 ; AVX512BWVL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
149 ; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi)
150 ; AVX512BWVL-NEXT: vzeroupper
151 ; AVX512BWVL-NEXT: retq
122152 %vec = load <16 x i32>, <16 x i32>* %L
123153 %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32>
124154 store <8 x i32> %strided.vec, <8 x i32>* %S
398428 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
399429 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
400430 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
401 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
402 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
403 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
404 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
431 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
432 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
433 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
405434 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
406435 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
407 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
408 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
409 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
410 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
436 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
437 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
438 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
411439 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
412440 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
413441 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
419447 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
420448 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
421449 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
422 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
423 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
424 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
425 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
450 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
451 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
452 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
426453 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
427454 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
428 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
429 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
430 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
431 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
455 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
456 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
457 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
432458 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
433459 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
434460 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
477503 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
478504 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
479505 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
480 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
481 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
482 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
483 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
506 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15]
507 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
508 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
484509 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
485510 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
486 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
487 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
488 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
489 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
511 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
512 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
513 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
490514 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
491515 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
492516 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
498522 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
499523 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
500524 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
501 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
502 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
503 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
504 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
525 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15]
526 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
527 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
505528 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
506529 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
507 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
508 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
509 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
510 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
530 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
531 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
532 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
511533 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
512534 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
513535 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
556578 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
557579 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
558580 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
559 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
560 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
561 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
562 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
581 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
582 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
583 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
563584 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
564585 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
565 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
566 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
567 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
568 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
586 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
587 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
588 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
569589 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
570590 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
571591 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
577597 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
578598 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
579599 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
580 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
581 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
582 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
583 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
600 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
601 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
602 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
584603 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
585604 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
586 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
587 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
588 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
589 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
605 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
606 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
607 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
590608 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
591609 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
592610 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
44 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
55 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
66 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
1010
1111 ; PR31551
1212 ; Pairs of shufflevector:trunc functions with functional equivalence.
472472 ;
473473 ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16:
474474 ; AVX512BW: # %bb.0:
475 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
476 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
475 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
476 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
477477 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
478478 ; AVX512BW-NEXT: retq
479479 ;
532532 ;
533533 ; AVX512BW-LABEL: trunc_v2i64_to_v2i16:
534534 ; AVX512BW: # %bb.0:
535 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
536 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
535 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
536 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
537537 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
538538 ; AVX512BW-NEXT: retq
539539 ;
22 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
33 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
44 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
88
99 ; PR31551
1010 ; Pairs of shufflevector:trunc functions with functional equivalence.
504504 ; AVX512BW: # %bb.0:
505505 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
506506 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
507 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
508 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
509 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
510 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
507 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
508 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
509 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
511510 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
512511 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
513512 ; AVX512BW-NEXT: vzeroupper
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
55
66 ; PR31551
77 ; Pairs of shufflevector:trunc functions with functional equivalence.
2626 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
2727 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
2828 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
29 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
30 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
31 ; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
29 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
30 ; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
31 ; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
3232 ; AVX512VL-NEXT: vzeroupper
3333 ; AVX512VL-NEXT: retq
3434 ;
5050 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
5151 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
5252 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
53 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
54 ; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
55 ; AVX512BWVL-NEXT: vmovdqa %ymm0, (%rsi)
53 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
54 ; AVX512BWVL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
55 ; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi)
5656 ; AVX512BWVL-NEXT: vzeroupper
5757 ; AVX512BWVL-NEXT: retq
5858 %vec = load <64 x i8>, <64 x i8>* %L
119119 ;
120120 ; AVX512VL-LABEL: shuffle_v32i16_to_v16i16:
121121 ; AVX512VL: # %bb.0:
122 ; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
123 ; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
124 ; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
125 ; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
126 ; AVX512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
127 ; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
128 ; AVX512VL-NEXT: vmovaps %ymm0, (%rsi)
122 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
123 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
124 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
125 ; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1
126 ; AVX512VL-NEXT: vpshufb %ymm2, %ymm0, %ymm0
127 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
128 ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
129 ; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
129130 ; AVX512VL-NEXT: vzeroupper
130131 ; AVX512VL-NEXT: retq
131132 ;
133134 ; AVX512BW: # %bb.0:
134135 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
135136 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
136 ; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
137 ; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
138 ; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
139 ; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
137 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
138 ; AVX512BW-NEXT: vpshufb %ymm2, %ymm1, %ymm1
139 ; AVX512BW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
140140 ; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
141141 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
142142 ; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
173173 }
174174
175175 define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
176 ; AVX512-LABEL: shuffle_v16i32_to_v8i32:
177 ; AVX512: # %bb.0:
178 ; AVX512-NEXT: vmovaps (%rdi), %zmm0
179 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
180 ; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
181 ; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
182 ; AVX512-NEXT: vmovaps %ymm0, (%rsi)
183 ; AVX512-NEXT: vzeroupper
184 ; AVX512-NEXT: retq
176 ; AVX512F-LABEL: shuffle_v16i32_to_v8i32:
177 ; AVX512F: # %bb.0:
178 ; AVX512F-NEXT: vmovaps (%rdi), %zmm0
179 ; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm1
180 ; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
181 ; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
182 ; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
183 ; AVX512F-NEXT: vzeroupper
184 ; AVX512F-NEXT: retq
185 ;
186 ; AVX512VL-LABEL: shuffle_v16i32_to_v8i32:
187 ; AVX512VL: # %bb.0:
188 ; AVX512VL-NEXT: vmovdqa32 (%rdi), %zmm0
189 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
190 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
191 ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
192 ; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
193 ; AVX512VL-NEXT: vzeroupper
194 ; AVX512VL-NEXT: retq
195 ;
196 ; AVX512BW-LABEL: shuffle_v16i32_to_v8i32:
197 ; AVX512BW: # %bb.0:
198 ; AVX512BW-NEXT: vmovaps (%rdi), %zmm0
199 ; AVX512BW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
200 ; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
201 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
202 ; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
203 ; AVX512BW-NEXT: vzeroupper
204 ; AVX512BW-NEXT: retq
205 ;
206 ; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32:
207 ; AVX512BWVL: # %bb.0:
208 ; AVX512BWVL-NEXT: vmovdqa32 (%rdi), %zmm0
209 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
210 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
211 ; AVX512BWVL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
212 ; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi)
213 ; AVX512BWVL-NEXT: vzeroupper
214 ; AVX512BWVL-NEXT: retq
185215 %vec = load <16 x i32>, <16 x i32>* %L
186216 %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32>
187217 store <8 x i32> %strided.vec, <8 x i32>* %S
325355 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
326356 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
327357 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
328 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
329 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
330 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
331 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
358 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
359 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
360 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
332361 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
333362 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
334 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
335 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
336 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
337 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
363 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
364 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
365 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
338366 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
339367 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
340368 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
346374 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
347375 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
348376 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
349 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
350 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
351 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
352 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
377 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
378 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
379 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
353380 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
354381 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
355 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
356 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
357 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
358 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
382 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
383 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
384 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
359385 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
360386 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
361387 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW