llvm.org GIT mirror llvm / 474a418
[AVX-512] Remove lzcnt intrinsics and autoupgrade them to generic ctlz intrinsics with select. Clang has been emitting cltz intrinsics for a while now. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@296091 91177308-0d34-0410-b5e6-96231b3b80d8 Craig Topper 3 years ago
8 changed file(s) with 185 addition(s) and 96 deletion(s). Raw diff Collapse all Expand all
54215421 Intrinsic<[llvm_v8i64_ty],
54225422 [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty],
54235423 [IntrNoMem]>;
5424
5425 def int_x86_avx512_mask_lzcnt_d_128 :
5426 Intrinsic<[llvm_v4i32_ty],
5427 [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty],
5428 [IntrNoMem]>;
5429 def int_x86_avx512_mask_lzcnt_d_256 :
5430 Intrinsic<[llvm_v8i32_ty],
5431 [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty],
5432 [IntrNoMem]>;
5433 def int_x86_avx512_mask_lzcnt_d_512 :
5434 Intrinsic<[llvm_v16i32_ty],
5435 [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty],
5436 [IntrNoMem]>;
5437
5438 def int_x86_avx512_mask_lzcnt_q_128 :
5439 Intrinsic<[llvm_v2i64_ty],
5440 [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty],
5441 [IntrNoMem]>;
5442 def int_x86_avx512_mask_lzcnt_q_256 :
5443 Intrinsic<[llvm_v4i64_ty],
5444 [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty],
5445 [IntrNoMem]>;
5446 def int_x86_avx512_mask_lzcnt_q_512 :
5447 Intrinsic<[llvm_v8i64_ty],
5448 [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty],
5449 [IntrNoMem]>;
54505424 }
54515425
54525426 // Compares
187187 Name.startswith("avx2.pmovzx") || // Added in 3.9
188188 Name.startswith("avx512.mask.pmovsx") || // Added in 4.0
189189 Name.startswith("avx512.mask.pmovzx") || // Added in 4.0
190 Name.startswith("avx512.mask.lzcnt.") || // Added in 5.0
190191 Name == "sse2.cvtdq2pd" || // Added in 3.9
191192 Name == "sse2.cvtps2pd" || // Added in 3.9
192193 Name == "avx.cvtdq2.pd.256" || // Added in 3.9
15161517 Rep = Builder.CreateFSub(CI->getArgOperand(0), CI->getArgOperand(1));
15171518 Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
15181519 CI->getArgOperand(2));
1520 } else if (IsX86 && Name.startswith("avx512.mask.lzcnt.")) {
1521 Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(),
1522 Intrinsic::ctlz,
1523 CI->getType()),
1524 { CI->getArgOperand(0), Builder.getInt1(false) });
1525 Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
1526 CI->getArgOperand(1));
15191527 } else if (IsX86 && (Name.startswith("avx512.mask.max.p") ||
15201528 Name.startswith("avx512.mask.min.p"))) {
15211529 bool IsMin = Name[13] == 'i';
794794 X86ISD::VGETMANTS, 0),
795795 X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_RM,
796796 X86ISD::VGETMANTS, 0),
797 X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_128, INTR_TYPE_1OP_MASK,
798 ISD::CTLZ, 0),
799 X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_256, INTR_TYPE_1OP_MASK,
800 ISD::CTLZ, 0),
801 X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_512, INTR_TYPE_1OP_MASK,
802 ISD::CTLZ, 0),
803 X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_128, INTR_TYPE_1OP_MASK,
804 ISD::CTLZ, 0),
805 X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_256, INTR_TYPE_1OP_MASK,
806 ISD::CTLZ, 0),
807 X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_512, INTR_TYPE_1OP_MASK,
808 ISD::CTLZ, 0),
809797 X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
810798 X86ISD::FMAX_RND),
811799 X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
639639 ret <8 x double> %res
640640 }
641641 declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8*) nounwind readonly
642
643 define <16 x i32> @test_conflict_d(<16 x i32> %a) {
644 ; CHECK-LABEL: test_conflict_d:
645 ; CHECK: ## BB#0:
646 ; CHECK-NEXT: vpconflictd %zmm0, %zmm0
647 ; CHECK-NEXT: retq
648 %res = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 -1)
649 ret <16 x i32> %res
650 }
651642
652643 define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
653644 ; CHECK-LABEL: test_cmpps:
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s
2
3 define <16 x i32> @test_lzcnt_d(<16 x i32> %a) {
4 ; CHECK-LABEL: test_lzcnt_d:
5 ; CHECK: ## BB#0:
6 ; CHECK-NEXT: vplzcntd %zmm0, %zmm0
7 ; CHECK-NEXT: retq
8 %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 -1)
9 ret <16 x i32> %res
10 }
11
12 declare <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
13
14 define <8 x i64> @test_lzcnt_q(<8 x i64> %a) {
15 ; CHECK-LABEL: test_lzcnt_q:
16 ; CHECK: ## BB#0:
17 ; CHECK-NEXT: vplzcntq %zmm0, %zmm0
18 ; CHECK-NEXT: retq
19 %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1)
20 ret <8 x i64> %res
21 }
22
23 declare <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
24
25
26 define <16 x i32> @test_mask_lzcnt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
27 ; CHECK-LABEL: test_mask_lzcnt_d:
28 ; CHECK: ## BB#0:
29 ; CHECK-NEXT: kmovw %edi, %k1
30 ; CHECK-NEXT: vplzcntd %zmm0, %zmm1 {%k1}
31 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
32 ; CHECK-NEXT: retq
33 %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
34 ret <16 x i32> %res
35 }
36
37 define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
38 ; CHECK-LABEL: test_mask_lzcnt_q:
39 ; CHECK: ## BB#0:
40 ; CHECK-NEXT: kmovw %edi, %k1
41 ; CHECK-NEXT: vplzcntq %zmm0, %zmm1 {%k1}
42 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
43 ; CHECK-NEXT: retq
44 %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
45 ret <8 x i64> %res
46 }
6161 ; CHECK: ## BB#0:
6262 ; CHECK-NEXT: vplzcntd %zmm0, %zmm0
6363 ; CHECK-NEXT: retq
64 %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 -1)
65 ret <16 x i32> %res
64 %1 = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false)
65 ret <16 x i32> %1
6666 }
67
68 declare <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
67 declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1) #0
6968
7069 define <8 x i64> @test_lzcnt_q(<8 x i64> %a) {
7170 ; CHECK-LABEL: test_lzcnt_q:
7271 ; CHECK: ## BB#0:
7372 ; CHECK-NEXT: vplzcntq %zmm0, %zmm0
7473 ; CHECK-NEXT: retq
75 %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1)
76 ret <8 x i64> %res
74 %1 = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false)
75 ret <8 x i64> %1
7776 }
78
79 declare <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
77 declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>, i1) #0
8078
8179 define <16 x i32> @test_mask_lzcnt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
8280 ; CHECK-LABEL: test_mask_lzcnt_d:
8583 ; CHECK-NEXT: vplzcntd %zmm0, %zmm1 {%k1}
8684 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
8785 ; CHECK-NEXT: retq
88 %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
89 ret <16 x i32> %res
86 %1 = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false)
87 %2 = bitcast i16 %mask to <16 x i1>
88 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %b
89 ret <16 x i32> %3
9090 }
9191
9292 define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
9696 ; CHECK-NEXT: vplzcntq %zmm0, %zmm1 {%k1}
9797 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
9898 ; CHECK-NEXT: retq
99 %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
100 ret <8 x i64> %res
99 %1 = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false)
100 %2 = bitcast i8 %mask to <8 x i1>
101 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %b
102 ret <8 x i64> %3
101103 }
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl| FileCheck %s
2
3 declare <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32>, <4 x i32>, i8)
4
5 define <4 x i32>@test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
6 ; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_128:
7 ; CHECK: ## BB#0:
8 ; CHECK-NEXT: vplzcntd %xmm0, %xmm2
9 ; CHECK-NEXT: kmovw %edi, %k1
10 ; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1}
11 ; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z}
12 ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
13 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
14 ; CHECK-NEXT: retq
15 %res = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
16 %res1 = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
17 %res3 = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %x2)
18 %res2 = add <4 x i32> %res, %res1
19 %res4 = add <4 x i32> %res2, %res3
20 ret <4 x i32> %res4
21 }
22
23 declare <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32>, <8 x i32>, i8)
24
25 define <8 x i32>@test_int_x86_avx512_mask_vplzcnt_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
26 ; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_256:
27 ; CHECK: ## BB#0:
28 ; CHECK-NEXT: vplzcntd %ymm0, %ymm2
29 ; CHECK-NEXT: kmovw %edi, %k1
30 ; CHECK-NEXT: vplzcntd %ymm0, %ymm1 {%k1}
31 ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm0
32 ; CHECK-NEXT: retq
33 %res = call <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
34 %res1 = call <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 -1)
35 %res2 = add <8 x i32> %res, %res1
36 ret <8 x i32> %res2
37 }
38
39 declare <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64>, <2 x i64>, i8)
40
41 define <2 x i64>@test_int_x86_avx512_mask_vplzcnt_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
42 ; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_128:
43 ; CHECK: ## BB#0:
44 ; CHECK-NEXT: vplzcntq %xmm0, %xmm2
45 ; CHECK-NEXT: kmovw %edi, %k1
46 ; CHECK-NEXT: vplzcntq %xmm0, %xmm1 {%k1}
47 ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm0
48 ; CHECK-NEXT: retq
49 %res = call <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
50 %res1 = call <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 -1)
51 %res2 = add <2 x i64> %res, %res1
52 ret <2 x i64> %res2
53 }
54
55 declare <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64>, <4 x i64>, i8)
56
57 define <4 x i64>@test_int_x86_avx512_mask_vplzcnt_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
58 ; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_256:
59 ; CHECK: ## BB#0:
60 ; CHECK-NEXT: vplzcntq %ymm0, %ymm2
61 ; CHECK-NEXT: kmovw %edi, %k1
62 ; CHECK-NEXT: vplzcntq %ymm0, %ymm1 {%k1}
63 ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm0
64 ; CHECK-NEXT: retq
65 %res = call <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
66 %res1 = call <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 -1)
67 %res2 = add <4 x i64> %res, %res1
68 ret <4 x i64> %res2
69 }
70
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
11 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl| FileCheck %s
22
3 declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readonly
4
5 declare <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32>, <4 x i32>, i8)
6
7 define <4 x i32>@test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
3 define <4 x i32> @test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
84 ; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_128:
95 ; CHECK: ## BB#0:
6 ; CHECK-NEXT: vplzcntd %xmm0, %xmm2
107 ; CHECK-NEXT: kmovw %edi, %k1
118 ; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1}
12 ; CHECK-NEXT: vplzcntd %xmm0, %xmm2 {%k1} {z}
13 ; CHECK-NEXT: vplzcntd %xmm0, %xmm0
9 ; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z}
10 ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
1411 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
15 ; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0
1612 ; CHECK-NEXT: retq
17 %res = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
18 %res1 = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
19 %res3 = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %x2)
20 %res2 = add <4 x i32> %res, %res1
21 %res4 = add <4 x i32> %res2, %res3
13 %1 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x0, i1 false)
14 %2 = bitcast i8 %x2 to <8 x i1>
15 %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32>
16 %3 = select <4 x i1> %extract1, <4 x i32> %1, <4 x i32> %x1
17 %4 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x0, i1 false)
18 %5 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x0, i1 false)
19 %6 = bitcast i8 %x2 to <8 x i1>
20 %extract = shufflevector <8 x i1> %6, <8 x i1> %6, <4 x i32>
21 %7 = select <4 x i1> %extract, <4 x i32> %5, <4 x i32> zeroinitializer
22 %res2 = add <4 x i32> %3, %4
23 %res4 = add <4 x i32> %res2, %7
2224 ret <4 x i32> %res4
2325 }
26 declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) #0
2427
25 declare <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32>, <8 x i32>, i8)
26
27 define <8 x i32>@test_int_x86_avx512_mask_vplzcnt_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
28 define <8 x i32> @test_int_x86_avx512_mask_vplzcnt_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
2829 ; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_256:
2930 ; CHECK: ## BB#0:
31 ; CHECK-NEXT: vplzcntd %ymm0, %ymm2
3032 ; CHECK-NEXT: kmovw %edi, %k1
3133 ; CHECK-NEXT: vplzcntd %ymm0, %ymm1 {%k1}
32 ; CHECK-NEXT: vplzcntd %ymm0, %ymm0
33 ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
34 ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm0
3435 ; CHECK-NEXT: retq
35 %res = call <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
36 %res1 = call <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 -1)
37 %res2 = add <8 x i32> %res, %res1
36 %1 = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %x0, i1 false)
37 %2 = bitcast i8 %x2 to <8 x i1>
38 %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x1
39 %4 = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %x0, i1 false)
40 %res2 = add <8 x i32> %3, %4
3841 ret <8 x i32> %res2
3942 }
43 declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>, i1) #0
4044
41 declare <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64>, <2 x i64>, i8)
42
43 define <2 x i64>@test_int_x86_avx512_mask_vplzcnt_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
45 define <2 x i64> @test_int_x86_avx512_mask_vplzcnt_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
4446 ; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_128:
4547 ; CHECK: ## BB#0:
48 ; CHECK-NEXT: vplzcntq %xmm0, %xmm2
4649 ; CHECK-NEXT: kmovw %edi, %k1
4750 ; CHECK-NEXT: vplzcntq %xmm0, %xmm1 {%k1}
48 ; CHECK-NEXT: vplzcntq %xmm0, %xmm0
49 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
51 ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm0
5052 ; CHECK-NEXT: retq
51 %res = call <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
52 %res1 = call <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 -1)
53 %res2 = add <2 x i64> %res, %res1
53 %1 = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %x0, i1 false)
54 %2 = bitcast i8 %x2 to <8 x i1>
55 %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32>
56 %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> %x1
57 %4 = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %x0, i1 false)
58 %res2 = add <2 x i64> %3, %4
5459 ret <2 x i64> %res2
5560 }
61 declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) #0
5662
57 declare <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64>, <4 x i64>, i8)
58
59 define <4 x i64>@test_int_x86_avx512_mask_vplzcnt_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
63 define <4 x i64> @test_int_x86_avx512_mask_vplzcnt_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
6064 ; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_256:
6165 ; CHECK: ## BB#0:
66 ; CHECK-NEXT: vplzcntq %ymm0, %ymm2
6267 ; CHECK-NEXT: kmovw %edi, %k1
6368 ; CHECK-NEXT: vplzcntq %ymm0, %ymm1 {%k1}
64 ; CHECK-NEXT: vplzcntq %ymm0, %ymm0
65 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
69 ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm0
6670 ; CHECK-NEXT: retq
67 %res = call <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
68 %res1 = call <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 -1)
69 %res2 = add <4 x i64> %res, %res1
71 %1 = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %x0, i1 false)
72 %2 = bitcast i8 %x2 to <8 x i1>
73 %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32>
74 %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %x1
75 %4 = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %x0, i1 false)
76 %res2 = add <4 x i64> %3, %4
7077 ret <4 x i64> %res2
7178 }
79 declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) #0
7280
7381 declare <4 x i32> @llvm.x86.avx512.mask.conflict.d.128(<4 x i32>, <4 x i32>, i8)
7482