llvm.org GIT mirror llvm / f3cb5d6
[X86] Adding vpopcntd and vpopcntq instructions AVX512_VPOPCNTDQ is a new feature set that was published by Intel. The patch represents the LLVM side of the addition of two new intrinsic based instructions (vpopcntd and vpopcntq). Differential Revision: https://reviews.llvm.org/D33169 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@303858 91177308-0d34-0410-b5e6-96231b3b80d8 Oren Ben Simhon 2 years ago
17 changed file(s) with 997 addition(s) and 30 deletion(s). Raw diff Collapse all Expand all
13891389 Features["sha"] = HasLeaf7 && ((EBX >> 29) & 1);
13901390
13911391 // AVX512 is only supported if the OS supports the context save for it.
1392 Features["avx512vpopcntdq"] = HasLeaf7 && ((EBX >> 14) & 1) && HasAVX512Save;
13921393 Features["avx512f"] = HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save;
13931394 Features["avx512dq"] = HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save;
13941395 Features["avx512ifma"] = HasLeaf7 && ((EBX >> 21) & 1) && HasAVX512Save;
125125 [FeatureAVX512]>;
126126 def FeatureCDI : SubtargetFeature<"avx512cd", "HasCDI", "true",
127127 "Enable AVX-512 Conflict Detection Instructions",
128 [FeatureAVX512]>;
129 def FeatureVPOPCNTDQ : SubtargetFeature<"avx512vpopcntdq", "HasVPOPCNTDQ",
130 "true", "Enable AVX-512 Population Count Instructions",
128131 [FeatureAVX512]>;
129132 def FeaturePFI : SubtargetFeature<"avx512pf", "HasPFI", "true",
130133 "Enable AVX-512 PreFetch Instructions",
13611361 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
13621362 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
13631363 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1364 }
1365
1366 if (Subtarget.hasVPOPCNTDQ()) {
1367 // VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512
1368 // version of popcntd/q.
1369 for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64,
1370 MVT::v4i32, MVT::v2i64})
1371 setOperationAction(ISD::CTPOP, VT, Legal);
13641372 }
13651373
13661374 // Custom lower several nodes.
86468646 (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)),
86478647 sub_xmm)>;
86488648 }
8649
8650 //===---------------------------------------------------------------------===//
8651 // Counts number of ones - VPOPCNTD and VPOPCNTQ
8652 //===---------------------------------------------------------------------===//
8653
8654 multiclass avx512_unary_rmb_popcnt opc, string OpcodeStr, X86VectorVTInfo VTInfo> {
8655 let Predicates = [HasVPOPCNTDQ] in
8656 defm Z : avx512_unary_rmb, EVEX_V512;
8657 }
8658
8659 // Use 512bit version to implement 128/256 bit.
8660 multiclass avx512_unary_lowering {
8661 let Predicates = [prd] in {
8662 def Z256_Alt : Pat<(_.info256.VT(OpNode _.info256.RC:$src1)),
8663 (EXTRACT_SUBREG
8664 (!cast(NAME # "Zrr")
8665 (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
8666 _.info256.RC:$src1,
8667 _.info256.SubRegIdx)),
8668 _.info256.SubRegIdx)>;
8669
8670 def Z128_Alt : Pat<(_.info128.VT(OpNode _.info128.RC:$src1)),
8671 (EXTRACT_SUBREG
8672 (!cast(NAME # "Zrr")
8673 (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
8674 _.info128.RC:$src1,
8675 _.info128.SubRegIdx)),
8676 _.info128.SubRegIdx)>;
8677 }
8678 }
8679
8680 defm VPOPCNTD : avx512_unary_rmb_popcnt<0x55, "vpopcntd", v16i32_info>,
8681 avx512_unary_lowering;
8682 defm VPOPCNTQ : avx512_unary_rmb_popcnt<0x55, "vpopcntq", v8i64_info>,
8683 avx512_unary_lowering, VEX_W;
86498684
86508685 //===---------------------------------------------------------------------===//
86518686 // Replicate Single FP - MOVSHDUP and MOVSLDUP
913913 { X86::VPMOVZXDQZrr, X86::VPMOVZXDQZrm, 0 },
914914 { X86::VPMOVZXWDZrr, X86::VPMOVZXWDZrm, 0 },
915915 { X86::VPMOVZXWQZrr, X86::VPMOVZXWQZrm, 0 },
916 { X86::VPOPCNTDZrr, X86::VPOPCNTDZrm, 0 },
917 { X86::VPOPCNTQZrr, X86::VPOPCNTQZrm, 0 },
916918 { X86::VPSHUFDZri, X86::VPSHUFDZmi, 0 },
917919 { X86::VPSHUFHWZri, X86::VPSHUFHWZmi, 0 },
918920 { X86::VPSHUFLWZri, X86::VPSHUFLWZmi, 0 },
23252327 { X86::VPMOVZXDQZrrkz, X86::VPMOVZXDQZrmkz, 0 },
23262328 { X86::VPMOVZXWDZrrkz, X86::VPMOVZXWDZrmkz, 0 },
23272329 { X86::VPMOVZXWQZrrkz, X86::VPMOVZXWQZrmkz, 0 },
2330 { X86::VPOPCNTDZrrkz, X86::VPOPCNTDZrmkz, 0 },
2331 { X86::VPOPCNTQZrrkz, X86::VPOPCNTQZrmkz, 0 },
23282332 { X86::VPSHUFDZrikz, X86::VPSHUFDZmikz, 0 },
23292333 { X86::VPSHUFHWZrikz, X86::VPSHUFHWZmikz, 0 },
23302334 { X86::VPSHUFLWZrikz, X86::VPSHUFLWZmikz, 0 },
29462950 { X86::VPMOVZXDQZrrk, X86::VPMOVZXDQZrmk, 0 },
29472951 { X86::VPMOVZXWDZrrk, X86::VPMOVZXWDZrmk, 0 },
29482952 { X86::VPMOVZXWQZrrk, X86::VPMOVZXWQZrmk, 0 },
2953 { X86::VPOPCNTDZrrk, X86::VPOPCNTDZrmk, 0 },
2954 { X86::VPOPCNTQZrrk, X86::VPOPCNTQZrmk, 0 },
29492955 { X86::VPSHUFDZrik, X86::VPSHUFDZmik, 0 },
29502956 { X86::VPSHUFHWZrik, X86::VPSHUFHWZmik, 0 },
29512957 { X86::VPSHUFLWZrik, X86::VPSHUFLWZmik, 0 },
812812 def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">;
813813 def HasCDI : Predicate<"Subtarget->hasCDI()">,
814814 AssemblerPredicate<"FeatureCDI", "AVX-512 CD ISA">;
815 def HasVPOPCNTDQ : Predicate<"Subtarget->hasVPOPCNTDQ()">,
816 AssemblerPredicate<"FeatureVPOPCNTDQ", "AVX-512 VPOPCNTDQ ISA">;
815817 def HasPFI : Predicate<"Subtarget->hasPFI()">,
816818 AssemblerPredicate<"FeaturePFI", "AVX-512 PF ISA">;
817819 def HasERI : Predicate<"Subtarget->hasERI()">,
285285 HasCDI = false;
286286 HasPFI = false;
287287 HasDQI = false;
288 HasVPOPCNTDQ = false;
288289 HasBWI = false;
289290 HasVLX = false;
290291 HasADX = false;
268268
269269 /// Processor has AVX-512 Conflict Detection Instructions
270270 bool HasCDI;
271
272 /// Processor has AVX-512 population count Instructions
273 bool HasVPOPCNTDQ;
271274
272275 /// Processor has AVX-512 Doubleword and Quadword instructions
273276 bool HasDQI;
493496 bool slow3OpsLEA() const { return Slow3OpsLEA; }
494497 bool slowIncDec() const { return SlowIncDec; }
495498 bool hasCDI() const { return HasCDI; }
499 bool hasVPOPCNTDQ() const { return HasVPOPCNTDQ; }
496500 bool hasPFI() const { return HasPFI; }
497501 bool hasERI() const { return HasERI; }
498502 bool hasDQI() const { return HasDQI; }
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq --show-mc-encoding | FileCheck %s --check-prefix=X86_64
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vpopcntdq --show-mc-encoding | FileCheck %s --check-prefix=X86
3
4 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
5 ;; The following tests check that patterns that includes ;;
6 ;; ctpop intrinsic + select are translated to the vpopcntd/q ;;
7 ;; instruction in a correct way. ;;
8 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
9
10 define <16 x i32> @test_mask_vpopcnt_d(<16 x i32> %a, i16 %mask, <16 x i32> %b) {
11 ; X86_64-LABEL: test_mask_vpopcnt_d:
12 ; X86_64: # BB#0:
13 ; X86_64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
14 ; X86_64-NEXT: vpopcntd %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x55,0xc1]
15 ; X86_64-NEXT: retq # encoding: [0xc3]
16 ;
17 ; X86-LABEL: test_mask_vpopcnt_d:
18 ; X86: # BB#0:
19 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
20 ; X86-NEXT: vpopcntd %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x55,0xc1]
21 ; X86-NEXT: retl # encoding: [0xc3]
22 %1 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %b)
23 %2 = bitcast i16 %mask to <16 x i1>
24 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %a
25 ret <16 x i32> %3
26 }
27
28 define <16 x i32> @test_maskz_vpopcnt_d(i16 %mask, <16 x i32> %a) {
29 ; X86_64-LABEL: test_maskz_vpopcnt_d:
30 ; X86_64: # BB#0:
31 ; X86_64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
32 ; X86_64-NEXT: vpopcntd %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x55,0xc0]
33 ; X86_64-NEXT: retq # encoding: [0xc3]
34 ;
35 ; X86-LABEL: test_maskz_vpopcnt_d:
36 ; X86: # BB#0:
37 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
38 ; X86-NEXT: vpopcntd %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x55,0xc0]
39 ; X86-NEXT: retl # encoding: [0xc3]
40 %1 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %a)
41 %2 = bitcast i16 %mask to <16 x i1>
42 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
43 ret <16 x i32> %3
44 }
45
46 define <8 x i64> @test_mask_vpopcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
47 ; X86_64-LABEL: test_mask_vpopcnt_q:
48 ; X86_64: # BB#0:
49 ; X86_64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
50 ; X86_64-NEXT: vpopcntq %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x55,0xc8]
51 ; X86_64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
52 ; X86_64-NEXT: retq # encoding: [0xc3]
53 ;
54 ; X86-LABEL: test_mask_vpopcnt_q:
55 ; X86: # BB#0:
56 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
57 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
58 ; X86-NEXT: vpopcntq %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x55,0xc8]
59 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
60 ; X86-NEXT: retl # encoding: [0xc3]
61 %1 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a)
62 %2 = bitcast i8 %mask to <8 x i1>
63 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %b
64 ret <8 x i64> %3
65 }
66
67 define <8 x i64> @test_maskz_vpopcnt_q(<8 x i64> %a, i8 %mask) {
68 ; X86_64-LABEL: test_maskz_vpopcnt_q:
69 ; X86_64: # BB#0:
70 ; X86_64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
71 ; X86_64-NEXT: vpopcntq %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x55,0xc0]
72 ; X86_64-NEXT: retq # encoding: [0xc3]
73 ;
74 ; X86-LABEL: test_maskz_vpopcnt_q:
75 ; X86: # BB#0:
76 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
77 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
78 ; X86-NEXT: vpopcntq %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x55,0xc0]
79 ; X86-NEXT: retl # encoding: [0xc3]
80 %1 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a)
81 %2 = bitcast i8 %mask to <8 x i1>
82 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
83 ret <8 x i64> %3
84 }
85
86 declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>)
87 declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>)
44 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
55 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
66 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ
78
89 define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
910 ; SSE2-LABEL: testv2i64:
8081 ; SSE41-NEXT: psadbw %xmm3, %xmm0
8182 ; SSE41-NEXT: retq
8283 ;
83 ; AVX-LABEL: testv2i64:
84 ; AVX: # BB#0:
85 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
86 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
87 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
88 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
89 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
90 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
91 ; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
92 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
93 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
94 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
95 ; AVX-NEXT: retq
84 ; AVX1-LABEL: testv2i64:
85 ; AVX1: # BB#0:
86 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
87 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
88 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
89 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
90 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
91 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
92 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
93 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
94 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
95 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
96 ; AVX1-NEXT: retq
97 ;
98 ; AVX2-LABEL: testv2i64:
99 ; AVX2: # BB#0:
100 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
101 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
102 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
103 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
104 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
105 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
106 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
107 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
108 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
109 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
110 ; AVX2-NEXT: retq
111 ;
112 ; AVX512VPOPCNTDQ-LABEL: testv2i64:
113 ; AVX512VPOPCNTDQ: # BB#0:
114 ; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0
115 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
116 ; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0
117 ; AVX512VPOPCNTDQ-NEXT: vzeroupper
118 ; AVX512VPOPCNTDQ-NEXT: retq
96119 %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in)
97120 ret <2 x i64> %out
98121 }
192215 ; SSE41-NEXT: packuswb %xmm3, %xmm0
193216 ; SSE41-NEXT: retq
194217 ;
195 ; AVX-LABEL: testv4i32:
196 ; AVX: # BB#0:
197 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
198 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
199 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
200 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
201 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
202 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
203 ; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
204 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
205 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
206 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
207 ; AVX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
208 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
209 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
210 ; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
211 ; AVX-NEXT: retq
218 ; AVX1-LABEL: testv4i32:
219 ; AVX1: # BB#0:
220 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
221 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
222 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
223 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
224 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
225 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
226 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
227 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
228 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
229 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
230 ; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
231 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
232 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
233 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
234 ; AVX1-NEXT: retq
235 ;
236 ; AVX2-LABEL: testv4i32:
237 ; AVX2: # BB#0:
238 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
239 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
240 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
241 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
242 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
243 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
244 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
245 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
246 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
247 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
248 ; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
249 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
250 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
251 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
252 ; AVX2-NEXT: retq
253 ;
254 ; AVX512VPOPCNTDQ-LABEL: testv4i32:
255 ; AVX512VPOPCNTDQ: # BB#0:
256 ; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0
257 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
258 ; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0
259 ; AVX512VPOPCNTDQ-NEXT: vzeroupper
260 ; AVX512VPOPCNTDQ-NEXT: retq
212261 %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in)
213262 ret <4 x i32> %out
214263 }
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
22 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ
34
45 define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
56 ; AVX1-LABEL: testv4i64:
3839 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
3940 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
4041 ; AVX2-NEXT: retq
42 ;
43 ; AVX512VPOPCNTDQ-LABEL: testv4i64:
44 ; AVX512VPOPCNTDQ: # BB#0:
45 ; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0
46 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
47 ; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0
48 ; AVX512VPOPCNTDQ-NEXT: retq
4149 %out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %in)
4250 ret <4 x i64> %out
4351 }
9199 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
92100 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
93101 ; AVX2-NEXT: retq
102 ;
103 ; AVX512VPOPCNTDQ-LABEL: testv8i32:
104 ; AVX512VPOPCNTDQ: # BB#0:
105 ; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0
106 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
107 ; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0
108 ; AVX512VPOPCNTDQ-NEXT: retq
94109 %out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %in)
95110 ret <8 x i32> %out
96111 }
136151 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
137152 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
138153 ; AVX2-NEXT: retq
154 ;
155 ; AVX512VPOPCNTDQ-LABEL: testv16i16:
156 ; AVX512VPOPCNTDQ: # BB#0:
157 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
158 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
159 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
160 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
161 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
162 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
163 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
164 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
165 ; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1
166 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0
167 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
168 ; AVX512VPOPCNTDQ-NEXT: retq
139169 %out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %in)
140170 ret <16 x i16> %out
141171 }
172202 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
173203 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
174204 ; AVX2-NEXT: retq
205 ;
206 ; AVX512VPOPCNTDQ-LABEL: testv32i8:
207 ; AVX512VPOPCNTDQ: # BB#0:
208 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
209 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
210 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
211 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
212 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
213 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
214 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
215 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
216 ; AVX512VPOPCNTDQ-NEXT: retq
175217 %out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %in)
176218 ret <32 x i8> %out
177219 }
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
11 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
22 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VPOPCNTDQ
34
45 define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
56 ; AVX512F-LABEL: testv8i64:
3839 ; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
3940 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
4041 ; AVX512BW-NEXT: retq
42 ;
43 ; AVX512VPOPCNTDQ-LABEL: testv8i64:
44 ; AVX512VPOPCNTDQ: ## BB#0:
45 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
46 ; AVX512VPOPCNTDQ-NEXT: retq
4147 %out = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %in)
4248 ret <8 x i64> %out
4349 }
9197 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
9298 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
9399 ; AVX512BW-NEXT: retq
100 ;
101 ; AVX512VPOPCNTDQ-LABEL: testv16i32:
102 ; AVX512VPOPCNTDQ: ## BB#0:
103 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
104 ; AVX512VPOPCNTDQ-NEXT: retq
94105 %out = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %in)
95106 ret <16 x i32> %out
96107 }
134145 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
135146 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
136147 ; AVX512BW-NEXT: retq
148 ;
149 ; AVX512VPOPCNTDQ-LABEL: testv32i16:
150 ; AVX512VPOPCNTDQ: ## BB#0:
151 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
152 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm3
153 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
154 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
155 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
156 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm0
157 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0
158 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0
159 ; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm3
160 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm3, %ymm0
161 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
162 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm3
163 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
164 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1
165 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1
166 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1
167 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1
168 ; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2
169 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1
170 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1
171 ; AVX512VPOPCNTDQ-NEXT: retq
137172 %out = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %in)
138173 ret <32 x i16> %out
139174 }
168203 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
169204 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
170205 ; AVX512BW-NEXT: retq
206 ;
207 ; AVX512VPOPCNTDQ-LABEL: testv64i8:
208 ; AVX512VPOPCNTDQ: ## BB#0:
209 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
210 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm3
211 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
212 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
213 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
214 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm0
215 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0
216 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0
217 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm3
218 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
219 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1
220 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1
221 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1
222 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1
223 ; AVX512VPOPCNTDQ-NEXT: retq
171224 %out = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %in)
172225 ret <64 x i8> %out
173226 }
66 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
77 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL
88 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQ
910 ;
1011 ; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt.
1112 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE41
116117 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
117118 ; AVX-NEXT: retq
118119 ;
120 ; AVX512VPOPCNTDQ-LABEL: testv2i64:
121 ; AVX512VPOPCNTDQ: # BB#0:
122 ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
123 ; AVX512VPOPCNTDQ-NEXT: vpsubq %xmm0, %xmm1, %xmm1
124 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
125 ; AVX512VPOPCNTDQ-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0
126 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
127 ; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0
128 ; AVX512VPOPCNTDQ-NEXT: vzeroupper
129 ; AVX512VPOPCNTDQ-NEXT: retq
130 ;
119131 ; X32-SSE-LABEL: testv2i64:
120132 ; X32-SSE: # BB#0:
121133 ; X32-SSE-NEXT: pxor %xmm1, %xmm1
282294 ; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0
283295 ; AVX512CD-NEXT: vzeroupper
284296 ; AVX512CD-NEXT: retq
297 ;
298 ; AVX512VPOPCNTDQ-LABEL: testv2i64u:
299 ; AVX512VPOPCNTDQ: # BB#0:
300 ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
301 ; AVX512VPOPCNTDQ-NEXT: vpsubq %xmm0, %xmm1, %xmm1
302 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
303 ; AVX512VPOPCNTDQ-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0
304 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
305 ; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0
306 ; AVX512VPOPCNTDQ-NEXT: vzeroupper
307 ; AVX512VPOPCNTDQ-NEXT: retq
285308 ;
286309 ; X32-SSE-LABEL: testv2i64u:
287310 ; X32-SSE: # BB#0:
500523 ; AVX512CD-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
501524 ; AVX512CD-NEXT: retq
502525 ;
526 ; AVX512VPOPCNTDQ-LABEL: testv4i32:
527 ; AVX512VPOPCNTDQ: # BB#0:
528 ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
529 ; AVX512VPOPCNTDQ-NEXT: vpsubd %xmm0, %xmm1, %xmm1
530 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
531 ; AVX512VPOPCNTDQ-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
532 ; AVX512VPOPCNTDQ-NEXT: vpsubd %xmm1, %xmm0, %xmm0
533 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
534 ; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0
535 ; AVX512VPOPCNTDQ-NEXT: vzeroupper
536 ; AVX512VPOPCNTDQ-NEXT: retq
537 ;
503538 ; X32-SSE-LABEL: testv4i32:
504539 ; X32-SSE: # BB#0:
505540 ; X32-SSE-NEXT: pxor %xmm1, %xmm1
699734 ; AVX512CD-NEXT: vzeroupper
700735 ; AVX512CD-NEXT: retq
701736 ;
737 ; AVX512VPOPCNTDQ-LABEL: testv4i32u:
738 ; AVX512VPOPCNTDQ: # BB#0:
739 ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
740 ; AVX512VPOPCNTDQ-NEXT: vpsubd %xmm0, %xmm1, %xmm1
741 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
742 ; AVX512VPOPCNTDQ-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
743 ; AVX512VPOPCNTDQ-NEXT: vpsubd %xmm1, %xmm0, %xmm0
744 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
745 ; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0
746 ; AVX512VPOPCNTDQ-NEXT: vzeroupper
747 ; AVX512VPOPCNTDQ-NEXT: retq
748 ;
702749 ; X32-SSE-LABEL: testv4i32u:
703750 ; X32-SSE: # BB#0:
704751 ; X32-SSE-NEXT: pxor %xmm1, %xmm1
842889 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
843890 ; AVX-NEXT: retq
844891 ;
892 ; AVX512VPOPCNTDQ-LABEL: testv8i16:
893 ; AVX512VPOPCNTDQ: # BB#0:
894 ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
895 ; AVX512VPOPCNTDQ-NEXT: vpsubw %xmm0, %xmm1, %xmm1
896 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
897 ; AVX512VPOPCNTDQ-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
898 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
899 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
900 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
901 ; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
902 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
903 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
904 ; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0
905 ; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0
906 ; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %xmm0, %xmm1
907 ; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0
908 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %xmm0, %xmm0
909 ; AVX512VPOPCNTDQ-NEXT: retq
910 ;
845911 ; X32-SSE-LABEL: testv8i16:
846912 ; X32-SSE: # BB#0:
847913 ; X32-SSE-NEXT: pxor %xmm1, %xmm1
9831049 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
9841050 ; AVX-NEXT: retq
9851051 ;
1052 ; AVX512VPOPCNTDQ-LABEL: testv8i16u:
1053 ; AVX512VPOPCNTDQ: # BB#0:
1054 ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
1055 ; AVX512VPOPCNTDQ-NEXT: vpsubw %xmm0, %xmm1, %xmm1
1056 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
1057 ; AVX512VPOPCNTDQ-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
1058 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1059 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
1060 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1061 ; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
1062 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
1063 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
1064 ; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0
1065 ; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0
1066 ; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %xmm0, %xmm1
1067 ; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0
1068 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %xmm0, %xmm0
1069 ; AVX512VPOPCNTDQ-NEXT: retq
1070 ;
9861071 ; X32-SSE-LABEL: testv8i16u:
9871072 ; X32-SSE: # BB#0:
9881073 ; X32-SSE-NEXT: pxor %xmm1, %xmm1
11051190 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
11061191 ; AVX-NEXT: retq
11071192 ;
1193 ; AVX512VPOPCNTDQ-LABEL: testv16i8:
1194 ; AVX512VPOPCNTDQ: # BB#0:
1195 ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
1196 ; AVX512VPOPCNTDQ-NEXT: vpsubb %xmm0, %xmm1, %xmm1
1197 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
1198 ; AVX512VPOPCNTDQ-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1199 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1200 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
1201 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1202 ; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
1203 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
1204 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
1205 ; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0
1206 ; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0
1207 ; AVX512VPOPCNTDQ-NEXT: retq
1208 ;
11081209 ; X32-SSE-LABEL: testv16i8:
11091210 ; X32-SSE: # BB#0:
11101211 ; X32-SSE-NEXT: pxor %xmm1, %xmm1
12231324 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
12241325 ; AVX-NEXT: retq
12251326 ;
1327 ; AVX512VPOPCNTDQ-LABEL: testv16i8u:
1328 ; AVX512VPOPCNTDQ: # BB#0:
1329 ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
1330 ; AVX512VPOPCNTDQ-NEXT: vpsubb %xmm0, %xmm1, %xmm1
1331 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
1332 ; AVX512VPOPCNTDQ-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1333 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1334 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
1335 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1336 ; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
1337 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
1338 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
1339 ; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0
1340 ; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0
1341 ; AVX512VPOPCNTDQ-NEXT: retq
1342 ;
12261343 ; X32-SSE-LABEL: testv16i8u:
12271344 ; X32-SSE: # BB#0:
12281345 ; X32-SSE-NEXT: pxor %xmm1, %xmm1
12571374 ; AVX-NEXT: vmovq %rax, %xmm0
12581375 ; AVX-NEXT: retq
12591376 ;
1377 ; AVX512VPOPCNTDQ-LABEL: foldv2i64:
1378 ; AVX512VPOPCNTDQ: # BB#0:
1379 ; AVX512VPOPCNTDQ-NEXT: movl $8, %eax
1380 ; AVX512VPOPCNTDQ-NEXT: vmovq %rax, %xmm0
1381 ; AVX512VPOPCNTDQ-NEXT: retq
1382 ;
12601383 ; X32-SSE-LABEL: foldv2i64:
12611384 ; X32-SSE: # BB#0:
12621385 ; X32-SSE-NEXT: movl $8, %eax
12791402 ; AVX-NEXT: vmovq %rax, %xmm0
12801403 ; AVX-NEXT: retq
12811404 ;
1405 ; AVX512VPOPCNTDQ-LABEL: foldv2i64u:
1406 ; AVX512VPOPCNTDQ: # BB#0:
1407 ; AVX512VPOPCNTDQ-NEXT: movl $8, %eax
1408 ; AVX512VPOPCNTDQ-NEXT: vmovq %rax, %xmm0
1409 ; AVX512VPOPCNTDQ-NEXT: retq
1410 ;
12821411 ; X32-SSE-LABEL: foldv2i64u:
12831412 ; X32-SSE: # BB#0:
12841413 ; X32-SSE-NEXT: movl $8, %eax
12991428 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
13001429 ; AVX-NEXT: retq
13011430 ;
1431 ; AVX512VPOPCNTDQ-LABEL: foldv4i32:
1432 ; AVX512VPOPCNTDQ: # BB#0:
1433 ; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1434 ; AVX512VPOPCNTDQ-NEXT: retq
1435 ;
13021436 ; X32-SSE-LABEL: foldv4i32:
13031437 ; X32-SSE: # BB#0:
13041438 ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0]
13181452 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
13191453 ; AVX-NEXT: retq
13201454 ;
1455 ; AVX512VPOPCNTDQ-LABEL: foldv4i32u:
1456 ; AVX512VPOPCNTDQ: # BB#0:
1457 ; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1458 ; AVX512VPOPCNTDQ-NEXT: retq
1459 ;
13211460 ; X32-SSE-LABEL: foldv4i32u:
13221461 ; X32-SSE: # BB#0:
13231462 ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0]
13371476 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
13381477 ; AVX-NEXT: retq
13391478 ;
1479 ; AVX512VPOPCNTDQ-LABEL: foldv8i16:
1480 ; AVX512VPOPCNTDQ: # BB#0:
1481 ; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1482 ; AVX512VPOPCNTDQ-NEXT: retq
1483 ;
13401484 ; X32-SSE-LABEL: foldv8i16:
13411485 ; X32-SSE: # BB#0:
13421486 ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
13561500 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
13571501 ; AVX-NEXT: retq
13581502 ;
1503 ; AVX512VPOPCNTDQ-LABEL: foldv8i16u:
1504 ; AVX512VPOPCNTDQ: # BB#0:
1505 ; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1506 ; AVX512VPOPCNTDQ-NEXT: retq
1507 ;
13591508 ; X32-SSE-LABEL: foldv8i16u:
13601509 ; X32-SSE: # BB#0:
13611510 ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
13751524 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
13761525 ; AVX-NEXT: retq
13771526 ;
1527 ; AVX512VPOPCNTDQ-LABEL: foldv16i8:
1528 ; AVX512VPOPCNTDQ: # BB#0:
1529 ; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1530 ; AVX512VPOPCNTDQ-NEXT: retq
1531 ;
13781532 ; X32-SSE-LABEL: foldv16i8:
13791533 ; X32-SSE: # BB#0:
13801534 ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
13941548 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
13951549 ; AVX-NEXT: retq
13961550 ;
1551 ; AVX512VPOPCNTDQ-LABEL: foldv16i8u:
1552 ; AVX512VPOPCNTDQ: # BB#0:
1553 ; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1554 ; AVX512VPOPCNTDQ-NEXT: retq
1555 ;
13971556 ; X32-SSE-LABEL: foldv16i8u:
13981557 ; X32-SSE: # BB#0:
13991558 ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
22 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
33 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL
44 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ
56 ;
67 ; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt.
78 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2
9192 ; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
9293 ; AVX512CD-NEXT: retq
9394 ;
95 ; AVX512VPOPCNTDQ-LABEL: testv4i64:
96 ; AVX512VPOPCNTDQ: # BB#0:
97 ; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
98 ; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm0, %ymm1, %ymm1
99 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
100 ; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
101 ; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm1, %ymm0, %ymm0
102 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
103 ; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0
104 ; AVX512VPOPCNTDQ-NEXT: retq
105 ;
94106 ; X32-AVX-LABEL: testv4i64:
95107 ; X32-AVX: # BB#0:
96108 ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
180192 ; AVX512CD-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
181193 ; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm0
182194 ; AVX512CD-NEXT: retq
195 ;
196 ; AVX512VPOPCNTDQ-LABEL: testv4i64u:
197 ; AVX512VPOPCNTDQ: # BB#0:
198 ; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
199 ; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm0, %ymm1, %ymm1
200 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
201 ; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
202 ; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm1, %ymm0, %ymm0
203 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
204 ; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0
205 ; AVX512VPOPCNTDQ-NEXT: retq
183206 ;
184207 ; X32-AVX-LABEL: testv4i64u:
185208 ; X32-AVX: # BB#0:
306329 ; AVX512CD-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
307330 ; AVX512CD-NEXT: retq
308331 ;
332 ; AVX512VPOPCNTDQ-LABEL: testv8i32:
333 ; AVX512VPOPCNTDQ: # BB#0:
334 ; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
335 ; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm0, %ymm1, %ymm1
336 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
337 ; AVX512VPOPCNTDQ-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
338 ; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm1, %ymm0, %ymm0
339 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
340 ; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0
341 ; AVX512VPOPCNTDQ-NEXT: retq
342 ;
309343 ; X32-AVX-LABEL: testv8i32:
310344 ; X32-AVX: # BB#0:
311345 ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
413447 ; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm0
414448 ; AVX512CD-NEXT: retq
415449 ;
450 ; AVX512VPOPCNTDQ-LABEL: testv8i32u:
451 ; AVX512VPOPCNTDQ: # BB#0:
452 ; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
453 ; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm0, %ymm1, %ymm1
454 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
455 ; AVX512VPOPCNTDQ-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
456 ; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm1, %ymm0, %ymm0
457 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
458 ; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0
459 ; AVX512VPOPCNTDQ-NEXT: retq
460 ;
416461 ; X32-AVX-LABEL: testv8i32u:
417462 ; X32-AVX: # BB#0:
418463 ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
531576 ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0
532577 ; AVX512CD-NEXT: retq
533578 ;
579 ; AVX512VPOPCNTDQ-LABEL: testv16i16:
580 ; AVX512VPOPCNTDQ: # BB#0:
581 ; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
582 ; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm1, %ymm1
583 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
584 ; AVX512VPOPCNTDQ-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
585 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
586 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
587 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
588 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
589 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
590 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
591 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
592 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
593 ; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1
594 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0
595 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
596 ; AVX512VPOPCNTDQ-NEXT: retq
597 ;
534598 ; X32-AVX-LABEL: testv16i16:
535599 ; X32-AVX: # BB#0:
536600 ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
646710 ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0
647711 ; AVX512CD-NEXT: retq
648712 ;
713 ; AVX512VPOPCNTDQ-LABEL: testv16i16u:
714 ; AVX512VPOPCNTDQ: # BB#0:
715 ; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
716 ; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm1, %ymm1
717 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
718 ; AVX512VPOPCNTDQ-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
719 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
720 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
721 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
722 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
723 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
724 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
725 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
726 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
727 ; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1
728 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0
729 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
730 ; AVX512VPOPCNTDQ-NEXT: retq
731 ;
649732 ; X32-AVX-LABEL: testv16i16u:
650733 ; X32-AVX: # BB#0:
651734 ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
746829 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0
747830 ; AVX512CD-NEXT: retq
748831 ;
832 ; AVX512VPOPCNTDQ-LABEL: testv32i8:
833 ; AVX512VPOPCNTDQ: # BB#0:
834 ; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
835 ; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm1, %ymm1
836 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
837 ; AVX512VPOPCNTDQ-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
838 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
839 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
840 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
841 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
842 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
843 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
844 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
845 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
846 ; AVX512VPOPCNTDQ-NEXT: retq
847 ;
749848 ; X32-AVX-LABEL: testv32i8:
750849 ; X32-AVX: # BB#0:
751850 ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
843942 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0
844943 ; AVX512CD-NEXT: retq
845944 ;
945 ; AVX512VPOPCNTDQ-LABEL: testv32i8u:
946 ; AVX512VPOPCNTDQ: # BB#0:
947 ; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
948 ; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm1, %ymm1
949 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
950 ; AVX512VPOPCNTDQ-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
951 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
952 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
953 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
954 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
955 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
956 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
957 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
958 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
959 ; AVX512VPOPCNTDQ-NEXT: retq
960 ;
846961 ; X32-AVX-LABEL: testv32i8u:
847962 ; X32-AVX: # BB#0:
848963 ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
11 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd,-avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CD
22 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CDBW
33 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=-avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
4 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQ
45
56 define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
67 ; AVX512CD-LABEL: testv8i64:
6364 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
6465 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
6566 ; AVX512BW-NEXT: retq
67 ;
68 ; AVX512VPOPCNTDQ-LABEL: testv8i64:
69 ; AVX512VPOPCNTDQ: ## BB#0:
70 ; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
71 ; AVX512VPOPCNTDQ-NEXT: vpsubq %zmm0, %zmm1, %zmm1
72 ; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0
73 ; AVX512VPOPCNTDQ-NEXT: vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0
74 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
75 ; AVX512VPOPCNTDQ-NEXT: retq
6676 %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 0)
6777 ret <8 x i64> %out
6878 }
104114 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
105115 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
106116 ; AVX512BW-NEXT: retq
117 ;
118 ; AVX512VPOPCNTDQ-LABEL: testv8i64u:
119 ; AVX512VPOPCNTDQ: ## BB#0:
120 ; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
121 ; AVX512VPOPCNTDQ-NEXT: vpsubq %zmm0, %zmm1, %zmm1
122 ; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0
123 ; AVX512VPOPCNTDQ-NEXT: vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0
124 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
125 ; AVX512VPOPCNTDQ-NEXT: retq
107126 %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 -1)
108127 ret <8 x i64> %out
109128 }
185204 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
186205 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
187206 ; AVX512BW-NEXT: retq
207 ;
208 ; AVX512VPOPCNTDQ-LABEL: testv16i32:
209 ; AVX512VPOPCNTDQ: ## BB#0:
210 ; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
211 ; AVX512VPOPCNTDQ-NEXT: vpsubd %zmm0, %zmm1, %zmm1
212 ; AVX512VPOPCNTDQ-NEXT: vpandd %zmm1, %zmm0, %zmm0
213 ; AVX512VPOPCNTDQ-NEXT: vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0
214 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
215 ; AVX512VPOPCNTDQ-NEXT: retq
188216 %out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 0)
189217 ret <16 x i32> %out
190218 }
230258 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
231259 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
232260 ; AVX512BW-NEXT: retq
261 ;
262 ; AVX512VPOPCNTDQ-LABEL: testv16i32u:
263 ; AVX512VPOPCNTDQ: ## BB#0:
264 ; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
265 ; AVX512VPOPCNTDQ-NEXT: vpsubd %zmm0, %zmm1, %zmm1
266 ; AVX512VPOPCNTDQ-NEXT: vpandd %zmm1, %zmm0, %zmm0
267 ; AVX512VPOPCNTDQ-NEXT: vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0
268 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
269 ; AVX512VPOPCNTDQ-NEXT: retq
233270 %out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 -1)
234271 ret <16 x i32> %out
235272 }
304341 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
305342 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
306343 ; AVX512BW-NEXT: retq
344 ;
345 ; AVX512VPOPCNTDQ-LABEL: testv32i16:
346 ; AVX512VPOPCNTDQ: ## BB#0:
347 ; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
348 ; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm2, %ymm3
349 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
350 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
351 ; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm3, %ymm0, %ymm0
352 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
353 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5
354 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
355 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5
356 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
357 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0
358 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0
359 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0
360 ; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm5
361 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm5, %ymm0
362 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
363 ; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm1, %ymm2, %ymm2
364 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1
365 ; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm3, %ymm1, %ymm1
366 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2
367 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2
368 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1
369 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1
370 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1
371 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1
372 ; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2
373 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1
374 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1
375 ; AVX512VPOPCNTDQ-NEXT: retq
307376 %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 0)
308377 ret <32 x i16> %out
309378 }
378447 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
379448 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
380449 ; AVX512BW-NEXT: retq
450 ;
451 ; AVX512VPOPCNTDQ-LABEL: testv32i16u:
452 ; AVX512VPOPCNTDQ: ## BB#0:
453 ; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
454 ; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm2, %ymm3
455 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
456 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
457 ; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm3, %ymm0, %ymm0
458 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
459 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5
460 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
461 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5
462 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
463 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0
464 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0
465 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0
466 ; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm5
467 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm5, %ymm0
468 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
469 ; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm1, %ymm2, %ymm2
470 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1
471 ; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm3, %ymm1, %ymm1
472 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2
473 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2
474 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1
475 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1
476 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1
477 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1
478 ; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2
479 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1
480 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1
481 ; AVX512VPOPCNTDQ-NEXT: retq
381482 %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 -1)
382483 ret <32 x i16> %out
383484 }
440541 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
441542 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
442543 ; AVX512BW-NEXT: retq
544 ;
545 ; AVX512VPOPCNTDQ-LABEL: testv64i8:
546 ; AVX512VPOPCNTDQ: ## BB#0:
547 ; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
548 ; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm2, %ymm3
549 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
550 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
551 ; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0
552 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
553 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5
554 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
555 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5
556 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
557 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0
558 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0
559 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0
560 ; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm1, %ymm2, %ymm2
561 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1
562 ; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm3, %ymm1, %ymm1
563 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2
564 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2
565 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1
566 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1
567 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1
568 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1
569 ; AVX512VPOPCNTDQ-NEXT: retq
443570 %out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 0)
444571 ret <64 x i8> %out
445572 }
502629 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
503630 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
504631 ; AVX512BW-NEXT: retq
632 ;
633 ; AVX512VPOPCNTDQ-LABEL: testv64i8u:
634 ; AVX512VPOPCNTDQ: ## BB#0:
635 ; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
636 ; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm2, %ymm3
637 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
638 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
639 ; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0
640 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
641 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5
642 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
643 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5
644 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
645 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0
646 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0
647 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0
648 ; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm1, %ymm2, %ymm2
649 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1
650 ; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm3, %ymm1, %ymm1
651 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2
652 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2
653 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1
654 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1
655 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1
656 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1
657 ; AVX512VPOPCNTDQ-NEXT: retq
505658 %out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 -1)
506659 ret <64 x i8> %out
507660 }
0 # RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 -mcpu=knl | FileCheck %s
11 # RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 -mcpu=skx | FileCheck --check-prefix=CHECK-SKX %s
2 # RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=AVX512VPOPCNTDQ
23
34 # CHECK: vpbroadcastd %xmm18, %zmm28 {%k7} {z}
45 0x62 0x22 0x7d 0xcf 0x58 0xe2
264265
265266 # CHECK: vscatterqpd %ymm19, 256(%r9,%ymm31) {%k1}
266267 0x62 0x82 0xfd 0x21 0xa3 0x5c 0x39 0x20
268
269 #####################################################
270 # POPULATION COUNT #
271 #####################################################
272
273 # AVX512VPOPCNTDQ: vpopcntd %zmm21, %zmm26 {%k4}
274 0x62 0x22 0x7d 0x4c 0x55 0xd5
275
276 # AVX512VPOPCNTDQ: vpopcntd %zmm21, %zmm26 {%k4} {z}
277 0x62 0x22 0x7d 0xcc 0x55 0xd5
278
279 # AVX512VPOPCNTDQ: vpopcntd (%rcx), %zmm26
280 0x62 0x62 0x7d 0x48 0x55 0x11
281
282 # AVX512VPOPCNTDQ: vpopcntq %zmm21, %zmm17 {%k6}
283 0x62 0xa2 0xfd 0x4e 0x55 0xcd
284
285 # AVX512VPOPCNTDQ: vpopcntq %zmm21, %zmm17 {%k6} {z}
286 0x62 0xa2 0xfd 0xce 0x55 0xcd
287
288 # AVX512VPOPCNTDQ: vpopcntq (%rcx), %zmm17
289 0x62 0xe2 0xfd 0x48 0x55 0x09
0 // RUN: llvm-mc -triple x86_64-unknown-unknown -mattr=+avx512vpopcntdq --show-encoding %s | FileCheck %s
1
2 // CHECK: vpopcntq %zmm25, %zmm20
3 // CHECK: encoding: [0x62,0x82,0xfd,0x48,0x55,0xe1]
4 vpopcntq %zmm25, %zmm20
5
6 // CHECK: vpopcntq %zmm25, %zmm20 {%k6}
7 // CHECK: encoding: [0x62,0x82,0xfd,0x4e,0x55,0xe1]
8 vpopcntq %zmm25, %zmm20 {%k6}
9
10 // CHECK: vpopcntq %zmm25, %zmm20 {%k6} {z}
11 // CHECK: encoding: [0x62,0x82,0xfd,0xce,0x55,0xe1]
12 vpopcntq %zmm25, %zmm20 {%k6} {z}
13
14 // CHECK: vpopcntq (%rcx), %zmm20
15 // CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x21]
16 vpopcntq (%rcx), %zmm20
17
18 // CHECK: vpopcntq 291(%rax,%r14,8), %zmm20
19 // CHECK: encoding: [0x62,0xa2,0xfd,0x48,0x55,0xa4,0xf0,0x23,0x01,0x00,0x00]
20 vpopcntq 291(%rax,%r14,8), %zmm20
21
22 // CHECK: vpopcntq (%rcx){1to8}, %zmm20
23 // CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x21]
24 vpopcntq (%rcx){1to8}, %zmm20
25
26 // CHECK: vpopcntq 4064(%rdx), %zmm20
27 // CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0xa2,0xe0,0x0f,0x00,0x00]
28 vpopcntq 4064(%rdx), %zmm20
29
30 // CHECK: vpopcntq 4096(%rdx), %zmm20
31 // CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x62,0x40]
32 vpopcntq 4096(%rdx), %zmm20
33
34 // CHECK: vpopcntq -4096(%rdx), %zmm20
35 // CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x62,0xc0]
36 vpopcntq -4096(%rdx), %zmm20
37
38 // CHECK: vpopcntq -4128(%rdx), %zmm20
39 // CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0xa2,0xe0,0xef,0xff,0xff]
40 vpopcntq -4128(%rdx), %zmm20
41
42 // CHECK: vpopcntq 1016(%rdx){1to8}, %zmm20
43 // CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x62,0x7f]
44 vpopcntq 1016(%rdx){1to8}, %zmm20
45
46 // CHECK: vpopcntq 1024(%rdx){1to8}, %zmm20
47 // CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0xa2,0x00,0x04,0x00,0x00]
48 vpopcntq 1024(%rdx){1to8}, %zmm20
49
50 // CHECK: vpopcntq -1024(%rdx){1to8}, %zmm20
51 // CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x62,0x80]
52 vpopcntq -1024(%rdx){1to8}, %zmm20
53
54 // CHECK: vpopcntq -1032(%rdx){1to8}, %zmm20
55 // CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0xa2,0xf8,0xfb,0xff,0xff]
56 vpopcntq -1032(%rdx){1to8}, %zmm20
57
58 // CHECK: vpopcntq %zmm21, %zmm17
59 // CHECK: encoding: [0x62,0xa2,0xfd,0x48,0x55,0xcd]
60 vpopcntq %zmm21, %zmm17
61
62 // CHECK: vpopcntq %zmm21, %zmm17 {%k6}
63 // CHECK: encoding: [0x62,0xa2,0xfd,0x4e,0x55,0xcd]
64 vpopcntq %zmm21, %zmm17 {%k6}
65
66 // CHECK: vpopcntq %zmm21, %zmm17 {%k6} {z}
67 // CHECK: encoding: [0x62,0xa2,0xfd,0xce,0x55,0xcd]
68 vpopcntq %zmm21, %zmm17 {%k6} {z}
69
70 // CHECK: vpopcntq (%rcx), %zmm17
71 // CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x09]
72 vpopcntq (%rcx), %zmm17
73
74 // CHECK: vpopcntq 4660(%rax,%r14,8), %zmm17
75 // CHECK: encoding: [0x62,0xa2,0xfd,0x48,0x55,0x8c,0xf0,0x34,0x12,0x00,0x00]
76 vpopcntq 4660(%rax,%r14,8), %zmm17
77
78 // CHECK: vpopcntq (%rcx){1to8}, %zmm17
79 // CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x09]
80 vpopcntq (%rcx){1to8}, %zmm17
81
82 // CHECK: vpopcntq 4064(%rdx), %zmm17
83 // CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x8a,0xe0,0x0f,0x00,0x00]
84 vpopcntq 4064(%rdx), %zmm17
85
86 // CHECK: vpopcntq 4096(%rdx), %zmm17
87 // CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x4a,0x40]
88 vpopcntq 4096(%rdx), %zmm17
89
90 // CHECK: vpopcntq -4096(%rdx), %zmm17
91 // CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x4a,0xc0]
92 vpopcntq -4096(%rdx), %zmm17
93
94 // CHECK: vpopcntq -4128(%rdx), %zmm17
95 // CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x8a,0xe0,0xef,0xff,0xff]
96 vpopcntq -4128(%rdx), %zmm17
97
98 // CHECK: vpopcntq 1016(%rdx){1to8}, %zmm17
99 // CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x4a,0x7f]
100 vpopcntq 1016(%rdx){1to8}, %zmm17
101
102 // CHECK: vpopcntq 1024(%rdx){1to8}, %zmm17
103 // CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x8a,0x00,0x04,0x00,0x00]
104 vpopcntq 1024(%rdx){1to8}, %zmm17
105
106 // CHECK: vpopcntq -1024(%rdx){1to8}, %zmm17
107 // CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x4a,0x80]
108 vpopcntq -1024(%rdx){1to8}, %zmm17
109
110 // CHECK: vpopcntq -1032(%rdx){1to8}, %zmm17
111 // CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x8a,0xf8,0xfb,0xff,0xff]
112 vpopcntq -1032(%rdx){1to8}, %zmm17
113
114 // CHECK: vpopcntd %zmm19, %zmm25
115 // CHECK: encoding: [0x62,0x22,0x7d,0x48,0x55,0xcb]
116 vpopcntd %zmm19, %zmm25
117
118 // CHECK: vpopcntd %zmm19, %zmm25 {%k4}
119 // CHECK: encoding: [0x62,0x22,0x7d,0x4c,0x55,0xcb]
120 vpopcntd %zmm19, %zmm25 {%k4}
121
122 // CHECK: vpopcntd %zmm19, %zmm25 {%k4} {z}
123 // CHECK: encoding: [0x62,0x22,0x7d,0xcc,0x55,0xcb]
124 vpopcntd %zmm19, %zmm25 {%k4} {z}
125
126 // CHECK: vpopcntd (%rcx), %zmm25
127 // CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x09]
128 vpopcntd (%rcx), %zmm25
129
130 // CHECK: vpopcntd 291(%rax,%r14,8), %zmm25
131 // CHECK: encoding: [0x62,0x22,0x7d,0x48,0x55,0x8c,0xf0,0x23,0x01,0x00,0x00]
132 vpopcntd 291(%rax,%r14,8), %zmm25
133
134 // CHECK: vpopcntd (%rcx){1to16}, %zmm25
135 // CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x09]
136 vpopcntd (%rcx){1to16}, %zmm25
137
138 // CHECK: vpopcntd 4064(%rdx), %zmm25
139 // CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x8a,0xe0,0x0f,0x00,0x00]
140 vpopcntd 4064(%rdx), %zmm25
141
142 // CHECK: vpopcntd 4096(%rdx), %zmm25
143 // CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x4a,0x40]
144 vpopcntd 4096(%rdx), %zmm25
145
146 // CHECK: vpopcntd -4096(%rdx), %zmm25
147 // CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x4a,0xc0]
148 vpopcntd -4096(%rdx), %zmm25
149
150 // CHECK: vpopcntd -4128(%rdx), %zmm25
151 // CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x8a,0xe0,0xef,0xff,0xff]
152 vpopcntd -4128(%rdx), %zmm25
153
154 // CHECK: vpopcntd 508(%rdx){1to16}, %zmm25
155 // CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x4a,0x7f]
156 vpopcntd 508(%rdx){1to16}, %zmm25
157
158 // CHECK: vpopcntd 512(%rdx){1to16}, %zmm25
159 // CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x8a,0x00,0x02,0x00,0x00]
160 vpopcntd 512(%rdx){1to16}, %zmm25
161
162 // CHECK: vpopcntd -512(%rdx){1to16}, %zmm25
163 // CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x4a,0x80]
164 vpopcntd -512(%rdx){1to16}, %zmm25
165
166 // CHECK: vpopcntd -516(%rdx){1to16}, %zmm25
167 // CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x8a,0xfc,0xfd,0xff,0xff]
168 vpopcntd -516(%rdx){1to16}, %zmm25
169
170 // CHECK: vpopcntd %zmm21, %zmm26
171 // CHECK: encoding: [0x62,0x22,0x7d,0x48,0x55,0xd5]
172 vpopcntd %zmm21, %zmm26
173
174 // CHECK: vpopcntd %zmm21, %zmm26 {%k4}
175 // CHECK: encoding: [0x62,0x22,0x7d,0x4c,0x55,0xd5]
176 vpopcntd %zmm21, %zmm26 {%k4}
177
178 // CHECK: vpopcntd %zmm21, %zmm26 {%k4} {z}
179 // CHECK: encoding: [0x62,0x22,0x7d,0xcc,0x55,0xd5]
180 vpopcntd %zmm21, %zmm26 {%k4} {z}
181
182 // CHECK: vpopcntd (%rcx), %zmm26
183 // CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x11]
184 vpopcntd (%rcx), %zmm26
185
186 // CHECK: vpopcntd 4660(%rax,%r14,8), %zmm26
187 // CHECK: encoding: [0x62,0x22,0x7d,0x48,0x55,0x94,0xf0,0x34,0x12,0x00,0x00]
188 vpopcntd 4660(%rax,%r14,8), %zmm26
189
190 // CHECK: vpopcntd (%rcx){1to16}, %zmm26
191 // CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x11]
192 vpopcntd (%rcx){1to16}, %zmm26
193
194 // CHECK: vpopcntd 4064(%rdx), %zmm26
195 // CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x92,0xe0,0x0f,0x00,0x00]
196 vpopcntd 4064(%rdx), %zmm26
197
198 // CHECK: vpopcntd 4096(%rdx), %zmm26
199 // CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x52,0x40]
200 vpopcntd 4096(%rdx), %zmm26
201
202 // CHECK: vpopcntd -4096(%rdx), %zmm26
203 // CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x52,0xc0]
204 vpopcntd -4096(%rdx), %zmm26
205
206 // CHECK: vpopcntd -4128(%rdx), %zmm26
207 // CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x92,0xe0,0xef,0xff,0xff]
208 vpopcntd -4128(%rdx), %zmm26
209
210 // CHECK: vpopcntd 508(%rdx){1to16}, %zmm26
211 // CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x52,0x7f]
212 vpopcntd 508(%rdx){1to16}, %zmm26
213
214 // CHECK: vpopcntd 512(%rdx){1to16}, %zmm26
215 // CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x92,0x00,0x02,0x00,0x00]
216 vpopcntd 512(%rdx){1to16}, %zmm26
217
218 // CHECK: vpopcntd -512(%rdx){1to16}, %zmm26
219 // CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x52,0x80]
220 vpopcntd -512(%rdx){1to16}, %zmm26
221
222 // CHECK: vpopcntd -516(%rdx){1to16}, %zmm26
223 // CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x92,0xfc,0xfd,0xff,0xff]
224 vpopcntd -516(%rdx){1to16}, %zmm26