llvm.org GIT mirror llvm / afebd3f
[X86][AVX512VPOPCNTDQ] Improve support for v16i8/v8i16/v16i16/ CTPOP Zero extend to v16i32/v8i64, use VPOPCNTDQ instructions and truncate back. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@306990 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 2 years ago
7 changed file(s) with 165 addition(s) and 196 deletion(s). Raw diff Collapse all Expand all
2320123201 "Unknown CTPOP type to handle");
2320223202 SDLoc DL(Op.getNode());
2320323203 SDValue Op0 = Op.getOperand(0);
23204
23205 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
23206 if (Subtarget.hasVPOPCNTDQ()) {
23207 if (VT == MVT::v8i16) {
23208 Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v8i64, Op0);
23209 Op = DAG.getNode(ISD::CTPOP, DL, MVT::v8i64, Op);
23210 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
23211 }
23212 if (VT == MVT::v16i8 || VT == MVT::v16i16) {
23213 Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v16i32, Op0);
23214 Op = DAG.getNode(ISD::CTPOP, DL, MVT::v16i32, Op);
23215 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
23216 }
23217 }
2320423218
2320523219 if (!Subtarget.hasSSSE3()) {
2320623220 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
343343 ; SSE41-NEXT: psrlw $8, %xmm0
344344 ; SSE41-NEXT: retq
345345 ;
346 ; AVX-LABEL: testv8i16:
347 ; AVX: # BB#0:
348 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
349 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
350 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
351 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
352 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
353 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
354 ; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
355 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
356 ; AVX-NEXT: vpsllw $8, %xmm0, %xmm1
357 ; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
358 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
359 ; AVX-NEXT: retq
346 ; AVX1-LABEL: testv8i16:
347 ; AVX1: # BB#0:
348 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
349 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
350 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
351 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
352 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
353 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
354 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
355 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
356 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
357 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
358 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
359 ; AVX1-NEXT: retq
360 ;
361 ; AVX2-LABEL: testv8i16:
362 ; AVX2: # BB#0:
363 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
364 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
365 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
366 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
367 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
368 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
369 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
370 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
371 ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1
372 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
373 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
374 ; AVX2-NEXT: retq
375 ;
376 ; AVX512VPOPCNTDQ-LABEL: testv8i16:
377 ; AVX512VPOPCNTDQ: # BB#0:
378 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
379 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
380 ; AVX512VPOPCNTDQ-NEXT: vpmovqw %zmm0, %xmm0
381 ; AVX512VPOPCNTDQ-NEXT: vzeroupper
382 ; AVX512VPOPCNTDQ-NEXT: retq
360383 %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %in)
361384 ret <8 x i16> %out
362385 }
430453 ; SSE41-NEXT: movdqa %xmm1, %xmm0
431454 ; SSE41-NEXT: retq
432455 ;
433 ; AVX-LABEL: testv16i8:
434 ; AVX: # BB#0:
435 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
436 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
437 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
438 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
439 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
440 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
441 ; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
442 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
443 ; AVX-NEXT: retq
456 ; AVX1-LABEL: testv16i8:
457 ; AVX1: # BB#0:
458 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
459 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
460 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
461 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
462 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
463 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
464 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
465 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
466 ; AVX1-NEXT: retq
467 ;
468 ; AVX2-LABEL: testv16i8:
469 ; AVX2: # BB#0:
470 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
471 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
472 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
473 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
474 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
475 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
476 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
477 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
478 ; AVX2-NEXT: retq
479 ;
480 ; AVX512VPOPCNTDQ-LABEL: testv16i8:
481 ; AVX512VPOPCNTDQ: # BB#0:
482 ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
483 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
484 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0
485 ; AVX512VPOPCNTDQ-NEXT: vzeroupper
486 ; AVX512VPOPCNTDQ-NEXT: retq
444487 %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %in)
445488 ret <16 x i8> %out
446489 }
154154 ;
155155 ; AVX512VPOPCNTDQ-LABEL: testv16i16:
156156 ; AVX512VPOPCNTDQ: # BB#0:
157 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
158 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
159 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
160 ; AVX512VPOPCNTDQ-NEXT: retq
161 %out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %in)
162 ret <16 x i16> %out
163 }
164
165 define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
166 ; AVX1-LABEL: testv32i8:
167 ; AVX1: # BB#0:
168 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
169 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
170 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
171 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
172 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
173 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
174 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
175 ; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
176 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
177 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3
178 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
179 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
180 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
181 ; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
182 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
183 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
184 ; AVX1-NEXT: retq
185 ;
186 ; AVX2-LABEL: testv32i8:
187 ; AVX2: # BB#0:
188 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
189 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
190 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
191 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
192 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
193 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
194 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
195 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
196 ; AVX2-NEXT: retq
197 ;
198 ; AVX512VPOPCNTDQ-LABEL: testv32i8:
199 ; AVX512VPOPCNTDQ: # BB#0:
157200 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
158201 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
159202 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
162205 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
163206 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
164207 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
165 ; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1
166 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0
167 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
168 ; AVX512VPOPCNTDQ-NEXT: retq
169 %out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %in)
170 ret <16 x i16> %out
171 }
172
173 define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
174 ; AVX1-LABEL: testv32i8:
175 ; AVX1: # BB#0:
176 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
177 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
178 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
179 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
180 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
181 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
182 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
183 ; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
184 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
185 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3
186 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
187 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
188 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
189 ; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
190 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
191 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
192 ; AVX1-NEXT: retq
193 ;
194 ; AVX2-LABEL: testv32i8:
195 ; AVX2: # BB#0:
196 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
197 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
198 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
199 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
200 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
201 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
202 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
203 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
204 ; AVX2-NEXT: retq
205 ;
206 ; AVX512VPOPCNTDQ-LABEL: testv32i8:
207 ; AVX512VPOPCNTDQ: # BB#0:
208 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
209 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
210 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
211 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
212 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
213 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
214 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
215 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
216208 ; AVX512VPOPCNTDQ-NEXT: retq
217209 %out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %in)
218210 ret <32 x i8> %out
148148 ;
149149 ; AVX512VPOPCNTDQ-LABEL: testv32i16:
150150 ; AVX512VPOPCNTDQ: # BB#0:
151 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
152 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm3
153 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
154 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
155 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
156 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm0
157 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0
158 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0
159 ; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm3
160 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm3, %ymm0
161 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
162 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm3
163 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
164 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1
165 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1
166 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1
167 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1
168 ; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2
169 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1
170 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1
151 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
152 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
153 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
154 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
155 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1
156 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1
171157 ; AVX512VPOPCNTDQ-NEXT: retq
172158 %out = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %in)
173159 ret <32 x i16> %out
927927 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
928928 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
929929 ; AVX512VPOPCNTDQ-NEXT: vpaddw %xmm1, %xmm0, %xmm0
930 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
931 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
932 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
933 ; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
934 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
935 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
936 ; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0
937 ; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0
938 ; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %xmm0, %xmm1
939 ; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0
940 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %xmm0, %xmm0
930 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
931 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
932 ; AVX512VPOPCNTDQ-NEXT: vpmovqw %zmm0, %xmm0
933 ; AVX512VPOPCNTDQ-NEXT: vzeroupper
941934 ; AVX512VPOPCNTDQ-NEXT: retq
942935 ;
943936 ; X32-SSE-LABEL: testv8i16:
10941087 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
10951088 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
10961089 ; AVX512VPOPCNTDQ-NEXT: vpaddw %xmm1, %xmm0, %xmm0
1097 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1098 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
1099 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1100 ; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
1101 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
1102 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
1103 ; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0
1104 ; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0
1105 ; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %xmm0, %xmm1
1106 ; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0
1107 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %xmm0, %xmm0
1090 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1091 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
1092 ; AVX512VPOPCNTDQ-NEXT: vpmovqw %zmm0, %xmm0
1093 ; AVX512VPOPCNTDQ-NEXT: vzeroupper
11081094 ; AVX512VPOPCNTDQ-NEXT: retq
11091095 ;
11101096 ; X32-SSE-LABEL: testv8i16u:
12421228 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
12431229 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
12441230 ; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1245 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1246 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
1247 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1248 ; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
1249 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
1250 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
1251 ; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0
1252 ; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0
1231 ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1232 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
1233 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0
1234 ; AVX512VPOPCNTDQ-NEXT: vzeroupper
12531235 ; AVX512VPOPCNTDQ-NEXT: retq
12541236 ;
12551237 ; X32-SSE-LABEL: testv16i8:
13831365 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
13841366 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
13851367 ; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1386 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1387 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
1388 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1389 ; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
1390 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
1391 ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
1392 ; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0
1393 ; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0
1368 ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1369 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
1370 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0
1371 ; AVX512VPOPCNTDQ-NEXT: vzeroupper
13941372 ; AVX512VPOPCNTDQ-NEXT: retq
13951373 ;
13961374 ; X32-SSE-LABEL: testv16i8u:
583583 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
584584 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
585585 ; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0
586 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
587 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
588 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
589 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
590 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
591 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
592 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
593 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
594 ; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1
595 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0
596 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
586 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
587 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
588 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
597589 ; AVX512VPOPCNTDQ-NEXT: retq
598590 ;
599591 ; X32-AVX-LABEL: testv16i16:
721713 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
722714 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
723715 ; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0
724 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
725 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
726 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
727 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
728 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
729 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
730 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
731 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
732 ; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1
733 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0
734 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
716 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
717 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
718 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
735719 ; AVX512VPOPCNTDQ-NEXT: retq
736720 ;
737721 ; X32-AVX-LABEL: testv16i16u:
363363 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
364364 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
365365 ; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm0, %ymm0
366 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
367 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5
368 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
369 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5
370 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
371 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0
372 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0
373 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0
374 ; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm5
375 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm5, %ymm0
376 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
366 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
367 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
368 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
377369 ; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm1, %ymm2, %ymm2
378370 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1
379371 ; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm1, %ymm1
380 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2
381 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2
382 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1
383 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1
384 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1
385 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1
386 ; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2
387 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1
388 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1
372 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
373 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1
374 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1
389375 ; AVX512VPOPCNTDQ-NEXT: retq
390376 %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 0)
391377 ret <32 x i16> %out
471457 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
472458 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
473459 ; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm0, %ymm0
474 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
475 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5
476 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
477 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5
478 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
479 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0
480 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0
481 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0
482 ; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm5
483 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm5, %ymm0
484 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
460 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
461 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
462 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
485463 ; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm1, %ymm2, %ymm2
486464 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1
487465 ; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm1, %ymm1
488 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2
489 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2
490 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1
491 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1
492 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1
493 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1
494 ; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2
495 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1
496 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1
466 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
467 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1
468 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1
497469 ; AVX512VPOPCNTDQ-NEXT: retq
498470 %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 -1)
499471 ret <32 x i16> %out