llvm.org GIT mirror llvm / abb4a55
[DAGCombiner] Extending pattern detection for vector shuffle (REAPPLIED) If all the operands of a BUILD_VECTOR extract elements from same vector then split the vector efficiently based on the maximum vector access index. Reapplied with fix to only work with simple value types. Committed on behalf of @jbhateja (Jatin Bhateja) Differential Revision: https://reviews.llvm.org/D35788 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@310782 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 2 years ago
5 changed file(s) with 109 addition(s) and 118 deletion(s). Raw diff Collapse all Expand all
1418514185 EVT InVT1 = VecIn1.getValueType();
1418614186 EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1;
1418714187
14188 unsigned Vec2Offset = InVT1.getVectorNumElements();
14188 unsigned Vec2Offset = 0;
1418914189 unsigned NumElems = VT.getVectorNumElements();
1419014190 unsigned ShuffleNumElems = NumElems;
14191
14192 // In case both the input vectors are extracted from same base
14193 // vector we do not need extra addend (Vec2Offset) while
14194 // computing shuffle mask.
14195 if (!VecIn2 || !(VecIn1.getOpcode() == ISD::EXTRACT_SUBVECTOR) ||
14196 !(VecIn2.getOpcode() == ISD::EXTRACT_SUBVECTOR) ||
14197 !(VecIn1.getOperand(0) == VecIn2.getOperand(0)))
14198 Vec2Offset = InVT1.getVectorNumElements();
1419114199
1419214200 // We can't generate a shuffle node with mismatched input and output types.
1419314201 // Try to make the types match the type of the output.
1433514343 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
1433614344 !isa(Op.getOperand(1)))
1433714345 return SDValue();
14338
1433914346 SDValue ExtractedFromVec = Op.getOperand(0);
1434014347
1434114348 // All inputs must have the same element type as the output.
1435714364 // If we didn't find at least one input vector, bail out.
1435814365 if (VecIn.size() < 2)
1435914366 return SDValue();
14367
14368 // If all the Operands of BUILD_VECTOR extract from same
14369 // vector, then split the vector efficiently based on the maximum
14370 // vector access index and adjust the VectorMask and
14371 // VecIn accordingly.
14372 if (VecIn.size() == 2) {
14373 unsigned MaxIndex = 0;
14374 unsigned NearestPow2 = 0;
14375 SDValue Vec = VecIn.back();
14376 EVT InVT = Vec.getValueType();
14377 MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
14378 SmallVector IndexVec(NumElems, 0);
14379
14380 for (unsigned i = 0; i < NumElems; i++) {
14381 if (VectorMask[i] <= 0)
14382 continue;
14383 unsigned Index = N->getOperand(i).getConstantOperandVal(1);
14384 IndexVec[i] = Index;
14385 MaxIndex = std::max(MaxIndex, Index);
14386 }
14387
14388 NearestPow2 = PowerOf2Ceil(MaxIndex);
14389 if (InVT.isSimple() && (NearestPow2 > 2) && ((NumElems * 2) < NearestPow2)) {
14390 unsigned SplitSize = NearestPow2 / 2;
14391 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(),
14392 InVT.getVectorElementType(), SplitSize);
14393 SDValue VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec,
14394 DAG.getConstant(SplitSize, DL, IdxTy));
14395 SDValue VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec,
14396 DAG.getConstant(0, DL, IdxTy));
14397 VecIn.pop_back();
14398 VecIn.push_back(VecIn1);
14399 VecIn.push_back(VecIn2);
14400
14401 for (unsigned i = 0; i < NumElems; i++)
14402 VectorMask[i] = (IndexVec[i] < SplitSize) ? 1 : 2;
14403 }
14404 }
1436014405
1436114406 // TODO: We want to sort the vectors by descending length, so that adjacent
1436214407 // pairs have similar length, and the longer vector is always first in the
260260 ;
261261 ; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
262262 ; AVX512BW: # BB#0:
263 ; AVX512BW-NEXT: vpextrb $5, %xmm0, %eax
264 ; AVX512BW-NEXT: vpextrb $1, %xmm0, %ecx
265 ; AVX512BW-NEXT: vmovd %ecx, %xmm1
266 ; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
267 ; AVX512BW-NEXT: vpextrb $9, %xmm0, %eax
268 ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
269 ; AVX512BW-NEXT: vpextrb $13, %xmm0, %eax
270 ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
271 ; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2
272 ; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
273 ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
274 ; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
275 ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
276 ; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
277 ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
278 ; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
279 ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
280 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
281 ; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
282 ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
283 ; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
284 ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
285 ; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
286 ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
287 ; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
288 ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
289 ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm0
290 ; AVX512BW-NEXT: vpextrb $1, %xmm0, %eax
291 ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
292 ; AVX512BW-NEXT: vpextrb $5, %xmm0, %eax
293 ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
294 ; AVX512BW-NEXT: vpextrb $9, %xmm0, %eax
295 ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
296 ; AVX512BW-NEXT: vpextrb $14, %xmm0, %eax
297 ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0
263 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
264 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
265 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
266 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm2
267 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
268 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
269 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
270 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
271 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
272 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
273 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
298274 ; AVX512BW-NEXT: vzeroupper
299275 ; AVX512BW-NEXT: retq
300276 ;
301277 ; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
302278 ; AVX512BWVL: # BB#0:
303 ; AVX512BWVL-NEXT: vpextrb $5, %xmm0, %eax
304 ; AVX512BWVL-NEXT: vpextrb $1, %xmm0, %ecx
305 ; AVX512BWVL-NEXT: vmovd %ecx, %xmm1
306 ; AVX512BWVL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
307 ; AVX512BWVL-NEXT: vpextrb $9, %xmm0, %eax
308 ; AVX512BWVL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
309 ; AVX512BWVL-NEXT: vpextrb $13, %xmm0, %eax
310 ; AVX512BWVL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
311 ; AVX512BWVL-NEXT: vextracti32x4 $1, %zmm0, %xmm2
312 ; AVX512BWVL-NEXT: vpextrb $1, %xmm2, %eax
313 ; AVX512BWVL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
314 ; AVX512BWVL-NEXT: vpextrb $5, %xmm2, %eax
315 ; AVX512BWVL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
316 ; AVX512BWVL-NEXT: vpextrb $9, %xmm2, %eax
317 ; AVX512BWVL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
318 ; AVX512BWVL-NEXT: vpextrb $13, %xmm2, %eax
319 ; AVX512BWVL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
320 ; AVX512BWVL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
321 ; AVX512BWVL-NEXT: vpextrb $1, %xmm2, %eax
322 ; AVX512BWVL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
323 ; AVX512BWVL-NEXT: vpextrb $5, %xmm2, %eax
324 ; AVX512BWVL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
325 ; AVX512BWVL-NEXT: vpextrb $9, %xmm2, %eax
326 ; AVX512BWVL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
327 ; AVX512BWVL-NEXT: vpextrb $13, %xmm2, %eax
328 ; AVX512BWVL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
329 ; AVX512BWVL-NEXT: vextracti32x4 $3, %zmm0, %xmm0
330 ; AVX512BWVL-NEXT: vpextrb $1, %xmm0, %eax
331 ; AVX512BWVL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
332 ; AVX512BWVL-NEXT: vpextrb $5, %xmm0, %eax
333 ; AVX512BWVL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
334 ; AVX512BWVL-NEXT: vpextrb $9, %xmm0, %eax
335 ; AVX512BWVL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
336 ; AVX512BWVL-NEXT: vpextrb $14, %xmm0, %eax
337 ; AVX512BWVL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0
279 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
280 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
281 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
282 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm2
283 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
284 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
285 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
286 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
287 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
288 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
289 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
338290 ; AVX512BWVL-NEXT: vzeroupper
339291 ; AVX512BWVL-NEXT: retq
340292 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32>
285285 define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) {
286286 ; ALL-LABEL: test_v16i32_0_1_2_12:
287287 ; ALL: # BB#0:
288 ; ALL-NEXT: vpextrd $1, %xmm0, %eax
289 ; ALL-NEXT: vpinsrd $1, %eax, %xmm0, %xmm1
290 ; ALL-NEXT: vpextrd $2, %xmm0, %eax
291 ; ALL-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
292 ; ALL-NEXT: vextracti32x4 $3, %zmm0, %xmm0
293 ; ALL-NEXT: vmovd %xmm0, %eax
294 ; ALL-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
288 ; ALL-NEXT: vextracti32x8 $1, %zmm0, %ymm1
289 ; ALL-NEXT: vextracti128 $1, %ymm1, %xmm1
290 ; ALL-NEXT: vpbroadcastd %xmm1, %xmm1
291 ; ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
295292 ; ALL-NEXT: vzeroupper
296293 ; ALL-NEXT: retq
297294 %res = shufflevector <16 x i32> %v, <16 x i32> undef, <4 x i32>
27252725 define <2 x i64> @test_v8i64_2_5 (<8 x i64> %v) {
27262726 ; AVX512F-LABEL: test_v8i64_2_5:
27272727 ; AVX512F: # BB#0:
2728 ; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1
2729 ; AVX512F-NEXT: vextracti32x4 $1, %zmm0, %xmm0
2728 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2729 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
27302730 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
27312731 ; AVX512F-NEXT: vzeroupper
27322732 ; AVX512F-NEXT: retq
27332733 ;
27342734 ; AVX512F-32-LABEL: test_v8i64_2_5:
27352735 ; AVX512F-32: # BB#0:
2736 ; AVX512F-32-NEXT: vextracti32x4 $1, %zmm0, %xmm1
2737 ; AVX512F-32-NEXT: vextracti32x4 $2, %zmm0, %xmm0
2738 ; AVX512F-32-NEXT: vpextrd $2, %xmm0, %eax
2739 ; AVX512F-32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
2740 ; AVX512F-32-NEXT: vpextrd $3, %xmm0, %eax
2741 ; AVX512F-32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
2736 ; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2737 ; AVX512F-32-NEXT: vextracti128 $1, %ymm0, %xmm0
2738 ; AVX512F-32-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
27422739 ; AVX512F-32-NEXT: vzeroupper
27432740 ; AVX512F-32-NEXT: retl
27442741 %res = shufflevector <8 x i64> %v, <8 x i64> undef, <2 x i32>
566566 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
567567 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
568568 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
569 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
570 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 =
571 ; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5
572 ; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm4
573 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
574 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5
575 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
576 ; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm7
577 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6
578 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
579 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
580 ; AVX2-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm2
581 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 =
582 ; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm6
583 ; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm4
584 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
585 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
586 ; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm7
587 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6
588 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
589 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
590 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 =
591 ; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3
592 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1
593 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
594 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
595 ; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5
596 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
569 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 =
570 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
571 ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm5
572 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm3
573 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
574 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
575 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm6
576 ; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm7
577 ; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5
578 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
579 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
580 ; AVX2-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
581 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 =
582 ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm5
583 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm3
584 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
585 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
586 ; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm7
587 ; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5
588 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
589 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
590 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 =
591 ; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
592 ; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
593 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
594 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
595 ; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm5
596 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
597597 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
598598 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
599 ; AVX2-NEXT: vpcmpeqb %xmm0, %xmm4, %xmm0
599 ; AVX2-NEXT: vpcmpeqb %xmm0, %xmm3, %xmm0
600600 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
601601 ; AVX2-NEXT: vpand %xmm1, %xmm2, %xmm2
602602 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
835835 ; AVX512-NEXT: vpmovdw %zmm1, %ymm3
836836 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
837837 ; AVX512-NEXT: vpmovwb %zmm2, %ymm8
838 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 =
839838 ; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm14
840839 ; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm9
840 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 =
841841 ; AVX512-NEXT: vpshufb %xmm7, %xmm9, %xmm4
842842 ; AVX512-NEXT: vpshufb %xmm7, %xmm14, %xmm5
843843 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
844844 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5
845 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm10
845846 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
846 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm10
847847 ; AVX512-NEXT: vpshufb %xmm3, %xmm10, %xmm6
848848 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm4
849849 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]