llvm.org GIT mirror llvm / dcc2525
[DAGCombiner][x86] add transform/hook to decompose integer multiply into shift/add This is an alternative to D37896. I don't see a way to decompose multiplies generically without a target hook to tell us when it's profitable. ARM and AArch64 may be able to remove some duplicate code that overlaps with this transform. As a first step, we're only getting the most clear wins on the vector examples requested in PR34474: https://bugs.llvm.org/show_bug.cgi?id=34474 As noted in the code comment, it's likely that the x86 constraints are tighter than necessary, but it may not always be a win to replace a pmullw/pmulld. Differential Revision: https://reviews.llvm.org/D52195 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@342554 91177308-0d34-0410-b5e6-96231b3b80d8 Sanjay Patel 1 year, 9 months ago
13 changed file(s) with 354 addition(s) and 500 deletion(s). Raw diff Collapse all Expand all
17201720 return false;
17211721 }
17221722
1723 /// Return true if it is profitable to transform an integer
1724 /// multiplication-by-constant into simpler operations like shifts and adds.
1725 /// This may be true if the target does not directly support the
1726 /// multiplication operation for the specified type or the sequence of simpler
1727 /// ops is faster than the multiply.
1728 virtual bool decomposeMulByConstant(EVT VT, SDValue C) const {
1729 return false;
1730 }
1731
17231732 //===--------------------------------------------------------------------===//
17241733 // TargetLowering Configuration Methods - These methods should be invoked by
17251734 // the derived class constructor to configure this object for the target.
29302930 getShiftAmountTy(N0.getValueType()))));
29312931 }
29322932
2933 // Try to transform multiply-by-(power-of-2 +/- 1) into shift and add/sub.
2934 // Examples: x * 33 --> (x << 5) + x
2935 // x * 15 --> (x << 4) - x
2936 if (N1IsConst && TLI.decomposeMulByConstant(VT, N1)) {
2937 // TODO: Negative constants can be handled by negating the result.
2938 // TODO: We could handle more general decomposition of any constant by
2939 // having the target set a limit on number of ops and making a
2940 // callback to determine that sequence (similar to sqrt expansion).
2941 unsigned MathOp = ISD::DELETED_NODE;
2942 if ((ConstValue1 - 1).isPowerOf2())
2943 MathOp = ISD::ADD;
2944 else if ((ConstValue1 + 1).isPowerOf2())
2945 MathOp = ISD::SUB;
2946
2947 if (MathOp != ISD::DELETED_NODE) {
2948 unsigned ShAmt = MathOp == ISD::ADD ? (ConstValue1 - 1).logBase2()
2949 : (ConstValue1 + 1).logBase2();
2950 assert(ShAmt > 0 && ShAmt < VT.getScalarSizeInBits() &&
2951 "Not expecting multiply-by-constant that could have simplified");
2952 SDLoc DL(N);
2953 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, N0,
2954 DAG.getConstant(ShAmt, DL, VT));
2955 return DAG.getNode(MathOp, DL, VT, Shl, N0);
2956 }
2957 }
2958
29332959 // (mul (shl X, c1), c2) -> (mul X, c2 << c1)
29342960 if (N0.getOpcode() == ISD::SHL &&
29352961 isConstantOrConstantVector(N1, /* NoOpaques */ true) &&
47194719 return false;
47204720
47214721 return true;
4722 }
4723
4724 bool X86TargetLowering::decomposeMulByConstant(EVT VT, SDValue C) const {
4725 // TODO: We handle scalars using custom code, but generic combining could make
4726 // that unnecessary.
4727 APInt MulC;
4728 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
4729 return false;
4730
4731 // If vector multiply is legal, assume that's faster than shl + add/sub.
4732 // TODO: Multiply is a complex op with higher latency and lower througput in
4733 // most implementations, so this check could be loosened based on type
4734 // and/or a CPU attribute.
4735 if (isOperationLegal(ISD::MUL, VT))
4736 return false;
4737
4738 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2();
47224739 }
47234740
47244741 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
10331033
10341034 bool convertSelectOfConstantsToMath(EVT VT) const override;
10351035
1036 bool decomposeMulByConstant(EVT VT, SDValue C) const override;
1037
10361038 /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
10371039 /// with this index.
10381040 bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
523523 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
524524 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
525525 ; CHECK-SSE2-NEXT: psrld $2, %xmm2
526 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,5,5,5]
527 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
528 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
529 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
530 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
531 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
532 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
533 ; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
526 ; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
527 ; CHECK-SSE2-NEXT: pslld $2, %xmm1
528 ; CHECK-SSE2-NEXT: paddd %xmm2, %xmm1
529 ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
534530 ; CHECK-SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0
535531 ; CHECK-SSE2-NEXT: psrld $31, %xmm0
536532 ; CHECK-SSE2-NEXT: retq
727723 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
728724 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
729725 ; CHECK-SSE2-NEXT: psrld $2, %xmm2
730 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,5,5,5]
731 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
732 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
733 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
734 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
735 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
736 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
737 ; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
726 ; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
727 ; CHECK-SSE2-NEXT: pslld $2, %xmm1
728 ; CHECK-SSE2-NEXT: paddd %xmm2, %xmm1
729 ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
738730 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
739731 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
740732 ; CHECK-SSE2-NEXT: psrld $31, %xmm0
1818 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
1919 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2020 ; CHECK-SSE2-NEXT: psrld $2, %xmm2
21 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,5,5,5]
22 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
23 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
24 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
25 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
26 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
27 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
28 ; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
21 ; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
22 ; CHECK-SSE2-NEXT: pslld $2, %xmm1
23 ; CHECK-SSE2-NEXT: paddd %xmm2, %xmm1
24 ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
2925 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
3026 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
3127 ; CHECK-SSE2-NEXT: psrld $31, %xmm0
115111 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
116112 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
117113 ; CHECK-SSE2-NEXT: psrld $2, %xmm2
118 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,5,5,5]
119 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
120 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
121 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
122 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
123 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
124 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
125 ; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
114 ; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
115 ; CHECK-SSE2-NEXT: pslld $2, %xmm1
116 ; CHECK-SSE2-NEXT: paddd %xmm2, %xmm1
117 ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
126118 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
127119 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
128120 ; CHECK-SSE2-NEXT: psrld $31, %xmm0
395395 ; SSE2-NEXT: psrld $31, %xmm1
396396 ; SSE2-NEXT: psrad $2, %xmm2
397397 ; SSE2-NEXT: paddd %xmm1, %xmm2
398 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [7,7,7,7]
399 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
400 ; SSE2-NEXT: pmuludq %xmm1, %xmm2
401 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
402 ; SSE2-NEXT: pmuludq %xmm1, %xmm3
403 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
404 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
405 ; SSE2-NEXT: psubd %xmm2, %xmm0
398 ; SSE2-NEXT: movdqa %xmm2, %xmm1
399 ; SSE2-NEXT: pslld $3, %xmm1
400 ; SSE2-NEXT: psubd %xmm1, %xmm2
401 ; SSE2-NEXT: paddd %xmm2, %xmm0
406402 ; SSE2-NEXT: retq
407403 ;
408404 ; SSE41-LABEL: test_rem7_4i32:
510506 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
511507 ; SSE2-NEXT: paddb %xmm2, %xmm1
512508 ; SSE2-NEXT: movdqa %xmm1, %xmm2
513 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
514 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
515 ; SSE2-NEXT: pmullw %xmm3, %xmm2
516 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
517 ; SSE2-NEXT: pand %xmm4, %xmm2
518 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
519 ; SSE2-NEXT: pmullw %xmm3, %xmm1
520 ; SSE2-NEXT: pand %xmm4, %xmm1
521 ; SSE2-NEXT: packuswb %xmm2, %xmm1
522 ; SSE2-NEXT: psubb %xmm1, %xmm0
509 ; SSE2-NEXT: psllw $3, %xmm2
510 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
511 ; SSE2-NEXT: psubb %xmm2, %xmm1
512 ; SSE2-NEXT: paddb %xmm0, %xmm1
513 ; SSE2-NEXT: movdqa %xmm1, %xmm0
523514 ; SSE2-NEXT: retq
524515 ;
525516 ; SSE41-LABEL: test_rem7_16i8:
543534 ; SSE41-NEXT: psrlw $7, %xmm1
544535 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
545536 ; SSE41-NEXT: paddb %xmm2, %xmm1
546 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
547 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
548 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
549 ; SSE41-NEXT: pmullw %xmm3, %xmm1
550 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
551 ; SSE41-NEXT: pand %xmm4, %xmm1
552 ; SSE41-NEXT: pmullw %xmm3, %xmm2
553 ; SSE41-NEXT: pand %xmm4, %xmm2
554 ; SSE41-NEXT: packuswb %xmm1, %xmm2
555 ; SSE41-NEXT: psubb %xmm2, %xmm0
537 ; SSE41-NEXT: movdqa %xmm1, %xmm2
538 ; SSE41-NEXT: psllw $3, %xmm2
539 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
540 ; SSE41-NEXT: psubb %xmm2, %xmm1
541 ; SSE41-NEXT: paddb %xmm1, %xmm0
556542 ; SSE41-NEXT: retq
557543 ;
558544 ; AVX1-LABEL: test_rem7_16i8:
575561 ; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1
576562 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
577563 ; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
578 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
579 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
580 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
581 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
582 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
583 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
584 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
585 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
586 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
587 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
564 ; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2
565 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
566 ; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1
567 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
588568 ; AVX1-NEXT: retq
589569 ;
590570 ; AVX2NOBW-LABEL: test_rem7_16i8:
603583 ; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm1
604584 ; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
605585 ; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
606 ; AVX2NOBW-NEXT: vpmovsxbw %xmm1, %ymm1
607 ; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
608 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
609 ; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
610 ; AVX2NOBW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
611 ; AVX2NOBW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
612 ; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
613 ; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
586 ; AVX2NOBW-NEXT: vpsllw $3, %xmm1, %xmm2
587 ; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
588 ; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1
589 ; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
614590 ; AVX2NOBW-NEXT: vzeroupper
615591 ; AVX2NOBW-NEXT: retq
616592 ;
629605 ; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1
630606 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
631607 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
632 ; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1
633 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
634 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
635 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
608 ; AVX512BW-NEXT: vpsllw $3, %xmm1, %xmm2
609 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
610 ; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm1
611 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
636612 ; AVX512BW-NEXT: vzeroupper
637613 ; AVX512BW-NEXT: retq
638614 %res = srem <16 x i8> %a,
368368 ; AVX1-NEXT: vpsrld $31, %xmm2, %xmm4
369369 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm2
370370 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
371 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7]
372 ; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
373 ; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1
371 ; AVX1-NEXT: vpslld $3, %xmm2, %xmm4
372 ; AVX1-NEXT: vpsubd %xmm4, %xmm2, %xmm2
373 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
374374 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
375375 ; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2
376376 ; AVX1-NEXT: vpmuldq %xmm3, %xmm0, %xmm3
380380 ; AVX1-NEXT: vpsrld $31, %xmm2, %xmm3
381381 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm2
382382 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
383 ; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
384 ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
383 ; AVX1-NEXT: vpslld $3, %xmm2, %xmm3
384 ; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2
385 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
385386 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
386387 ; AVX1-NEXT: retq
387388 ;
414415 ; AVX1-NEXT: vpsrlw $15, %xmm3, %xmm4
415416 ; AVX1-NEXT: vpsraw $1, %xmm3, %xmm3
416417 ; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3
417 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7]
418 ; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
419 ; AVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm1
418 ; AVX1-NEXT: vpsllw $3, %xmm3, %xmm4
419 ; AVX1-NEXT: vpsubw %xmm4, %xmm3, %xmm3
420 ; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1
420421 ; AVX1-NEXT: vpmulhw %xmm2, %xmm0, %xmm2
421422 ; AVX1-NEXT: vpsrlw $15, %xmm2, %xmm3
422423 ; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2
423424 ; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2
424 ; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
425 ; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0
425 ; AVX1-NEXT: vpsllw $3, %xmm2, %xmm3
426 ; AVX1-NEXT: vpsubw %xmm3, %xmm2, %xmm2
427 ; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0
426428 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
427429 ; AVX1-NEXT: retq
428430 ;
442444 define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
443445 ; AVX1-LABEL: test_rem7_32i8:
444446 ; AVX1: # %bb.0:
445 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
446 ; AVX1-NEXT: vpmovsxbw %xmm2, %xmm3
447 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [65427,65427,65427,65427,65427,65427,65427,65427]
448 ; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm3
449 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
450 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
447 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
448 ; AVX1-NEXT: vpmovsxbw %xmm1, %xmm2
449 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65427,65427,65427,65427,65427,65427,65427,65427]
450 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
451 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
452 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
451453 ; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4
452 ; AVX1-NEXT: vpmullw %xmm1, %xmm4, %xmm4
454 ; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm4
453455 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
454 ; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
455 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm3
456 ; AVX1-NEXT: vpsrlw $7, %xmm3, %xmm4
456 ; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
457 ; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm2
458 ; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm4
457459 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
458460 ; AVX1-NEXT: vpand %xmm8, %xmm4, %xmm4
459 ; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3
460 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
461 ; AVX1-NEXT: vpand %xmm9, %xmm3, %xmm3
461 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2
462 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
463 ; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
462464 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
463 ; AVX1-NEXT: vpxor %xmm7, %xmm3, %xmm3
464 ; AVX1-NEXT: vpsubb %xmm7, %xmm3, %xmm3
465 ; AVX1-NEXT: vpaddb %xmm4, %xmm3, %xmm3
466 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
467 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
468 ; AVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
469 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
470 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
471 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
472 ; AVX1-NEXT: vpmullw %xmm5, %xmm3, %xmm3
473 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
474 ; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
475 ; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2
476 ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm3
477 ; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm3
478 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
465 ; AVX1-NEXT: vpxor %xmm7, %xmm2, %xmm2
466 ; AVX1-NEXT: vpsubb %xmm7, %xmm2, %xmm2
467 ; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2
468 ; AVX1-NEXT: vpsllw $3, %xmm2, %xmm4
469 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
470 ; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
471 ; AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2
472 ; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1
473 ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm2
474 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
475 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
479476 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
480477 ; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4
481 ; AVX1-NEXT: vpmullw %xmm1, %xmm4, %xmm1
482 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
483 ; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
484 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1
485 ; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm3
478 ; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3
479 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
480 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
481 ; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm2
482 ; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3
486483 ; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
487 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1
488 ; AVX1-NEXT: vpand %xmm9, %xmm1, %xmm1
489 ; AVX1-NEXT: vpxor %xmm7, %xmm1, %xmm1
490 ; AVX1-NEXT: vpsubb %xmm7, %xmm1, %xmm1
491 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
492 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
493 ; AVX1-NEXT: vpmullw %xmm5, %xmm3, %xmm3
494 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
495 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
496 ; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1
497 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
498 ; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
499 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
500 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
484 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2
485 ; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
486 ; AVX1-NEXT: vpxor %xmm7, %xmm2, %xmm2
487 ; AVX1-NEXT: vpsubb %xmm7, %xmm2, %xmm2
488 ; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
489 ; AVX1-NEXT: vpsllw $3, %xmm2, %xmm3
490 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
491 ; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2
492 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
493 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
501494 ; AVX1-NEXT: retq
502495 ;
503496 ; AVX2NOBW-LABEL: test_rem7_32i8:
521514 ; AVX2NOBW-NEXT: vpsrlw $7, %ymm1, %ymm1
522515 ; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
523516 ; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm2, %ymm1
524 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
525 ; AVX2NOBW-NEXT: vpmovsxbw %xmm2, %ymm2
526 ; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
527 ; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
528 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm2, %xmm4
529 ; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
530 ; AVX2NOBW-NEXT: vpshufb %xmm5, %xmm4, %xmm4
531 ; AVX2NOBW-NEXT: vpshufb %xmm5, %xmm2, %xmm2
532 ; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
533 ; AVX2NOBW-NEXT: vpmovsxbw %xmm1, %ymm1
534 ; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm1, %ymm1
535 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm3
536 ; AVX2NOBW-NEXT: vpshufb %xmm5, %xmm3, %xmm3
537 ; AVX2NOBW-NEXT: vpshufb %xmm5, %xmm1, %xmm1
538 ; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
539 ; AVX2NOBW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
540 ; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
517 ; AVX2NOBW-NEXT: vpsllw $3, %ymm1, %ymm2
518 ; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
519 ; AVX2NOBW-NEXT: vpsubb %ymm2, %ymm1, %ymm1
520 ; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
541521 ; AVX2NOBW-NEXT: retq
542522 ;
543523 ; AVX512BW-LABEL: test_rem7_32i8:
555535 ; AVX512BW-NEXT: vpsrlw $7, %ymm1, %ymm1
556536 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
557537 ; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1
558 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1
559 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1
560 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
561 ; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
538 ; AVX512BW-NEXT: vpsllw $3, %ymm1, %ymm2
539 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
540 ; AVX512BW-NEXT: vpsubb %ymm2, %ymm1, %ymm1
541 ; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
562542 ; AVX512BW-NEXT: retq
563543 %res = srem <32 x i8> %a,
564544 ret <32 x i8> %res
360360 ; AVX512F-LABEL: test_rem7_64i8:
361361 ; AVX512F: # %bb.0:
362362 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
363 ; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm3
364 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
365 ; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm3
363 ; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
364 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
365 ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
366 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
367 ; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm4
368 ; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4
369 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
370 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm4, %ymm2
371 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
372 ; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm2
373 ; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm4
374 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
375 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
376 ; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
377 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
378 ; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
379 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
380 ; AVX512F-NEXT: vpxor %ymm7, %ymm2, %ymm2
381 ; AVX512F-NEXT: vpsubb %ymm7, %ymm2, %ymm2
382 ; AVX512F-NEXT: vpaddb %ymm4, %ymm2, %ymm2
383 ; AVX512F-NEXT: vpsllw $3, %ymm2, %ymm4
384 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
385 ; AVX512F-NEXT: vpand %ymm8, %ymm4, %ymm4
386 ; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2
387 ; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
388 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
389 ; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
390 ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
391 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
392 ; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm4
393 ; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm3
366394 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
367 ; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm4
368 ; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm4
369 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
370 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
371 ; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3]
372 ; AVX512F-NEXT: vpaddb %ymm0, %ymm3, %ymm3
373 ; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm5
374 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
375 ; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm7
376 ; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3
377 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
378 ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
379 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
380 ; AVX512F-NEXT: vpxor %ymm6, %ymm3, %ymm3
381 ; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3
382 ; AVX512F-NEXT: vpaddb %ymm7, %ymm3, %ymm7
383 ; AVX512F-NEXT: vpmovsxbw %xmm7, %ymm8
384 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
385 ; AVX512F-NEXT: vpmullw %ymm3, %ymm8, %ymm8
386 ; AVX512F-NEXT: vpmovsxwd %ymm8, %zmm8
387 ; AVX512F-NEXT: vpmovdb %zmm8, %xmm8
388 ; AVX512F-NEXT: vextracti128 $1, %ymm7, %xmm7
389 ; AVX512F-NEXT: vpmovsxbw %xmm7, %ymm7
390 ; AVX512F-NEXT: vpmullw %ymm3, %ymm7, %ymm7
391 ; AVX512F-NEXT: vpmovsxwd %ymm7, %zmm7
392 ; AVX512F-NEXT: vpmovdb %zmm7, %xmm7
393 ; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7
394 ; AVX512F-NEXT: vpsubb %ymm7, %ymm0, %ymm0
395 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm7
396 ; AVX512F-NEXT: vpmovsxbw %xmm7, %ymm7
397 ; AVX512F-NEXT: vpmullw %ymm2, %ymm7, %ymm7
398 ; AVX512F-NEXT: vpsrlw $8, %ymm7, %ymm7
399 ; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm8
400 ; AVX512F-NEXT: vpmullw %ymm2, %ymm8, %ymm2
401 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
402 ; AVX512F-NEXT: vpackuswb %ymm7, %ymm2, %ymm2
395 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm3, %ymm2
403396 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
404397 ; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm2
405 ; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm7
406 ; AVX512F-NEXT: vpand %ymm4, %ymm7, %ymm4
398 ; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm3
399 ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
407400 ; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
408 ; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
409 ; AVX512F-NEXT: vpxor %ymm6, %ymm2, %ymm2
410 ; AVX512F-NEXT: vpsubb %ymm6, %ymm2, %ymm2
411 ; AVX512F-NEXT: vpaddb %ymm4, %ymm2, %ymm2
412 ; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm4
413 ; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4
414 ; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4
415 ; AVX512F-NEXT: vpmovdb %zmm4, %xmm4
416 ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2
417 ; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
418 ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
419 ; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
420 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
421 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
422 ; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1
401 ; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
402 ; AVX512F-NEXT: vpxor %ymm7, %ymm2, %ymm2
403 ; AVX512F-NEXT: vpsubb %ymm7, %ymm2, %ymm2
404 ; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2
405 ; AVX512F-NEXT: vpsllw $3, %ymm2, %ymm3
406 ; AVX512F-NEXT: vpand %ymm8, %ymm3, %ymm3
407 ; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm2
408 ; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1
423409 ; AVX512F-NEXT: retq
424410 ;
425411 ; AVX512BW-LABEL: test_rem7_64i8:
444430 ; AVX512BW-NEXT: vpsrlw $7, %zmm1, %zmm1
445431 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
446432 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1
447 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm2
448 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
449 ; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2
450 ; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2
451 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1
452 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1
453 ; AVX512BW-NEXT: vpmullw %zmm3, %zmm1, %zmm1
454 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
455 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
456 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0
433 ; AVX512BW-NEXT: vpsllw $3, %zmm1, %zmm2
434 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
435 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1
436 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
457437 ; AVX512BW-NEXT: retq
458438 %res = srem <64 x i8> %a,
459439 ret <64 x i8> %res
375375 ; SSE2-NEXT: psrld $1, %xmm1
376376 ; SSE2-NEXT: paddd %xmm2, %xmm1
377377 ; SSE2-NEXT: psrld $2, %xmm1
378 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [7,7,7,7]
379 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
380 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
381 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
382 ; SSE2-NEXT: pmuludq %xmm2, %xmm3
383 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
384 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
385 ; SSE2-NEXT: psubd %xmm1, %xmm0
378 ; SSE2-NEXT: movdqa %xmm1, %xmm2
379 ; SSE2-NEXT: pslld $3, %xmm2
380 ; SSE2-NEXT: psubd %xmm2, %xmm1
381 ; SSE2-NEXT: paddd %xmm0, %xmm1
382 ; SSE2-NEXT: movdqa %xmm1, %xmm0
386383 ; SSE2-NEXT: retq
387384 ;
388385 ; SSE41-LABEL: test_rem7_4i32:
488485 ; SSE2-NEXT: psrlw $2, %xmm1
489486 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
490487 ; SSE2-NEXT: movdqa %xmm1, %xmm2
491 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
492 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
493 ; SSE2-NEXT: pmullw %xmm3, %xmm2
494 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
495 ; SSE2-NEXT: pand %xmm4, %xmm2
496 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
497 ; SSE2-NEXT: pmullw %xmm3, %xmm1
498 ; SSE2-NEXT: pand %xmm4, %xmm1
499 ; SSE2-NEXT: packuswb %xmm2, %xmm1
500 ; SSE2-NEXT: psubb %xmm1, %xmm0
488 ; SSE2-NEXT: psllw $3, %xmm2
489 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
490 ; SSE2-NEXT: psubb %xmm2, %xmm1
491 ; SSE2-NEXT: paddb %xmm0, %xmm1
492 ; SSE2-NEXT: movdqa %xmm1, %xmm0
501493 ; SSE2-NEXT: retq
502494 ;
503495 ; SSE41-LABEL: test_rem7_16i8:
504496 ; SSE41: # %bb.0:
505 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
506 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
507 ; SSE41-NEXT: pmullw %xmm2, %xmm1
508 ; SSE41-NEXT: psrlw $8, %xmm1
497 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
498 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37]
499 ; SSE41-NEXT: pmullw %xmm1, %xmm2
500 ; SSE41-NEXT: psrlw $8, %xmm2
509501 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
510502 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
511 ; SSE41-NEXT: pmullw %xmm2, %xmm3
503 ; SSE41-NEXT: pmullw %xmm1, %xmm3
512504 ; SSE41-NEXT: psrlw $8, %xmm3
513 ; SSE41-NEXT: packuswb %xmm3, %xmm1
514 ; SSE41-NEXT: movdqa %xmm0, %xmm2
515 ; SSE41-NEXT: psubb %xmm1, %xmm2
516 ; SSE41-NEXT: psrlw $1, %xmm2
505 ; SSE41-NEXT: packuswb %xmm3, %xmm2
506 ; SSE41-NEXT: movdqa %xmm0, %xmm1
507 ; SSE41-NEXT: psubb %xmm2, %xmm1
508 ; SSE41-NEXT: psrlw $1, %xmm1
509 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
510 ; SSE41-NEXT: paddb %xmm2, %xmm1
511 ; SSE41-NEXT: psrlw $2, %xmm1
512 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
513 ; SSE41-NEXT: movdqa %xmm1, %xmm2
514 ; SSE41-NEXT: psllw $3, %xmm2
517515 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
518 ; SSE41-NEXT: paddb %xmm1, %xmm2
519 ; SSE41-NEXT: psrlw $2, %xmm2
520 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
521 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
522 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
523 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
524 ; SSE41-NEXT: pmullw %xmm3, %xmm2
525 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
526 ; SSE41-NEXT: pand %xmm4, %xmm2
527 ; SSE41-NEXT: pmullw %xmm3, %xmm1
528 ; SSE41-NEXT: pand %xmm4, %xmm1
529 ; SSE41-NEXT: packuswb %xmm2, %xmm1
530 ; SSE41-NEXT: psubb %xmm1, %xmm0
516 ; SSE41-NEXT: psubb %xmm2, %xmm1
517 ; SSE41-NEXT: paddb %xmm0, %xmm1
518 ; SSE41-NEXT: movdqa %xmm1, %xmm0
531519 ; SSE41-NEXT: retq
532520 ;
533521 ; AVX1-LABEL: test_rem7_16i8:
547535 ; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
548536 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1
549537 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
550 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
551 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
552 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
553 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
554 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
555 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
556 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
557 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
558 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
559 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
538 ; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2
539 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
540 ; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1
541 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
560542 ; AVX1-NEXT: retq
561543 ;
562544 ; AVX2NOBW-LABEL: test_rem7_16i8:
572554 ; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
573555 ; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm1
574556 ; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
575 ; AVX2NOBW-NEXT: vpmovsxbw %xmm1, %ymm1
576 ; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
577 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
578 ; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
579 ; AVX2NOBW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
580 ; AVX2NOBW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
581 ; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
582 ; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
557 ; AVX2NOBW-NEXT: vpsllw $3, %xmm1, %xmm2
558 ; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
559 ; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1
560 ; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
583561 ; AVX2NOBW-NEXT: vzeroupper
584562 ; AVX2NOBW-NEXT: retq
585563 ;
595573 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
596574 ; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm1
597575 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
598 ; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1
599 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
600 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
601 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
576 ; AVX512BW-NEXT: vpsllw $3, %xmm1, %xmm2
577 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
578 ; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm1
579 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
602580 ; AVX512BW-NEXT: vzeroupper
603581 ; AVX512BW-NEXT: retq
604582 %res = urem <16 x i8> %a,
376376 ; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
377377 ; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2
378378 ; AVX1-NEXT: vpsrld $2, %xmm2, %xmm2
379 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7]
380 ; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
381 ; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1
379 ; AVX1-NEXT: vpslld $3, %xmm2, %xmm4
380 ; AVX1-NEXT: vpsubd %xmm4, %xmm2, %xmm2
381 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
382382 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
383383 ; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
384384 ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
388388 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
389389 ; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
390390 ; AVX1-NEXT: vpsrld $2, %xmm2, %xmm2
391 ; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
392 ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
391 ; AVX1-NEXT: vpslld $3, %xmm2, %xmm3
392 ; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2
393 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
393394 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
394395 ; AVX1-NEXT: retq
395396 ;
423424 ; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4
424425 ; AVX1-NEXT: vpaddw %xmm3, %xmm4, %xmm3
425426 ; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3
426 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7]
427 ; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
428 ; AVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm1
427 ; AVX1-NEXT: vpsllw $3, %xmm3, %xmm4
428 ; AVX1-NEXT: vpsubw %xmm4, %xmm3, %xmm3
429 ; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1
429430 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm2
430431 ; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm3
431432 ; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
432433 ; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2
433434 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2
434 ; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
435 ; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0
435 ; AVX1-NEXT: vpsllw $3, %xmm2, %xmm3
436 ; AVX1-NEXT: vpsubw %xmm3, %xmm2, %xmm2
437 ; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0
436438 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
437439 ; AVX1-NEXT: retq
438440 ;
465467 ; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
466468 ; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm4
467469 ; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4
468 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
469 ; AVX1-NEXT: vpand %xmm8, %xmm4, %xmm4
470 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
471 ; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
470472 ; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
471473 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2
472474 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
473475 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
474 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
475 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
476 ; AVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6
477 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
478 ; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6
479 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
480 ; AVX1-NEXT: vpmullw %xmm7, %xmm2, %xmm2
481 ; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
482 ; AVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2
483 ; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1
476 ; AVX1-NEXT: vpsllw $3, %xmm2, %xmm6
477 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
478 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
479 ; AVX1-NEXT: vpsubb %xmm6, %xmm2, %xmm2
480 ; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1
484481 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
485482 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
486483 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
491488 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
492489 ; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm3
493490 ; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
494 ; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
491 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
495492 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
496493 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2
497494 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
498 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
499 ; AVX1-NEXT: vpmullw %xmm7, %xmm3, %xmm3
500 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
501 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
502 ; AVX1-NEXT: vpmullw %xmm7, %xmm2, %xmm2
503 ; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
504 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
505 ; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0
495 ; AVX1-NEXT: vpsllw $3, %xmm2, %xmm3
496 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
497 ; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2
498 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
506499 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
507500 ; AVX1-NEXT: retq
508501 ;
524517 ; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm2, %ymm1
525518 ; AVX2NOBW-NEXT: vpsrlw $2, %ymm1, %ymm1
526519 ; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
527 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
528 ; AVX2NOBW-NEXT: vpmovsxbw %xmm2, %ymm2
529 ; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
530 ; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
531 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm2, %xmm4
532 ; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
533 ; AVX2NOBW-NEXT: vpshufb %xmm5, %xmm4, %xmm4
534 ; AVX2NOBW-NEXT: vpshufb %xmm5, %xmm2, %xmm2
535 ; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
536 ; AVX2NOBW-NEXT: vpmovsxbw %xmm1, %ymm1
537 ; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm1, %ymm1
538 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm3
539 ; AVX2NOBW-NEXT: vpshufb %xmm5, %xmm3, %xmm3
540 ; AVX2NOBW-NEXT: vpshufb %xmm5, %xmm1, %xmm1
541 ; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
542 ; AVX2NOBW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
543 ; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
520 ; AVX2NOBW-NEXT: vpsllw $3, %ymm1, %ymm2
521 ; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
522 ; AVX2NOBW-NEXT: vpsubb %ymm2, %ymm1, %ymm1
523 ; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
544524 ; AVX2NOBW-NEXT: retq
545525 ;
546526 ; AVX512BW-LABEL: test_rem7_32i8:
555535 ; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1
556536 ; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm1
557537 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
558 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1
559 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1
560 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
561 ; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
538 ; AVX512BW-NEXT: vpsllw $3, %ymm1, %ymm2
539 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
540 ; AVX512BW-NEXT: vpsubb %ymm2, %ymm1, %ymm1
541 ; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
562542 ; AVX512BW-NEXT: retq
563543 %res = urem <32 x i8> %a,
564544 ret <32 x i8> %res
374374 ; AVX512F-LABEL: test_rem7_64i8:
375375 ; AVX512F: # %bb.0:
376376 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
377 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
378 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
379 ; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm3
380 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
377 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
378 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
379 ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
380 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
381381 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
382 ; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm4
382 ; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4
383383 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
384 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
385 ; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3]
386 ; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm4
384 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm4, %ymm2
385 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
386 ; AVX512F-NEXT: vpsubb %ymm2, %ymm0, %ymm4
387387 ; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4
388388 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
389389 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
390 ; AVX512F-NEXT: vpaddb %ymm3, %ymm4, %ymm3
391 ; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3
390 ; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2
391 ; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
392392 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
393 ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm6
394 ; AVX512F-NEXT: vpmovsxbw %xmm6, %ymm7
395 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
396 ; AVX512F-NEXT: vpmullw %ymm3, %ymm7, %ymm7
397 ; AVX512F-NEXT: vpmovsxwd %ymm7, %zmm7
398 ; AVX512F-NEXT: vpmovdb %zmm7, %xmm7
399 ; AVX512F-NEXT: vextracti128 $1, %ymm6, %xmm6
400 ; AVX512F-NEXT: vpmovsxbw %xmm6, %ymm6
401 ; AVX512F-NEXT: vpmullw %ymm3, %ymm6, %ymm6
402 ; AVX512F-NEXT: vpmovsxwd %ymm6, %zmm6
403 ; AVX512F-NEXT: vpmovdb %zmm6, %xmm6
404 ; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6
405 ; AVX512F-NEXT: vpsubb %ymm6, %ymm0, %ymm0
406 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm6
407 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero
408 ; AVX512F-NEXT: vpmullw %ymm2, %ymm6, %ymm6
409 ; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm6
410 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
411 ; AVX512F-NEXT: vpmullw %ymm2, %ymm7, %ymm2
393 ; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
394 ; AVX512F-NEXT: vpsllw $3, %ymm2, %ymm6
395 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
396 ; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6
397 ; AVX512F-NEXT: vpsubb %ymm6, %ymm2, %ymm2
398 ; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
399 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
400 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
401 ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
412402 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
413 ; AVX512F-NEXT: vpackuswb %ymm6, %ymm2, %ymm2
403 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
404 ; AVX512F-NEXT: vpmullw %ymm3, %ymm6, %ymm3
405 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
406 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm3, %ymm2
414407 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
415 ; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm6
416 ; AVX512F-NEXT: vpsrlw $1, %ymm6, %ymm6
417 ; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm5
418 ; AVX512F-NEXT: vpaddb %ymm2, %ymm5, %ymm2
408 ; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm3
409 ; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
410 ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
411 ; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2
419412 ; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
420413 ; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
421 ; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm4
422 ; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4
423 ; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4
424 ; AVX512F-NEXT: vpmovdb %zmm4, %xmm4
425 ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2
426 ; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
427 ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
428 ; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
429 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
430 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
431 ; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1
414 ; AVX512F-NEXT: vpsllw $3, %ymm2, %ymm3
415 ; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
416 ; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm2
417 ; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1
432418 ; AVX512F-NEXT: retq
433419 ;
434420 ; AVX512BW-LABEL: test_rem7_64i8:
450436 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1
451437 ; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm1
452438 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
453 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm2
454 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
455 ; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2
456 ; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2
457 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1
458 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1
459 ; AVX512BW-NEXT: vpmullw %zmm3, %zmm1, %zmm1
460 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
461 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
462 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0
439 ; AVX512BW-NEXT: vpsllw $3, %zmm1, %zmm2
440 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
441 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1
442 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
463443 ; AVX512BW-NEXT: retq
464444 %res = urem <64 x i8> %a,
465445 ret <64 x i8> %res
231231 define <2 x i64> @mul_v2i64_17(<2 x i64> %a0) nounwind {
232232 ; X86-LABEL: mul_v2i64_17:
233233 ; X86: # %bb.0:
234 ; X86-NEXT: movdqa {{.*#+}} xmm1 = [17,0,17,0]
235 ; X86-NEXT: movdqa %xmm0, %xmm2
236 ; X86-NEXT: pmuludq %xmm1, %xmm2
237 ; X86-NEXT: psrlq $32, %xmm0
238 ; X86-NEXT: pmuludq %xmm1, %xmm0
239 ; X86-NEXT: psllq $32, %xmm0
240 ; X86-NEXT: paddq %xmm2, %xmm0
234 ; X86-NEXT: movdqa %xmm0, %xmm1
235 ; X86-NEXT: psllq $4, %xmm1
236 ; X86-NEXT: paddq %xmm0, %xmm1
237 ; X86-NEXT: movdqa %xmm1, %xmm0
241238 ; X86-NEXT: retl
242239 ;
243240 ; X64-LABEL: mul_v2i64_17:
244241 ; X64: # %bb.0:
245 ; X64-NEXT: movdqa {{.*#+}} xmm1 = [17,17]
246 ; X64-NEXT: movdqa %xmm0, %xmm2
247 ; X64-NEXT: pmuludq %xmm1, %xmm2
248 ; X64-NEXT: psrlq $32, %xmm0
249 ; X64-NEXT: pmuludq %xmm1, %xmm0
250 ; X64-NEXT: psllq $32, %xmm0
251 ; X64-NEXT: paddq %xmm2, %xmm0
242 ; X64-NEXT: movdqa %xmm0, %xmm1
243 ; X64-NEXT: psllq $4, %xmm1
244 ; X64-NEXT: paddq %xmm0, %xmm1
245 ; X64-NEXT: movdqa %xmm1, %xmm0
252246 ; X64-NEXT: retq
253247 ;
254248 ; X64-AVX-LABEL: mul_v2i64_17:
255249 ; X64-AVX: # %bb.0:
256 ; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17]
257 ; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
258 ; X64-AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
259 ; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
260 ; X64-AVX-NEXT: vpsllq $32, %xmm0, %xmm0
261 ; X64-AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0
250 ; X64-AVX-NEXT: vpsllq $4, %xmm0, %xmm1
251 ; X64-AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
262252 ; X64-AVX-NEXT: retq
263253 %1 = mul <2 x i64> %a0,
264254 ret <2 x i64> %1
311301 define <16 x i8> @mul_v16i8_17(<16 x i8> %a0) nounwind {
312302 ; X86-LABEL: mul_v16i8_17:
313303 ; X86: # %bb.0:
314 ; X86-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
315 ; X86-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
316 ; X86-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17]
317 ; X86-NEXT: pmullw %xmm2, %xmm0
318 ; X86-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
319 ; X86-NEXT: pand %xmm3, %xmm0
320 ; X86-NEXT: pmullw %xmm2, %xmm1
321 ; X86-NEXT: pand %xmm3, %xmm1
322 ; X86-NEXT: packuswb %xmm0, %xmm1
304 ; X86-NEXT: movdqa %xmm0, %xmm1
305 ; X86-NEXT: psllw $4, %xmm1
306 ; X86-NEXT: pand {{\.LCPI.*}}, %xmm1
307 ; X86-NEXT: paddb %xmm0, %xmm1
323308 ; X86-NEXT: movdqa %xmm1, %xmm0
324309 ; X86-NEXT: retl
325310 ;
326311 ; X64-LABEL: mul_v16i8_17:
327312 ; X64: # %bb.0:
328 ; X64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
329 ; X64-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
330 ; X64-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17]
331 ; X64-NEXT: pmullw %xmm2, %xmm0
332 ; X64-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
333 ; X64-NEXT: pand %xmm3, %xmm0
334 ; X64-NEXT: pmullw %xmm2, %xmm1
335 ; X64-NEXT: pand %xmm3, %xmm1
336 ; X64-NEXT: packuswb %xmm0, %xmm1
313 ; X64-NEXT: movdqa %xmm0, %xmm1
314 ; X64-NEXT: psllw $4, %xmm1
315 ; X64-NEXT: pand {{.*}}(%rip), %xmm1
316 ; X64-NEXT: paddb %xmm0, %xmm1
337317 ; X64-NEXT: movdqa %xmm1, %xmm0
338318 ; X64-NEXT: retq
339319 ;
340320 ; X64-XOP-LABEL: mul_v16i8_17:
341321 ; X64-XOP: # %bb.0:
342 ; X64-XOP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
343 ; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17]
344 ; X64-XOP-NEXT: vpmullw %xmm2, %xmm1, %xmm1
345 ; X64-XOP-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
346 ; X64-XOP-NEXT: vpmullw %xmm2, %xmm0, %xmm0
347 ; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm1[0,2,4,6,8,10,12,14]
322 ; X64-XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm1
323 ; X64-XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
348324 ; X64-XOP-NEXT: retq
349325 ;
350326 ; X64-AVX2-LABEL: mul_v16i8_17:
351327 ; X64-AVX2: # %bb.0:
352 ; X64-AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
353 ; X64-AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
354 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
355 ; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
356 ; X64-AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
357 ; X64-AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
358 ; X64-AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
359 ; X64-AVX2-NEXT: vzeroupper
328 ; X64-AVX2-NEXT: vpsllw $4, %xmm0, %xmm1
329 ; X64-AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
330 ; X64-AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
360331 ; X64-AVX2-NEXT: retq
361332 %1 = mul <16 x i8> %a0,
362333 ret <16 x i8> %1
504475 define <2 x i64> @mul_v2i64_7(<2 x i64> %a0) nounwind {
505476 ; X86-LABEL: mul_v2i64_7:
506477 ; X86: # %bb.0:
507 ; X86-NEXT: movdqa {{.*#+}} xmm1 = [7,0,7,0]
508 ; X86-NEXT: movdqa %xmm0, %xmm2
509 ; X86-NEXT: pmuludq %xmm1, %xmm2
510 ; X86-NEXT: psrlq $32, %xmm0
511 ; X86-NEXT: pmuludq %xmm1, %xmm0
512 ; X86-NEXT: psllq $32, %xmm0
513 ; X86-NEXT: paddq %xmm2, %xmm0
478 ; X86-NEXT: movdqa %xmm0, %xmm1
479 ; X86-NEXT: psllq $3, %xmm1
480 ; X86-NEXT: psubq %xmm0, %xmm1
481 ; X86-NEXT: movdqa %xmm1, %xmm0
514482 ; X86-NEXT: retl
515483 ;
516484 ; X64-LABEL: mul_v2i64_7:
517485 ; X64: # %bb.0:
518 ; X64-NEXT: movdqa {{.*#+}} xmm1 = [7,7]
519 ; X64-NEXT: movdqa %xmm0, %xmm2
520 ; X64-NEXT: pmuludq %xmm1, %xmm2
521 ; X64-NEXT: psrlq $32, %xmm0
522 ; X64-NEXT: pmuludq %xmm1, %xmm0
523 ; X64-NEXT: psllq $32, %xmm0
524 ; X64-NEXT: paddq %xmm2, %xmm0
486 ; X64-NEXT: movdqa %xmm0, %xmm1
487 ; X64-NEXT: psllq $3, %xmm1
488 ; X64-NEXT: psubq %xmm0, %xmm1
489 ; X64-NEXT: movdqa %xmm1, %xmm0
525490 ; X64-NEXT: retq
526491 ;
527492 ; X64-AVX-LABEL: mul_v2i64_7:
528493 ; X64-AVX: # %bb.0:
529 ; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7]
530 ; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
531 ; X64-AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
532 ; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
533 ; X64-AVX-NEXT: vpsllq $32, %xmm0, %xmm0
534 ; X64-AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0
494 ; X64-AVX-NEXT: vpsllq $3, %xmm0, %xmm1
495 ; X64-AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm0
535496 ; X64-AVX-NEXT: retq
536497 %1 = mul <2 x i64> %a0,
537498 ret <2 x i64> %1
584545 define <16 x i8> @mul_v16i8_31(<16 x i8> %a0) nounwind {
585546 ; X86-LABEL: mul_v16i8_31:
586547 ; X86: # %bb.0:
587 ; X86-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
588 ; X86-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
589 ; X86-NEXT: movdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
590 ; X86-NEXT: pmullw %xmm2, %xmm0
591 ; X86-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
592 ; X86-NEXT: pand %xmm3, %xmm0
593 ; X86-NEXT: pmullw %xmm2, %xmm1
594 ; X86-NEXT: pand %xmm3, %xmm1
595 ; X86-NEXT: packuswb %xmm0, %xmm1
548 ; X86-NEXT: movdqa %xmm0, %xmm1
549 ; X86-NEXT: psllw $5, %xmm1
550 ; X86-NEXT: pand {{\.LCPI.*}}, %xmm1
551 ; X86-NEXT: psubb %xmm0, %xmm1
596552 ; X86-NEXT: movdqa %xmm1, %xmm0
597553 ; X86-NEXT: retl
598554 ;
599555 ; X64-LABEL: mul_v16i8_31:
600556 ; X64: # %bb.0:
601 ; X64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
602 ; X64-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
603 ; X64-NEXT: movdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
604 ; X64-NEXT: pmullw %xmm2, %xmm0
605 ; X64-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
606 ; X64-NEXT: pand %xmm3, %xmm0
607 ; X64-NEXT: pmullw %xmm2, %xmm1
608 ; X64-NEXT: pand %xmm3, %xmm1
609 ; X64-NEXT: packuswb %xmm0, %xmm1
557 ; X64-NEXT: movdqa %xmm0, %xmm1
558 ; X64-NEXT: psllw $5, %xmm1
559 ; X64-NEXT: pand {{.*}}(%rip), %xmm1
560 ; X64-NEXT: psubb %xmm0, %xmm1
610561 ; X64-NEXT: movdqa %xmm1, %xmm0
611562 ; X64-NEXT: retq
612563 ;
613564 ; X64-XOP-LABEL: mul_v16i8_31:
614565 ; X64-XOP: # %bb.0:
615 ; X64-XOP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
616 ; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
617 ; X64-XOP-NEXT: vpmullw %xmm2, %xmm1, %xmm1
618 ; X64-XOP-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
619 ; X64-XOP-NEXT: vpmullw %xmm2, %xmm0, %xmm0
620 ; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm1[0,2,4,6,8,10,12,14]
566 ; X64-XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm1
567 ; X64-XOP-NEXT: vpsubb %xmm0, %xmm1, %xmm0
621568 ; X64-XOP-NEXT: retq
622569 ;
623570 ; X64-AVX2-LABEL: mul_v16i8_31:
624571 ; X64-AVX2: # %bb.0:
625 ; X64-AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
626 ; X64-AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
627 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
628 ; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
629 ; X64-AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
630 ; X64-AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
631 ; X64-AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
632 ; X64-AVX2-NEXT: vzeroupper
572 ; X64-AVX2-NEXT: vpsllw $5, %xmm0, %xmm1
573 ; X64-AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
574 ; X64-AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm0
633575 ; X64-AVX2-NEXT: retq
634576 %1 = mul <16 x i8> %a0,
635577 ret <16 x i8> %1