llvm.org GIT mirror llvm / d0ca754
[X86][XOP] Add support for lowering vector rotations This patch adds support for lowering to the XOP VPROT / VPROTI vector bit rotation instructions. This has required changes to the DAGCombiner rotation pattern matching to support vector types - so far I've only changed it to support splat vectors, but generalising this further is feasible in the future. Differential Revision: http://reviews.llvm.org/D13851 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251188 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 4 years ago
4 changed file(s) with 230 addition(s) and 359 deletion(s). Raw diff Collapse all Expand all
37953795 /// Match "(X shl/srl V1) & V2" where V2 may not be present.
37963796 static bool MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) {
37973797 if (Op.getOpcode() == ISD::AND) {
3798 if (isa(Op.getOperand(1))) {
3798 if (isConstOrConstSplat(Op.getOperand(1))) {
37993799 Mask = Op.getOperand(1);
38003800 Op = Op.getOperand(0);
38013801 } else {
38123812 }
38133813
38143814 // Return true if we can prove that, whenever Neg and Pos are both in the
3815 // range [0, OpSize), Neg == (Pos == 0 ? 0 : OpSize - Pos). This means that
3815 // range [0, EltSize), Neg == (Pos == 0 ? 0 : EltSize - Pos). This means that
38163816 // for two opposing shifts shift1 and shift2 and a value X with OpBits bits:
38173817 //
38183818 // (or (shift1 X, Neg), (shift2 X, Pos))
38193819 //
38203820 // reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate
3821 // in direction shift1 by Neg. The range [0, OpSize) means that we only need
3821 // in direction shift1 by Neg. The range [0, EltSize) means that we only need
38223822 // to consider shift amounts with defined behavior.
3823 static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned OpSize) {
3824 // If OpSize is a power of 2 then:
3823 static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize) {
3824 // If EltSize is a power of 2 then:
38253825 //
3826 // (a) (Pos == 0 ? 0 : OpSize - Pos) == (OpSize - Pos) & (OpSize - 1)
3827 // (b) Neg == Neg & (OpSize - 1) whenever Neg is in [0, OpSize).
3826 // (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1)
3827 // (b) Neg == Neg & (EltSize - 1) whenever Neg is in [0, EltSize).
38283828 //
3829 // So if OpSize is a power of 2 and Neg is (and Neg', OpSize-1), we check
3829 // So if EltSize is a power of 2 and Neg is (and Neg', EltSize-1), we check
38303830 // for the stronger condition:
38313831 //
3832 // Neg & (OpSize - 1) == (OpSize - Pos) & (OpSize - 1) [A]
3832 // Neg & (EltSize - 1) == (EltSize - Pos) & (EltSize - 1) [A]
38333833 //
3834 // for all Neg and Pos. Since Neg & (OpSize - 1) == Neg' & (OpSize - 1)
3834 // for all Neg and Pos. Since Neg & (EltSize - 1) == Neg' & (EltSize - 1)
38353835 // we can just replace Neg with Neg' for the rest of the function.
38363836 //
38373837 // In other cases we check for the even stronger condition:
38383838 //
3839 // Neg == OpSize - Pos [B]
3839 // Neg == EltSize - Pos [B]
38403840 //
38413841 // for all Neg and Pos. Note that the (or ...) then invokes undefined
3842 // behavior if Pos == 0 (and consequently Neg == OpSize).
3842 // behavior if Pos == 0 (and consequently Neg == EltSize).
38433843 //
3844 // We could actually use [A] whenever OpSize is a power of 2, but the
3844 // We could actually use [A] whenever EltSize is a power of 2, but the
38453845 // only extra cases that it would match are those uninteresting ones
38463846 // where Neg and Pos are never in range at the same time. E.g. for
3847 // OpSize == 32, using [A] would allow a Neg of the form (sub 64, Pos)
3847 // EltSize == 32, using [A] would allow a Neg of the form (sub 64, Pos)
38483848 // as well as (sub 32, Pos), but:
38493849 //
38503850 // (or (shift1 X, (sub 64, Pos)), (shift2 X, Pos))
38513851 //
38523852 // always invokes undefined behavior for 32-bit X.
38533853 //
3854 // Below, Mask == OpSize - 1 when using [A] and is all-ones otherwise.
3854 // Below, Mask == EltSize - 1 when using [A] and is all-ones otherwise.
38553855 unsigned MaskLoBits = 0;
3856 if (Neg.getOpcode() == ISD::AND &&
3857 isPowerOf2_64(OpSize) &&
3858 Neg.getOperand(1).getOpcode() == ISD::Constant &&
3859 cast(Neg.getOperand(1))->getAPIntValue() == OpSize - 1) {
3860 Neg = Neg.getOperand(0);
3861 MaskLoBits = Log2_64(OpSize);
3856 if (Neg.getOpcode() == ISD::AND && isPowerOf2_64(EltSize)) {
3857 if (ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(1))) {
3858 if (NegC->getAPIntValue() == EltSize - 1) {
3859 Neg = Neg.getOperand(0);
3860 MaskLoBits = Log2_64(EltSize);
3861 }
3862 }
38623863 }
38633864
38643865 // Check whether Neg has the form (sub NegC, NegOp1) for some NegC and NegOp1.
38653866 if (Neg.getOpcode() != ISD::SUB)
38663867 return 0;
3867 ConstantSDNode *NegC = dyn_cast(Neg.getOperand(0));
3868 ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(0));
38683869 if (!NegC)
38693870 return 0;
38703871 SDValue NegOp1 = Neg.getOperand(1);
38713872
3872 // On the RHS of [A], if Pos is Pos' & (OpSize - 1), just replace Pos with
3873 // On the RHS of [A], if Pos is Pos' & (EltSize - 1), just replace Pos with
38733874 // Pos'. The truncation is redundant for the purpose of the equality.
3874 if (MaskLoBits &&
3875 Pos.getOpcode() == ISD::AND &&
3876 Pos.getOperand(1).getOpcode() == ISD::Constant &&
3877 cast(Pos.getOperand(1))->getAPIntValue() == OpSize - 1)
3878 Pos = Pos.getOperand(0);
3875 if (MaskLoBits && Pos.getOpcode() == ISD::AND)
3876 if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1)))
3877 if (PosC->getAPIntValue() == EltSize - 1)
3878 Pos = Pos.getOperand(0);
38793879
38803880 // The condition we need is now:
38813881 //
3882 // (NegC - NegOp1) & Mask == (OpSize - Pos) & Mask
3882 // (NegC - NegOp1) & Mask == (EltSize - Pos) & Mask
38833883 //
38843884 // If NegOp1 == Pos then we need:
38853885 //
3886 // OpSize & Mask == NegC & Mask
3886 // EltSize & Mask == NegC & Mask
38873887 //
38883888 // (because "x & Mask" is a truncation and distributes through subtraction).
38893889 APInt Width;
38903890 if (Pos == NegOp1)
38913891 Width = NegC->getAPIntValue();
3892
38923893 // Check for cases where Pos has the form (add NegOp1, PosC) for some PosC.
38933894 // Then the condition we want to prove becomes:
38943895 //
3895 // (NegC - NegOp1) & Mask == (OpSize - (NegOp1 + PosC)) & Mask
3896 // (NegC - NegOp1) & Mask == (EltSize - (NegOp1 + PosC)) & Mask
38963897 //
38973898 // which, again because "x & Mask" is a truncation, becomes:
38983899 //
3899 // NegC & Mask == (OpSize - PosC) & Mask
3900 // OpSize & Mask == (NegC + PosC) & Mask
3901 else if (Pos.getOpcode() == ISD::ADD &&
3902 Pos.getOperand(0) == NegOp1 &&
3903 Pos.getOperand(1).getOpcode() == ISD::Constant)
3904 Width = (cast(Pos.getOperand(1))->getAPIntValue() +
3905 NegC->getAPIntValue());
3906 else
3900 // NegC & Mask == (EltSize - PosC) & Mask
3901 // EltSize & Mask == (NegC + PosC) & Mask
3902 else if (Pos.getOpcode() == ISD::ADD && Pos.getOperand(0) == NegOp1) {
3903 if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1)))
3904 Width = PosC->getAPIntValue() + NegC->getAPIntValue();
3905 else
3906 return false;
3907 } else
39073908 return false;
39083909
3909 // Now we just need to check that OpSize & Mask == Width & Mask.
3910 // Now we just need to check that EltSize & Mask == Width & Mask.
39103911 if (MaskLoBits)
3911 // Opsize & Mask is 0 since Mask is Opsize - 1.
3912 // EltSize & Mask is 0 since Mask is EltSize - 1.
39123913 return Width.getLoBits(MaskLoBits) == 0;
3913 return Width == OpSize;
3914 return Width == EltSize;
39143915 }
39153916
39163917 // A subroutine of MatchRotate used once we have found an OR of two opposite
39303931 // (srl x, (*ext y))) ->
39313932 // (rotr x, y) or (rotl x, (sub 32, y))
39323933 EVT VT = Shifted.getValueType();
3933 if (matchRotateSub(InnerPos, InnerNeg, VT.getSizeInBits())) {
3934 if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits())) {
39343935 bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
39353936 return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted,
39363937 HasPos ? Pos : Neg).getNode();
39733974 if (RHSShift.getOpcode() == ISD::SHL) {
39743975 std::swap(LHS, RHS);
39753976 std::swap(LHSShift, RHSShift);
3976 std::swap(LHSMask , RHSMask );
3977 }
3978
3979 unsigned OpSizeInBits = VT.getSizeInBits();
3977 std::swap(LHSMask, RHSMask);
3978 }
3979
3980 unsigned EltSizeInBits = VT.getScalarSizeInBits();
39803981 SDValue LHSShiftArg = LHSShift.getOperand(0);
39813982 SDValue LHSShiftAmt = LHSShift.getOperand(1);
39823983 SDValue RHSShiftArg = RHSShift.getOperand(0);
39843985
39853986 // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
39863987 // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
3987 if (LHSShiftAmt.getOpcode() == ISD::Constant &&
3988 RHSShiftAmt.getOpcode() == ISD::Constant) {
3989 uint64_t LShVal = cast(LHSShiftAmt)->getZExtValue();
3990 uint64_t RShVal = cast(RHSShiftAmt)->getZExtValue();
3991 if ((LShVal + RShVal) != OpSizeInBits)
3988 if (isConstOrConstSplat(LHSShiftAmt) && isConstOrConstSplat(RHSShiftAmt)) {
3989 uint64_t LShVal = isConstOrConstSplat(LHSShiftAmt)->getZExtValue();
3990 uint64_t RShVal = isConstOrConstSplat(RHSShiftAmt)->getZExtValue();
3991 if ((LShVal + RShVal) != EltSizeInBits)
39923992 return nullptr;
39933993
39943994 SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT,
39963996
39973997 // If there is an AND of either shifted operand, apply it to the result.
39983998 if (LHSMask.getNode() || RHSMask.getNode()) {
3999 APInt Mask = APInt::getAllOnesValue(OpSizeInBits);
3999 APInt Mask = APInt::getAllOnesValue(EltSizeInBits);
40004000
40014001 if (LHSMask.getNode()) {
4002 APInt RHSBits = APInt::getLowBitsSet(OpSizeInBits, LShVal);
4003 Mask &= cast(LHSMask)->getAPIntValue() | RHSBits;
4002 APInt RHSBits = APInt::getLowBitsSet(EltSizeInBits, LShVal);
4003 Mask &= isConstOrConstSplat(LHSMask)->getAPIntValue() | RHSBits;
40044004 }
40054005 if (RHSMask.getNode()) {
4006 APInt LHSBits = APInt::getHighBitsSet(OpSizeInBits, RShVal);
4007 Mask &= cast(RHSMask)->getAPIntValue() | LHSBits;
4006 APInt LHSBits = APInt::getHighBitsSet(EltSizeInBits, RShVal);
4007 Mask &= isConstOrConstSplat(RHSMask)->getAPIntValue() | LHSBits;
40084008 }
40094009
40104010 Rot = DAG.getNode(ISD::AND, DL, VT, Rot, DAG.getConstant(Mask, DL, VT));
10491049 setOperationAction(ISD::SRA, MVT::v4i32, Custom);
10501050 }
10511051
1052 if (Subtarget->hasXOP()) {
1053 setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
1054 setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
1055 setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
1056 setOperationAction(ISD::ROTL, MVT::v2i64, Custom);
1057 setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
1058 setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
1059 setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
1060 setOperationAction(ISD::ROTL, MVT::v4i64, Custom);
1061 }
1062
10521063 if (!Subtarget->useSoftFloat() && Subtarget->hasFp256()) {
10531064 addRegisterClass(MVT::v32i8, &X86::VR256RegClass);
10541065 addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
1881418825 }
1881518826
1881618827 return SDValue();
18828 }
18829
18830 static SDValue LowerRotate(SDValue Op, const X86Subtarget *Subtarget,
18831 SelectionDAG &DAG) {
18832 MVT VT = Op.getSimpleValueType();
18833 SDLoc DL(Op);
18834 SDValue R = Op.getOperand(0);
18835 SDValue Amt = Op.getOperand(1);
18836 unsigned Opc = Op.getOpcode();
18837
18838 assert(VT.isVector() && "Custom lowering only for vector rotates!");
18839 assert(Subtarget->hasXOP() && "XOP support required for vector rotates!");
18840 assert((Opc == ISD::ROTL) && "Only ROTL supported");
18841
18842 // XOP has 128-bit vector variable + immediate rotates.
18843 // +ve/-ve Amt = rotate left/right.
18844
18845 // Split 256-bit integers.
18846 if (VT.getSizeInBits() == 256)
18847 return Lower256IntArith(Op, DAG);
18848
18849 assert(VT.getSizeInBits() == 128 && "Only rotate 128-bit vectors!");
18850
18851 // Attempt to rotate by immediate.
18852 if (auto *BVAmt = dyn_cast(Amt)) {
18853 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
18854 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
18855 assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
18856 return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
18857 DAG.getConstant(RotateAmt, DL, MVT::i8));
18858 }
18859 }
18860
18861 // Use general rotate by variable (per-element).
18862 return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
1881718863 }
1881818864
1881918865 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
1967419720 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
1967519721 case ISD::UMUL_LOHI:
1967619722 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
19723 case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG);
1967719724 case ISD::SRA:
1967819725 case ISD::SRL:
1967919726 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
7373 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
7474 ; AVX2-NEXT: retq
7575 ;
76 ; XOPAVX1-LABEL: var_rotate_v2i64:
77 ; XOPAVX1: # BB#0:
78 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
79 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2
80 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm1
81 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
82 ; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
83 ; XOPAVX1-NEXT: vpshlq %xmm2, %xmm0, %xmm0
84 ; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
85 ; XOPAVX1-NEXT: retq
86 ;
87 ; XOPAVX2-LABEL: var_rotate_v2i64:
88 ; XOPAVX2: # BB#0:
89 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
90 ; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2
91 ; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm1
92 ; XOPAVX2-NEXT: vpsrlvq %xmm2, %xmm0, %xmm0
93 ; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
94 ; XOPAVX2-NEXT: retq
76 ; XOP-LABEL: var_rotate_v2i64:
77 ; XOP: # BB#0:
78 ; XOP-NEXT: vprotq %xmm1, %xmm0, %xmm0
79 ; XOP-NEXT: retq
9580 ;
9681 ; X32-SSE-LABEL: var_rotate_v2i64:
9782 ; X32-SSE: # BB#0:
218203 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
219204 ; AVX2-NEXT: retq
220205 ;
221 ; XOPAVX1-LABEL: var_rotate_v4i32:
222 ; XOPAVX1: # BB#0:
223 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32]
224 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm2
225 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm1
226 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
227 ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
228 ; XOPAVX1-NEXT: vpshld %xmm2, %xmm0, %xmm0
229 ; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
230 ; XOPAVX1-NEXT: retq
231 ;
232 ; XOPAVX2-LABEL: var_rotate_v4i32:
233 ; XOPAVX2: # BB#0:
234 ; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
235 ; XOPAVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm2
236 ; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm1
237 ; XOPAVX2-NEXT: vpsrlvd %xmm2, %xmm0, %xmm0
238 ; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
239 ; XOPAVX2-NEXT: retq
206 ; XOP-LABEL: var_rotate_v4i32:
207 ; XOP: # BB#0:
208 ; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0
209 ; XOP-NEXT: retq
240210 ;
241211 ; X32-SSE-LABEL: var_rotate_v4i32:
242212 ; X32-SSE: # BB#0:
464434 ;
465435 ; XOP-LABEL: var_rotate_v8i16:
466436 ; XOP: # BB#0:
467 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
468 ; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm2
469 ; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm1
470 ; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
471 ; XOP-NEXT: vpsubw %xmm2, %xmm3, %xmm2
472 ; XOP-NEXT: vpshlw %xmm2, %xmm0, %xmm0
473 ; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
437 ; XOP-NEXT: vprotw %xmm1, %xmm0, %xmm0
474438 ; XOP-NEXT: retq
475439 ;
476440 ; X32-SSE-LABEL: var_rotate_v8i16:
688652 ;
689653 ; XOP-LABEL: var_rotate_v16i8:
690654 ; XOP: # BB#0:
691 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
692 ; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm2
693 ; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm1
694 ; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
695 ; XOP-NEXT: vpsubb %xmm2, %xmm3, %xmm2
696 ; XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm0
697 ; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
655 ; XOP-NEXT: vprotb %xmm1, %xmm0, %xmm0
698656 ; XOP-NEXT: retq
699657 ;
700658 ; X32-SSE-LABEL: var_rotate_v16i8:
13201278 ;
13211279 ; XOP-LABEL: splatconstant_rotate_v2i64:
13221280 ; XOP: # BB#0:
1323 ; XOP-NEXT: vpsllq $14, %xmm0, %xmm1
1324 ; XOP-NEXT: vpsrlq $50, %xmm0, %xmm0
1325 ; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
1281 ; XOP-NEXT: vprotq $14, %xmm0, %xmm0
13261282 ; XOP-NEXT: retq
13271283 ;
13281284 ; X32-SSE-LABEL: splatconstant_rotate_v2i64:
13561312 ;
13571313 ; XOP-LABEL: splatconstant_rotate_v4i32:
13581314 ; XOP: # BB#0:
1359 ; XOP-NEXT: vpslld $4, %xmm0, %xmm1
1360 ; XOP-NEXT: vpsrld $28, %xmm0, %xmm0
1361 ; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
1315 ; XOP-NEXT: vprotd $4, %xmm0, %xmm0
13621316 ; XOP-NEXT: retq
13631317 ;
13641318 ; X32-SSE-LABEL: splatconstant_rotate_v4i32:
13921346 ;
13931347 ; XOP-LABEL: splatconstant_rotate_v8i16:
13941348 ; XOP: # BB#0:
1395 ; XOP-NEXT: vpsllw $7, %xmm0, %xmm1
1396 ; XOP-NEXT: vpsrlw $9, %xmm0, %xmm0
1397 ; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
1349 ; XOP-NEXT: vprotw $7, %xmm0, %xmm0
13981350 ; XOP-NEXT: retq
13991351 ;
14001352 ; X32-SSE-LABEL: splatconstant_rotate_v8i16:
14321384 ;
14331385 ; XOP-LABEL: splatconstant_rotate_v16i8:
14341386 ; XOP: # BB#0:
1435 ; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
1436 ; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm2
1437 ; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
1438 ; XOP-NEXT: vpsubb %xmm1, %xmm3, %xmm1
1439 ; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
1440 ; XOP-NEXT: vpor %xmm0, %xmm2, %xmm0
1387 ; XOP-NEXT: vprotb $4, %xmm0, %xmm0
14411388 ; XOP-NEXT: retq
14421389 ;
14431390 ; X32-SSE-LABEL: splatconstant_rotate_v16i8:
14821429 ;
14831430 ; XOP-LABEL: splatconstant_rotate_mask_v2i64:
14841431 ; XOP: # BB#0:
1485 ; XOP-NEXT: vpsllq $15, %xmm0, %xmm1
1486 ; XOP-NEXT: vpsrlq $49, %xmm0, %xmm0
1432 ; XOP-NEXT: vprotq $15, %xmm0, %xmm0
14871433 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1488 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
1489 ; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
14901434 ; XOP-NEXT: retq
14911435 ;
14921436 ; X32-SSE-LABEL: splatconstant_rotate_mask_v2i64:
15411485 ;
15421486 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i32:
15431487 ; XOPAVX1: # BB#0:
1544 ; XOPAVX1-NEXT: vpslld $4, %xmm0, %xmm1
1545 ; XOPAVX1-NEXT: vpsrld $28, %xmm0, %xmm0
1488 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
15461489 ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1547 ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
1548 ; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
15491490 ; XOPAVX1-NEXT: retq
15501491 ;
15511492 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v4i32:
15521493 ; XOPAVX2: # BB#0:
1553 ; XOPAVX2-NEXT: vpslld $4, %xmm0, %xmm1
1554 ; XOPAVX2-NEXT: vpsrld $28, %xmm0, %xmm0
1555 ; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
1556 ; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
1557 ; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
1558 ; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
1559 ; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
1494 ; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
1495 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
1496 ; XOPAVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
15601497 ; XOPAVX2-NEXT: retq
15611498 ;
15621499 ; X32-SSE-LABEL: splatconstant_rotate_mask_v4i32:
16001537 ;
16011538 ; XOP-LABEL: splatconstant_rotate_mask_v8i16:
16021539 ; XOP: # BB#0:
1603 ; XOP-NEXT: vpsllw $5, %xmm0, %xmm1
1604 ; XOP-NEXT: vpsrlw $11, %xmm0, %xmm0
1540 ; XOP-NEXT: vprotw $5, %xmm0, %xmm0
16051541 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1606 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
1607 ; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
16081542 ; XOP-NEXT: retq
16091543 ;
16101544 ; X32-SSE-LABEL: splatconstant_rotate_mask_v8i16:
16521586 ;
16531587 ; XOP-LABEL: splatconstant_rotate_mask_v16i8:
16541588 ; XOP: # BB#0:
1655 ; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
1656 ; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm2
1657 ; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
1658 ; XOP-NEXT: vpsubb %xmm1, %xmm3, %xmm1
1659 ; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
1589 ; XOP-NEXT: vprotb $4, %xmm0, %xmm0
16601590 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1661 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm1
1662 ; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
16631591 ; XOP-NEXT: retq
16641592 ;
16651593 ; X32-SSE-LABEL: splatconstant_rotate_mask_v16i8:
4646 ;
4747 ; XOPAVX1-LABEL: var_rotate_v4i64:
4848 ; XOPAVX1: # BB#0:
49 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
50 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3
51 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
52 ; XOPAVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2
53 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
54 ; XOPAVX1-NEXT: vpshlq %xmm4, %xmm5, %xmm4
55 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm1
56 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
57 ; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
58 ; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2
59 ; XOPAVX1-NEXT: vpshlq %xmm2, %xmm5, %xmm2
60 ; XOPAVX1-NEXT: vpsubq %xmm3, %xmm4, %xmm3
61 ; XOPAVX1-NEXT: vpshlq %xmm3, %xmm0, %xmm0
49 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
50 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
51 ; XOPAVX1-NEXT: vprotq %xmm2, %xmm3, %xmm2
52 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
6253 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
63 ; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
6454 ; XOPAVX1-NEXT: retq
6555 ;
6656 ; XOPAVX2-LABEL: var_rotate_v4i64:
6757 ; XOPAVX2: # BB#0:
68 ; XOPAVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
69 ; XOPAVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2
70 ; XOPAVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm1
71 ; XOPAVX2-NEXT: vpsrlvq %ymm2, %ymm0, %ymm0
72 ; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
58 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
59 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
60 ; XOPAVX2-NEXT: vprotq %xmm2, %xmm3, %xmm2
61 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
62 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
7363 ; XOPAVX2-NEXT: retq
7464 %b64 = sub <4 x i64> , %b
7565 %shl = shl <4 x i64> %a, %b
134124 ;
135125 ; XOPAVX1-LABEL: var_rotate_v8i32:
136126 ; XOPAVX1: # BB#0:
137 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32]
138 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm3
139 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
140 ; XOPAVX1-NEXT: vpsubd %xmm4, %xmm2, %xmm2
141 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
142 ; XOPAVX1-NEXT: vpshld %xmm4, %xmm5, %xmm4
143 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm1
144 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
145 ; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
146 ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
147 ; XOPAVX1-NEXT: vpshld %xmm2, %xmm5, %xmm2
148 ; XOPAVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm3
149 ; XOPAVX1-NEXT: vpshld %xmm3, %xmm0, %xmm0
127 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
128 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
129 ; XOPAVX1-NEXT: vprotd %xmm2, %xmm3, %xmm2
130 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
150131 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
151 ; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
152132 ; XOPAVX1-NEXT: retq
153133 ;
154134 ; XOPAVX2-LABEL: var_rotate_v8i32:
155135 ; XOPAVX2: # BB#0:
156 ; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
157 ; XOPAVX2-NEXT: vpsubd %ymm1, %ymm2, %ymm2
158 ; XOPAVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm1
159 ; XOPAVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0
160 ; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
136 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
137 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
138 ; XOPAVX2-NEXT: vprotd %xmm2, %xmm3, %xmm2
139 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
140 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
161141 ; XOPAVX2-NEXT: retq
162142 %b32 = sub <8 x i32> , %b
163143 %shl = shl <8 x i32> %a, %b
261241 ;
262242 ; XOPAVX1-LABEL: var_rotate_v16i16:
263243 ; XOPAVX1: # BB#0:
264 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
265 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm3
266 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
267 ; XOPAVX1-NEXT: vpsubw %xmm4, %xmm2, %xmm2
268 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
269 ; XOPAVX1-NEXT: vpshlw %xmm4, %xmm5, %xmm4
270 ; XOPAVX1-NEXT: vpshlw %xmm1, %xmm0, %xmm1
271 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
272 ; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
273 ; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm2
274 ; XOPAVX1-NEXT: vpshlw %xmm2, %xmm5, %xmm2
275 ; XOPAVX1-NEXT: vpsubw %xmm3, %xmm4, %xmm3
276 ; XOPAVX1-NEXT: vpshlw %xmm3, %xmm0, %xmm0
244 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
245 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
246 ; XOPAVX1-NEXT: vprotw %xmm2, %xmm3, %xmm2
247 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
277248 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
278 ; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
279249 ; XOPAVX1-NEXT: retq
280250 ;
281251 ; XOPAVX2-LABEL: var_rotate_v16i16:
282252 ; XOPAVX2: # BB#0:
283 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
284 ; XOPAVX2-NEXT: vpsubw %ymm1, %ymm2, %ymm2
285 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
286 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
287 ; XOPAVX2-NEXT: vpshlw %xmm3, %xmm4, %xmm3
288 ; XOPAVX2-NEXT: vpshlw %xmm1, %xmm0, %xmm1
289 ; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
290 ; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
291 ; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
292 ; XOPAVX2-NEXT: vpsubw %xmm3, %xmm5, %xmm3
293 ; XOPAVX2-NEXT: vpshlw %xmm3, %xmm4, %xmm3
294 ; XOPAVX2-NEXT: vpsubw %xmm2, %xmm5, %xmm2
295 ; XOPAVX2-NEXT: vpshlw %xmm2, %xmm0, %xmm0
296 ; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
297 ; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
253 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
254 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
255 ; XOPAVX2-NEXT: vprotw %xmm2, %xmm3, %xmm2
256 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
257 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
298258 ; XOPAVX2-NEXT: retq
299259 %b16 = sub <16 x i16> , %b
300260 %shl = shl <16 x i16> %a, %b
399359 ;
400360 ; XOPAVX1-LABEL: var_rotate_v32i8:
401361 ; XOPAVX1: # BB#0:
402 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
403 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm3
404 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
405 ; XOPAVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2
406 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
407 ; XOPAVX1-NEXT: vpshlb %xmm4, %xmm5, %xmm4
408 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm1
409 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
410 ; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
411 ; XOPAVX1-NEXT: vpsubb %xmm2, %xmm4, %xmm2
412 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm5, %xmm2
413 ; XOPAVX1-NEXT: vpsubb %xmm3, %xmm4, %xmm3
414 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0
362 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
363 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
364 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm3, %xmm2
365 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
415366 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
416 ; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
417367 ; XOPAVX1-NEXT: retq
418368 ;
419369 ; XOPAVX2-LABEL: var_rotate_v32i8:
420370 ; XOPAVX2: # BB#0:
421 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
422 ; XOPAVX2-NEXT: vpsubb %ymm1, %ymm2, %ymm2
423 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
424 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
425 ; XOPAVX2-NEXT: vpshlb %xmm3, %xmm4, %xmm3
426 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm1
427 ; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
428 ; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
429 ; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
430 ; XOPAVX2-NEXT: vpsubb %xmm3, %xmm5, %xmm3
431 ; XOPAVX2-NEXT: vpshlb %xmm3, %xmm4, %xmm3
432 ; XOPAVX2-NEXT: vpsubb %xmm2, %xmm5, %xmm2
433 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0
434 ; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
435 ; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
371 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
372 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
373 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm3, %xmm2
374 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
375 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
436376 ; XOPAVX2-NEXT: retq
437377 %b8 = sub <32 x i8> , %b
438378 %shl = shl <32 x i8> %a, %b
788728 ;
789729 ; XOPAVX1-LABEL: splatconstant_rotate_v4i64:
790730 ; XOPAVX1: # BB#0:
791 ; XOPAVX1-NEXT: vpsllq $14, %xmm0, %xmm1
792 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
793 ; XOPAVX1-NEXT: vpsllq $14, %xmm2, %xmm3
794 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
795 ; XOPAVX1-NEXT: vpsrlq $50, %xmm0, %xmm0
796 ; XOPAVX1-NEXT: vpsrlq $50, %xmm2, %xmm2
797 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
798 ; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
731 ; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm1
732 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
733 ; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm0
734 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
799735 ; XOPAVX1-NEXT: retq
800736 ;
801737 ; XOPAVX2-LABEL: splatconstant_rotate_v4i64:
802738 ; XOPAVX2: # BB#0:
803 ; XOPAVX2-NEXT: vpsllq $14, %ymm0, %ymm1
804 ; XOPAVX2-NEXT: vpsrlq $50, %ymm0, %ymm0
805 ; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
739 ; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm1
740 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
741 ; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm0
742 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
806743 ; XOPAVX2-NEXT: retq
807744 %shl = shl <4 x i64> %a,
808745 %lshr = lshr <4 x i64> %a,
832769 ;
833770 ; XOPAVX1-LABEL: splatconstant_rotate_v8i32:
834771 ; XOPAVX1: # BB#0:
835 ; XOPAVX1-NEXT: vpslld $4, %xmm0, %xmm1
836 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
837 ; XOPAVX1-NEXT: vpslld $4, %xmm2, %xmm3
838 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
839 ; XOPAVX1-NEXT: vpsrld $28, %xmm0, %xmm0
840 ; XOPAVX1-NEXT: vpsrld $28, %xmm2, %xmm2
841 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
842 ; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
772 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
773 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
774 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
775 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
843776 ; XOPAVX1-NEXT: retq
844777 ;
845778 ; XOPAVX2-LABEL: splatconstant_rotate_v8i32:
846779 ; XOPAVX2: # BB#0:
847 ; XOPAVX2-NEXT: vpslld $4, %ymm0, %ymm1
848 ; XOPAVX2-NEXT: vpsrld $28, %ymm0, %ymm0
849 ; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
780 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1
781 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
782 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
783 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
850784 ; XOPAVX2-NEXT: retq
851785 %shl = shl <8 x i32> %a,
852786 %lshr = lshr <8 x i32> %a,
876810 ;
877811 ; XOPAVX1-LABEL: splatconstant_rotate_v16i16:
878812 ; XOPAVX1: # BB#0:
879 ; XOPAVX1-NEXT: vpsllw $7, %xmm0, %xmm1
880 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
881 ; XOPAVX1-NEXT: vpsllw $7, %xmm2, %xmm3
882 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
883 ; XOPAVX1-NEXT: vpsrlw $9, %xmm0, %xmm0
884 ; XOPAVX1-NEXT: vpsrlw $9, %xmm2, %xmm2
885 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
886 ; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
813 ; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm1
814 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
815 ; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm0
816 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
887817 ; XOPAVX1-NEXT: retq
888818 ;
889819 ; XOPAVX2-LABEL: splatconstant_rotate_v16i16:
890820 ; XOPAVX2: # BB#0:
891 ; XOPAVX2-NEXT: vpsllw $7, %ymm0, %ymm1
892 ; XOPAVX2-NEXT: vpsrlw $9, %ymm0, %ymm0
893 ; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
821 ; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm1
822 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
823 ; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm0
824 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
894825 ; XOPAVX2-NEXT: retq
895826 %shl = shl <16 x i16> %a,
896827 %lshr = lshr <16 x i16> %a,
928859 ;
929860 ; XOPAVX1-LABEL: splatconstant_rotate_v32i8:
930861 ; XOPAVX1: # BB#0:
931 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
932 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
933 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm3
934 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm4
935 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
936 ; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
937 ; XOPAVX1-NEXT: vpsubb %xmm2, %xmm4, %xmm2
938 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
939 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
940 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
941 ; XOPAVX1-NEXT: vorps %ymm0, %ymm3, %ymm0
862 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
863 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
864 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
865 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
942866 ; XOPAVX1-NEXT: retq
943867 ;
944868 ; XOPAVX2-LABEL: splatconstant_rotate_v32i8:
945869 ; XOPAVX2: # BB#0:
946 ; XOPAVX2-NEXT: vpsllw $4, %ymm0, %ymm1
947 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
948 ; XOPAVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
949 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
950 ; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
870 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
871 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
872 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
873 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
951874 ; XOPAVX2-NEXT: retq
952875 %shl = shl <32 x i8> %a,
953876 %lshr = lshr <32 x i8> %a,
988911 ;
989912 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64:
990913 ; XOPAVX1: # BB#0:
991 ; XOPAVX1-NEXT: vpsllq $15, %xmm0, %xmm1
992 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
993 ; XOPAVX1-NEXT: vpsllq $15, %xmm2, %xmm3
994 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
995 ; XOPAVX1-NEXT: vpsrlq $49, %xmm0, %xmm0
996 ; XOPAVX1-NEXT: vpsrlq $49, %xmm2, %xmm2
997 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
998 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
999 ; XOPAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1000 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1001 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
1002 ; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
914 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
915 ; XOPAVX1-NEXT: vprotq $15, %xmm1, %xmm1
916 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
917 ; XOPAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
918 ; XOPAVX1-NEXT: vprotq $15, %xmm0, %xmm0
919 ; XOPAVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
920 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1003921 ; XOPAVX1-NEXT: retq
1004922 ;
1005923 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v4i64:
1006924 ; XOPAVX2: # BB#0:
1007 ; XOPAVX2-NEXT: vpsllq $15, %ymm0, %ymm1
1008 ; XOPAVX2-NEXT: vpsrlq $49, %ymm0, %ymm0
925 ; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm1
926 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
927 ; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm0
928 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1009929 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1010 ; XOPAVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
1011 ; XOPAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
1012 ; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
1013930 ; XOPAVX2-NEXT: retq
1014931 %shl = shl <4 x i64> %a,
1015932 %lshr = lshr <4 x i64> %a,
1047964 ;
1048965 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32:
1049966 ; XOPAVX1: # BB#0:
1050 ; XOPAVX1-NEXT: vpslld $4, %xmm0, %xmm1
1051 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1052 ; XOPAVX1-NEXT: vpslld $4, %xmm2, %xmm3
1053 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1054 ; XOPAVX1-NEXT: vpsrld $28, %xmm0, %xmm0
1055 ; XOPAVX1-NEXT: vpsrld $28, %xmm2, %xmm2
1056 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
967 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
968 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
969 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
970 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1057971 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1058 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
1059 ; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
1060972 ; XOPAVX1-NEXT: retq
1061973 ;
1062974 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v8i32:
1063975 ; XOPAVX2: # BB#0:
1064 ; XOPAVX2-NEXT: vpslld $4, %ymm0, %ymm1
1065 ; XOPAVX2-NEXT: vpsrld $28, %ymm0, %ymm0
1066 ; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
1067 ; XOPAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
1068 ; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
1069 ; XOPAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
1070 ; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
976 ; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
977 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm2
978 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
979 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
980 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
981 ; XOPAVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1071982 ; XOPAVX2-NEXT: retq
1072983 %shl = shl <8 x i32> %a,
1073984 %lshr = lshr <8 x i32> %a,
11031014 ;
11041015 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16:
11051016 ; XOPAVX1: # BB#0:
1106 ; XOPAVX1-NEXT: vpsllw $5, %xmm0, %xmm1
1107 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1108 ; XOPAVX1-NEXT: vpsllw $5, %xmm2, %xmm3
1109 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1110 ; XOPAVX1-NEXT: vpsrlw $11, %xmm0, %xmm0
1111 ; XOPAVX1-NEXT: vpsrlw $11, %xmm2, %xmm2
1112 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1017 ; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm1
1018 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1019 ; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm0
1020 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
11131021 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1114 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
1115 ; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
11161022 ; XOPAVX1-NEXT: retq
11171023 ;
11181024 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v16i16:
11191025 ; XOPAVX2: # BB#0:
1120 ; XOPAVX2-NEXT: vpsllw $5, %ymm0, %ymm1
1121 ; XOPAVX2-NEXT: vpsrlw $11, %ymm0, %ymm0
1026 ; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm1
1027 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1028 ; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm0
1029 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
11221030 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1123 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1124 ; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
11251031 ; XOPAVX2-NEXT: retq
11261032 %shl = shl <16 x i16> %a,
11271033 %lshr = lshr <16 x i16> %a,
11651071 ;
11661072 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8:
11671073 ; XOPAVX1: # BB#0:
1168 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1169 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
1170 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm3
1171 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm4
1172 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
1173 ; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
1174 ; XOPAVX1-NEXT: vpsubb %xmm2, %xmm4, %xmm2
1175 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
1176 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
1177 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1074 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
1075 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1076 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
1077 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
11781078 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1179 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm3, %ymm1
1180 ; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
11811079 ; XOPAVX1-NEXT: retq
11821080 ;
11831081 ; XOPAVX2-LABEL: splatconstant_rotate_mask_v32i8:
11841082 ; XOPAVX2: # BB#0:
1185 ; XOPAVX2-NEXT: vpsllw $4, %ymm0, %ymm1
1186 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1187 ; XOPAVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
1083 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
1084 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1085 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
1086 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
11881087 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1189 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1190 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1191 ; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
11921088 ; XOPAVX2-NEXT: retq
11931089 %shl = shl <32 x i8> %a,
11941090 %lshr = lshr <32 x i8> %a,