llvm.org GIT mirror llvm / 05f671e
Revert r302678 "[AArch64] Enable use of reduction intrinsics." This caused PR33053. Original commit message: > The new experimental reduction intrinsics can now be used, so I'm enabling this > for AArch64. We will need this for SVE anyway, so it makes sense to do this for > NEON reductions as well. > > The existing code to match shufflevector patterns are replaced with a direct > lowering of the reductions to AArch64-specific nodes. Tests updated with the > new, simpler, representation. > > Differential Revision: https://reviews.llvm.org/D32247 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@303115 91177308-0d34-0410-b5e6-96231b3b80d8 Hans Wennborg 3 years ago
9 changed file(s) with 773 addition(s) and 236 deletion(s). Raw diff Collapse all Expand all
552552 setTargetDAGCombine(ISD::INTRINSIC_VOID);
553553 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
554554 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
555 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
555556
556557 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
557558 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
656657 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
657658 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
658659 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
659
660 // Vector reductions
661 for (MVT VT : MVT::integer_valuetypes()) {
662 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
663 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
664 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
665 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
666 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
667 }
668 for (MVT VT : MVT::fp_valuetypes()) {
669 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
670 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
671 }
672660
673661 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
674662 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
26172605 return LowerMUL(Op, DAG);
26182606 case ISD::INTRINSIC_WO_CHAIN:
26192607 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
2620 case ISD::VECREDUCE_ADD:
2621 case ISD::VECREDUCE_SMAX:
2622 case ISD::VECREDUCE_SMIN:
2623 case ISD::VECREDUCE_UMAX:
2624 case ISD::VECREDUCE_UMIN:
2625 case ISD::VECREDUCE_FMAX:
2626 case ISD::VECREDUCE_FMIN:
2627 return LowerVECREDUCE(Op, DAG);
26282608 }
26292609 }
26302610
71477127 return Cmp;
71487128 }
71497129
7150 static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
7151 SelectionDAG &DAG) {
7152 SDValue VecOp = ScalarOp.getOperand(0);
7153 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
7154 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
7155 DAG.getConstant(0, DL, MVT::i64));
7156 }
7157
7158 SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
7159 SelectionDAG &DAG) const {
7160 SDLoc dl(Op);
7161 switch (Op.getOpcode()) {
7162 case ISD::VECREDUCE_ADD:
7163 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
7164 case ISD::VECREDUCE_SMAX:
7165 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
7166 case ISD::VECREDUCE_SMIN:
7167 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
7168 case ISD::VECREDUCE_UMAX:
7169 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
7170 case ISD::VECREDUCE_UMIN:
7171 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
7172 case ISD::VECREDUCE_FMAX: {
7173 assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag");
7174 return DAG.getNode(
7175 ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
7176 DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
7177 Op.getOperand(0));
7178 }
7179 case ISD::VECREDUCE_FMIN: {
7180 assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag");
7181 return DAG.getNode(
7182 ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
7183 DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
7184 Op.getOperand(0));
7185 }
7186 default:
7187 llvm_unreachable("Unhandled reduction");
7188 }
7189 }
7190
71917130 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
71927131 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
71937132 /// specified in the intrinsic calls.
95509489 return SDValue();
95519490 }
95529491
9492 /// This function handles the log2-shuffle pattern produced by the
9493 /// LoopVectorizer for the across vector reduction. It consists of
9494 /// log2(NumVectorElements) steps and, in each step, 2^(s) elements
9495 /// are reduced, where s is an induction variable from 0 to
9496 /// log2(NumVectorElements).
9497 static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV,
9498 unsigned Op,
9499 SelectionDAG &DAG) {
9500 EVT VTy = OpV->getOperand(0).getValueType();
9501 if (!VTy.isVector())
9502 return SDValue();
9503
9504 int NumVecElts = VTy.getVectorNumElements();
9505 if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) {
9506 if (NumVecElts != 4)
9507 return SDValue();
9508 } else {
9509 if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16)
9510 return SDValue();
9511 }
9512
9513 int NumExpectedSteps = APInt(8, NumVecElts).logBase2();
9514 SDValue PreOp = OpV;
9515 // Iterate over each step of the across vector reduction.
9516 for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) {
9517 SDValue CurOp = PreOp.getOperand(0);
9518 SDValue Shuffle = PreOp.getOperand(1);
9519 if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) {
9520 // Try to swap the 1st and 2nd operand as add and min/max instructions
9521 // are commutative.
9522 CurOp = PreOp.getOperand(1);
9523 Shuffle = PreOp.getOperand(0);
9524 if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
9525 return SDValue();
9526 }
9527
9528 // Check if the input vector is fed by the operator we want to handle,
9529 // except the last step; the very first input vector is not necessarily
9530 // the same operator we are handling.
9531 if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1)))
9532 return SDValue();
9533
9534 // Check if it forms one step of the across vector reduction.
9535 // E.g.,
9536 // %cur = add %1, %0
9537 // %shuffle = vector_shuffle %cur, <2, 3, u, u>
9538 // %pre = add %cur, %shuffle
9539 if (Shuffle.getOperand(0) != CurOp)
9540 return SDValue();
9541
9542 int NumMaskElts = 1 << CurStep;
9543 ArrayRef Mask = cast(Shuffle)->getMask();
9544 // Check mask values in each step.
9545 // We expect the shuffle mask in each step follows a specific pattern
9546 // denoted here by the form, where M is a sequence of integers
9547 // starting from NumMaskElts, increasing by 1, and the number integers
9548 // in M should be NumMaskElts. U is a sequence of UNDEFs and the number
9549 // of undef in U should be NumVecElts - NumMaskElts.
9550 // E.g., for <8 x i16>, mask values in each step should be :
9551 // step 0 : <1,u,u,u,u,u,u,u>
9552 // step 1 : <2,3,u,u,u,u,u,u>
9553 // step 2 : <4,5,6,7,u,u,u,u>
9554 for (int i = 0; i < NumVecElts; ++i)
9555 if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) ||
9556 (i >= NumMaskElts && !(Mask[i] < 0)))
9557 return SDValue();
9558
9559 PreOp = CurOp;
9560 }
9561 unsigned Opcode;
9562 bool IsIntrinsic = false;
9563
9564 switch (Op) {
9565 default:
9566 llvm_unreachable("Unexpected operator for across vector reduction");
9567 case ISD::ADD:
9568 Opcode = AArch64ISD::UADDV;
9569 break;
9570 case ISD::SMAX:
9571 Opcode = AArch64ISD::SMAXV;
9572 break;
9573 case ISD::UMAX:
9574 Opcode = AArch64ISD::UMAXV;
9575 break;
9576 case ISD::SMIN:
9577 Opcode = AArch64ISD::SMINV;
9578 break;
9579 case ISD::UMIN:
9580 Opcode = AArch64ISD::UMINV;
9581 break;
9582 case ISD::FMAXNUM:
9583 Opcode = Intrinsic::aarch64_neon_fmaxnmv;
9584 IsIntrinsic = true;
9585 break;
9586 case ISD::FMINNUM:
9587 Opcode = Intrinsic::aarch64_neon_fminnmv;
9588 IsIntrinsic = true;
9589 break;
9590 }
9591 SDLoc DL(N);
9592
9593 return IsIntrinsic
9594 ? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0),
9595 DAG.getConstant(Opcode, DL, MVT::i32), PreOp)
9596 : DAG.getNode(
9597 ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
9598 DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp),
9599 DAG.getConstant(0, DL, MVT::i64));
9600 }
9601
9602 /// Target-specific DAG combine for the across vector min/max reductions.
9603 /// This function specifically handles the final clean-up step of the vector
9604 /// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle
9605 /// pattern, which narrows down and finds the final min/max value from all
9606 /// elements of the vector.
9607 /// For example, for a <16 x i8> vector :
9608 /// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u>
9609 /// %smax0 = smax %arr, svn0
9610 /// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u>
9611 /// %smax1 = smax %smax0, %svn1
9612 /// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
9613 /// %smax2 = smax %smax1, svn2
9614 /// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
9615 /// %sc = setcc %smax2, %svn3, gt
9616 /// %n0 = extract_vector_elt %sc, #0
9617 /// %n1 = extract_vector_elt %smax2, #0
9618 /// %n2 = extract_vector_elt $smax2, #1
9619 /// %result = select %n0, %n1, n2
9620 /// becomes :
9621 /// %1 = smaxv %0
9622 /// %result = extract_vector_elt %1, 0
9623 static SDValue
9624 performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG,
9625 const AArch64Subtarget *Subtarget) {
9626 if (!Subtarget->hasNEON())
9627 return SDValue();
9628
9629 SDValue N0 = N->getOperand(0);
9630 SDValue IfTrue = N->getOperand(1);
9631 SDValue IfFalse = N->getOperand(2);
9632
9633 // Check if the SELECT merges up the final result of the min/max
9634 // from a vector.
9635 if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9636 IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9637 IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9638 return SDValue();
9639
9640 // Expect N0 is fed by SETCC.
9641 SDValue SetCC = N0.getOperand(0);
9642 EVT SetCCVT = SetCC.getValueType();
9643 if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() ||
9644 SetCCVT.getVectorElementType() != MVT::i1)
9645 return SDValue();
9646
9647 SDValue VectorOp = SetCC.getOperand(0);
9648 unsigned Op = VectorOp->getOpcode();
9649 // Check if the input vector is fed by the operator we want to handle.
9650 if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN &&
9651 Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM)
9652 return SDValue();
9653
9654 EVT VTy = VectorOp.getValueType();
9655 if (!VTy.isVector())
9656 return SDValue();
9657
9658 if (VTy.getSizeInBits() < 64)
9659 return SDValue();
9660
9661 EVT EltTy = VTy.getVectorElementType();
9662 if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) {
9663 if (EltTy != MVT::f32)
9664 return SDValue();
9665 } else {
9666 if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
9667 return SDValue();
9668 }
9669
9670 // Check if extracting from the same vector.
9671 // For example,
9672 // %sc = setcc %vector, %svn1, gt
9673 // %n0 = extract_vector_elt %sc, #0
9674 // %n1 = extract_vector_elt %vector, #0
9675 // %n2 = extract_vector_elt $vector, #1
9676 if (!(VectorOp == IfTrue->getOperand(0) &&
9677 VectorOp == IfFalse->getOperand(0)))
9678 return SDValue();
9679
9680 // Check if the condition code is matched with the operator type.
9681 ISD::CondCode CC = cast(SetCC->getOperand(2))->get();
9682 if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) ||
9683 (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) ||
9684 (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) ||
9685 (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) ||
9686 (Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE &&
9687 CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT &&
9688 CC != ISD::SETGE) ||
9689 (Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE &&
9690 CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT &&
9691 CC != ISD::SETLE))
9692 return SDValue();
9693
9694 // Expect to check only lane 0 from the vector SETCC.
9695 if (!isNullConstant(N0.getOperand(1)))
9696 return SDValue();
9697
9698 // Expect to extract the true value from lane 0.
9699 if (!isNullConstant(IfTrue.getOperand(1)))
9700 return SDValue();
9701
9702 // Expect to extract the false value from lane 1.
9703 if (!isOneConstant(IfFalse.getOperand(1)))
9704 return SDValue();
9705
9706 return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG);
9707 }
9708
9709 /// Target-specific DAG combine for the across vector add reduction.
9710 /// This function specifically handles the final clean-up step of the vector
9711 /// add reduction produced by the LoopVectorizer. It is the log2-shuffle
9712 /// pattern, which adds all elements of a vector together.
9713 /// For example, for a <4 x i32> vector :
9714 /// %1 = vector_shuffle %0, <2,3,u,u>
9715 /// %2 = add %0, %1
9716 /// %3 = vector_shuffle %2, <1,u,u,u>
9717 /// %4 = add %2, %3
9718 /// %result = extract_vector_elt %4, 0
9719 /// becomes :
9720 /// %0 = uaddv %0
9721 /// %result = extract_vector_elt %0, 0
9722 static SDValue
9723 performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG,
9724 const AArch64Subtarget *Subtarget) {
9725 if (!Subtarget->hasNEON())
9726 return SDValue();
9727 SDValue N0 = N->getOperand(0);
9728 SDValue N1 = N->getOperand(1);
9729
9730 // Check if the input vector is fed by the ADD.
9731 if (N0->getOpcode() != ISD::ADD)
9732 return SDValue();
9733
9734 // The vector extract idx must constant zero because we only expect the final
9735 // result of the reduction is placed in lane 0.
9736 if (!isNullConstant(N1))
9737 return SDValue();
9738
9739 EVT VTy = N0.getValueType();
9740 if (!VTy.isVector())
9741 return SDValue();
9742
9743 EVT EltTy = VTy.getVectorElementType();
9744 if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
9745 return SDValue();
9746
9747 if (VTy.getSizeInBits() < 64)
9748 return SDValue();
9749
9750 return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG);
9751 }
95539752
95549753 /// Target-specific DAG combine function for NEON load/store intrinsics
95559754 /// to merge base address updates.
1022810427 return performBitcastCombine(N, DCI, DAG);
1022910428 case ISD::CONCAT_VECTORS:
1023010429 return performConcatVectorsCombine(N, DCI, DAG);
10231 case ISD::SELECT:
10232 return performSelectCombine(N, DCI);
10430 case ISD::SELECT: {
10431 SDValue RV = performSelectCombine(N, DCI);
10432 if (!RV.getNode())
10433 RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget);
10434 return RV;
10435 }
1023310436 case ISD::VSELECT:
1023410437 return performVSelectCombine(N, DCI.DAG);
1023510438 case ISD::LOAD:
1025110454 return performNVCASTCombine(N);
1025210455 case ISD::INSERT_VECTOR_ELT:
1025310456 return performPostLD1Combine(N, DCI, true);
10457 case ISD::EXTRACT_VECTOR_ELT:
10458 return performAcrossLaneAddReductionCombine(N, DAG, Subtarget);
1025410459 case ISD::INTRINSIC_VOID:
1025510460 case ISD::INTRINSIC_W_CHAIN:
1025610461 switch (cast(N->getOperand(1))->getZExtValue()) {
1047010675 case ISD::BITCAST:
1047110676 ReplaceBITCASTResults(N, Results, DAG);
1047210677 return;
10473 case ISD::VECREDUCE_ADD:
10474 case ISD::VECREDUCE_SMAX:
10475 case ISD::VECREDUCE_SMIN:
10476 case ISD::VECREDUCE_UMAX:
10477 case ISD::VECREDUCE_UMIN:
10478 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
10479 return;
10480
1048110678 case AArch64ISD::SADDV:
1048210679 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
1048310680 return;
567567 SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
568568 SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
569569 SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
570 SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
571570
572571 SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
573572 std::vector *Created) const override;
768768 unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {
769769 return ST->getMaxPrefetchIterationsAhead();
770770 }
771
772 bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
773 TTI::ReductionFlags Flags) const {
774 assert(isa(Ty) && "Expected Ty to be a vector type");
775 switch (Opcode) {
776 case Instruction::FAdd:
777 case Instruction::FMul:
778 case Instruction::And:
779 case Instruction::Or:
780 case Instruction::Xor:
781 case Instruction::Mul:
782 return false;
783 case Instruction::Add:
784 return Ty->getScalarSizeInBits() * Ty->getVectorNumElements() >= 128;
785 case Instruction::ICmp:
786 return Ty->getScalarSizeInBits() < 64;
787 case Instruction::FCmp:
788 return Flags.NoNaN;
789 default:
790 llvm_unreachable("Unhandled reduction opcode");
791 }
792 return false;
793 }
140140 bool shouldExpandReduction(const IntrinsicInst *II) const {
141141 return false;
142142 }
143
144 bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
145 TTI::ReductionFlags Flags) const;
146143 /// @}
147144 };
148145
0 ; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=generic | FileCheck %s
1
2 ; Function Attrs: nounwind readnone
3 declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64>)
4 declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32>)
5 declare i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>)
6 declare i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>)
71
82 define i8 @add_B(<16 x i8>* %arr) {
93 ; CHECK-LABEL: add_B
104 ; CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b
115 %bin.rdx = load <16 x i8>, <16 x i8>* %arr
12 %r = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> %bin.rdx)
6 %rdx.shuf0 = shufflevector <16 x i8> %bin.rdx, <16 x i8> undef, <16 x i32>
7 %bin.rdx0 = add <16 x i8> %bin.rdx, %rdx.shuf0
8 %rdx.shuf = shufflevector <16 x i8> %bin.rdx0, <16 x i8> undef, <16 x i32>
9 %bin.rdx11 = add <16 x i8> %bin.rdx0, %rdx.shuf
10 %rdx.shuf12 = shufflevector <16 x i8> %bin.rdx11, <16 x i8> undef, <16 x i32>
11 %bin.rdx13 = add <16 x i8> %bin.rdx11, %rdx.shuf12
12 %rdx.shuf13 = shufflevector <16 x i8> %bin.rdx13, <16 x i8> undef, <16 x i32>
13 %bin.rdx14 = add <16 x i8> %bin.rdx13, %rdx.shuf13
14 %r = extractelement <16 x i8> %bin.rdx14, i32 0
1315 ret i8 %r
1416 }
1517
1719 ; CHECK-LABEL: add_H
1820 ; CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h
1921 %bin.rdx = load <8 x i16>, <8 x i16>* %arr
20 %r = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> %bin.rdx)
22 %rdx.shuf = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32>
23 %bin.rdx11 = add <8 x i16> %bin.rdx, %rdx.shuf
24 %rdx.shuf12 = shufflevector <8 x i16> %bin.rdx11, <8 x i16> undef, <8 x i32>
25 %bin.rdx13 = add <8 x i16> %bin.rdx11, %rdx.shuf12
26 %rdx.shuf13 = shufflevector <8 x i16> %bin.rdx13, <8 x i16> undef, <8 x i32>
27 %bin.rdx14 = add <8 x i16> %bin.rdx13, %rdx.shuf13
28 %r = extractelement <8 x i16> %bin.rdx14, i32 0
2129 ret i16 %r
2230 }
2331
2533 ; CHECK-LABEL: add_S
2634 ; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
2735 %bin.rdx = load <4 x i32>, <4 x i32>* %arr
28 %r = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %bin.rdx)
36 %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32>
37 %bin.rdx11 = add <4 x i32> %bin.rdx, %rdx.shuf
38 %rdx.shuf12 = shufflevector <4 x i32> %bin.rdx11, <4 x i32> undef, <4 x i32>
39 %bin.rdx13 = add <4 x i32> %bin.rdx11, %rdx.shuf12
40 %r = extractelement <4 x i32> %bin.rdx13, i32 0
2941 ret i32 %r
3042 }
3143
3345 ; CHECK-LABEL: add_D
3446 ; CHECK-NOT: addv
3547 %bin.rdx = load <2 x i64>, <2 x i64>* %arr
36 %r = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %bin.rdx)
48 %rdx.shuf0 = shufflevector <2 x i64> %bin.rdx, <2 x i64> undef, <2 x i32>
49 %bin.rdx0 = add <2 x i64> %bin.rdx, %rdx.shuf0
50 %r = extractelement <2 x i64> %bin.rdx0, i32 0
3751 ret i64 %r
3852 }
39
40 declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>)
4153
4254 define i32 @oversized_ADDV_256(i8* noalias nocapture readonly %arg1, i8* noalias nocapture readonly %arg2) {
4355 ; CHECK-LABEL: oversized_ADDV_256
5365 %7 = icmp slt <8 x i32> %6, zeroinitializer
5466 %8 = sub nsw <8 x i32> zeroinitializer, %6
5567 %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
56 %r = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %9)
57 ret i32 %r
68 %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32>
69 %bin.rdx = add <8 x i32> %9, %rdx.shuf
70 %rdx.shuf1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32>
71 %bin.rdx2 = add <8 x i32> %bin.rdx, %rdx.shuf1
72 %rdx.shuf3 = shufflevector <8 x i32> %bin.rdx2, <8 x i32> undef, <8 x i32>
73 %bin.rdx4 = add <8 x i32> %bin.rdx2, %rdx.shuf3
74 %10 = extractelement <8 x i32> %bin.rdx4, i32 0
75 ret i32 %10
5876 }
59
60 declare i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32>)
6177
6278 define i32 @oversized_ADDV_512(<16 x i32>* %arr) {
6379 ; CHECK-LABEL: oversized_ADDV_512
6480 ; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
6581 %bin.rdx = load <16 x i32>, <16 x i32>* %arr
66 %r = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> %bin.rdx)
82
83 %rdx.shuf0 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32>
84 %bin.rdx0 = add <16 x i32> %bin.rdx, %rdx.shuf0
85
86 %rdx.shuf = shufflevector <16 x i32> %bin.rdx0, <16 x i32> undef, <16 x i32>
87 %bin.rdx11 = add <16 x i32> %bin.rdx0, %rdx.shuf
88
89 %rdx.shuf12 = shufflevector <16 x i32> %bin.rdx11, <16 x i32> undef, <16 x i32>
90 %bin.rdx13 = add <16 x i32> %bin.rdx11, %rdx.shuf12
91
92 %rdx.shuf13 = shufflevector <16 x i32> %bin.rdx13, <16 x i32> undef, <16 x i32>
93 %bin.rdx14 = add <16 x i32> %bin.rdx13, %rdx.shuf13
94
95 %r = extractelement <16 x i32> %bin.rdx14, i32 0
6796 ret i32 %r
6897 }
0 ; RUN: llc < %s -mtriple=aarch64-linux--gnu -aarch64-neon-syntax=generic | FileCheck %s
11
22 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
3
4 declare i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8>)
5 declare i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16>)
6 declare i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32>)
7 declare i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8>)
8 declare i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16>)
9 declare i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32>)
10
11 declare i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8>)
12 declare i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16>)
13 declare i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32>)
14 declare i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8>)
15 declare i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16>)
16 declare i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32>)
17
18 declare float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float>)
19 declare float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float>)
203
214 ; CHECK-LABEL: smax_B
225 ; CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.16b
236 define i8 @smax_B(<16 x i8>* nocapture readonly %arr) {
247 %arr.load = load <16 x i8>, <16 x i8>* %arr
25 %r = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> %arr.load)
8 %rdx.shuf = shufflevector <16 x i8> %arr.load, <16 x i8> undef, <16 x i32>
9 %rdx.minmax.cmp22 = icmp sgt <16 x i8> %arr.load, %rdx.shuf
10 %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %arr.load, <16 x i8> %rdx.shuf
11 %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32>
12 %rdx.minmax.cmp25 = icmp sgt <16 x i8> %rdx.minmax.select23, %rdx.shuf24
13 %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24
14 %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32>
15 %rdx.minmax.cmp28 = icmp sgt <16 x i8> %rdx.minmax.select26, %rdx.shuf27
16 %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27
17 %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32>
18 %rdx.minmax.cmp31 = icmp sgt <16 x i8> %rdx.minmax.select29, %rdx.shuf30
19 %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
20 %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0
21 %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1
22 %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt
2623 ret i8 %r
2724 }
2825
2926 ; CHECK-LABEL: smax_H
3027 ; CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.8h
3128 define i16 @smax_H(<8 x i16>* nocapture readonly %arr) {
32 %arr.load = load <8 x i16>, <8 x i16>* %arr
33 %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> %arr.load)
29 %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr
30 %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32>
31 %rdx.minmax.cmp23 = icmp sgt <8 x i16> %rdx.minmax.select, %rdx.shuf
32 %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf
33 %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32>
34 %rdx.minmax.cmp26 = icmp sgt <8 x i16> %rdx.minmax.select24, %rdx.shuf25
35 %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25
36 %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32>
37 %rdx.minmax.cmp29 = icmp sgt <8 x i16> %rdx.minmax.select27, %rdx.shuf28
38 %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0
39 %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0
40 %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1
41 %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt
3442 ret i16 %r
3543 }
3644
3745 ; CHECK-LABEL: smax_S
3846 ; CHECK: smaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
3947 define i32 @smax_S(<4 x i32> * nocapture readonly %arr) {
40 %arr.load = load <4 x i32>, <4 x i32>* %arr
41 %r = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> %arr.load)
42 ret i32 %r
43 }
48 %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr
49 %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32>
50 %rdx.minmax.cmp18 = icmp sgt <4 x i32> %rdx.minmax.select, %rdx.shuf
51 %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf
52 %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32>
53 %rdx.minmax.cmp21 = icmp sgt <4 x i32> %rdx.minmax.select19, %rdx.shuf20
54 %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0
55 %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0
56 %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1
57 %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt
58 ret i32 %r
59 }
60
61 ; CHECK-LABEL: smax_D
62 ; CHECK-NOT: smaxv
63 define i64 @smax_D(<2 x i64>* nocapture readonly %arr) {
64 %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr
65 %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32>
66 %rdx.minmax.cmp18 = icmp sgt <2 x i64> %rdx.minmax.select, %rdx.shuf
67 %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0
68 %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0
69 %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1
70 %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt
71 ret i64 %r
72 }
73
4474
4575 ; CHECK-LABEL: umax_B
4676 ; CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.16b
4777 define i8 @umax_B(<16 x i8>* nocapture readonly %arr) {
48 %arr.load = load <16 x i8>, <16 x i8>* %arr
49 %r = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> %arr.load)
78 %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr
79 %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32>
80 %rdx.minmax.cmp22 = icmp ugt <16 x i8> %rdx.minmax.select, %rdx.shuf
81 %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf
82 %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32>
83 %rdx.minmax.cmp25 = icmp ugt <16 x i8> %rdx.minmax.select23, %rdx.shuf24
84 %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24
85 %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32>
86 %rdx.minmax.cmp28 = icmp ugt <16 x i8> %rdx.minmax.select26, %rdx.shuf27
87 %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27
88 %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32>
89 %rdx.minmax.cmp31 = icmp ugt <16 x i8> %rdx.minmax.select29, %rdx.shuf30
90 %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
91 %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0
92 %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1
93 %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt
5094 ret i8 %r
5195 }
5296
5397 ; CHECK-LABEL: umax_H
5498 ; CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.8h
5599 define i16 @umax_H(<8 x i16>* nocapture readonly %arr) {
56 %arr.load = load <8 x i16>, <8 x i16>* %arr
57 %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> %arr.load)
100 %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr
101 %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32>
102 %rdx.minmax.cmp23 = icmp ugt <8 x i16> %rdx.minmax.select, %rdx.shuf
103 %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf
104 %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32>
105 %rdx.minmax.cmp26 = icmp ugt <8 x i16> %rdx.minmax.select24, %rdx.shuf25
106 %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25
107 %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32>
108 %rdx.minmax.cmp29 = icmp ugt <8 x i16> %rdx.minmax.select27, %rdx.shuf28
109 %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0
110 %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0
111 %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1
112 %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt
58113 ret i16 %r
59114 }
60115
61116 ; CHECK-LABEL: umax_S
62117 ; CHECK: umaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
63118 define i32 @umax_S(<4 x i32>* nocapture readonly %arr) {
64 %arr.load = load <4 x i32>, <4 x i32>* %arr
65 %r = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> %arr.load)
66 ret i32 %r
67 }
119 %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr
120 %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32>
121 %rdx.minmax.cmp18 = icmp ugt <4 x i32> %rdx.minmax.select, %rdx.shuf
122 %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf
123 %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32>
124 %rdx.minmax.cmp21 = icmp ugt <4 x i32> %rdx.minmax.select19, %rdx.shuf20
125 %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0
126 %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0
127 %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1
128 %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt
129 ret i32 %r
130 }
131
132 ; CHECK-LABEL: umax_D
133 ; CHECK-NOT: umaxv
134 define i64 @umax_D(<2 x i64>* nocapture readonly %arr) {
135 %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr
136 %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32>
137 %rdx.minmax.cmp18 = icmp ugt <2 x i64> %rdx.minmax.select, %rdx.shuf
138 %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0
139 %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0
140 %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1
141 %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt
142 ret i64 %r
143 }
144
68145
69146 ; CHECK-LABEL: smin_B
70147 ; CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.16b
71148 define i8 @smin_B(<16 x i8>* nocapture readonly %arr) {
72 %arr.load = load <16 x i8>, <16 x i8>* %arr
73 %r = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> %arr.load)
149 %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr
150 %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32>
151 %rdx.minmax.cmp22 = icmp slt <16 x i8> %rdx.minmax.select, %rdx.shuf
152 %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf
153 %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32>
154 %rdx.minmax.cmp25 = icmp slt <16 x i8> %rdx.minmax.select23, %rdx.shuf24
155 %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24
156 %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32>
157 %rdx.minmax.cmp28 = icmp slt <16 x i8> %rdx.minmax.select26, %rdx.shuf27
158 %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27
159 %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32>
160 %rdx.minmax.cmp31 = icmp slt <16 x i8> %rdx.minmax.select29, %rdx.shuf30
161 %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
162 %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0
163 %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1
164 %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt
74165 ret i8 %r
75166 }
76167
77168 ; CHECK-LABEL: smin_H
78169 ; CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.8h
79170 define i16 @smin_H(<8 x i16>* nocapture readonly %arr) {
80 %arr.load = load <8 x i16>, <8 x i16>* %arr
81 %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> %arr.load)
171 %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr
172 %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32>
173 %rdx.minmax.cmp23 = icmp slt <8 x i16> %rdx.minmax.select, %rdx.shuf
174 %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf
175 %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32>
176 %rdx.minmax.cmp26 = icmp slt <8 x i16> %rdx.minmax.select24, %rdx.shuf25
177 %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25
178 %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32>
179 %rdx.minmax.cmp29 = icmp slt <8 x i16> %rdx.minmax.select27, %rdx.shuf28
180 %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0
181 %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0
182 %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1
183 %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt
82184 ret i16 %r
83185 }
84186
85187 ; CHECK-LABEL: smin_S
86188 ; CHECK: sminv {{s[0-9]+}}, {{v[0-9]+}}.4s
87189 define i32 @smin_S(<4 x i32>* nocapture readonly %arr) {
88 %arr.load = load <4 x i32>, <4 x i32>* %arr
89 %r = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> %arr.load)
90 ret i32 %r
91 }
190 %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr
191 %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32>
192 %rdx.minmax.cmp18 = icmp slt <4 x i32> %rdx.minmax.select, %rdx.shuf
193 %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf
194 %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32>
195 %rdx.minmax.cmp21 = icmp slt <4 x i32> %rdx.minmax.select19, %rdx.shuf20
196 %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0
197 %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0
198 %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1
199 %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt
200 ret i32 %r
201 }
202
203 ; CHECK-LABEL: smin_D
204 ; CHECK-NOT: sminv
205 define i64 @smin_D(<2 x i64>* nocapture readonly %arr) {
206 %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr
207 %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32>
208 %rdx.minmax.cmp18 = icmp slt <2 x i64> %rdx.minmax.select, %rdx.shuf
209 %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0
210 %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0
211 %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1
212 %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt
213 ret i64 %r
214 }
215
92216
93217 ; CHECK-LABEL: umin_B
94218 ; CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.16b
95219 define i8 @umin_B(<16 x i8>* nocapture readonly %arr) {
96 %arr.load = load <16 x i8>, <16 x i8>* %arr
97 %r = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> %arr.load)
220 %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr
221 %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32>
222 %rdx.minmax.cmp22 = icmp ult <16 x i8> %rdx.minmax.select, %rdx.shuf
223 %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf
224 %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32>
225 %rdx.minmax.cmp25 = icmp ult <16 x i8> %rdx.minmax.select23, %rdx.shuf24
226 %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24
227 %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32>
228 %rdx.minmax.cmp28 = icmp ult <16 x i8> %rdx.minmax.select26, %rdx.shuf27
229 %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27
230 %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32>
231 %rdx.minmax.cmp31 = icmp ult <16 x i8> %rdx.minmax.select29, %rdx.shuf30
232 %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
233 %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0
234 %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1
235 %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt
98236 ret i8 %r
99237 }
100238
101239 ; CHECK-LABEL: umin_H
102240 ; CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.8h
103241 define i16 @umin_H(<8 x i16>* nocapture readonly %arr) {
104 %arr.load = load <8 x i16>, <8 x i16>* %arr
105 %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> %arr.load)
242 %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr
243 %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32>
244 %rdx.minmax.cmp23 = icmp ult <8 x i16> %rdx.minmax.select, %rdx.shuf
245 %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf
246 %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32>
247 %rdx.minmax.cmp26 = icmp ult <8 x i16> %rdx.minmax.select24, %rdx.shuf25
248 %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25
249 %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32>
250 %rdx.minmax.cmp29 = icmp ult <8 x i16> %rdx.minmax.select27, %rdx.shuf28
251 %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0
252 %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0
253 %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1
254 %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt
106255 ret i16 %r
107256 }
108257
109258 ; CHECK-LABEL: umin_S
110259 ; CHECK: uminv {{s[0-9]+}}, {{v[0-9]+}}.4s
111260 define i32 @umin_S(<4 x i32>* nocapture readonly %arr) {
112 %arr.load = load <4 x i32>, <4 x i32>* %arr
113 %r = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> %arr.load)
114 ret i32 %r
261 %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr
262 %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32>
263 %rdx.minmax.cmp18 = icmp ult <4 x i32> %rdx.minmax.select, %rdx.shuf
264 %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf
265 %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32>
266 %rdx.minmax.cmp21 = icmp ult <4 x i32> %rdx.minmax.select19, %rdx.shuf20
267 %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0
268 %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0
269 %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1
270 %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt
271 ret i32 %r
272 }
273
274 ; CHECK-LABEL: umin_D
275 ; CHECK-NOT: uminv
276 define i64 @umin_D(<2 x i64>* nocapture readonly %arr) {
277 %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr
278 %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32>
279 %rdx.minmax.cmp18 = icmp ult <2 x i64> %rdx.minmax.select, %rdx.shuf
280 %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0
281 %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0
282 %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1
283 %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt
284 ret i64 %r
115285 }
116286
117287 ; CHECK-LABEL: fmaxnm_S
118288 ; CHECK: fmaxnmv
119289 define float @fmaxnm_S(<4 x float>* nocapture readonly %arr) {
120 %arr.load = load <4 x float>, <4 x float>* %arr
121 %r = call nnan float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float> %arr.load)
290 %rdx.minmax.select = load <4 x float>, <4 x float>* %arr
291 %rdx.shuf = shufflevector <4 x float> %rdx.minmax.select, <4 x float> undef, <4 x i32>
292 %rdx.minmax.cmp = fcmp fast oge <4 x float> %rdx.minmax.select, %rdx.shuf
293 %rdx.minmax.select1 = select <4 x i1> %rdx.minmax.cmp, <4 x float> %rdx.minmax.select, <4 x float> %rdx.shuf
294 %rdx.shuf1 = shufflevector <4 x float> %rdx.minmax.select1, <4 x float> undef, <4 x i32>
295 %rdx.minmax.cmp1 = fcmp fast oge <4 x float> %rdx.minmax.select1, %rdx.shuf1
296 %rdx.minmax.cmp1.elt = extractelement <4 x i1> %rdx.minmax.cmp1, i32 0
297 %rdx.minmax.select1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 0
298 %rdx.shuf1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 1
299 %r = select i1 %rdx.minmax.cmp1.elt, float %rdx.minmax.select1.elt, float %rdx.shuf1.elt
122300 ret float %r
123301 }
124302
125303 ; CHECK-LABEL: fminnm_S
126304 ; CHECK: fminnmv
127305 define float @fminnm_S(<4 x float>* nocapture readonly %arr) {
128 %arr.load = load <4 x float>, <4 x float>* %arr
129 %r = call nnan float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float> %arr.load)
306 %rdx.minmax.select = load <4 x float>, <4 x float>* %arr
307 %rdx.shuf = shufflevector <4 x float> %rdx.minmax.select, <4 x float> undef, <4 x i32>
308 %rdx.minmax.cmp = fcmp fast ole <4 x float> %rdx.minmax.select, %rdx.shuf
309 %rdx.minmax.select1 = select <4 x i1> %rdx.minmax.cmp, <4 x float> %rdx.minmax.select, <4 x float> %rdx.shuf
310 %rdx.shuf1 = shufflevector <4 x float> %rdx.minmax.select1, <4 x float> undef, <4 x i32>
311 %rdx.minmax.cmp1 = fcmp fast ole <4 x float> %rdx.minmax.select1, %rdx.shuf1
312 %rdx.minmax.cmp1.elt = extractelement <4 x i1> %rdx.minmax.cmp1, i32 0
313 %rdx.minmax.select1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 0
314 %rdx.shuf1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 1
315 %r = select i1 %rdx.minmax.cmp1.elt, float %rdx.minmax.select1.elt, float %rdx.shuf1.elt
130316 ret float %r
131317 }
132
133 declare i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16>)
134318
135319 define i16 @oversized_umax_256(<16 x i16>* nocapture readonly %arr) {
136320 ; CHECK-LABEL: oversized_umax_256
137321 ; CHECK: umax [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
138322 ; CHECK: umaxv {{h[0-9]+}}, [[V0]]
139 %arr.load = load <16 x i16>, <16 x i16>* %arr
140 %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> %arr.load)
141 ret i16 %r
142 }
143
144 declare i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32>)
323 %rdx.minmax.select = load <16 x i16>, <16 x i16>* %arr
324 %rdx.shuf = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32>
325 %rdx.minmax.cmp22 = icmp ugt <16 x i16> %rdx.minmax.select, %rdx.shuf
326 %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf
327 %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32>
328 %rdx.minmax.cmp25 = icmp ugt <16 x i16> %rdx.minmax.select23, %rdx.shuf24
329 %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24
330 %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32>
331 %rdx.minmax.cmp28 = icmp ugt <16 x i16> %rdx.minmax.select26, %rdx.shuf27
332 %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27
333 %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32>
334 %rdx.minmax.cmp31 = icmp ugt <16 x i16> %rdx.minmax.select29, %rdx.shuf30
335 %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
336 %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0
337 %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1
338 %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt
339 ret i16 %r
340 }
145341
146342 define i32 @oversized_umax_512(<16 x i32>* nocapture readonly %arr) {
147343 ; CHECK-LABEL: oversized_umax_512
150346 ; CHECK-NEXT: umax [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
151347 ; CHECK-NEXT: umaxv {{s[0-9]+}}, [[V0]]
152348 %arr.load = load <16 x i32>, <16 x i32>* %arr
153 %r = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> %arr.load)
154 ret i32 %r
155 }
156
157 declare i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16>)
349 %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32>
350 %rdx.minmax.cmp22 = icmp ugt <16 x i32> %arr.load, %rdx.shuf
351 %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf
352 %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32>
353 %rdx.minmax.cmp25 = icmp ugt <16 x i32> %rdx.minmax.select23, %rdx.shuf24
354 %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24
355 %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32>
356 %rdx.minmax.cmp28 = icmp ugt <16 x i32> %rdx.minmax.select26, %rdx.shuf27
357 %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27
358 %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32>
359 %rdx.minmax.cmp31 = icmp ugt <16 x i32> %rdx.minmax.select29, %rdx.shuf30
360 %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
361 %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0
362 %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1
363 %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt
364 ret i32 %r
365 }
158366
159367 define i16 @oversized_umin_256(<16 x i16>* nocapture readonly %arr) {
160368 ; CHECK-LABEL: oversized_umin_256
161369 ; CHECK: umin [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
162370 ; CHECK: uminv {{h[0-9]+}}, [[V0]]
163 %arr.load = load <16 x i16>, <16 x i16>* %arr
164 %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> %arr.load)
165 ret i16 %r
166 }
167
168 declare i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32>)
371 %rdx.minmax.select = load <16 x i16>, <16 x i16>* %arr
372 %rdx.shuf = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32>
373 %rdx.minmax.cmp22 = icmp ult <16 x i16> %rdx.minmax.select, %rdx.shuf
374 %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf
375 %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32>
376 %rdx.minmax.cmp25 = icmp ult <16 x i16> %rdx.minmax.select23, %rdx.shuf24
377 %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24
378 %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32>
379 %rdx.minmax.cmp28 = icmp ult <16 x i16> %rdx.minmax.select26, %rdx.shuf27
380 %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27
381 %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32>
382 %rdx.minmax.cmp31 = icmp ult <16 x i16> %rdx.minmax.select29, %rdx.shuf30
383 %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
384 %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0
385 %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1
386 %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt
387 ret i16 %r
388 }
169389
170390 define i32 @oversized_umin_512(<16 x i32>* nocapture readonly %arr) {
171391 ; CHECK-LABEL: oversized_umin_512
174394 ; CHECK-NEXT: umin [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
175395 ; CHECK-NEXT: uminv {{s[0-9]+}}, [[V0]]
176396 %arr.load = load <16 x i32>, <16 x i32>* %arr
177 %r = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> %arr.load)
178 ret i32 %r
179 }
180
181 declare i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16>)
397 %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32>
398 %rdx.minmax.cmp22 = icmp ult <16 x i32> %arr.load, %rdx.shuf
399 %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf
400 %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32>
401 %rdx.minmax.cmp25 = icmp ult <16 x i32> %rdx.minmax.select23, %rdx.shuf24
402 %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24
403 %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32>
404 %rdx.minmax.cmp28 = icmp ult <16 x i32> %rdx.minmax.select26, %rdx.shuf27
405 %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27
406 %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32>
407 %rdx.minmax.cmp31 = icmp ult <16 x i32> %rdx.minmax.select29, %rdx.shuf30
408 %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
409 %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0
410 %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1
411 %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt
412 ret i32 %r
413 }
182414
183415 define i16 @oversized_smax_256(<16 x i16>* nocapture readonly %arr) {
184416 ; CHECK-LABEL: oversized_smax_256
185417 ; CHECK: smax [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
186418 ; CHECK: smaxv {{h[0-9]+}}, [[V0]]
187419 %arr.load = load <16 x i16>, <16 x i16>* %arr
188 %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> %arr.load)
189 ret i16 %r
190 }
191
192 declare i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32>)
420 %rdx.shuf = shufflevector <16 x i16> %arr.load, <16 x i16> undef, <16 x i32>
421 %rdx.minmax.cmp22 = icmp sgt <16 x i16> %arr.load, %rdx.shuf
422 %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %arr.load, <16 x i16> %rdx.shuf
423 %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32>
424 %rdx.minmax.cmp25 = icmp sgt <16 x i16> %rdx.minmax.select23, %rdx.shuf24
425 %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24
426 %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32>
427 %rdx.minmax.cmp28 = icmp sgt <16 x i16> %rdx.minmax.select26, %rdx.shuf27
428 %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27
429 %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32>
430 %rdx.minmax.cmp31 = icmp sgt <16 x i16> %rdx.minmax.select29, %rdx.shuf30
431 %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
432 %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0
433 %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1
434 %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt
435 ret i16 %r
436 }
193437
194438 define i32 @oversized_smax_512(<16 x i32>* nocapture readonly %arr) {
195439 ; CHECK-LABEL: oversized_smax_512
198442 ; CHECK-NEXT: smax [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
199443 ; CHECK-NEXT: smaxv {{s[0-9]+}}, [[V0]]
200444 %arr.load = load <16 x i32>, <16 x i32>* %arr
201 %r = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> %arr.load)
202 ret i32 %r
203 }
204
205 declare i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16>)
445 %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32>
446 %rdx.minmax.cmp22 = icmp sgt <16 x i32> %arr.load, %rdx.shuf
447 %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf
448 %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32>
449 %rdx.minmax.cmp25 = icmp sgt <16 x i32> %rdx.minmax.select23, %rdx.shuf24
450 %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24
451 %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32>
452 %rdx.minmax.cmp28 = icmp sgt <16 x i32> %rdx.minmax.select26, %rdx.shuf27
453 %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27
454 %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32>
455 %rdx.minmax.cmp31 = icmp sgt <16 x i32> %rdx.minmax.select29, %rdx.shuf30
456 %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
457 %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0
458 %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1
459 %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt
460 ret i32 %r
461 }
206462
207463 define i16 @oversized_smin_256(<16 x i16>* nocapture readonly %arr) {
208464 ; CHECK-LABEL: oversized_smin_256
209465 ; CHECK: smin [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
210466 ; CHECK: sminv {{h[0-9]+}}, [[V0]]
211 %arr.load = load <16 x i16>, <16 x i16>* %arr
212 %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> %arr.load)
213 ret i16 %r
214 }
215
216 declare i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32>)
467 %rdx.minmax.select = load <16 x i16>, <16 x i16>* %arr
468 %rdx.shuf = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32>
469 %rdx.minmax.cmp22 = icmp slt <16 x i16> %rdx.minmax.select, %rdx.shuf
470 %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf
471 %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32>
472 %rdx.minmax.cmp25 = icmp slt <16 x i16> %rdx.minmax.select23, %rdx.shuf24
473 %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24
474 %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32>
475 %rdx.minmax.cmp28 = icmp slt <16 x i16> %rdx.minmax.select26, %rdx.shuf27
476 %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27
477 %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32>
478 %rdx.minmax.cmp31 = icmp slt <16 x i16> %rdx.minmax.select29, %rdx.shuf30
479 %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
480 %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0
481 %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1
482 %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt
483 ret i16 %r
484 }
217485
218486 define i32 @oversized_smin_512(<16 x i32>* nocapture readonly %arr) {
219487 ; CHECK-LABEL: oversized_smin_512
222490 ; CHECK-NEXT: smin [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
223491 ; CHECK-NEXT: sminv {{s[0-9]+}}, [[V0]]
224492 %arr.load = load <16 x i32>, <16 x i32>* %arr
225 %r = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> %arr.load)
226 ret i32 %r
227 }
493 %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32>
494 %rdx.minmax.cmp22 = icmp slt <16 x i32> %arr.load, %rdx.shuf
495 %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf
496 %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32>
497 %rdx.minmax.cmp25 = icmp slt <16 x i32> %rdx.minmax.select23, %rdx.shuf24
498 %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24
499 %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32>
500 %rdx.minmax.cmp28 = icmp slt <16 x i32> %rdx.minmax.select26, %rdx.shuf27
501 %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27
502 %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32>
503 %rdx.minmax.cmp31 = icmp slt <16 x i32> %rdx.minmax.select29, %rdx.shuf30
504 %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
505 %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0
506 %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1
507 %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt
508 ret i32 %r
509 }
133133 ret <2 x i64> %tmp4
134134 }
135135
136 declare i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>)
137
138 define i16 @uabdl8h_rdx(<16 x i8>* %a, <16 x i8>* %b) {
139 ; CHECK-LABEL: uabdl8h_rdx
136 define i16 @uabdl8h_log2_shuffle(<16 x i8>* %a, <16 x i8>* %b) {
137 ; CHECK-LABEL: uabdl8h_log2_shuffle
140138 ; CHECK: uabdl2.8h
141139 ; CHECK: uabdl.8h
142140 %aload = load <16 x i8>, <16 x i8>* %a, align 1
147145 %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer
148146 %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff
149147 %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff
150 %reduced_v = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> %absel)
148 %rdx.shuf = shufflevector <16 x i16> %absel, <16 x i16> undef, <16 x i32>
149 %bin1.rdx = add <16 x i16> %absel, %rdx.shuf
150 %rdx.shufx = shufflevector <16 x i16> %bin1.rdx, <16 x i16> undef, <16 x i32>
151 %bin.rdx = add <16 x i16> %bin1.rdx, %rdx.shufx
152 %rdx.shuf136 = shufflevector <16 x i16> %bin.rdx, <16 x i16> undef, <16 x i32>
153 %bin.rdx137 = add <16 x i16> %bin.rdx, %rdx.shuf136
154 %rdx.shuf138 = shufflevector <16 x i16> %bin.rdx137, <16 x i16> undef, <16 x i32>
155 %bin.rdx139 = add <16 x i16> %bin.rdx137, %rdx.shuf138
156 %reduced_v = extractelement <16 x i16> %bin.rdx139, i16 0
151157 ret i16 %reduced_v
152158 }
153159
154 declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>)
155
156 define i32 @uabdl4s_rdx(<8 x i16>* %a, <8 x i16>* %b) {
157 ; CHECK-LABEL: uabdl4s_rdx
160 define i32 @uabdl4s_log2_shuffle(<8 x i16>* %a, <8 x i16>* %b) {
161 ; CHECK-LABEL: uabdl4s_log2_shuffle
158162 ; CHECK: uabdl2.4s
159163 ; CHECK: uabdl.4s
160164 %aload = load <8 x i16>, <8 x i16>* %a, align 1
165169 %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer
166170 %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff
167171 %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff
168 %reduced_v = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %absel)
172 %rdx.shuf = shufflevector <8 x i32> %absel, <8 x i32> undef, <8 x i32>
173 %bin.rdx = add <8 x i32> %absel, %rdx.shuf
174 %rdx.shuf136 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32>
175 %bin.rdx137 = add <8 x i32> %bin.rdx, %rdx.shuf136
176 %rdx.shuf138 = shufflevector <8 x i32> %bin.rdx137, <8 x i32> undef, <8 x i32>
177 %bin.rdx139 = add <8 x i32> %bin.rdx137, %rdx.shuf138
178 %reduced_v = extractelement <8 x i32> %bin.rdx139, i32 0
169179 ret i32 %reduced_v
170180 }
171181
172 declare i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64>)
173
174 define i64 @uabdl2d_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) {
175 ; CHECK: uabdl2d_rdx
182 define i64 @uabdl2d_log2_shuffle(<4 x i32>* %a, <4 x i32>* %b, i32 %h) {
183 ; CHECK: uabdl2d_log2_shuffle
176184 ; CHECK: uabdl2.2d
177185 ; CHECK: uabdl.2d
178186 %aload = load <4 x i32>, <4 x i32>* %a, align 1
183191 %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer
184192 %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff
185193 %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff
186 %reduced_v = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> %absel)
194 %rdx.shuf136 = shufflevector <4 x i64> %absel, <4 x i64> undef, <4 x i32>
195 %bin.rdx137 = add <4 x i64> %absel, %rdx.shuf136
196 %rdx.shuf138 = shufflevector <4 x i64> %bin.rdx137, <4 x i64> undef, <4 x i32>
197 %bin.rdx139 = add <4 x i64> %bin.rdx137, %rdx.shuf138
198 %reduced_v = extractelement <4 x i64> %bin.rdx139, i16 0
187199 ret i64 %reduced_v
188200 }
189201
1919 ; CHECK: add <16 x i8>
2020 ;
2121 ; CHECK: middle.block:
22 ; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>
22 ; CHECK: shufflevector <16 x i8>
23 ; CHECK: add <16 x i8>
24 ; CHECK: shufflevector <16 x i8>
25 ; CHECK: add <16 x i8>
26 ; CHECK: shufflevector <16 x i8>
27 ; CHECK: add <16 x i8>
28 ; CHECK: shufflevector <16 x i8>
29 ; CHECK: add <16 x i8>
30 ; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = extractelement <16 x i8>
2331 ; CHECK: zext i8 [[Rdx]] to i32
2432 ;
2533 define i8 @reduction_i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {
7482 ; CHECK: add <8 x i16>
7583 ;
7684 ; CHECK: middle.block:
77 ; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>
85 ; CHECK: shufflevector <8 x i16>
86 ; CHECK: add <8 x i16>
87 ; CHECK: shufflevector <8 x i16>
88 ; CHECK: add <8 x i16>
89 ; CHECK: shufflevector <8 x i16>
90 ; CHECK: add <8 x i16>
91 ; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = extractelement <8 x i16>
7892 ; CHECK: zext i16 [[Rdx]] to i32
7993 ;
8094 define i16 @reduction_i16_1(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %n) {
131145 ; CHECK: add <8 x i16>
132146 ;
133147 ; CHECK: middle.block:
134 ; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>
148 ; CHECK: shufflevector <8 x i16>
149 ; CHECK: add <8 x i16>
150 ; CHECK: shufflevector <8 x i16>
151 ; CHECK: add <8 x i16>
152 ; CHECK: shufflevector <8 x i16>
153 ; CHECK: add <8 x i16>
154 ; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = extractelement <8 x i16>
135155 ; CHECK: zext i16 [[Rdx]] to i32
136156 ;
137157 define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {
1010 ; DEFAULT-LABEL: @PR28330(
1111 ; DEFAULT: %tmp17 = phi i32 [ %bin.extra, %for.body ], [ 0, %entry ]
1212 ; DEFAULT: %[[S0:.+]] = select <8 x i1> %1, <8 x i32> , <8 x i32>
13 ; DEFAULT: %[[Rdx:.+]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %[[S0]])
14 ; DEFAULT: %bin.extra = add i32 %[[Rdx]], %tmp17
13 ; DEFAULT: %[[R0:.+]] = shufflevector <8 x i32> %[[S0]], <8 x i32> undef, <8 x i32>
14 ; DEFAULT: %[[R1:.+]] = add <8 x i32> %[[S0]], %[[R0]]
15 ; DEFAULT: %[[R2:.+]] = shufflevector <8 x i32> %[[R1]], <8 x i32> undef, <8 x i32>
16 ; DEFAULT: %[[R3:.+]] = add <8 x i32> %[[R1]], %[[R2]]
17 ; DEFAULT: %[[R4:.+]] = shufflevector <8 x i32> %[[R3]], <8 x i32> undef, <8 x i32>
18 ; DEFAULT: %[[R5:.+]] = add <8 x i32> %[[R3]], %[[R4]]
19 ; DEFAULT: %[[R6:.+]] = extractelement <8 x i32> %[[R5]], i32 0
20 ; DEFAULT: %bin.extra = add i32 %[[R6]], %tmp17
1521 ;
1622 ; GATHER-LABEL: @PR28330(
1723 ; GATHER: %tmp17 = phi i32 [ %bin.extra, %for.body ], [ 0, %entry ]
3137 ; GATHER: %[[I5:.+]] = insertelement <8 x i32> %[[I4]], i32 %tmp29, i32 5
3238 ; GATHER: %[[I6:.+]] = insertelement <8 x i32> %[[I5]], i32 %tmp31, i32 6
3339 ; GATHER: %[[I7:.+]] = insertelement <8 x i32> %[[I6]], i32 %tmp33, i32 7
34 ; GATHER: %[[Rdx:.+]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %[[I7]])
35 ; GATHER: %bin.extra = add i32 %[[Rdx]], %tmp17
40 ; GATHER: %[[R0:.+]] = shufflevector <8 x i32> %[[I7]], <8 x i32> undef, <8 x i32>
41 ; GATHER: %[[R1:.+]] = add <8 x i32> %[[I7]], %[[R0]]
42 ; GATHER: %[[R2:.+]] = shufflevector <8 x i32> %[[R1]], <8 x i32> undef, <8 x i32>
43 ; GATHER: %[[R3:.+]] = add <8 x i32> %[[R1]], %[[R2]]
44 ; GATHER: %[[R4:.+]] = shufflevector <8 x i32> %[[R3]], <8 x i32> undef, <8 x i32>
45 ; GATHER: %[[R5:.+]] = add <8 x i32> %[[R3]], %[[R4]]
46 ; GATHER: %[[R6:.+]] = extractelement <8 x i32> %[[R5]], i32 0
47 ; GATHER: %bin.extra = add i32 %[[R6]], %tmp17
3648 ;
3749 ; MAX-COST-LABEL: @PR28330(
3850 ; MAX-COST-NOT: shufflevector
94106 ; DEFAULT-NEXT: [[TMP28:%.*]] = add i32 [[TMP26]], undef
95107 ; DEFAULT-NEXT: [[TMP30:%.*]] = add i32 [[TMP28]], undef
96108 ; DEFAULT-NEXT: [[TMP32:%.*]] = add i32 [[TMP30]], undef
97 ; DEFAULT-NEXT: [[Rdx:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP2]])
98 ; DEFAULT-NEXT: [[BIN_EXTRA]] = add i32 [[Rdx]], -5
109 ; DEFAULT-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32>
110 ; DEFAULT-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP2]], [[RDX_SHUF]]
111 ; DEFAULT-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32>
112 ; DEFAULT-NEXT: [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
113 ; DEFAULT-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32>
114 ; DEFAULT-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
115 ; DEFAULT-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
116 ; DEFAULT-NEXT: [[BIN_EXTRA]] = add i32 [[TMP3]], -5
99117 ; DEFAULT-NEXT: [[TMP34:%.*]] = add i32 [[TMP32]], undef
100118 ; DEFAULT-NEXT: br label [[FOR_BODY]]
101119 ;
143161 ; GATHER-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP29]], i32 5
144162 ; GATHER-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP31]], i32 6
145163 ; GATHER-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP33]], i32 7
146 ; GATHER-NEXT: [[Rdx:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP7]])
147 ; GATHER-NEXT: [[BIN_EXTRA]] = add i32 [[Rdx]], -5
164 ; GATHER-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> undef, <8 x i32>
165 ; GATHER-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP7]], [[RDX_SHUF]]
166 ; GATHER-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32>
167 ; GATHER-NEXT: [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
168 ; GATHER-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32>
169 ; GATHER-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
170 ; GATHER-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
171 ; GATHER-NEXT: [[BIN_EXTRA]] = add i32 [[TMP8]], -5
148172 ; GATHER-NEXT: [[TMP34:%.*]] = add i32 [[TMP32]], [[TMP33]]
149173 ; GATHER-NEXT: br label [[FOR_BODY]]
150174 ;