73#ifdef EXPENSIVE_CHECKS
105using namespace slpvectorizer;
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
110STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
114 cl::desc(
"Run the SLP vectorization passes"));
118 cl::desc(
"Only vectorize if you gain more than this "
123 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
124 "heuristics and makes vectorization decision via cost modeling."));
128 cl::desc(
"Attempt to vectorize horizontal reductions"));
133 "Attempt to vectorize horizontal reductions feeding into a store"));
139 cl::desc(
"Allow optimization of original scalar identity operations on "
140 "matched horizontal reductions."));
144 cl::desc(
"Attempt to vectorize for this register size in bits"));
148 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
156 cl::desc(
"Limit the size of the SLP scheduling region per block"));
160 cl::desc(
"Attempt to vectorize for this register size in bits"));
164 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
168 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
174 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
183 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
187 cl::desc(
"The minimum number of loads, which should be considered strided, "
188 "if the stride is > 1 or is runtime value"));
192 cl::desc(
"The maximum stride, considered to be profitable."));
196 cl::desc(
"Display the SLP trees with Graphviz"));
200 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
230 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
237 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
244 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
245 !isa<ExtractValueInst, UndefValue>(V))
247 auto *
I = dyn_cast<Instruction>(V);
248 if (!
I || isa<ExtractValueInst>(
I))
250 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
252 if (isa<ExtractElementInst>(
I))
254 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
263 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
279 for (
int I = 1, E = VL.
size();
I < E;
I++) {
280 auto *II = dyn_cast<Instruction>(VL[
I]);
301 Value *FirstNonUndef =
nullptr;
302 for (
Value *V : VL) {
303 if (isa<UndefValue>(V))
305 if (!FirstNonUndef) {
309 if (V != FirstNonUndef)
312 return FirstNonUndef !=
nullptr;
317 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
318 return Cmp->isCommutative();
319 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
320 return BO->isCommutative() ||
321 (BO->getOpcode() == Instruction::Sub &&
327 ICmpInst::Predicate Pred;
328 if (match(U.getUser(),
329 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
330 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
334 return match(U.getUser(),
335 m_Intrinsic<Intrinsic::abs>(
336 m_Specific(U.get()), m_ConstantInt(Flag))) &&
337 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
340 (BO->getOpcode() == Instruction::FSub &&
343 return match(U.getUser(),
344 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
346 return I->isCommutative();
354 if (
const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
355 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
358 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
361 if (CI->getValue().uge(VT->getNumElements()))
363 Index *= VT->getNumElements();
364 Index += CI->getZExtValue();
368 const auto *
IV = cast<InsertValueInst>(InsertInst);
369 Type *CurrentType =
IV->getType();
370 for (
unsigned I :
IV->indices()) {
371 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
372 Index *= ST->getNumElements();
373 CurrentType = ST->getElementType(
I);
374 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
375 Index *= AT->getNumElements();
376 CurrentType = AT->getElementType();
409 if (MaskArg == UseMask::UndefsAsMask)
413 if (MaskArg == UseMask::FirstArg &&
Value < VF)
414 UseMask.reset(
Value);
415 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
416 UseMask.reset(
Value - VF);
424template <
bool IsPoisonOnly = false>
428 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
431 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
434 auto *
C = dyn_cast<Constant>(V);
436 if (!UseMask.empty()) {
438 while (
auto *II = dyn_cast<InsertElementInst>(
Base)) {
439 Base = II->getOperand(0);
440 if (isa<T>(II->getOperand(1)))
447 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
455 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
462 for (
unsigned I = 0, E = VecTy->getNumElements();
I != E; ++
I) {
463 if (
Constant *Elem =
C->getAggregateElement(
I))
465 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
493static std::optional<TargetTransformInfo::ShuffleKind>
495 const auto *It =
find_if(VL, IsaPred<ExtractElementInst>);
498 auto *EI0 = cast<ExtractElementInst>(*It);
499 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
502 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
503 Value *Vec1 =
nullptr;
504 Value *Vec2 =
nullptr;
506 ShuffleMode CommonShuffleMode =
Unknown;
508 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
510 if (isa<UndefValue>(VL[
I]))
512 auto *EI = cast<ExtractElementInst>(VL[
I]);
513 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
515 auto *Vec = EI->getVectorOperand();
520 if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
Size)
522 if (isa<UndefValue>(EI->getIndexOperand()))
524 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
530 unsigned IntIdx =
Idx->getValue().getZExtValue();
534 if (!Vec1 || Vec1 == Vec) {
536 }
else if (!Vec2 || Vec2 == Vec) {
542 if (CommonShuffleMode == Permute)
547 CommonShuffleMode = Permute;
550 CommonShuffleMode =
Select;
553 if (CommonShuffleMode ==
Select && Vec2)
564 assert((Opcode == Instruction::ExtractElement ||
565 Opcode == Instruction::ExtractValue) &&
566 "Expected extractelement or extractvalue instruction.");
567 if (Opcode == Instruction::ExtractElement) {
568 auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
571 return CI->getZExtValue();
573 auto *EI = cast<ExtractValueInst>(E);
574 if (EI->getNumIndices() != 1)
576 return *EI->idx_begin();
582struct InstructionsState {
584 Value *OpValue =
nullptr;
595 unsigned getAltOpcode()
const {
600 bool isAltShuffle()
const {
return AltOp != MainOp; }
603 unsigned CheckedOpcode =
I->getOpcode();
604 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
607 InstructionsState() =
delete;
609 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
618 auto *
I = dyn_cast<Instruction>(
Op);
619 if (
I && S.isOpcodeOrAlt(
I))
638 unsigned BaseIndex = 0);
646 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
647 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
648 BaseOp0 == Op0 || BaseOp1 == Op1 ||
659 "Assessing comparisons of different types?");
669 return (BasePred == Pred &&
671 (BasePred == SwappedPred &&
680 unsigned BaseIndex) {
683 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
685 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
686 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
687 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
689 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
691 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
692 unsigned AltOpcode = Opcode;
693 unsigned AltIndex = BaseIndex;
695 bool SwappedPredsCompatible = [&]() {
699 UniquePreds.
insert(BasePred);
700 UniqueNonSwappedPreds.
insert(BasePred);
701 for (
Value *V : VL) {
702 auto *
I = dyn_cast<CmpInst>(V);
708 UniqueNonSwappedPreds.
insert(CurrentPred);
709 if (!UniquePreds.
contains(CurrentPred) &&
710 !UniquePreds.
contains(SwappedCurrentPred))
711 UniquePreds.
insert(CurrentPred);
716 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
720 auto *IBase = cast<Instruction>(VL[BaseIndex]);
723 if (
auto *
CallBase = dyn_cast<CallInst>(IBase)) {
727 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
729 for (
int Cnt = 0, E = VL.
size(); Cnt < E; Cnt++) {
730 auto *
I = cast<Instruction>(VL[Cnt]);
731 unsigned InstOpcode =
I->getOpcode();
732 if (IsBinOp && isa<BinaryOperator>(
I)) {
733 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
737 AltOpcode = InstOpcode;
741 }
else if (IsCastOp && isa<CastInst>(
I)) {
742 Value *Op0 = IBase->getOperand(0);
744 Value *Op1 =
I->getOperand(0);
747 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
749 if (Opcode == AltOpcode) {
752 "Cast isn't safe for alternation, logic needs to be updated!");
753 AltOpcode = InstOpcode;
758 }
else if (
auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
759 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
760 Type *Ty0 = BaseInst->getOperand(0)->getType();
761 Type *Ty1 = Inst->getOperand(0)->getType();
763 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
770 if ((E == 2 || SwappedPredsCompatible) &&
771 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
776 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
777 if (AltIndex != BaseIndex) {
780 }
else if (BasePred != CurrentPred) {
783 "CmpInst isn't safe for alternation, logic needs to be updated!");
788 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
789 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
792 }
else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
793 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
794 if (Gep->getNumOperands() != 2 ||
795 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
796 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
797 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
799 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
800 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
801 auto *BaseLI = cast<LoadInst>(IBase);
802 if (!LI->isSimple() || !BaseLI->isSimple())
803 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
804 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
805 auto *
CallBase = cast<CallInst>(IBase);
807 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
808 if (Call->hasOperandBundles() &&
809 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
810 Call->op_begin() + Call->getBundleOperandsEndIndex(),
813 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
816 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
819 if (Mappings.
size() != BaseMappings.
size() ||
820 Mappings.
front().ISA != BaseMappings.
front().ISA ||
821 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
822 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
823 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
824 Mappings.
front().Shape.Parameters !=
825 BaseMappings.
front().Shape.Parameters)
826 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
831 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
834 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
835 cast<Instruction>(VL[AltIndex]));
851 case Instruction::Load: {
852 LoadInst *LI = cast<LoadInst>(UserInst);
855 case Instruction::Store: {
856 StoreInst *SI = cast<StoreInst>(UserInst);
857 return (SI->getPointerOperand() == Scalar);
859 case Instruction::Call: {
860 CallInst *CI = cast<CallInst>(UserInst);
863 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
864 Arg.value().get() == Scalar;
876 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
883 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
884 return LI->isSimple();
886 return SI->isSimple();
888 return !
MI->isVolatile();
896 bool ExtendingManyInputs =
false) {
900 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
902 (SubMask.
size() == Mask.size() &&
903 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
904 [](
int Idx) { return Idx == PoisonMaskElem; }))) &&
905 "SubMask with many inputs support must be larger than the mask.");
907 Mask.append(SubMask.
begin(), SubMask.
end());
911 int TermValue = std::min(Mask.size(), SubMask.
size());
912 for (
int I = 0, E = SubMask.
size();
I < E; ++
I) {
914 (!ExtendingManyInputs &&
915 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
917 NewMask[
I] = Mask[SubMask[
I]];
933 const unsigned Sz = Order.
size();
936 for (
unsigned I = 0;
I < Sz; ++
I) {
938 UnusedIndices.
reset(Order[
I]);
940 MaskedIndices.
set(
I);
942 if (MaskedIndices.
none())
945 "Non-synced masked/available indices.");
949 assert(
Idx >= 0 &&
"Indices must be synced.");
961 const unsigned E = Indices.
size();
963 for (
unsigned I = 0;
I < E; ++
I)
964 Mask[Indices[
I]] =
I;
970 assert(!Mask.empty() &&
"Expected non-empty mask.");
974 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
976 Scalars[Mask[
I]] = Prev[
I];
984 auto *
I = dyn_cast<Instruction>(V);
989 auto *IO = dyn_cast<Instruction>(V);
992 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1001 auto *
I = dyn_cast<Instruction>(V);
1005 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1007 auto *IU = dyn_cast<Instruction>(U);
1010 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1026 return !VL.
empty() &&
1030namespace slpvectorizer {
1035 struct ScheduleData;
1060 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1061 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1115 return !VectorizableTree.
empty() &&
1116 !VectorizableTree.
front()->UserTreeIndices.empty();
1121 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1122 return VectorizableTree.
front()->Scalars;
1137 VectorizableTree.
clear();
1138 ScalarToTreeEntry.clear();
1139 MultiNodeScalars.clear();
1141 NonScheduledFirst.
clear();
1142 EntryToLastInstruction.clear();
1143 ExternalUses.
clear();
1144 ExternalUsesAsGEPs.clear();
1145 for (
auto &Iter : BlocksSchedules) {
1146 BlockScheduling *BS = Iter.second.get();
1150 ReductionBitWidth = 0;
1151 CastMaxMinBWSizes.reset();
1152 ExtraBitWidthNodes.
clear();
1153 InstrElementSize.clear();
1154 UserIgnoreList =
nullptr;
1155 PostponedGathers.
clear();
1156 ValueToGatherNodes.
clear();
1213 return MaxVecRegSize;
1218 return MinVecRegSize;
1226 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1228 return MaxVF ? MaxVF : UINT_MAX;
1272 bool TryRecursiveCheck =
true)
const;
1296 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1297 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1319 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1320 MaxLevel(MaxLevel) {}
1374 if (isa<LoadInst>(V1)) {
1376 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1381 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1383 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1386 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1389 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1391 ((
int)V1->getNumUses() == NumLanes ||
1392 AllUsersAreInternal(V1, V2)))
1398 auto CheckSameEntryOrFail = [&]() {
1399 if (
const TreeEntry *TE1 = R.getTreeEntry(V1);
1400 TE1 && TE1 == R.getTreeEntry(V2))
1405 auto *LI1 = dyn_cast<LoadInst>(V1);
1406 auto *LI2 = dyn_cast<LoadInst>(V2);
1408 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1410 return CheckSameEntryOrFail();
1413 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1414 LI2->getPointerOperand(),
DL, SE,
true);
1415 if (!Dist || *Dist == 0) {
1418 R.TTI->isLegalMaskedGather(
1422 return CheckSameEntryOrFail();
1426 if (std::abs(*Dist) > NumLanes / 2)
1435 auto *C1 = dyn_cast<Constant>(V1);
1436 auto *C2 = dyn_cast<Constant>(V2);
1450 if (isa<UndefValue>(V2))
1454 Value *EV2 =
nullptr;
1467 int Dist = Idx2 - Idx1;
1470 if (std::abs(Dist) == 0)
1472 if (std::abs(Dist) > NumLanes / 2)
1479 return CheckSameEntryOrFail();
1482 auto *I1 = dyn_cast<Instruction>(V1);
1483 auto *I2 = dyn_cast<Instruction>(V2);
1485 if (I1->getParent() != I2->getParent())
1486 return CheckSameEntryOrFail();
1493 if (S.getOpcode() &&
1494 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1495 !S.isAltShuffle()) &&
1497 return cast<Instruction>(V)->getNumOperands() ==
1498 S.MainOp->getNumOperands();
1504 if (isa<UndefValue>(V2))
1507 return CheckSameEntryOrFail();
1541 int ShallowScoreAtThisLevel =
1550 auto *I1 = dyn_cast<Instruction>(
LHS);
1551 auto *I2 = dyn_cast<Instruction>(
RHS);
1552 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1554 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1555 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1556 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1557 ShallowScoreAtThisLevel))
1558 return ShallowScoreAtThisLevel;
1559 assert(I1 && I2 &&
"Should have early exited.");
1566 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1567 OpIdx1 != NumOperands1; ++OpIdx1) {
1569 int MaxTmpScore = 0;
1570 unsigned MaxOpIdx2 = 0;
1571 bool FoundBest =
false;
1575 ? I2->getNumOperands()
1576 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1577 assert(FromIdx <= ToIdx &&
"Bad index");
1578 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1580 if (Op2Used.
count(OpIdx2))
1585 I1, I2, CurrLevel + 1, std::nullopt);
1588 TmpScore > MaxTmpScore) {
1589 MaxTmpScore = TmpScore;
1596 Op2Used.
insert(MaxOpIdx2);
1597 ShallowScoreAtThisLevel += MaxTmpScore;
1600 return ShallowScoreAtThisLevel;
1631 struct OperandData {
1632 OperandData() =
default;
1633 OperandData(
Value *V,
bool APO,
bool IsUsed)
1634 : V(V), APO(APO), IsUsed(IsUsed) {}
1644 bool IsUsed =
false;
1653 enum class ReorderingMode {
1670 const Loop *L =
nullptr;
1673 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
1674 return OpsVec[OpIdx][Lane];
1678 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
1679 return OpsVec[OpIdx][Lane];
1684 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
1685 OpIdx != NumOperands; ++OpIdx)
1686 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1688 OpsVec[OpIdx][Lane].IsUsed =
false;
1692 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
1693 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1705 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1706 Value *IdxLaneV = getData(
Idx, Lane).V;
1707 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1710 for (
unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1713 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1714 if (!isa<Instruction>(OpIdxLnV))
1716 Uniques.
insert(OpIdxLnV);
1718 int UniquesCount = Uniques.
size();
1719 int UniquesCntWithIdxLaneV =
1720 Uniques.
contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1721 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1722 int UniquesCntWithOpIdxLaneV =
1723 Uniques.
contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1724 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1727 UniquesCntWithOpIdxLaneV) -
1728 (
PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1737 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1738 Value *IdxLaneV = getData(
Idx, Lane).V;
1739 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1748 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1749 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1751 return R.areAllUsersVectorized(IdxLaneI)
1759 static const int ScoreScaleFactor = 10;
1767 int Lane,
unsigned OpIdx,
unsigned Idx,
1777 int SplatScore = getSplatScore(Lane, OpIdx,
Idx);
1778 if (Score <= -SplatScore) {
1783 Score += SplatScore;
1789 Score *= ScoreScaleFactor;
1790 Score += getExternalUseScore(Lane, OpIdx,
Idx);
1808 std::optional<unsigned>
1809 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
1812 unsigned NumOperands = getNumOperands();
1815 Value *OpLastLane = getData(OpIdx, LastLane).V;
1818 ReorderingMode RMode = ReorderingModes[OpIdx];
1819 if (RMode == ReorderingMode::Failed)
1820 return std::nullopt;
1823 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1829 std::optional<unsigned>
Idx;
1833 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
1839 bool IsUsed = RMode == ReorderingMode::Splat ||
1840 RMode == ReorderingMode::Constant ||
1841 RMode == ReorderingMode::Load;
1843 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
1845 OperandData &OpData = getData(
Idx, Lane);
1847 bool OpAPO = OpData.APO;
1856 if (OpAPO != OpIdxAPO)
1861 case ReorderingMode::Load:
1862 case ReorderingMode::Opcode: {
1863 bool LeftToRight = Lane > LastLane;
1864 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
1865 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
1866 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1867 OpIdx,
Idx, IsUsed);
1868 if (Score >
static_cast<int>(BestOp.Score) ||
1869 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
1872 BestOp.Score = Score;
1873 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1877 case ReorderingMode::Constant:
1878 if (isa<Constant>(
Op) ||
1879 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
1881 if (isa<Constant>(
Op)) {
1883 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1886 if (isa<UndefValue>(
Op) || !isa<Constant>(
Op))
1890 case ReorderingMode::Splat:
1891 if (
Op == OpLastLane || (!BestOp.Score && isa<Constant>(
Op))) {
1892 IsUsed =
Op == OpLastLane;
1893 if (
Op == OpLastLane) {
1895 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1901 case ReorderingMode::Failed:
1907 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1911 return std::nullopt;
1918 unsigned getBestLaneToStartReordering()
const {
1919 unsigned Min = UINT_MAX;
1920 unsigned SameOpNumber = 0;
1931 for (
int I = getNumLanes();
I > 0; --
I) {
1932 unsigned Lane =
I - 1;
1933 OperandsOrderData NumFreeOpsHash =
1934 getMaxNumOperandsThatCanBeReordered(Lane);
1937 if (NumFreeOpsHash.NumOfAPOs < Min) {
1938 Min = NumFreeOpsHash.NumOfAPOs;
1939 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1941 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1942 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1943 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1946 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1947 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1948 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1949 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1950 auto *It = HashMap.
find(NumFreeOpsHash.Hash);
1951 if (It == HashMap.
end())
1952 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1958 unsigned BestLane = 0;
1959 unsigned CntMin = UINT_MAX;
1961 if (
Data.second.first < CntMin) {
1962 CntMin =
Data.second.first;
1963 BestLane =
Data.second.second;
1970 struct OperandsOrderData {
1973 unsigned NumOfAPOs = UINT_MAX;
1976 unsigned NumOpsWithSameOpcodeParent = 0;
1990 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
1991 unsigned CntTrue = 0;
1992 unsigned NumOperands = getNumOperands();
2002 bool AllUndefs =
true;
2003 unsigned NumOpsWithSameOpcodeParent = 0;
2007 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2008 const OperandData &OpData = getData(OpIdx, Lane);
2013 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
2015 I->getParent() != Parent) {
2016 if (NumOpsWithSameOpcodeParent == 0) {
2017 NumOpsWithSameOpcodeParent = 1;
2019 Parent =
I->getParent();
2021 --NumOpsWithSameOpcodeParent;
2024 ++NumOpsWithSameOpcodeParent;
2028 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2029 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2033 OperandsOrderData
Data;
2034 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2035 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2043 assert((empty() || VL.
size() == getNumLanes()) &&
2044 "Expected same number of lanes");
2045 assert(isa<Instruction>(VL[0]) &&
"Expected instruction");
2046 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2047 constexpr unsigned IntrinsicNumOperands = 2;
2048 if (isa<IntrinsicInst>(VL[0]))
2049 NumOperands = IntrinsicNumOperands;
2050 OpsVec.
resize(NumOperands);
2051 unsigned NumLanes = VL.
size();
2052 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2053 OpsVec[OpIdx].
resize(NumLanes);
2054 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2055 assert(isa<Instruction>(VL[Lane]) &&
"Expected instruction");
2066 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2067 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2068 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2075 unsigned getNumOperands()
const {
return OpsVec.
size(); }
2078 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2081 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2082 return getData(OpIdx, Lane).V;
2086 bool empty()
const {
return OpsVec.
empty(); }
2089 void clear() { OpsVec.
clear(); }
2094 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2095 bool OpAPO = getData(OpIdx, Lane).APO;
2096 bool IsInvariant = L && L->isLoopInvariant(
Op);
2098 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2102 bool FoundCandidate =
false;
2103 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2104 OperandData &
Data = getData(OpI, Ln);
2105 if (
Data.APO != OpAPO ||
Data.IsUsed)
2107 Value *OpILane = getValue(OpI, Lane);
2108 bool IsConstantOp = isa<Constant>(OpILane);
2117 ((Lns > 2 && isa<Constant>(
Data.V)) ||
2123 isa<Constant>(
Data.V)))) ||
2130 (IsInvariant && !isa<Constant>(
Data.V) &&
2132 L->isLoopInvariant(
Data.V))) {
2133 FoundCandidate =
true;
2140 if (!FoundCandidate)
2143 return getNumLanes() == 2 || Cnt > 1;
2149 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R),
2153 appendOperandsOfVL(RootVL);
2160 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2161 "Expected same num of lanes across all operands");
2162 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2163 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2171 unsigned NumOperands = getNumOperands();
2172 unsigned NumLanes = getNumLanes();
2192 unsigned FirstLane = getBestLaneToStartReordering();
2195 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2196 Value *OpLane0 = getValue(OpIdx, FirstLane);
2199 if (isa<LoadInst>(OpLane0))
2200 ReorderingModes[OpIdx] = ReorderingMode::Load;
2201 else if (isa<Instruction>(OpLane0)) {
2203 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2204 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2206 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2208 else if (isa<Constant>(OpLane0))
2209 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2210 else if (isa<Argument>(OpLane0))
2212 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2215 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2222 auto &&SkipReordering = [
this]() {
2225 for (
const OperandData &
Data : Op0)
2228 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2247 if (SkipReordering())
2250 bool StrategyFailed =
false;
2258 for (
unsigned I = 0;
I < NumOperands; ++
I)
2259 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2261 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2264 int Lane = FirstLane +
Direction * Distance;
2265 if (Lane < 0 || Lane >= (
int)NumLanes)
2268 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2271 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2273 std::optional<unsigned> BestIdx = getBestOperand(
2274 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2281 swap(OpIdx, *BestIdx, Lane);
2284 StrategyFailed =
true;
2287 if (MainAltOps[OpIdx].
size() != 2) {
2288 OperandData &AltOp = getData(OpIdx, Lane);
2289 InstructionsState OpS =
2291 if (OpS.getOpcode() && OpS.isAltShuffle())
2298 if (!StrategyFailed)
2303#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2306 case ReorderingMode::Load:
2308 case ReorderingMode::Opcode:
2310 case ReorderingMode::Constant:
2312 case ReorderingMode::Splat:
2314 case ReorderingMode::Failed:
2335 const unsigned Indent = 2;
2338 OS <<
"Operand " << Cnt++ <<
"\n";
2339 for (
const OperandData &OpData : OpDataVec) {
2341 if (
Value *V = OpData.V)
2345 OS <<
", APO:" << OpData.APO <<
"}\n";
2367 int BestScore = Limit;
2368 std::optional<int>
Index;
2369 for (
int I : seq<int>(0, Candidates.size())) {
2371 Candidates[
I].second,
2374 if (Score > BestScore) {
2389 DeletedInstructions.insert(
I);
2395 return AnalyzedReductionsRoots.count(
I);
2400 AnalyzedReductionsRoots.insert(
I);
2414 AnalyzedReductionsRoots.clear();
2415 AnalyzedReductionVals.
clear();
2416 AnalyzedMinBWVals.
clear();
2428 return NonScheduledFirst.
contains(V);
2441 bool collectValuesToDemote(
const TreeEntry &E,
bool IsProfitableToDemoteRoot,
2445 unsigned &MaxDepthLevel,
2446 bool &IsProfitableToDemote,
2447 bool IsTruncRoot)
const;
2457 canReorderOperands(TreeEntry *UserTE,
2464 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2468 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2470 TreeEntry *TE =
nullptr;
2472 TE = getTreeEntry(V);
2473 if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2475 auto It = MultiNodeScalars.find(V);
2476 if (It != MultiNodeScalars.end()) {
2477 for (TreeEntry *E : It->second) {
2478 if (
is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2486 if (It != VL.
end()) {
2487 assert(
TE->isSame(VL) &&
"Expected same scalars.");
2495 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
2496 unsigned OpIdx)
const {
2497 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
2498 const_cast<TreeEntry *
>(UserTE), OpIdx);
2502 bool areAllUsersVectorized(
2511 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
2515 getCastContextHint(
const TreeEntry &TE)
const;
2524 const EdgeInfo &EI);
2535 bool ResizeAllowed =
false)
const;
2546 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
bool PostponedPHIs);
2551 template <
typename BVTy,
typename ResTy,
typename...
Args>
2552 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
2557 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy);
2563 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
2570 std::optional<TargetTransformInfo::ShuffleKind>
2582 unsigned NumParts)
const;
2594 std::optional<TargetTransformInfo::ShuffleKind>
2595 isGatherShuffledSingleRegisterEntry(
2612 isGatherShuffledEntry(
2615 unsigned NumParts,
bool ForOrder =
false);
2622 Type *ScalarTy)
const;
2626 void setInsertPointAfterBundle(
const TreeEntry *E);
2634 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
2647 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
2663 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
2667 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2684 [Scalars](
Value *V,
int Idx) {
2685 return (isa<UndefValue>(V) &&
2686 Idx == PoisonMaskElem) ||
2687 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2690 if (!ReorderIndices.empty()) {
2697 return IsSame(Scalars, Mask);
2698 if (VL.
size() == ReuseShuffleIndices.size()) {
2700 return IsSame(Scalars, Mask);
2704 return IsSame(Scalars, ReuseShuffleIndices);
2707 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
2708 return State == TreeEntry::NeedToGather &&
2709 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2710 UserTreeIndices.front().UserTE == UserEI.UserTE;
2714 bool hasEqualOperands(
const TreeEntry &TE)
const {
2715 if (
TE.getNumOperands() != getNumOperands())
2718 for (
unsigned I = 0, E = getNumOperands();
I < E; ++
I) {
2719 unsigned PrevCount =
Used.count();
2720 for (
unsigned K = 0;
K < E; ++
K) {
2723 if (getOperand(K) ==
TE.getOperand(
I)) {
2729 if (PrevCount ==
Used.count())
2738 unsigned getVectorFactor()
const {
2739 if (!ReuseShuffleIndices.empty())
2740 return ReuseShuffleIndices.size();
2741 return Scalars.
size();
2776 VecTreeTy &Container;
2800 assert(Operands[OpIdx].empty() &&
"Already resized?");
2802 "Number of operands is greater than the number of scalars.");
2808 void setOperandsInOrder() {
2810 auto *I0 = cast<Instruction>(Scalars[0]);
2811 Operands.resize(I0->getNumOperands());
2812 unsigned NumLanes = Scalars.size();
2813 for (
unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2814 OpIdx != NumOperands; ++OpIdx) {
2816 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2817 auto *
I = cast<Instruction>(Scalars[Lane]);
2818 assert(
I->getNumOperands() == NumOperands &&
2819 "Expected same number of operands");
2820 Operands[OpIdx][Lane] =
I->getOperand(OpIdx);
2844 unsigned getNumOperands()
const {
return Operands.size(); }
2847 Value *getSingleOperand(
unsigned OpIdx)
const {
2849 assert(!Operands[OpIdx].empty() &&
"No operand available");
2854 bool isAltShuffle()
const {
return MainOp != AltOp; }
2857 unsigned CheckedOpcode =
I->getOpcode();
2858 return (getOpcode() == CheckedOpcode ||
2859 getAltOpcode() == CheckedOpcode);
2866 auto *
I = dyn_cast<Instruction>(
Op);
2867 if (
I && isOpcodeOrAlt(
I))
2872 void setOperations(
const InstructionsState &S) {
2886 unsigned getOpcode()
const {
2887 return MainOp ? MainOp->
getOpcode() : 0;
2890 unsigned getAltOpcode()
const {
2896 int findLaneForValue(
Value *V)
const {
2897 unsigned FoundLane = std::distance(Scalars.begin(),
find(Scalars, V));
2898 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2899 if (!ReorderIndices.
empty())
2900 FoundLane = ReorderIndices[FoundLane];
2901 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2902 if (!ReuseShuffleIndices.
empty()) {
2903 FoundLane = std::distance(ReuseShuffleIndices.
begin(),
2904 find(ReuseShuffleIndices, FoundLane));
2918 bool isNonPowOf2Vec()
const {
2920 assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
2921 "Reshuffling not supported with non-power-of-2 vectors yet.");
2922 return IsNonPowerOf2;
2929 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
2930 dbgs() <<
"Operand " << OpI <<
":\n";
2931 for (
const Value *V : Operands[OpI])
2934 dbgs() <<
"Scalars: \n";
2935 for (
Value *V : Scalars)
2937 dbgs() <<
"State: ";
2940 dbgs() <<
"Vectorize\n";
2942 case ScatterVectorize:
2943 dbgs() <<
"ScatterVectorize\n";
2945 case StridedVectorize:
2946 dbgs() <<
"StridedVectorize\n";
2949 dbgs() <<
"NeedToGather\n";
2952 dbgs() <<
"MainOp: ";
2954 dbgs() << *MainOp <<
"\n";
2957 dbgs() <<
"AltOp: ";
2959 dbgs() << *AltOp <<
"\n";
2962 dbgs() <<
"VectorizedValue: ";
2963 if (VectorizedValue)
2964 dbgs() << *VectorizedValue <<
"\n";
2967 dbgs() <<
"ReuseShuffleIndices: ";
2968 if (ReuseShuffleIndices.
empty())
2971 for (
int ReuseIdx : ReuseShuffleIndices)
2972 dbgs() << ReuseIdx <<
", ";
2974 dbgs() <<
"ReorderIndices: ";
2975 for (
unsigned ReorderIdx : ReorderIndices)
2976 dbgs() << ReorderIdx <<
", ";
2978 dbgs() <<
"UserTreeIndices: ";
2979 for (
const auto &EInfo : UserTreeIndices)
2980 dbgs() << EInfo <<
", ";
2987 void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
2990 dbgs() <<
"SLP: " << Banner <<
":\n";
2992 dbgs() <<
"SLP: Costs:\n";
2993 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
2994 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
2995 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
2996 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2997 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
3003 std::optional<ScheduleData *> Bundle,
3004 const InstructionsState &S,
3005 const EdgeInfo &UserTreeIdx,
3008 TreeEntry::EntryState EntryState =
3009 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3010 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3011 ReuseShuffleIndices, ReorderIndices);
3015 TreeEntry::EntryState EntryState,
3016 std::optional<ScheduleData *> Bundle,
3017 const InstructionsState &S,
3018 const EdgeInfo &UserTreeIdx,
3021 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3022 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3023 "Need to vectorize gather entry?");
3024 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
3025 TreeEntry *
Last = VectorizableTree.
back().get();
3026 Last->Idx = VectorizableTree.
size() - 1;
3027 Last->State = EntryState;
3028 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3029 ReuseShuffleIndices.end());
3030 if (ReorderIndices.
empty()) {
3032 Last->setOperations(S);
3035 Last->Scalars.assign(VL.
size(),
nullptr);
3038 if (Idx >= VL.size())
3039 return UndefValue::get(VL.front()->getType());
3043 Last->setOperations(S);
3044 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
3046 if (
Last->State != TreeEntry::NeedToGather) {
3047 for (
Value *V : VL) {
3048 const TreeEntry *
TE = getTreeEntry(V);
3050 "Scalar already in tree!");
3053 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
3056 ScalarToTreeEntry[
V] =
Last;
3059 ScheduleData *BundleMember = *Bundle;
3060 assert((BundleMember || isa<PHINode>(S.MainOp) ||
3063 "Bundle and VL out of sync");
3065 for (
Value *V : VL) {
3070 BundleMember->TE =
Last;
3071 BundleMember = BundleMember->NextInBundle;
3074 assert(!BundleMember &&
"Bundle and VL out of sync");
3077 bool AllConstsOrCasts =
true;
3080 auto *
I = dyn_cast<CastInst>(V);
3081 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
3084 if (AllConstsOrCasts)
3086 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3087 MustGather.
insert(VL.begin(), VL.end());
3090 if (UserTreeIdx.UserTE) {
3091 Last->UserTreeIndices.push_back(UserTreeIdx);
3092 assert((!
Last->isNonPowOf2Vec() ||
Last->ReorderIndices.empty()) &&
3093 "Reordering isn't implemented for non-power-of-2 nodes yet");
3100 TreeEntry::VecTreeTy VectorizableTree;
3105 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3106 VectorizableTree[
Id]->dump();
3112 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
3114 const TreeEntry *getTreeEntry(
Value *V)
const {
3115 return ScalarToTreeEntry.lookup(V);
3124 bool areAltOperandsProfitable(
const InstructionsState &S,
3129 TreeEntry::EntryState getScalarsVectorizationState(
3162 using ValueToGatherNodesMap =
3164 ValueToGatherNodesMap ValueToGatherNodes;
3167 struct ExternalUser {
3191 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3192 auto It = AliasCache.
find(Key);
3193 if (It != AliasCache.
end())
3198 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3202 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3234 UserList ExternalUses;
3254 struct ScheduleData {
3257 enum { InvalidDeps = -1 };
3259 ScheduleData() =
default;
3261 void init(
int BlockSchedulingRegionID,
Value *OpVal) {
3262 FirstInBundle =
this;
3263 NextInBundle =
nullptr;
3264 NextLoadStore =
nullptr;
3265 IsScheduled =
false;
3266 SchedulingRegionID = BlockSchedulingRegionID;
3267 clearDependencies();
3274 if (hasValidDependencies()) {
3275 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3277 assert(UnscheduledDeps == Dependencies &&
"invariant");
3281 assert(isSchedulingEntity() &&
3282 "unexpected scheduled state");
3283 for (
const ScheduleData *BundleMember =
this; BundleMember;
3284 BundleMember = BundleMember->NextInBundle) {
3285 assert(BundleMember->hasValidDependencies() &&
3286 BundleMember->UnscheduledDeps == 0 &&
3287 "unexpected scheduled state");
3288 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3289 "only bundle is marked scheduled");
3293 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3294 "all bundle members must be in same basic block");
3300 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3304 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3308 bool isPartOfBundle()
const {
3309 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3314 bool isReady()
const {
3315 assert(isSchedulingEntity() &&
3316 "can't consider non-scheduling entity for ready list");
3317 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3323 int incrementUnscheduledDeps(
int Incr) {
3324 assert(hasValidDependencies() &&
3325 "increment of unscheduled deps would be meaningless");
3326 UnscheduledDeps += Incr;
3327 return FirstInBundle->unscheduledDepsInBundle();
3332 void resetUnscheduledDeps() {
3333 UnscheduledDeps = Dependencies;
3337 void clearDependencies() {
3338 Dependencies = InvalidDeps;
3339 resetUnscheduledDeps();
3340 MemoryDependencies.clear();
3341 ControlDependencies.clear();
3344 int unscheduledDepsInBundle()
const {
3345 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3347 for (
const ScheduleData *BundleMember =
this; BundleMember;
3348 BundleMember = BundleMember->NextInBundle) {
3349 if (BundleMember->UnscheduledDeps == InvalidDeps)
3351 Sum += BundleMember->UnscheduledDeps;
3357 if (!isSchedulingEntity()) {
3358 os <<
"/ " << *Inst;
3359 }
else if (NextInBundle) {
3361 ScheduleData *SD = NextInBundle;
3363 os <<
';' << *SD->Inst;
3364 SD = SD->NextInBundle;
3375 Value *OpValue =
nullptr;
3378 TreeEntry *
TE =
nullptr;
3382 ScheduleData *FirstInBundle =
nullptr;
3386 ScheduleData *NextInBundle =
nullptr;
3390 ScheduleData *NextLoadStore =
nullptr;
3404 int SchedulingRegionID = 0;
3407 int SchedulingPriority = 0;
3413 int Dependencies = InvalidDeps;
3419 int UnscheduledDeps = InvalidDeps;
3423 bool IsScheduled =
false;
3428 const BoUpSLP::ScheduleData &SD) {
3453 struct BlockScheduling {
3455 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
3459 ScheduleStart =
nullptr;
3460 ScheduleEnd =
nullptr;
3461 FirstLoadStoreInRegion =
nullptr;
3462 LastLoadStoreInRegion =
nullptr;
3463 RegionHasStackSave =
false;
3467 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3470 ScheduleRegionSize = 0;
3474 ++SchedulingRegionID;
3478 if (BB !=
I->getParent())
3481 ScheduleData *SD = ScheduleDataMap.lookup(
I);
3482 if (SD && isInSchedulingRegion(SD))
3487 ScheduleData *getScheduleData(
Value *V) {
3488 if (
auto *
I = dyn_cast<Instruction>(V))
3489 return getScheduleData(
I);
3493 ScheduleData *getScheduleData(
Value *V,
Value *Key) {
3495 return getScheduleData(V);
3496 auto I = ExtraScheduleDataMap.find(V);
3497 if (
I != ExtraScheduleDataMap.end()) {
3498 ScheduleData *SD =
I->second.lookup(Key);
3499 if (SD && isInSchedulingRegion(SD))
3505 bool isInSchedulingRegion(ScheduleData *SD)
const {
3506 return SD->SchedulingRegionID == SchedulingRegionID;
3511 template <
typename ReadyListType>
3512 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3513 SD->IsScheduled =
true;
3516 for (ScheduleData *BundleMember = SD; BundleMember;
3517 BundleMember = BundleMember->NextInBundle) {
3518 if (BundleMember->Inst != BundleMember->OpValue)
3524 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
3525 doForAllOpcodes(
I, [&ReadyList](ScheduleData *OpDef) {
3526 if (OpDef && OpDef->hasValidDependencies() &&
3527 OpDef->incrementUnscheduledDeps(-1) == 0) {
3531 ScheduleData *DepBundle = OpDef->FirstInBundle;
3532 assert(!DepBundle->IsScheduled &&
3533 "already scheduled bundle gets ready");
3534 ReadyList.insert(DepBundle);
3536 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
3544 if (TreeEntry *TE = BundleMember->TE) {
3546 int Lane = std::distance(
TE->Scalars.begin(),
3547 find(
TE->Scalars, BundleMember->Inst));
3548 assert(Lane >= 0 &&
"Lane not set");
3556 auto *
In = BundleMember->Inst;
3559 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3560 In->getNumOperands() ==
TE->getNumOperands()) &&
3561 "Missed TreeEntry operands?");
3564 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
3565 OpIdx != NumOperands; ++OpIdx)
3566 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
3571 for (
Use &U : BundleMember->Inst->operands())
3572 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
3576 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3577 if (MemoryDepSD->hasValidDependencies() &&
3578 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3581 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3582 assert(!DepBundle->IsScheduled &&
3583 "already scheduled bundle gets ready");
3584 ReadyList.insert(DepBundle);
3586 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
3590 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3591 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3594 ScheduleData *DepBundle = DepSD->FirstInBundle;
3595 assert(!DepBundle->IsScheduled &&
3596 "already scheduled bundle gets ready");
3597 ReadyList.insert(DepBundle);
3599 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
3610 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3611 ScheduleStart->comesBefore(ScheduleEnd) &&
3612 "Not a valid scheduling region?");
3614 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3615 auto *SD = getScheduleData(
I);
3618 assert(isInSchedulingRegion(SD) &&
3619 "primary schedule data not in window?");
3620 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3621 "entire bundle in window!");
3623 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->verify(); });
3626 for (
auto *SD : ReadyInsts) {
3627 assert(SD->isSchedulingEntity() && SD->isReady() &&
3628 "item in ready list not ready?");
3633 void doForAllOpcodes(
Value *V,
3635 if (ScheduleData *SD = getScheduleData(V))
3637 auto I = ExtraScheduleDataMap.find(V);
3638 if (
I != ExtraScheduleDataMap.end())
3639 for (
auto &
P :
I->second)
3640 if (isInSchedulingRegion(
P.second))
3645 template <
typename ReadyListType>
3646 void initialFillReadyList(ReadyListType &ReadyList) {
3647 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3648 doForAllOpcodes(
I, [&](ScheduleData *SD) {
3649 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3651 ReadyList.insert(SD);
3653 <<
"SLP: initially in ready list: " << *SD <<
"\n");
3668 std::optional<ScheduleData *>
3670 const InstructionsState &S);
3676 ScheduleData *allocateScheduleDataChunks();
3680 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
3685 ScheduleData *PrevLoadStore,
3686 ScheduleData *NextLoadStore);
3690 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
3694 void resetSchedule();
3715 ExtraScheduleDataMap;
3728 ScheduleData *FirstLoadStoreInRegion =
nullptr;
3732 ScheduleData *LastLoadStoreInRegion =
nullptr;
3737 bool RegionHasStackSave =
false;
3740 int ScheduleRegionSize = 0;
3749 int SchedulingRegionID = 1;
3757 void scheduleBlock(BlockScheduling *BS);
3764 struct OrdersTypeDenseMapInfo {
3777 static unsigned getHashValue(
const OrdersType &V) {
3798 unsigned MaxVecRegSize;
3799 unsigned MinVecRegSize;
3814 unsigned ReductionBitWidth = 0;
3818 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
3837 struct ChildIteratorType
3839 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3850 return R.VectorizableTree[0].get();
3854 return {
N->UserTreeIndices.begin(),
N->Container};
3858 return {
N->UserTreeIndices.end(),
N->Container};
3863 class nodes_iterator {
3874 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
3878 return nodes_iterator(R->VectorizableTree.begin());
3882 return nodes_iterator(R->VectorizableTree.end());
3885 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
3896 OS << Entry->Idx <<
".\n";
3899 for (
auto *V : Entry->Scalars) {
3901 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
3902 return EU.Scalar == V;
3912 if (Entry->State == TreeEntry::NeedToGather)
3914 if (Entry->State == TreeEntry::ScatterVectorize ||
3915 Entry->State == TreeEntry::StridedVectorize)
3916 return "color=blue";
3925 for (
auto *
I : DeletedInstructions) {
3926 for (
Use &U :
I->operands()) {
3927 auto *
Op = dyn_cast<Instruction>(U.get());
3928 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
3932 I->dropAllReferences();
3934 for (
auto *
I : DeletedInstructions) {
3936 "trying to erase instruction with users.");
3937 I->eraseFromParent();
3943#ifdef EXPENSIVE_CHECKS
3954 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
3955 "Expected non-empty mask.");
3958 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
3960 Reuses[Mask[
I]] = Prev[
I];
3968 bool BottomOrder =
false) {
3969 assert(!Mask.empty() &&
"Expected non-empty mask.");
3970 unsigned Sz = Mask.size();
3973 if (Order.
empty()) {
3975 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
3977 PrevOrder.
swap(Order);
3980 for (
unsigned I = 0;
I < Sz; ++
I)
3982 Order[
I] = PrevOrder[Mask[
I]];
3984 return Data.value() == Sz ||
Data.index() ==
Data.value();
3993 if (Order.
empty()) {
3995 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
4005 for (
unsigned I = 0;
I < Sz; ++
I)
4007 Order[MaskOrder[
I]] =
I;
4011std::optional<BoUpSLP::OrdersType>
4013 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
4017 Type *ScalarTy = GatheredScalars.
front()->getType();
4018 int NumScalars = GatheredScalars.
size();
4020 return std::nullopt;
4023 if (NumParts == 0 || NumParts >= NumScalars)
4029 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4031 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4034 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
4035 return std::nullopt;
4036 OrdersType CurrentOrder(NumScalars, NumScalars);
4037 if (GatherShuffles.
size() == 1 &&
4039 Entries.front().front()->isSame(TE.Scalars)) {
4042 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
4043 return CurrentOrder;
4047 return all_of(Mask, [&](
int I) {
4054 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
4055 (Entries.size() != 1 ||
4056 Entries.front().front()->ReorderIndices.empty())) ||
4057 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
4058 return std::nullopt;
4063 for (
int I : seq<int>(0, NumParts)) {
4064 if (ShuffledSubMasks.
test(
I))
4066 const int VF = GetVF(
I);
4071 if (
any_of(Slice, [&](
int I) {
return I != NumScalars; })) {
4072 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4073 ShuffledSubMasks.
set(
I);
4077 int FirstMin = INT_MAX;
4078 int SecondVecFound =
false;
4079 for (
int K : seq<int>(0, PartSz)) {
4080 int Idx = Mask[
I * PartSz + K];
4082 Value *V = GatheredScalars[
I * PartSz + K];
4084 SecondVecFound =
true;
4093 SecondVecFound =
true;
4097 FirstMin = (FirstMin / PartSz) * PartSz;
4099 if (SecondVecFound) {
4100 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4101 ShuffledSubMasks.
set(
I);
4104 for (
int K : seq<int>(0, PartSz)) {
4105 int Idx = Mask[
I * PartSz + K];
4109 if (
Idx >= PartSz) {
4110 SecondVecFound =
true;
4113 if (CurrentOrder[
I * PartSz +
Idx] >
4114 static_cast<unsigned>(
I * PartSz + K) &&
4115 CurrentOrder[
I * PartSz +
Idx] !=
4116 static_cast<unsigned>(
I * PartSz +
Idx))
4117 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
4120 if (SecondVecFound) {
4121 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4122 ShuffledSubMasks.
set(
I);
4127 int PartSz = NumScalars / NumParts;
4128 if (!ExtractShuffles.
empty())
4129 TransformMaskToOrder(
4130 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
4131 if (!ExtractShuffles[
I])
4134 for (
unsigned Idx : seq<unsigned>(0, PartSz)) {
4135 int K =
I * PartSz +
Idx;
4138 if (!TE.ReuseShuffleIndices.empty())
4139 K = TE.ReuseShuffleIndices[K];
4140 if (!TE.ReorderIndices.empty())
4141 K = std::distance(TE.ReorderIndices.begin(),
4142 find(TE.ReorderIndices, K));
4143 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4146 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4148 .getKnownMinValue());
4153 if (GatherShuffles.
size() == 1 && NumParts != 1) {
4154 if (ShuffledSubMasks.
any())
4155 return std::nullopt;
4156 PartSz = NumScalars;
4159 if (!Entries.empty())
4160 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
4161 if (!GatherShuffles[
I])
4163 return std::max(Entries[
I].front()->getVectorFactor(),
4164 Entries[
I].back()->getVectorFactor());
4167 count_if(CurrentOrder, [&](
int Idx) {
return Idx == NumScalars; });
4168 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4169 return std::nullopt;
4170 return std::move(CurrentOrder);
4175 bool CompareOpcodes =
true) {
4178 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4181 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4184 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4188 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4193template <
typename T>
4195 Align CommonAlignment = cast<T>(VL.
front())->getAlign();
4197 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
4198 return CommonAlignment;
4203 unsigned Sz = Order.
size();
4205 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4216static std::optional<Value *>
4222 const SCEV *PtrSCEVLowest =
nullptr;
4223 const SCEV *PtrSCEVHighest =
nullptr;
4229 return std::nullopt;
4231 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4232 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4236 if (isa<SCEVCouldNotCompute>(Diff))
4237 return std::nullopt;
4239 PtrSCEVLowest = PtrSCEV;
4243 if (isa<SCEVCouldNotCompute>(Diff1))
4244 return std::nullopt;
4246 PtrSCEVHighest = PtrSCEV;
4252 if (isa<SCEVCouldNotCompute>(Dist))
4253 return std::nullopt;
4254 int Size =
DL.getTypeStoreSize(ElemTy);
4255 auto TryGetStride = [&](
const SCEV *Dist,
4256 const SCEV *Multiplier) ->
const SCEV * {
4257 if (
const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4258 if (M->getOperand(0) == Multiplier)
4259 return M->getOperand(1);
4260 if (M->getOperand(1) == Multiplier)
4261 return M->getOperand(0);
4264 if (Multiplier == Dist)
4269 const SCEV *Stride =
nullptr;
4270 if (
Size != 1 || SCEVs.
size() > 2) {
4272 Stride = TryGetStride(Dist, Sz);
4274 return std::nullopt;
4276 if (!Stride || isa<SCEVConstant>(Stride))
4277 return std::nullopt;
4280 using DistOrdPair = std::pair<int64_t, int>;
4282 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4284 bool IsConsecutive =
true;
4285 for (
const SCEV *PtrSCEV : SCEVs) {
4287 if (PtrSCEV != PtrSCEVLowest) {
4289 const SCEV *Coeff = TryGetStride(Diff, Stride);
4291 return std::nullopt;
4292 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4293 if (!SC || isa<SCEVCouldNotCompute>(SC))
4294 return std::nullopt;
4298 return std::nullopt;
4299 Dist = SC->getAPInt().getZExtValue();
4303 return std::nullopt;
4304 auto Res = Offsets.emplace(Dist, Cnt);
4306 return std::nullopt;
4308 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4311 if (Offsets.size() != SCEVs.
size())
4312 return std::nullopt;
4313 SortedIndices.
clear();
4314 if (!IsConsecutive) {
4318 for (
const std::pair<int64_t, int> &Pair : Offsets) {
4319 SortedIndices[Cnt] = Pair.second;
4329static std::pair<InstructionCost, InstructionCost>
4345 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
4351 const unsigned Sz = VL.
size();
4353 auto *POIter = PointerOps.
begin();
4354 for (
Value *V : VL) {
4355 auto *L = cast<LoadInst>(V);
4358 *POIter = L->getPointerOperand();
4369 "supported with VectorizeNonPowerOf2");
4373 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4384 if (Order.
empty()) {
4385 Ptr0 = PointerOps.
front();
4386 PtrN = PointerOps.
back();
4388 Ptr0 = PointerOps[Order.
front()];
4389 PtrN = PointerOps[Order.
back()];
4391 std::optional<int> Diff =
4394 if (
static_cast<unsigned>(*Diff) == Sz - 1)
4397 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4409 (
static_cast<unsigned>(std::abs(*Diff)) <=
4412 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4413 *Diff == -(
static_cast<int>(Sz) - 1))) {
4414 int Stride = *Diff /
static_cast<int>(Sz - 1);
4415 if (*Diff == Stride *
static_cast<int>(Sz - 1)) {
4427 else if (
Ptr != Ptr0)
4432 if (((Dist / Stride) * Stride) != Dist ||
4433 !Dists.
insert(Dist).second)
4436 if (Dists.
size() == Sz)
4442 auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment) {
4443 unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
4445 unsigned MaxVF = std::max<unsigned>(
bit_floor(VL.
size() / 2), MinVF);
4446 MaxVF = std::min(
getMaximumVF(Sz, Instruction::Load), MaxVF);
4447 for (
unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4448 unsigned VectorizedCnt = 0;
4450 for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End;
4451 Cnt += VF, ++VectorizedCnt) {
4469 if (VectorizedCnt == VL.
size() / VF) {
4472 auto [ScalarGEPCost, VectorGEPCost] =
getGEPCosts(
4473 TTI, PointerOps, PointerOps.
front(), Instruction::GetElementPtr,
4477 Instruction::Load, VecTy,
4479 false, CommonAlignment,
CostKind) +
4480 VectorGEPCost - ScalarGEPCost;
4484 auto *LI0 = cast<LoadInst>(VL[
I * VF]);
4487 auto [ScalarGEPCost, VectorGEPCost] =
4489 LI0->getPointerOperand(), Instruction::Load,
4492 Instruction::Load, SubVecTy, LI0->getAlign(),
4493 LI0->getPointerAddressSpace(),
CostKind,
4495 VectorGEPCost - ScalarGEPCost;
4499 auto [ScalarGEPCost, VectorGEPCost] =
4501 LI0->getPointerOperand(), Instruction::Load,
4505 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4506 false, CommonAlignment,
CostKind) +
4507 VectorGEPCost - ScalarGEPCost;
4511 auto [ScalarGEPCost, VectorGEPCost] =
getGEPCosts(
4513 LI0->getPointerOperand(), Instruction::GetElementPtr,
4517 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4518 false, CommonAlignment,
CostKind) +
4519 VectorGEPCost - ScalarGEPCost;
4524 "Expected only consecutive, strided or masked gather loads.");
4527 for (
int Idx : seq<int>(0, VL.
size()))
4536 if (MaskedGatherCost >= VecLdCost)
4546 bool ProfitableGatherPointers =
4549 return L->isLoopInvariant(V);
4551 if (ProfitableGatherPointers ||
all_of(PointerOps, [IsSorted](
Value *
P) {
4552 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
4554 (
GEP &&
GEP->getNumOperands() == 2 &&
4555 isa<Constant, Instruction>(
GEP->getOperand(1)));
4557 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4562 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4581 "Expected list of pointer operands.");
4586 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4591 std::optional<int> Diff =
4597 Base.second.emplace_back(
Ptr, *Diff, Cnt++);
4603 if (Bases.
size() > VL.
size() / 2 - 1)
4607 Bases[
Ptr].emplace_back(
Ptr, 0, Cnt++);
4613 bool AnyConsecutive =
false;
4614 for (
auto &
Base : Bases) {
4615 auto &Vec =
Base.second;
4616 if (Vec.size() > 1) {
4618 const std::tuple<Value *, int, unsigned> &
Y) {
4619 return std::get<1>(
X) < std::get<1>(
Y);
4621 int InitialOffset = std::get<1>(Vec[0]);
4623 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
4629 SortedIndices.
clear();
4630 if (!AnyConsecutive)
4633 for (
auto &
Base : Bases) {
4634 for (
auto &
T :
Base.second)
4639 "Expected SortedIndices to be the size of VL");
4643std::optional<BoUpSLP::OrdersType>
4645 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
4646 Type *ScalarTy = TE.Scalars[0]->getType();
4649 Ptrs.
reserve(TE.Scalars.size());
4650 for (
Value *V : TE.Scalars) {
4651 auto *L = dyn_cast<LoadInst>(V);
4652 if (!L || !L->isSimple())
4653 return std::nullopt;
4659 return std::move(Order);
4660 return std::nullopt;
4671 if (VU->
getType() != V->getType())
4674 if (!VU->
hasOneUse() && !V->hasOneUse())
4680 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4686 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());
4687 bool IsReusedIdx =
false;
4689 if (IE2 == VU && !IE1)
4691 if (IE1 == V && !IE2)
4692 return V->hasOneUse();
4693 if (IE1 && IE1 != V) {
4695 IsReusedIdx |= ReusedIdx.
test(Idx1);
4696 ReusedIdx.
set(Idx1);
4697 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
4700 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4702 if (IE2 && IE2 != VU) {
4704 IsReusedIdx |= ReusedIdx.
test(Idx2);
4705 ReusedIdx.
set(Idx2);
4706 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4709 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4711 }
while (!IsReusedIdx && (IE1 || IE2));
4715std::optional<BoUpSLP::OrdersType>
4718 if (TE.isNonPowOf2Vec())
4719 return std::nullopt;
4723 if (!TE.ReuseShuffleIndices.empty()) {
4725 return std::nullopt;
4733 unsigned Sz = TE.Scalars.size();
4734 if (TE.State == TreeEntry::NeedToGather) {
4735 if (std::optional<OrdersType> CurrentOrder =
4740 ::addMask(Mask, TE.ReuseShuffleIndices);
4741 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4742 unsigned Sz = TE.Scalars.size();
4743 for (
int K = 0,
E = TE.getVectorFactor() / Sz; K <
E; ++K) {
4746 Res[
Idx + K * Sz] =
I + K * Sz;
4748 return std::move(Res);
4751 if (Sz == 2 && TE.getVectorFactor() == 4 &&
4753 TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1)
4754 return std::nullopt;
4758 if (TE.ReorderIndices.empty())
4759 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
4762 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
4763 unsigned VF = ReorderMask.
size();
4765 unsigned NumParts = VF / Sz;
4767 for (
unsigned I = 0;
I < VF;
I += Sz) {
4769 unsigned UndefCnt = 0;
4778 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
4780 return std::nullopt;
4782 for (
unsigned K = 0; K < NumParts; ++K)
4783 ResOrder[Val + Sz * K] =
I + K;
4785 return std::move(ResOrder);
4787 unsigned VF = TE.getVectorFactor();
4790 TE.ReuseShuffleIndices.end());
4791 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4793 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4794 return Idx && *Idx < Sz;
4797 if (TE.ReorderIndices.empty())
4798 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
4801 for (
unsigned I = 0;
I < VF; ++
I) {
4802 int &
Idx = ReusedMask[
I];
4805 Value *V = TE.Scalars[ReorderMask[
Idx]];
4807 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
4813 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
4814 auto *It = ResOrder.
begin();
4815 for (
unsigned K = 0; K < VF; K += Sz) {
4819 std::iota(SubMask.begin(), SubMask.end(), 0);
4821 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
4822 std::advance(It, Sz);
4824 if (TE.State == TreeEntry::NeedToGather &&
4826 [](
const auto &
Data) {
return Data.index() ==
Data.value(); }))
4827 return std::nullopt;
4828 return std::move(ResOrder);
4830 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
4831 any_of(TE.UserTreeIndices,
4833 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
4835 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
4836 return std::nullopt;
4837 if ((TE.State == TreeEntry::Vectorize ||
4838 TE.State == TreeEntry::StridedVectorize) &&
4839 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4840 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4842 return TE.ReorderIndices;
4843 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4844 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
4845 Value *V1 = TE.Scalars[I1];
4846 Value *V2 = TE.Scalars[I2];
4847 if (V1 == V2 || (V1->
getNumUses() == 0 && V2->getNumUses() == 0))
4853 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
4854 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4855 if (
auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4856 if (
auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4863 if (
auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4864 if (
auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4865 if (EE1->getOperand(0) != EE2->getOperand(0))
4871 auto IsIdentityOrder = [](
const OrdersType &Order) {
4872 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
4877 if (!TE.ReorderIndices.empty())
4878 return TE.ReorderIndices;
4881 std::iota(Phis.begin(), Phis.end(), 0);
4883 for (
unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4886 for (
unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4887 ResOrder[Id] = PhiToId[Phis[Id]];
4888 if (IsIdentityOrder(ResOrder))
4889 return std::nullopt;
4890 return std::move(ResOrder);
4892 if (TE.State == TreeEntry::NeedToGather && !TE.isAltShuffle() &&
4896 if ((TE.getOpcode() == Instruction::ExtractElement ||
4897 (
all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
4898 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
4900 auto *EE = dyn_cast<ExtractElementInst>(V);
4901 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4906 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
4908 if (Reuse || !CurrentOrder.
empty())
4909 return std::move(CurrentOrder);
4917 int Sz = TE.Scalars.size();
4919 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
4921 find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
4922 if (It == TE.Scalars.begin())
4925 if (It != TE.Scalars.end()) {
4927 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4942 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
4945 return std::move(Order);
4950 return std::nullopt;
4951 if (TE.Scalars.size() >= 4)
4955 return CurrentOrder;
4957 return std::nullopt;
4967 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
4969 if (Cluster != FirstCluster)
4975void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
4978 const unsigned Sz =
TE.Scalars.size();
4980 if (
TE.State != TreeEntry::NeedToGather ||
4987 addMask(NewMask,
TE.ReuseShuffleIndices);
4989 TE.ReorderIndices.clear();
4996 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
4997 *
End =
TE.ReuseShuffleIndices.end();
4998 It !=
End; std::advance(It, Sz))
4999 std::iota(It, std::next(It, Sz), 0);
5005 "Expected same size of orders");
5006 unsigned Sz = Order.
size();
5008 for (
unsigned Idx : seq<unsigned>(0, Sz)) {
5009 if (Order[
Idx] != Sz)
5010 UsedIndices.
set(Order[
Idx]);
5012 if (SecondaryOrder.
empty()) {
5013 for (
unsigned Idx : seq<unsigned>(0, Sz))
5014 if (Order[
Idx] == Sz && !UsedIndices.
test(
Idx))
5017 for (
unsigned Idx : seq<unsigned>(0, Sz))
5018 if (SecondaryOrder[
Idx] != Sz && Order[
Idx] == Sz &&
5019 !UsedIndices.
test(SecondaryOrder[
Idx]))
5020 Order[
Idx] = SecondaryOrder[
Idx];
5040 ExternalUserReorderMap;
5045 const std::unique_ptr<TreeEntry> &TE) {
5048 findExternalStoreUsersReorderIndices(TE.get());
5049 if (!ExternalUserReorderIndices.
empty()) {
5050 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5052 std::move(ExternalUserReorderIndices));
5058 if (TE->isAltShuffle()) {
5061 unsigned Opcode0 = TE->getOpcode();
5062 unsigned Opcode1 = TE->getAltOpcode();
5065 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
5066 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
5067 OpcodeMask.
set(Lane);
5069 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5070 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5076 if (std::optional<OrdersType> CurrentOrder =
5086 const TreeEntry *UserTE = TE.get();
5088 if (UserTE->UserTreeIndices.size() != 1)
5091 return EI.UserTE->State == TreeEntry::Vectorize &&
5092 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5095 UserTE = UserTE->UserTreeIndices.back().UserTE;
5098 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5099 if (!(TE->State == TreeEntry::Vectorize ||
5100 TE->State == TreeEntry::StridedVectorize) ||
5101 !TE->ReuseShuffleIndices.empty())
5102 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
5103 if (TE->State == TreeEntry::Vectorize &&
5104 TE->getOpcode() == Instruction::PHI)
5105 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
5110 for (
unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5112 auto It = VFToOrderedEntries.
find(VF);
5113 if (It == VFToOrderedEntries.
end())
5125 for (
const TreeEntry *OpTE : OrderedEntries) {
5128 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
5131 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5133 if (OpTE->State == TreeEntry::NeedToGather ||
5134 !OpTE->ReuseShuffleIndices.empty()) {
5135 auto It = GathersToOrders.find(OpTE);
5136 if (It != GathersToOrders.end())
5139 if (OpTE->isAltShuffle()) {
5140 auto It = AltShufflesToOrders.find(OpTE);
5141 if (It != AltShufflesToOrders.end())
5144 if (OpTE->State == TreeEntry::Vectorize &&
5145 OpTE->getOpcode() == Instruction::PHI) {
5146 auto It = PhisToOrders.
find(OpTE);
5147 if (It != PhisToOrders.
end())
5150 return OpTE->ReorderIndices;
5153 auto It = ExternalUserReorderMap.
find(OpTE);
5154 if (It != ExternalUserReorderMap.
end()) {
5155 const auto &ExternalUserReorderIndices = It->second;
5159 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5160 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
5161 ExternalUserReorderIndices.size();
5163 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
5164 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5171 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5172 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5175 unsigned E = Order.size();
5178 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5181 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
5183 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
5186 if (OrdersUses.empty())
5189 const unsigned Sz = Order.size();
5190 for (
unsigned Idx : seq<unsigned>(0, Sz))
5191 if (
Idx != Order[
Idx] && Order[
Idx] != Sz)
5196 unsigned IdentityCnt = 0;
5197 unsigned FilledIdentityCnt = 0;
5199 for (
auto &Pair : OrdersUses) {
5200 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5201 if (!Pair.first.empty())
5202 FilledIdentityCnt += Pair.second;
5203 IdentityCnt += Pair.second;
5208 unsigned Cnt = IdentityCnt;
5209 for (
auto &Pair : OrdersUses) {
5213 if (Cnt < Pair.second ||
5214 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5215 Cnt == Pair.second && !BestOrder.
empty() &&
5216 IsIdentityOrder(BestOrder))) {
5218 BestOrder = Pair.first;
5225 if (IsIdentityOrder(BestOrder))
5231 unsigned E = BestOrder.
size();
5233 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5236 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5238 if (TE->Scalars.size() != VF) {
5239 if (TE->ReuseShuffleIndices.size() == VF) {
5245 return EI.UserTE->Scalars.size() == VF ||
5246 EI.UserTE->Scalars.size() ==
5249 "All users must be of VF size.");
5252 reorderNodeWithReuses(*TE, Mask);
5256 if ((TE->State == TreeEntry::Vectorize ||
5257 TE->State == TreeEntry::StridedVectorize) &&
5260 !TE->isAltShuffle()) {
5264 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5265 TE->reorderOperands(Mask);
5268 TE->reorderOperands(Mask);
5269 assert(TE->ReorderIndices.empty() &&
5270 "Expected empty reorder sequence.");
5273 if (!TE->ReuseShuffleIndices.empty()) {
5280 addMask(NewReuses, TE->ReuseShuffleIndices);
5281 TE->ReuseShuffleIndices.swap(NewReuses);
5287bool BoUpSLP::canReorderOperands(
5288 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5292 if (UserTE->isNonPowOf2Vec())
5295 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
5296 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
5297 return OpData.first ==
I &&
5298 (OpData.second->State == TreeEntry::Vectorize ||
5299 OpData.second->State == TreeEntry::StridedVectorize);
5302 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
5304 if (
any_of(TE->UserTreeIndices,
5305 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5309 Edges.emplace_back(
I, TE);
5315 if (TE->State != TreeEntry::Vectorize &&
5316 TE->State != TreeEntry::StridedVectorize &&
5317 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5321 TreeEntry *
Gather =
nullptr;
5323 [&
Gather, UserTE,
I](TreeEntry *TE) {
5324 assert(TE->State != TreeEntry::Vectorize &&
5325 TE->State != TreeEntry::StridedVectorize &&
5326 "Only non-vectorized nodes are expected.");
5327 if (
any_of(TE->UserTreeIndices,
5328 [UserTE,
I](
const EdgeInfo &EI) {
5329 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5331 assert(TE->isSame(UserTE->getOperand(
I)) &&
5332 "Operand entry does not match operands.");
5353 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5354 if (TE->State != TreeEntry::Vectorize &&
5355 TE->State != TreeEntry::StridedVectorize)
5357 if (std::optional<OrdersType> CurrentOrder =
5359 OrderedEntries.
insert(TE.get());
5360 if (!(TE->State == TreeEntry::Vectorize ||
5361 TE->State == TreeEntry::StridedVectorize) ||
5362 !TE->ReuseShuffleIndices.empty())
5363 GathersToOrders.
insert(TE.get());
5372 while (!OrderedEntries.
empty()) {
5377 for (TreeEntry *TE : OrderedEntries) {
5378 if (!(TE->State == TreeEntry::Vectorize ||
5379 TE->State == TreeEntry::StridedVectorize ||
5380 (TE->State == TreeEntry::NeedToGather &&
5382 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5385 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5387 !Visited.
insert(TE).second) {
5393 for (
EdgeInfo &EI : TE->UserTreeIndices) {
5394 TreeEntry *UserTE = EI.
UserTE;
5395 auto It =
Users.find(UserTE);
5396 if (It ==
Users.end())
5397 It =
Users.insert({UserTE, {}}).first;
5398 It->second.emplace_back(EI.
EdgeIdx, TE);
5402 for (TreeEntry *TE : Filtered)
5403 OrderedEntries.remove(TE);
5405 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5407 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
5408 return Data1.first->Idx > Data2.first->Idx;
5410 for (
auto &
Data : UsersVec) {
5413 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
5415 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5416 OrderedEntries.remove(
Op.second);
5429 for (
const auto &
Op :
Data.second) {
5430 TreeEntry *OpTE =
Op.second;
5431 if (!VisitedOps.
insert(OpTE).second)
5433 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
5435 const auto Order = [&]() ->
const OrdersType {
5436 if (OpTE->State == TreeEntry::NeedToGather ||
5437 !OpTE->ReuseShuffleIndices.empty())
5440 return OpTE->ReorderIndices;
5444 if (Order.size() == 1)
5447 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
5448 return P.second == OpTE;
5451 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5452 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5455 unsigned E = Order.size();
5458 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5461 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
5464 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
5466 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
5467 const auto AllowsReordering = [&](
const TreeEntry *TE) {
5469 if (TE->isNonPowOf2Vec())
5471 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5472 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5473 (IgnoreReorder && TE->Idx == 0))
5475 if (TE->State == TreeEntry::NeedToGather) {
5484 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
5485 TreeEntry *UserTE = EI.
UserTE;
5486 if (!VisitedUsers.
insert(UserTE).second)
5491 if (AllowsReordering(UserTE))
5499 if (
static_cast<unsigned>(
count_if(
5500 Ops, [UserTE, &AllowsReordering](
5501 const std::pair<unsigned, TreeEntry *> &
Op) {
5502 return AllowsReordering(
Op.second) &&
5505 return EI.UserTE == UserTE;
5507 })) <= Ops.
size() / 2)
5508 ++Res.first->second;
5511 if (OrdersUses.empty()) {
5512 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5513 OrderedEntries.remove(
Op.second);
5517 const unsigned Sz = Order.size();
5518 for (
unsigned Idx : seq<unsigned>(0, Sz))
5519 if (
Idx != Order[
Idx] && Order[
Idx] != Sz)
5524 unsigned IdentityCnt = 0;
5525 unsigned VF =
Data.second.front().second->getVectorFactor();
5527 for (
auto &Pair : OrdersUses) {
5528 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5529 IdentityCnt += Pair.second;
5534 unsigned Cnt = IdentityCnt;
5535 for (
auto &Pair : OrdersUses) {
5539 if (Cnt < Pair.second) {
5541 BestOrder = Pair.first;
5548 if (IsIdentityOrder(BestOrder)) {
5549 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5550 OrderedEntries.remove(
Op.second);
5559 unsigned E = BestOrder.
size();
5561 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5563 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
5564 TreeEntry *TE =
Op.second;
5565 OrderedEntries.remove(TE);
5566 if (!VisitedOps.
insert(TE).second)
5568 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
5569 reorderNodeWithReuses(*TE, Mask);
5573 if (TE->State != TreeEntry::Vectorize &&
5574 TE->State != TreeEntry::StridedVectorize &&
5575 (TE->State != TreeEntry::ScatterVectorize ||
5576 TE->ReorderIndices.empty()))
5578 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
5579 TE->ReorderIndices.empty()) &&
5580 "Non-matching sizes of user/operand entries.");
5582 if (IgnoreReorder && TE == VectorizableTree.front().get())
5583 IgnoreReorder =
false;
5586 for (TreeEntry *
Gather : GatherOps) {
5588 "Unexpected reordering of gathers.");
5589 if (!
Gather->ReuseShuffleIndices.empty()) {
5595 OrderedEntries.remove(
Gather);
5599 if (
Data.first->State != TreeEntry::Vectorize ||
5600 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5601 Data.first->getMainOp()) ||
5602 Data.first->isAltShuffle())
5603 Data.first->reorderOperands(Mask);
5604 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
5605 Data.first->isAltShuffle() ||
5606 Data.first->State == TreeEntry::StridedVectorize) {
5610 if (
Data.first->ReuseShuffleIndices.empty() &&
5611 !
Data.first->ReorderIndices.empty() &&
5612 !
Data.first->isAltShuffle()) {
5615 OrderedEntries.insert(
Data.first);
5623 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5624 VectorizableTree.front()->ReuseShuffleIndices.empty())
5625 VectorizableTree.front()->ReorderIndices.clear();
5632 for (
auto &TEPtr : VectorizableTree) {
5633 TreeEntry *Entry = TEPtr.get();
5636 if (Entry->State == TreeEntry::NeedToGather)
5640 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5641 Value *Scalar = Entry->Scalars[Lane];
5642 if (!isa<Instruction>(Scalar))
5645 auto It = ScalarToExtUses.
find(Scalar);
5646 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
5650 const auto *ExtI = ExternallyUsedValues.
find(Scalar);
5651 if (ExtI != ExternallyUsedValues.
end()) {
5652 int FoundLane = Entry->findLaneForValue(Scalar);
5653 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
5654 << FoundLane <<
" from " << *Scalar <<
".\n");
5655 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
5656 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
5659 for (
User *U : Scalar->users()) {
5667 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
5671 if (TreeEntry *UseEntry = getTreeEntry(U)) {
5675 if (UseEntry->State == TreeEntry::ScatterVectorize ||
5677 Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
5678 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
5680 assert(UseEntry->State != TreeEntry::NeedToGather &&
"Bad state");
5684 if (It != ScalarToExtUses.
end()) {
5685 ExternalUses[It->second].User =
nullptr;
5690 int FoundLane = Entry->findLaneForValue(Scalar);
5692 <<
" from lane " << FoundLane <<
" from " << *Scalar
5694 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
5695 ExternalUses.emplace_back(Scalar, U, FoundLane);
5704BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
5706 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
5707 Value *V = TE->Scalars[Lane];
5713 for (
User *U : V->users()) {
5714 auto *SI = dyn_cast<StoreInst>(U);
5715 if (SI ==
nullptr || !SI->isSimple() ||
5719 if (getTreeEntry(U))
5723 auto &StoresVec = PtrToStoresMap[
Ptr];
5726 if (StoresVec.size() > Lane)
5729 if (!StoresVec.empty() &&
5730 SI->getParent() != StoresVec.back()->getParent())
5733 if (!StoresVec.empty() &&
5734 SI->getValueOperand()->getType() !=
5735 StoresVec.back()->getValueOperand()->getType())
5737 StoresVec.push_back(SI);
5740 return PtrToStoresMap;
5744 OrdersType &ReorderIndices)
const {
5752 StoreOffsetVec[0] = {S0, 0};
5755 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
5757 std::optional<int> Diff =
5759 SI->getPointerOperand(), *
DL, *SE,
5764 StoreOffsetVec[
Idx] = {StoresVec[
Idx], *Diff};
5769 stable_sort(StoreOffsetVec, [](
const std::pair<StoreInst *, int> &Pair1,
5770 const std::pair<StoreInst *, int> &Pair2) {
5771 int Offset1 = Pair1.second;
5772 int Offset2 = Pair2.second;
5773 return Offset1 < Offset2;
5777 for (
unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
5778 if (StoreOffsetVec[
Idx].second != StoreOffsetVec[
Idx - 1].second + 1)
5783 ReorderIndices.reserve(StoresVec.
size());
5786 [SI](
const std::pair<StoreInst *, int> &Pair) {
5787 return Pair.first ==
SI;
5789 StoreOffsetVec.begin();
5790 ReorderIndices.push_back(
Idx);
5795 auto IsIdentityOrder = [](
const OrdersType &Order) {
5796 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
5801 if (IsIdentityOrder(ReorderIndices))
5802 ReorderIndices.clear();
5809 for (
unsigned Idx : Order)
5816BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
5817 unsigned NumLanes =
TE->Scalars.size();
5820 collectUserStores(TE);
5829 for (
const auto &Pair : PtrToStoresMap) {
5830 auto &StoresVec = Pair.second;
5832 if (StoresVec.size() != NumLanes)
5837 if (!canFormVector(StoresVec, ReorderIndices))
5842 ExternalReorderIndices.
push_back(ReorderIndices);
5844 return ExternalReorderIndices;
5850 UserIgnoreList = &UserIgnoreLst;
5853 buildTree_rec(Roots, 0,
EdgeInfo());
5860 buildTree_rec(Roots, 0,
EdgeInfo());
5867 Value *NeedsScheduling =
nullptr;
5868 for (
Value *V : VL) {
5871 if (!NeedsScheduling) {
5872 NeedsScheduling = V;
5877 return NeedsScheduling;
5888 bool AllowAlternate) {
5892 if (
auto *LI = dyn_cast<LoadInst>(V)) {
5895 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
5900 if (isa<ExtractElementInst, UndefValue>(V))
5902 if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
5904 !isa<UndefValue>(EI->getIndexOperand()))
5907 }
else if (
auto *
I = dyn_cast<Instruction>(V)) {
5910 if ((isa<BinaryOperator, CastInst>(
I)) &&
5920 : cast<CastInst>(
I)->getOperand(0)->getType()));
5922 if (isa<CastInst>(
I)) {
5923 std::pair<size_t, size_t> OpVals =
5929 }
else if (
auto *CI = dyn_cast<CmpInst>(
I)) {
5931 if (CI->isCommutative())
5937 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
5951 }
else if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
5952 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
5953 SubKey =
hash_value(Gep->getPointerOperand());
5957 !isa<ConstantInt>(
I->getOperand(1))) {
5965 return std::make_pair(Key, SubKey);
5975bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
5977 unsigned Opcode0 = S.getOpcode();
5978 unsigned Opcode1 = S.getAltOpcode();
5981 for (
unsigned Lane : seq<unsigned>(0, VL.
size()))
5982 if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
5983 OpcodeMask.set(Lane);
5986 Opcode0, Opcode1, OpcodeMask))
5989 for (
unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
5993 Operands.back().push_back(cast<Instruction>(V)->getOperand(
I));
5997 for (
unsigned I : seq<unsigned>(0, VL.size() - 1)) {
6003 switch (Res.value_or(0)) {
6018 constexpr unsigned NumAltInsts = 3;
6019 unsigned NonInstCnt = 0;
6022 unsigned UndefCnt = 0;
6024 unsigned ExtraShuffleInsts = 0;
6033 return is_contained(Operands.back(), V);
6036 ++ExtraShuffleInsts;
6053 if (isa<Constant, ExtractElementInst>(V) ||
6054 getTreeEntry(V) || (L &&
L->isLoopInvariant(V))) {
6055 if (isa<UndefValue>(V))
6061 if (!Res.second && Res.first->second == 1)
6062 ++ExtraShuffleInsts;
6063 ++Res.first->getSecond();
6064 if (
auto *
I = dyn_cast<Instruction>(V))
6065 UniqueOpcodes.
insert(
I->getOpcode());
6066 else if (Res.second)
6069 return none_of(Uniques, [&](
const auto &
P) {
6070 return P.first->hasNUsesOrMore(
P.second + 1) &&
6072 return getTreeEntry(U) || Uniques.contains(U);
6081 (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
6082 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
6083 NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
6086BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
6089 assert(S.MainOp &&
"Expected instructions with same/alternate opcodes only.");
6091 unsigned ShuffleOrOp =
6092 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
6093 auto *VL0 = cast<Instruction>(S.OpValue);
6094 switch (ShuffleOrOp) {
6095 case Instruction::PHI: {
6098 return TreeEntry::NeedToGather;
6101 for (
Value *
Incoming : cast<PHINode>(V)->incoming_values()) {
6103 if (Term &&
Term->isTerminator()) {
6105 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
6106 return TreeEntry::NeedToGather;
6110 return TreeEntry::Vectorize;
6112 case Instruction::ExtractValue:
6113 case Instruction::ExtractElement: {
6114 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
6117 return TreeEntry::NeedToGather;
6118 if (Reuse || !CurrentOrder.empty())
6119 return TreeEntry::Vectorize;
6121 return TreeEntry::NeedToGather;
6123 case Instruction::InsertElement: {
6127 for (
Value *V : VL) {
6128 SourceVectors.
insert(cast<Instruction>(V)->getOperand(0));
6130 "Non-constant or undef index?");
6134 return !SourceVectors.contains(V);
6137 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
6138 "different source vectors.\n");
6139 return TreeEntry::NeedToGather;
6142 return TreeEntry::Vectorize;
6144 case Instruction::Load: {
6153 return TreeEntry::Vectorize;
6155 return TreeEntry::ScatterVectorize;
6157 return TreeEntry::StridedVectorize;
6160 Type *ScalarTy = VL0->getType();
6161 if (
DL->getTypeSizeInBits(ScalarTy) !=
6162 DL->getTypeAllocSizeInBits(ScalarTy))
6163 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
6165 [](
Value *V) {
return !cast<LoadInst>(V)->isSimple(); }))
6170 return TreeEntry::NeedToGather;
6174 case Instruction::ZExt:
6175 case Instruction::SExt:
6176 case Instruction::FPToUI:
6177 case Instruction::FPToSI:
6178 case Instruction::FPExt:
6179 case Instruction::PtrToInt:
6180 case Instruction::IntToPtr:
6181 case Instruction::SIToFP:
6182 case Instruction::UIToFP:
6183 case Instruction::Trunc:
6184 case Instruction::FPTrunc:
6185 case Instruction::BitCast: {
6186 Type *SrcTy = VL0->getOperand(0)->getType();
6187 for (
Value *V : VL) {
6188 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
6191 dbgs() <<
"SLP: Gathering casts with different src types.\n");
6192 return TreeEntry::NeedToGather;
6195 return TreeEntry::Vectorize;
6197 case Instruction::ICmp:
6198 case Instruction::FCmp: {
6202 Type *ComparedTy = VL0->getOperand(0)->getType();
6203 for (
Value *V : VL) {
6205 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
6206 Cmp->getOperand(0)->getType() != ComparedTy) {
6207 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
6208 return TreeEntry::NeedToGather;
6211 return TreeEntry::Vectorize;
6213 case Instruction::Select:
6214 case Instruction::FNeg:
6215 case Instruction::Add:
6216 case Instruction::FAdd:
6217 case Instruction::Sub:
6218 case Instruction::FSub:
6219 case Instruction::Mul:
6220 case Instruction::FMul:
6221 case Instruction::UDiv:
6222 case Instruction::SDiv:
6223 case Instruction::FDiv:
6224 case Instruction::URem:
6225 case Instruction::SRem:
6226 case Instruction::FRem:
6227 case Instruction::Shl:
6228 case Instruction::LShr:
6229 case Instruction::AShr:
6230 case Instruction::And:
6231 case Instruction::Or:
6232 case Instruction::Xor:
6233 return TreeEntry::Vectorize;
6234 case Instruction::GetElementPtr: {
6236 for (
Value *V : VL) {
6237 auto *
I = dyn_cast<GetElementPtrInst>(V);
6240 if (
I->getNumOperands() != 2) {
6241 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
6242 return TreeEntry::NeedToGather;
6248 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
6249 for (
Value *V : VL) {
6250 auto *
GEP = dyn_cast<GEPOperator>(V);
6253 Type *CurTy =
GEP->getSourceElementType();
6255 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
6256 return TreeEntry::NeedToGather;
6261 Type *Ty1 = VL0->getOperand(1)->getType();
6262 for (
Value *V : VL) {
6263 auto *
I = dyn_cast<GetElementPtrInst>(V);
6266 auto *
Op =
I->getOperand(1);
6267 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
6268 (
Op->getType() != Ty1 &&
6269 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
6270 Op->getType()->getScalarSizeInBits() >
6271 DL->getIndexSizeInBits(
6272 V->getType()->getPointerAddressSpace())))) {
6274 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
6275 return TreeEntry::NeedToGather;
6279 return TreeEntry::Vectorize;
6281 case Instruction::Store: {
6283 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
6286 if (
DL->getTypeSizeInBits(ScalarTy) !=
6287 DL->getTypeAllocSizeInBits(ScalarTy)) {
6288 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
6289 return TreeEntry::NeedToGather;
6293 for (
Value *V : VL) {
6294 auto *
SI = cast<StoreInst>(V);
6295 if (!
SI->isSimple()) {
6297 return TreeEntry::NeedToGather;
6306 if (CurrentOrder.empty()) {
6307 Ptr0 = PointerOps.
front();
6308 PtrN = PointerOps.
back();
6310 Ptr0 = PointerOps[CurrentOrder.front()];
6311 PtrN = PointerOps[CurrentOrder.back()];
6313 std::optional<int> Dist =
6316 if (
static_cast<unsigned>(*Dist) == VL.size() - 1)
6317 return TreeEntry::Vectorize;
6321 return TreeEntry::NeedToGather;
6323 case Instruction::Call: {
6326 CallInst *CI = cast<CallInst>(VL0);
6337 return TreeEntry::NeedToGather;
6342 for (
unsigned J = 0; J != NumArgs; ++J)
6345 for (
Value *V : VL) {
6346 CallInst *CI2 = dyn_cast<CallInst>(V);
6352 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
6354 return TreeEntry::NeedToGather;
6358 for (
unsigned J = 0; J != NumArgs; ++J) {
6361 if (ScalarArgs[J] != A1J) {
6363 <<
"SLP: mismatched arguments in call:" << *CI
6364 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
6365 return TreeEntry::NeedToGather;
6374 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
6375 <<
"!=" << *V <<
'\n');
6376 return TreeEntry::NeedToGather;
6380 return TreeEntry::Vectorize;
6382 case Instruction::ShuffleVector: {
6385 if (!S.isAltShuffle()) {
6386 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
6387 return TreeEntry::NeedToGather;
6392 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
6393 "the whole alt sequence is not profitable.\n");
6394 return TreeEntry::NeedToGather;
6397 return TreeEntry::Vectorize;
6401 return TreeEntry::NeedToGather;
6415 PHIHandler() =
delete;
6417 : DT(DT), Main(Main), Phis(Phis),
6418 Operands(Main->getNumIncomingValues(),
6420 void buildOperands() {
6421 constexpr unsigned FastLimit = 4;
6431 auto *
P = cast<PHINode>(V);
6432 if (
P->getIncomingBlock(
I) == InBB)
6447 Blocks.try_emplace(InBB).first->second.push_back(
I);
6450 auto *
P = cast<PHINode>(V);
6451 for (
unsigned I : seq<unsigned>(0,
P->getNumIncomingValues())) {
6459 auto It =
Blocks.find(InBB);
6465 for (
const auto &
P :
Blocks) {
6466 if (
P.getSecond().size() <= 1)
6468 unsigned BasicI =
P.getSecond().front();
6471 [&](
const auto &Data) {
6472 return !Data.value() ||
6473 Data.value() ==
Operands[BasicI][Data.index()];
6475 "Expected empty operands list.");
6485 const EdgeInfo &UserTreeIdx) {
6491 auto TryToFindDuplicates = [&](
const InstructionsState &S,
6492 bool DoNotFail =
false) {
6495 for (
Value *V : VL) {
6502 auto Res = UniquePositions.try_emplace(V, UniqueValues.
size());
6507 size_t NumUniqueScalarValues = UniqueValues.
size();
6508 if (NumUniqueScalarValues == VL.size()) {
6509 ReuseShuffleIndicies.
clear();
6512 if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6513 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
6514 "for nodes with padding.\n");
6515 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6519 if (NumUniqueScalarValues <= 1 ||
6520 (UniquePositions.size() == 1 &&
all_of(UniqueValues,
6522 return isa<UndefValue>(V) ||
6525 !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
6526 if (DoNotFail && UniquePositions.size() > 1 &&
6527 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6529 return isa<ExtractElementInst>(V) ||
6530 areAllUsersVectorized(cast<Instruction>(V),
6534 if (PWSz == VL.size()) {
6535 ReuseShuffleIndicies.
clear();
6537 NonUniqueValueVL.
assign(UniqueValues.
begin(), UniqueValues.
end());
6538 NonUniqueValueVL.
append(PWSz - UniqueValues.
size(),
6539 UniqueValues.
back());
6540 VL = NonUniqueValueVL;
6545 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6556 if (!EphValues.
empty()) {
6557 for (
Value *V : VL) {
6558 if (EphValues.
count(V)) {
6560 <<
") is ephemeral.\n");
6561 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6571 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
6576 cast<Instruction>(
I)->getOpcode() ==
6577 cast<Instruction>(S.MainOp)->getOpcode();
6579 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
6580 if (TryToFindDuplicates(S))
6581 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6582 ReuseShuffleIndicies);
6587 if (S.getOpcode() == Instruction::ExtractElement &&
6588 isa<ScalableVectorType>(
6589 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
6590 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
6591 if (TryToFindDuplicates(S))
6592 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6593 ReuseShuffleIndicies);
6598 if (S.OpValue->getType()->isVectorTy() &&
6599 !isa<InsertElementInst>(S.OpValue)) {
6601 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6605 if (
StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
6606 if (
SI->getValueOperand()->getType()->isVectorTy()) {
6607 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to store vector type.\n");
6608 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6617 auto &&NotProfitableForVectorization = [&S,
this,
6619 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6628 for (
Value *V : VL) {
6629 auto *
I = cast<Instruction>(V);
6631 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
6635 if ((IsCommutative &&
6636 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
6638 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
6640 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
6642 auto *
I1 = cast<Instruction>(VL.front());
6643 auto *I2 = cast<Instruction>(VL.back());
6646 I2->getOperand(
Op));
6647 if (
static_cast<unsigned>(
count_if(
6648 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
6650 })) >= S.MainOp->getNumOperands() / 2)
6652 if (S.MainOp->getNumOperands() > 2)
6654 if (IsCommutative) {
6659 I2->getOperand((
Op + 1) % E));
6661 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
6670 bool IsScatterVectorizeUserTE =
6671 UserTreeIdx.UserTE &&
6672 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6673 bool AreAllSameInsts =
6675 (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
6679 auto *
I = dyn_cast<GetElementPtrInst>(V);
6683 BB =
I->getParent();
6684 return BB ==
I->getParent() &&
I->getNumOperands() == 2;
6687 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
6690 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6693 NotProfitableForVectorization(VL)) {
6694 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n");
6695 if (TryToFindDuplicates(S))
6696 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6697 ReuseShuffleIndicies);
6705 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
6706 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.OpValue <<
".\n");
6707 if (!E->isSame(VL)) {
6708 auto It = MultiNodeScalars.
find(S.OpValue);
6709 if (It != MultiNodeScalars.
end()) {
6710 auto *TEIt =
find_if(It->getSecond(),
6711 [&](TreeEntry *ME) { return ME->isSame(VL); });
6712 if (TEIt != It->getSecond().end())
6722 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to partial overlap.\n");
6723 if (TryToFindDuplicates(S))
6724 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6725 ReuseShuffleIndicies);
6731 E->UserTreeIndices.push_back(UserTreeIdx);
6732 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.OpValue
6739 for (
Value *V : VL) {
6740 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
6743 if (getTreeEntry(V)) {
6745 <<
") is already in tree.\n");
6746 if (TryToFindDuplicates(S))
6747 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6748 ReuseShuffleIndicies);
6754 if (UserIgnoreList && !UserIgnoreList->empty()) {
6755 for (
Value *V : VL) {
6756 if (UserIgnoreList && UserIgnoreList->contains(V)) {
6757 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
6758 if (TryToFindDuplicates(S))
6759 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6760 ReuseShuffleIndicies);
6768 if (AreAllSameInsts && UserTreeIdx.UserTE &&
6769 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
6771 assert(S.OpValue->getType()->isPointerTy() &&
6772 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
6773 "Expected pointers only.");
6775 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
6776 assert(It != VL.end() &&
"Expected at least one GEP.");
6782 auto *VL0 = cast<Instruction>(S.OpValue);
6789 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6798 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6803 if (!TryToFindDuplicates(S,
true))
6809 TreeEntry::EntryState State = getScalarsVectorizationState(
6810 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
6811 if (State == TreeEntry::NeedToGather) {
6812 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6813 ReuseShuffleIndicies);
6817 auto &BSRef = BlocksSchedules[BB];
6819 BSRef = std::make_unique<BlockScheduling>(BB);
6821 BlockScheduling &BS = *BSRef;
6823 std::optional<ScheduleData *> Bundle =
6824 BS.tryScheduleBundle(UniqueValues,
this, S);
6825#ifdef EXPENSIVE_CHECKS
6830 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
6831 assert((!BS.getScheduleData(VL0) ||
6832 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
6833 "tryScheduleBundle should cancelScheduling on failure");
6834 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6835 ReuseShuffleIndicies);
6836 NonScheduledFirst.insert(VL.front());
6839 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
6841 unsigned ShuffleOrOp = S.isAltShuffle() ?
6842 (
unsigned) Instruction::ShuffleVector : S.getOpcode();
6843 switch (ShuffleOrOp) {
6844 case Instruction::PHI: {
6845 auto *PH = cast<PHINode>(VL0);
6848 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
6852 PHIHandler Handler(*DT, PH, VL);
6853 Handler.buildOperands();
6854 for (
unsigned I : seq<unsigned>(0, PH->getNumOperands()))
6855 TE->setOperand(
I, Handler.getOperands(
I));
6856 for (
unsigned I : seq<unsigned>(0, PH->getNumOperands()))
6857 buildTree_rec(Handler.getOperands(
I),
Depth + 1, {TE, I});
6860 case Instruction::ExtractValue:
6861 case Instruction::ExtractElement: {
6862 if (CurrentOrder.empty()) {
6863 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
6864 newTreeEntry(VL, Bundle , S, UserTreeIdx,
6865 ReuseShuffleIndicies);
6869 Op0.
assign(VL.size(), VL0->getOperand(0));
6870 VectorizableTree.back()->setOperand(0, Op0);
6874 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
6876 for (
unsigned Idx : CurrentOrder)
6883 newTreeEntry(VL, Bundle , S, UserTreeIdx,
6884 ReuseShuffleIndicies, CurrentOrder);
6888 Op0.
assign(VL.size(), VL0->getOperand(0));
6889 VectorizableTree.back()->setOperand(0, Op0);
6892 case Instruction::InsertElement: {
6893 assert(ReuseShuffleIndicies.
empty() &&
"All inserts should be unique");
6895 auto OrdCompare = [](
const std::pair<int, int> &P1,
6896 const std::pair<int, int> &P2) {
6897 return P1.first > P2.first;
6900 decltype(OrdCompare)>
6901 Indices(OrdCompare);
6902 for (
int I = 0, E = VL.size();
I < E; ++
I) {
6904 Indices.emplace(
Idx,
I);
6906 OrdersType CurrentOrder(VL.size(), VL.size());
6907 bool IsIdentity =
true;
6908 for (
int I = 0, E = VL.size();
I < E; ++
I) {
6909 CurrentOrder[Indices.top().second] =
I;
6910 IsIdentity &= Indices.top().second ==
I;
6914 CurrentOrder.clear();
6915 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6916 std::nullopt, CurrentOrder);
6919 constexpr int NumOps = 2;
6921 for (
int I = 0;
I < NumOps; ++
I) {
6923 VectorOperands[
I].
push_back(cast<Instruction>(V)->getOperand(
I));
6925 TE->setOperand(
I, VectorOperands[
I]);
6927 buildTree_rec(VectorOperands[NumOps - 1],
Depth + 1, {
TE, NumOps - 1});
6930 case Instruction::Load: {
6937 TreeEntry *
TE =
nullptr;
6940 case TreeEntry::Vectorize:
6941 if (CurrentOrder.empty()) {
6943 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6944 ReuseShuffleIndicies);
6948 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6949 ReuseShuffleIndicies, CurrentOrder);
6952 TE->setOperandsInOrder();
6954 case TreeEntry::StridedVectorize:
6956 if (CurrentOrder.empty()) {
6957 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6958 UserTreeIdx, ReuseShuffleIndicies);
6960 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6961 UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
6963 TE->setOperandsInOrder();
6966 case TreeEntry::ScatterVectorize:
6968 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
6969 UserTreeIdx, ReuseShuffleIndicies);
6970 TE->setOperandsInOrder();
6971 buildTree_rec(PointerOps,
Depth + 1, {
TE, 0});
6972 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of non-consecutive loads.\n");
6974 case TreeEntry::NeedToGather:
6979 case Instruction::ZExt:
6980 case Instruction::SExt:
6981 case Instruction::FPToUI:
6982 case Instruction::FPToSI:
6983 case Instruction::FPExt:
6984 case Instruction::PtrToInt:
6985 case Instruction::IntToPtr:
6986 case Instruction::SIToFP:
6987 case Instruction::UIToFP:
6988 case Instruction::Trunc:
6989 case Instruction::FPTrunc:
6990 case Instruction::BitCast: {
6991 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
6992 std::make_pair(std::numeric_limits<unsigned>::min(),
6993 std::numeric_limits<unsigned>::max()));
6994 if (ShuffleOrOp == Instruction::ZExt ||
6995 ShuffleOrOp == Instruction::SExt) {
6996 CastMaxMinBWSizes = std::make_pair(
7002 }
else if (ShuffleOrOp == Instruction::Trunc) {
7003 CastMaxMinBWSizes = std::make_pair(
7009 ExtraBitWidthNodes.
insert(VectorizableTree.size() + 1);
7010 }
else if (ShuffleOrOp == Instruction::SIToFP ||
7011 ShuffleOrOp == Instruction::UIToFP) {
7012 unsigned NumSignBits =
7014 if (
auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
7016 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
7018 if (NumSignBits * 2 >=
7020 ExtraBitWidthNodes.
insert(VectorizableTree.size() + 1);
7022 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7023 ReuseShuffleIndicies);
7026 TE->setOperandsInOrder();
7027 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
7031 Operands.push_back(cast<Instruction>(V)->getOperand(
I));
7037 case Instruction::ICmp:
7038 case Instruction::FCmp: {
7041 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7042 ReuseShuffleIndicies);
7050 "Commutative Predicate mismatch");
7051 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7054 for (
Value *V : VL) {
7055 auto *
Cmp = cast<CmpInst>(V);
7058 if (
Cmp->getPredicate() != P0)
7060 Left.push_back(LHS);
7061 Right.push_back(RHS);
7068 if (ShuffleOrOp == Instruction::ICmp) {
7069 unsigned NumSignBits0 =
7071 if (NumSignBits0 * 2 >=
7073 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
7074 unsigned NumSignBits1 =
7076 if (NumSignBits1 * 2 >=
7078 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 1)->
Idx);
7082 case Instruction::Select:
7083 case Instruction::FNeg:
7084 case Instruction::Add:
7085 case Instruction::FAdd:
7086 case Instruction::Sub:
7087 case Instruction::FSub:
7088 case Instruction::Mul:
7089 case Instruction::FMul:
7090 case Instruction::UDiv:
7091 case Instruction::SDiv:
7092 case Instruction::FDiv:
7093 case Instruction::URem:
7094 case Instruction::SRem:
7095 case Instruction::FRem:
7096 case Instruction::Shl:
7097 case Instruction::LShr:
7098 case Instruction::AShr:
7099 case Instruction::And:
7100 case Instruction::Or:
7101 case Instruction::Xor: {
7102 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7103 ReuseShuffleIndicies);
7110 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7118 TE->setOperandsInOrder();
7119 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
7123 Operands.push_back(cast<Instruction>(V)->getOperand(
I));
7129 case Instruction::GetElementPtr: {
7130 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7131 ReuseShuffleIndicies);
7135 for (
Value *V : VL) {
7136 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
7141 Operands.front().push_back(
GEP->getPointerOperand());
7150 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
7152 [VL0Ty, IndexIdx](
Value *V) {
7153 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
7156 return VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
7160 ->getPointerOperandType()
7163 for (
Value *V : VL) {
7164 auto *
I = dyn_cast<GetElementPtrInst>(V);
7167 ConstantInt::get(Ty, 0,
false));
7170 auto *
Op =
I->getOperand(IndexIdx);
7171 auto *CI = dyn_cast<ConstantInt>(
Op);
7176 CI, Ty, CI->getValue().isSignBitSet(), *DL));
7180 for (
unsigned I = 0, Ops =
Operands.size();
I < Ops; ++
I)
7184 case Instruction::Store: {
7188 for (
Value *V : VL) {
7189 auto *
SI = cast<StoreInst>(V);
7190 *OIter =
SI->getValueOperand();
7194 if (CurrentOrder.empty()) {
7196 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7197 ReuseShuffleIndicies);
7198 TE->setOperandsInOrder();
7203 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7204 ReuseShuffleIndicies, CurrentOrder);
7205 TE->setOperandsInOrder();
7207 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of jumbled stores.\n");
7211 case Instruction::Call: {
7214 CallInst *CI = cast<CallInst>(VL0);
7217 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7218 ReuseShuffleIndicies);
7223 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7227 for (
unsigned I : seq<unsigned>(2, CI->
arg_size())) {
7231 for (
Value *V : VL) {
7232 auto *CI2 = cast<CallInst>(V);
7239 for (
unsigned I : seq<unsigned>(2, CI->
arg_size())) {
7246 TE->setOperandsInOrder();
7247 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
7254 for (
Value *V : VL) {
7255 auto *CI2 = cast<CallInst>(V);
7262 case Instruction::ShuffleVector: {
7263 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7264 ReuseShuffleIndicies);
7268 auto *CI = dyn_cast<CmpInst>(VL0);
7269 if (isa<BinaryOperator>(VL0) || CI) {
7272 return cast<CmpInst>(V)->isCommutative();
7274 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7276 auto *MainCI = cast<CmpInst>(S.MainOp);
7277 auto *AltCI = cast<CmpInst>(S.AltOp);
7281 "Expected different main/alternate predicates.");
7284 for (
Value *V : VL) {
7285 auto *
Cmp = cast<CmpInst>(V);
7296 Left.push_back(LHS);
7297 Right.push_back(RHS);
7307 TE->setOperandsInOrder();
7308 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
7312 Operands.push_back(cast<Instruction>(V)->getOperand(
I));
7328 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
7329 if (
auto *ST = dyn_cast<StructType>(EltTy)) {
7331 for (
const auto *Ty : ST->elements())
7332 if (Ty != *ST->element_begin())
7334 N *= ST->getNumElements();
7335 EltTy = *ST->element_begin();
7336 }
else if (
auto *AT = dyn_cast<ArrayType>(EltTy)) {
7337 N *= AT->getNumElements();
7338 EltTy = AT->getElementType();
7340 auto *VT = cast<FixedVectorType>(EltTy);
7341 N *= VT->getNumElements();
7342 EltTy = VT->getElementType();
7349 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
7357 bool ResizeAllowed)
const {
7358 const auto *It =
find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
7359 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
7360 auto *E0 = cast<Instruction>(*It);
7362 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7366 Value *Vec = E0->getOperand(0);
7368 CurrentOrder.
clear();
7372 if (E0->getOpcode() == Instruction::ExtractValue) {
7377 LoadInst *LI = dyn_cast<LoadInst>(Vec);
7381 NElts = cast<FixedVectorType>(Vec->
getType())->getNumElements();
7384 unsigned E = VL.
size();
7385 if (!ResizeAllowed && NElts != E)
7388 unsigned MinIdx = NElts, MaxIdx = 0;
7390 auto *Inst = dyn_cast<Instruction>(V);
7393 if (Inst->getOperand(0) != Vec)
7395 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst))
7396 if (isa<UndefValue>(EE->getIndexOperand()))
7401 const unsigned ExtIdx = *
Idx;
7402 if (ExtIdx >= NElts)
7404 Indices[
I] = ExtIdx;
7405 if (MinIdx > ExtIdx)
7407 if (MaxIdx < ExtIdx)
7410 if (MaxIdx - MinIdx + 1 > E)
7412 if (MaxIdx + 1 <= E)
7416 bool ShouldKeepOrder =
true;
7422 CurrentOrder.
assign(E, E);
7423 for (
unsigned I = 0;
I < E; ++
I) {
7426 const unsigned ExtIdx = Indices[
I] - MinIdx;
7427 if (CurrentOrder[ExtIdx] != E) {
7428 CurrentOrder.
clear();
7431 ShouldKeepOrder &= ExtIdx ==
I;
7432 CurrentOrder[ExtIdx] =
I;
7434 if (ShouldKeepOrder)
7435 CurrentOrder.
clear();
7437 return ShouldKeepOrder;
7440bool BoUpSLP::areAllUsersVectorized(
7442 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
7444 return ScalarToTreeEntry.contains(U) ||
7445 isVectorLikeInstWithConstOps(U) ||
7446 (isa<ExtractElementInst>(U) && MustGather.contains(U));
7450static std::pair<InstructionCost, InstructionCost>
7458 if (
auto *FPCI = dyn_cast<FPMathOperator>(CI))
7459 FMF = FPCI->getFastMathFlags();
7462 dyn_cast<IntrinsicInst>(CI));
7463 auto IntrinsicCost =
7470 auto LibCost = IntrinsicCost;
7477 return {IntrinsicCost, LibCost};
7480void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7484 unsigned Sz = Scalars.size();
7487 if (!ReorderIndices.empty())
7489 for (
unsigned I = 0;
I < Sz; ++
I) {
7491 if (!ReorderIndices.empty())
7493 auto *OpInst = cast<Instruction>(Scalars[
Idx]);
7494 if (IsAltOp(OpInst)) {
7504 if (!ReuseShuffleIndices.empty()) {
7507 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7517 if (
auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
7518 auto *AltCI = cast<CmpInst>(AltOp);
7521 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
7522 auto *CI = cast<CmpInst>(
I);
7530 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
7531 "CmpInst expected to match either main or alternate predicate or "
7534 return MainP !=
P && MainP != SwappedP;
7541 const auto *Op0 = Ops.
front();
7547 const bool IsUniform =
all_of(Ops, [=](
Value *V) {
7551 const bool IsPowerOfTwo =
all_of(Ops, [](
Value *V) {
7553 if (
auto *CI = dyn_cast<ConstantInt>(V))
7554 return CI->getValue().isPowerOf2();
7557 const bool IsNegatedPowerOfTwo =
all_of(Ops, [](
Value *V) {
7559 if (
auto *CI = dyn_cast<ConstantInt>(V))
7560 return CI->getValue().isNegatedPowerOf2();
7565 if (IsConstant && IsUniform)
7567 else if (IsConstant)
7581class BaseShuffleAnalysis {
7588 int Limit =
Mask.size();
7600 if (Limit % VF == 0 &&
all_of(seq<int>(0, Limit / VF), [=](
int Idx) {
7616 unsigned VF =
Mask.size();
7618 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
7621 int MaskedIdx =
Mask[ExtMask[
I] % VF];
7662 bool SinglePermute) {
7666 while (
auto *SV = dyn_cast<ShuffleVectorInst>(
Op)) {
7668 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
7674 if (isIdentityMask(Mask, SVTy,
false)) {
7675 if (!IdentityOp || !SinglePermute ||
7676 (isIdentityMask(Mask, SVTy,
true) &&
7678 IdentityMask.
size()))) {
7683 IdentityMask.
assign(Mask);
7703 if (SV->isZeroEltSplat()) {
7705 IdentityMask.
assign(Mask);
7707 int LocalVF =
Mask.size();
7709 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
7710 LocalVF = SVOpTy->getNumElements();
7714 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
7716 ExtMask[
Idx] = SV->getMaskValue(
I);
7726 if (!IsOp1Undef && !IsOp2Undef) {
7728 for (
int &
I : Mask) {
7731 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
7738 SV->getShuffleMask().end());
7739 combineMasks(LocalVF, ShuffleMask, Mask);
7740 Mask.swap(ShuffleMask);
7742 Op = SV->getOperand(0);
7744 Op = SV->getOperand(1);
7746 if (
auto *OpTy = dyn_cast<FixedVectorType>(
Op->getType());
7747 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
7752 "Expected masks of same sizes.");
7757 Mask.swap(IdentityMask);
7758 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
7759 return SinglePermute &&
7760 (isIdentityMask(Mask, cast<FixedVectorType>(
V->getType()),
7762 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
7763 Shuffle->isZeroEltSplat() &&
7776 template <
typename T,
typename ShuffleBuilderTy>
7778 ShuffleBuilderTy &Builder) {
7779 assert(V1 &&
"Expected at least one vector value.");
7781 Builder.resizeToMatch(V1, V2);
7782 int VF =
Mask.size();
7783 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
7784 VF = FTy->getNumElements();
7791 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
7794 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
7796 CombinedMask1[
I] =
Mask[
I];
7798 CombinedMask2[
I] =
Mask[
I] - VF;
7805 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
7806 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
7809 if (
auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
7810 if (
auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
7815 ExtMask1[
Idx] = SV1->getMaskValue(
I);
7818 cast<FixedVectorType>(SV1->getOperand(1)->getType())
7820 ExtMask1, UseMask::SecondArg);
7825 ExtMask2[
Idx] = SV2->getMaskValue(
I);
7828 cast<FixedVectorType>(SV2->getOperand(1)->getType())
7830 ExtMask2, UseMask::SecondArg);
7831 if (SV1->getOperand(0)->getType() ==
7832 SV2->getOperand(0)->getType() &&
7833 SV1->getOperand(0)->getType() != SV1->getType() &&
7836 Op1 = SV1->getOperand(0);
7837 Op2 = SV2->getOperand(0);
7839 SV1->getShuffleMask().end());
7840 int LocalVF = ShuffleMask1.size();
7841 if (
auto *FTy = dyn_cast<FixedVectorType>(Op1->
getType()))
7842 LocalVF = FTy->getNumElements();
7843 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
7844 CombinedMask1.swap(ShuffleMask1);
7846 SV2->getShuffleMask().end());
7847 LocalVF = ShuffleMask2.size();
7848 if (
auto *FTy = dyn_cast<FixedVectorType>(Op2->
getType()))
7849 LocalVF = FTy->getNumElements();
7850 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
7851 CombinedMask2.swap(ShuffleMask2);
7854 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
7855 Builder.resizeToMatch(Op1, Op2);
7856 VF = std::max(cast<VectorType>(Op1->
getType())
7858 .getKnownMinValue(),
7859 cast<VectorType>(Op2->
getType())
7861 .getKnownMinValue());
7862 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
7865 "Expected undefined mask element");
7866 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
7872 isa<ShuffleVectorInst>(Op1) &&
7873 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
7875 return Builder.createIdentity(Op1);
7876 return Builder.createShuffleVector(
7880 if (isa<PoisonValue>(V1))
7881 return Builder.createPoison(
7882 cast<VectorType>(V1->
getType())->getElementType(),
Mask.size());
7884 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
7885 assert(V1 &&
"Expected non-null value after looking through shuffles.");
7888 return Builder.createShuffleVector(V1, NewMask);
7889 return Builder.createIdentity(V1);
7905 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
7908 Mask, NumSrcElts, NumSubElts,
Index)) {
7909 if (
Index + NumSubElts > NumSrcElts &&
7910 Index + NumSrcElts <=
static_cast<int>(Mask.size()))
7920static std::pair<InstructionCost, InstructionCost>
7931 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
7941 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
7945 for (
Value *V : Ptrs) {
7950 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
7955 if (!
Ptr || !
Ptr->hasOneUse())
7959 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
7965 TTI::PointersChainInfo::getKnownStride(),
7975 [](
const Value *V) {
7976 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
7977 return Ptr && !
Ptr->hasAllConstantIndices();
7979 ? TTI::PointersChainInfo::getUnknownStride()
7980 : TTI::PointersChainInfo::getKnownStride();
7984 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
7986 auto *It =
find_if(Ptrs, IsaPred<GEPOperator>);
7987 if (It != Ptrs.
end())
7988 BaseGEP = cast<GEPOperator>(*It);
7993 BaseGEP->getPointerOperand(), Indices, VecTy,
7998 return std::make_pair(ScalarCost, VecCost);
8003 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8004 TreeEntry &E = *TE.get();
8005 switch (E.getOpcode()) {
8006 case Instruction::Load: {
8007 Type *ScalarTy = E.getMainOp()->getType();
8009 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
8016 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
8023 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
8024 false, CommonAlignment,
CostKind, BaseLI);
8025 if (StridedCost < OriginalVecCost)
8028 E.State = TreeEntry::StridedVectorize;
8032 case Instruction::Store: {
8034 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
8036 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
8043 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
8050 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
8051 false, CommonAlignment,
CostKind, BaseSI);
8052 if (StridedCost < OriginalVecCost)
8055 E.State = TreeEntry::StridedVectorize;
8072 bool IsFinalized =
false;
8075 Type *ScalarTy =
nullptr;
8086 bool SameNodesEstimated =
true;
8095 if (
auto *VTy = dyn_cast<VectorType>(Ty))
8111 const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy);
8112 unsigned MinVF = R.getMinVF(2 * Sz);
8113 if (VL.
size() > 2 &&
8114 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
8115 (InVectors.
empty() &&
8118 ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
8119 InstructionsState S = getSameOpcode(SubVL, *R.TLI);
8120 return S.getOpcode() == Instruction::Load &&
8123 !
all_of(Gathers, [&](
Value *V) {
return R.getTreeEntry(V); }) &&
8129 unsigned StartIdx = 0;
8130 unsigned VF = VL.
size() / 2;
8131 for (; VF >= MinVF; VF /= 2) {
8132 for (
unsigned Cnt = StartIdx,
End = VL.
size(); Cnt + VF <=
End;
8135 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
8137 if (SliceS.getOpcode() != Instruction::Load ||
8138 SliceS.isAltShuffle())
8146 CurrentOrder, PointerOps);
8156 CurrentOrder.
empty()) ||
8165 if (Cnt == StartIdx)
8174 if (StartIdx >= VL.
size())
8177 if (!VectorizedLoads.
empty())
8180 if (!VectorizedLoads.
empty()) {
8182 bool NeedInsertSubvectorAnalysis =
8183 !NumParts || (VL.
size() / VF) > NumParts;
8189 getBuildVectorCost(VL.
slice(
I, std::min(
End -
I, VF)), Root);
8196 for (
Value *V : VectorizedLoads) {
8197 auto *LI = cast<LoadInst>(V);
8204 for (
const std::pair<unsigned, LoadsState> &
P : VectorizedStarts) {
8205 auto *LI = cast<LoadInst>(VL[
P.first]);
8214 false, Alignment, CostKind, LI);
8218 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
8219 auto [ScalarGEPCost, VectorGEPCost] =
8221 Instruction::Load, CostKind, LI->
getType(), LoadTy);
8222 GatherCost += VectorGEPCost - ScalarGEPCost;
8224 for (
unsigned P : ScatterVectorized) {
8225 auto *LI0 = cast<LoadInst>(VL[
P]);
8227 Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
8229 Instruction::Load, LoadTy, LI0->getPointerOperand(),
8230 false, CommonAlignment, CostKind, LI0);
8234 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
8242 auto [ScalarGEPCost, VectorGEPCost] =
8244 CostKind, ScalarTy, VecTy);
8245 GatherCost += VectorGEPCost - ScalarGEPCost;
8246 if (!Order.
empty()) {
8250 VecTy, Mask, CostKind);
8253 GatherCost += R.getGatherCost(PointerOps,
true,
8254 PointerOps.
front()->getType());
8257 if (NeedInsertSubvectorAnalysis) {
8260 for (
unsigned I = VF, E = VL.
size();
I < E;
I += VF) {
8261 for (
unsigned Idx : seq<unsigned>(0, E))
8264 ShuffleMask, CostKind,
I, LoadTy);
8267 GatherCost -= ScalarsCost;
8269 GatherCost = std::min(BaseCost, GatherCost);
8270 }
else if (!Root &&
isSplat(VL)) {
8273 const auto *It =
find_if_not(VL, IsaPred<UndefValue>);
8274 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
8277 count(VL, *It) > 1 &&
8281 CostKind, std::distance(VL.
begin(), It),
8286 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
8292 VecTy, ShuffleMask, CostKind,
8297 (
all_of(Gathers, IsaPred<UndefValue>)
8299 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
8307 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8308 unsigned NumParts) {
8309 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
8311 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
8312 auto *EE = dyn_cast<ExtractElementInst>(V);
8315 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
8318 return std::max(Sz, VecTy->getNumElements());
8320 unsigned NumSrcRegs =
8322 if (NumSrcRegs == 0)
8327 auto CheckPerRegistersShuffle =
8332 int FirstRegId = -1;
8333 for (
int &
I : Mask) {
8336 int RegId = (
I / NumElts) * NumParts + (
I % NumElts) / EltsPerVector;
8339 RegIndices.
insert(RegId);
8340 if (RegIndices.
size() > 2)
8341 return std::nullopt;
8342 if (RegIndices.
size() == 2)
8344 I = (
I % NumElts) % EltsPerVector +
8345 (RegId == FirstRegId ? 0 : EltsPerVector);
8354 for (
unsigned Part = 0; Part < NumParts; ++Part) {
8355 if (!ShuffleKinds[Part])
8358 Mask.slice(Part * EltsPerVector,
8359 (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
8360 ? Mask.size() % EltsPerVector
8364 std::optional<TTI::ShuffleKind> RegShuffleKind =
8365 CheckPerRegistersShuffle(SubMask);
8366 if (!RegShuffleKind) {
8385 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8392 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
8394 unsigned SliceSize) {
8395 if (SameNodesEstimated) {
8401 if ((InVectors.
size() == 2 &&
8402 InVectors.
front().get<
const TreeEntry *>() == &E1 &&
8403 InVectors.
back().get<
const TreeEntry *>() == E2) ||
8404 (!E2 && InVectors.
front().get<
const TreeEntry *>() == &E1)) {
8407 "Expected all poisoned elements.");
8410 copy(SubMask, std::next(CommonMask.
begin(), SliceSize * Part));
8415 Cost += createShuffle(InVectors.
front(),
8416 InVectors.
size() == 1 ?
nullptr : InVectors.
back(),
8418 transformMaskAfterShuffle(CommonMask, CommonMask);
8420 SameNodesEstimated =
false;
8421 if (!E2 && InVectors.
size() == 1) {
8422 unsigned VF = E1.getVectorFactor();
8425 cast<FixedVectorType>(V1->
getType())->getNumElements());
8427 const auto *E = InVectors.
front().get<
const TreeEntry *>();
8428 VF = std::max(VF, E->getVectorFactor());
8430 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8432 CommonMask[
Idx] = Mask[
Idx] + VF;
8433 Cost += createShuffle(InVectors.
front(), &E1, CommonMask);
8434 transformMaskAfterShuffle(CommonMask, CommonMask);
8436 Cost += createShuffle(&E1, E2, Mask);
8437 transformMaskAfterShuffle(CommonMask, Mask);
8441 class ShuffleCostBuilder {
8444 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
8446 return Mask.empty() ||
8447 (VF == Mask.size() &&
8455 ~ShuffleCostBuilder() =
default;
8460 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
8461 if (isEmptyOrIdentity(Mask, VF))
8464 cast<VectorType>(V1->
getType()), Mask);
8469 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
8470 if (isEmptyOrIdentity(Mask, VF))
8473 cast<VectorType>(V1->
getType()), Mask);
8479 void resizeToMatch(
Value *&,
Value *&)
const {}
8489 ShuffleCostBuilder Builder(
TTI);
8492 unsigned CommonVF = Mask.size();
8494 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
8496 if (E.State == TreeEntry::NeedToGather &&
allConstant(E.Scalars))
8498 Type *EScalarTy = E.Scalars.front()->getType();
8499 bool IsSigned =
true;
8500 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
8502 IsSigned = It->second.second;
8504 if (EScalarTy != ScalarTy) {
8505 unsigned CastOpcode = Instruction::Trunc;
8506 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8507 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8509 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8518 if (isa<Constant>(V))
8520 auto *VecTy = cast<VectorType>(V->getType());
8522 if (EScalarTy != ScalarTy) {
8524 unsigned CastOpcode = Instruction::Trunc;
8525 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8526 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8528 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8535 if (!V1 && !V2 && !P2.
isNull()) {
8537 const TreeEntry *E = P1.
get<
const TreeEntry *>();
8538 unsigned VF = E->getVectorFactor();
8539 const TreeEntry *E2 = P2.
get<
const TreeEntry *>();
8540 CommonVF = std::max(VF, E2->getVectorFactor());
8543 return Idx < 2 * static_cast<int>(CommonVF);
8545 "All elements in mask must be less than 2 * CommonVF.");
8546 if (E->Scalars.size() == E2->Scalars.size()) {
8550 for (
int &
Idx : CommonMask) {
8553 if (
Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
8555 else if (
Idx >=
static_cast<int>(CommonVF))
8556 Idx = (E2Mask.
empty() ?
Idx - CommonVF : E2Mask[
Idx - CommonVF]) +
8560 CommonVF = E->Scalars.size();
8561 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
8562 GetNodeMinBWAffectedCost(*E2, CommonVF);
8564 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
8565 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
8569 }
else if (!V1 && P2.
isNull()) {
8571 const TreeEntry *E = P1.
get<
const TreeEntry *>();
8572 unsigned VF = E->getVectorFactor();
8576 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
8577 "All elements in mask must be less than CommonVF.");
8578 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8580 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
8581 for (
int &
Idx : CommonMask) {
8585 CommonVF = E->Scalars.size();
8587 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
8590 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8591 CommonVF == CommonMask.
size() &&
8593 [](
const auto &&
P) {
8595 static_cast<unsigned>(
P.value()) !=
P.index();
8603 }
else if (V1 && P2.
isNull()) {
8605 ExtraCost += GetValueMinBWAffectedCost(V1);
8606 CommonVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8609 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
8610 "All elements in mask must be less than CommonVF.");
8611 }
else if (V1 && !V2) {
8613 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8614 const TreeEntry *E2 = P2.
get<
const TreeEntry *>();
8615 CommonVF = std::max(VF, E2->getVectorFactor());
8618 return Idx < 2 * static_cast<int>(CommonVF);
8620 "All elements in mask must be less than 2 * CommonVF.");
8621 if (E2->Scalars.size() == VF && VF != CommonVF) {
8623 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
8624 for (
int &
Idx : CommonMask) {
8627 if (
Idx >=
static_cast<int>(CommonVF))
8628 Idx = E2Mask[
Idx - CommonVF] + VF;
8632 ExtraCost += GetValueMinBWAffectedCost(V1);
8634 ExtraCost += GetNodeMinBWAffectedCost(
8635 *E2, std::min(CommonVF, E2->getVectorFactor()));
8637 }
else if (!V1 && V2) {
8639 unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
8640 const TreeEntry *E1 = P1.
get<
const TreeEntry *>();
8641 CommonVF = std::max(VF, E1->getVectorFactor());
8644 return Idx < 2 * static_cast<int>(CommonVF);
8646 "All elements in mask must be less than 2 * CommonVF.");
8647 if (E1->Scalars.size() == VF && VF != CommonVF) {
8649 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
8650 for (
int &
Idx : CommonMask) {
8653 if (
Idx >=
static_cast<int>(CommonVF))
8654 Idx = E1Mask[
Idx - CommonVF] + VF;
8660 ExtraCost += GetNodeMinBWAffectedCost(
8661 *E1, std::min(CommonVF, E1->getVectorFactor()));
8663 ExtraCost += GetValueMinBWAffectedCost(V2);
8666 assert(V1 && V2 &&
"Expected both vectors.");
8667 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8669 std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
8672 return Idx < 2 * static_cast<int>(CommonVF);
8674 "All elements in mask must be less than 2 * CommonVF.");
8676 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
8677 if (V1->
getType() != V2->getType()) {
8681 if (cast<VectorType>(V1->
getType())->getElementType() != ScalarTy)
8683 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
8689 if (InVectors.
size() == 2)
8691 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
8692 V1, V2, CommonMask, Builder);
8699 : ScalarTy(ScalarTy),
TTI(
TTI),
8700 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
8701 CheckedExtracts(CheckedExtracts) {}
8703 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8704 unsigned NumParts,
bool &UseVecBaseAsInput) {
8705 UseVecBaseAsInput =
false;
8708 Value *VecBase =
nullptr;
8711 if (NumParts == VL.
size())
8715 bool PrevNodeFound =
any_of(
8717 [&](
const std::unique_ptr<TreeEntry> &TE) {
8718 return ((!TE->isAltShuffle() &&
8719 TE->getOpcode() == Instruction::ExtractElement) ||
8720 TE->State == TreeEntry::NeedToGather) &&
8721 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
8722 return VL.size() > Data.index() &&
8723 (Mask[Data.index()] == PoisonMaskElem ||
8724 isa<UndefValue>(VL[Data.index()]) ||
8725 Data.value() == VL[Data.index()]);
8729 unsigned SliceSize = VL.
size() / NumParts;
8730 for (
unsigned Part = 0; Part < NumParts; ++Part) {
8731 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
8732 for (
auto [
I, V] :
enumerate(VL.
slice(Part * SliceSize, SliceSize))) {
8734 if (isa<UndefValue>(V) ||
8743 auto *EE = cast<ExtractElementInst>(V);
8744 VecBase = EE->getVectorOperand();
8745 UniqueBases.
insert(VecBase);
8746 const TreeEntry *VE = R.getTreeEntry(V);
8747 if (!CheckedExtracts.
insert(V).second ||
8748 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
8751 return isa<GetElementPtrInst>(U) &&
8752 !R.areAllUsersVectorized(cast<Instruction>(U),
8760 unsigned Idx = *EEIdx;
8762 if (EE->hasOneUse() || !PrevNodeFound) {
8764 if (isa<SExtInst, ZExtInst>(Ext) &&
8765 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
8770 EE->getVectorOperandType(),
Idx);
8773 Ext->getOpcode(), Ext->getType(), EE->getType(),
8789 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8792 transformMaskAfterShuffle(CommonMask, CommonMask);
8793 SameNodesEstimated =
false;
8794 if (NumParts != 1 && UniqueBases.
size() != 1) {
8795 UseVecBaseAsInput =
true;
8803 std::optional<InstructionCost>
8807 return std::nullopt;
8813 return Idx < static_cast<int>(E1.getVectorFactor());
8815 "Expected single vector shuffle mask.");
8819 if (InVectors.
empty()) {
8820 CommonMask.
assign(Mask.begin(), Mask.end());
8821 InVectors.
assign({&E1, &E2});
8824 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
8827 if (NumParts == 0 || NumParts >= Mask.size())
8829 unsigned SliceSize = Mask.size() / NumParts;
8832 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8833 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
8836 if (InVectors.
empty()) {
8837 CommonMask.
assign(Mask.begin(), Mask.end());
8838 InVectors.
assign(1, &E1);
8841 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
8844 if (NumParts == 0 || NumParts >= Mask.size())
8846 unsigned SliceSize = Mask.size() / NumParts;
8849 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8850 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
8851 if (!SameNodesEstimated && InVectors.
size() == 1)
8864 cast<ExtractElementInst>(InVectors.
front()
8865 .get<
const TreeEntry *>()
8866 ->Scalars[
P.index()]);
8867 return EI->getVectorOperand() == V1 ||
8868 EI->getVectorOperand() == V2;
8870 "Expected extractelement vectors.");
8874 if (InVectors.
empty()) {
8876 "Expected empty input mask/vectors.");
8877 CommonMask.
assign(Mask.begin(), Mask.end());
8884 InVectors.
front().is<
const TreeEntry *>() && !CommonMask.
empty() &&
8888 .get<const TreeEntry *>()
8889 ->Scalars[
P.index()];
8891 return P.value() == Mask[
P.index()] ||
8892 isa<UndefValue>(Scalar);
8893 if (isa<Constant>(V1))
8895 auto *EI = cast<ExtractElementInst>(Scalar);
8896 return EI->getVectorOperand() == V1;
8898 "Expected only tree entry for extractelement vectors.");
8902 "Expected only tree entries from extracts/reused buildvectors.");
8903 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8904 if (InVectors.
size() == 2) {
8905 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
8906 transformMaskAfterShuffle(CommonMask, CommonMask);
8907 VF = std::max<unsigned>(VF, CommonMask.
size());
8908 }
else if (
const auto *InTE =
8909 InVectors.
front().dyn_cast<
const TreeEntry *>()) {
8910 VF = std::max(VF, InTE->getVectorFactor());
8914 ->getNumElements());
8917 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8919 CommonMask[
Idx] = Mask[
Idx] + VF;
8922 Value *Root =
nullptr) {
8923 Cost += getBuildVectorCost(VL, Root);
8927 unsigned VF = VL.
size();
8929 VF = std::min(VF, MaskVF);
8931 if (isa<UndefValue>(V)) {
8941 cast<FixedVectorType>(Root->
getType())->getNumElements()),
8942 getAllOnesValue(*R.DL, ScalarTy));
8952 if (InVectors.
size() == 2)
8953 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
8955 Cost += createShuffle(Vec,
nullptr, CommonMask);
8956 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8960 "Expected vector length for the final value before action.");
8962 Action(V, CommonMask);
8963 InVectors.
front() = V;
8966 if (CommonMask.
empty()) {
8967 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
8971 createShuffle(InVectors.
front(),
8972 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
8978 "Shuffle construction must be finalized.");
8982const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
8983 unsigned Idx)
const {
8985 if (
const TreeEntry *TE = getTreeEntry(
Op)) {
8986 if (
find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
8987 return EI.EdgeIdx == Idx && EI.UserTE == E;
8988 }) != TE->UserTreeIndices.end())
8990 auto MIt = MultiNodeScalars.
find(
Op);
8991 if (MIt != MultiNodeScalars.
end()) {
8992 for (
const TreeEntry *TE : MIt->second) {
8993 if (
find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
8994 return EI.EdgeIdx == Idx && EI.UserTE == E;
8995 }) != TE->UserTreeIndices.end())
9001 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
9002 return TE->State == TreeEntry::NeedToGather &&
9003 find_if(
TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
9004 return EI.EdgeIdx == Idx && EI.UserTE == E;
9005 }) !=
TE->UserTreeIndices.end();
9007 assert(It != VectorizableTree.end() &&
"Expected vectorizable entry.");
9012 if (
TE.State == TreeEntry::ScatterVectorize ||
9013 TE.State == TreeEntry::StridedVectorize)
9015 if (
TE.State == TreeEntry::Vectorize &&
TE.getOpcode() == Instruction::Load &&
9016 !
TE.isAltShuffle()) {
9017 if (
TE.ReorderIndices.empty())
9056 Type *ScalarTy = VL[0]->getType();
9057 if (E->State != TreeEntry::NeedToGather) {
9058 if (
auto *SI = dyn_cast<StoreInst>(VL[0]))
9059 ScalarTy =
SI->getValueOperand()->getType();
9060 else if (
auto *CI = dyn_cast<CmpInst>(VL[0]))
9062 else if (
auto *IE = dyn_cast<InsertElementInst>(VL[0]))
9063 ScalarTy =
IE->getOperand(1)->getType();
9072 auto It = MinBWs.
find(E);
9073 Type *OrigScalarTy = ScalarTy;
9074 if (It != MinBWs.
end()) {
9078 unsigned EntryVF = E->getVectorFactor();
9081 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
9082 if (E->State == TreeEntry::NeedToGather) {
9085 if (isa<InsertElementInst>(VL[0]))
9087 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
9088 E, ScalarTy, *
TTI, VectorizedVals, *
this, CheckedExtracts);
9093 if (!E->ReorderIndices.empty() &&
9094 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
9096 if (E->getOpcode() == Instruction::Store) {
9098 NewMask.
resize(E->ReorderIndices.size());
9099 copy(E->ReorderIndices, NewMask.
begin());
9105 if (NeedToShuffleReuses)
9106 ::addMask(Mask, E->ReuseShuffleIndices);
9110 assert((E->State == TreeEntry::Vectorize ||
9111 E->State == TreeEntry::ScatterVectorize ||
9112 E->State == TreeEntry::StridedVectorize) &&
9116 (E->getOpcode() == Instruction::GetElementPtr &&
9117 E->getMainOp()->getType()->isPointerTy())) &&
9120 unsigned ShuffleOrOp =
9121 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
9123 const unsigned Sz = UniqueValues.
size();
9125 for (
unsigned I = 0;
I < Sz; ++
I) {
9126 if (getTreeEntry(UniqueValues[
I]) == E)
9130 auto GetCastContextHint = [&](
Value *
V) {
9131 if (
const TreeEntry *OpTE = getTreeEntry(V))
9132 return getCastContextHint(*OpTE);
9133 InstructionsState SrcState =
getSameOpcode(E->getOperand(0), *TLI);
9134 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
9143 if (isa<CastInst, CmpInst, SelectInst, CallInst>(VL0)) {
9147 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
9149 for (
unsigned I = 0;
I < Sz; ++
I) {
9150 if (UsedScalars.test(
I))
9152 ScalarCost += ScalarEltCost(
I);
9160 const EdgeInfo &EI = E->UserTreeIndices.front();
9161 if ((EI.UserTE->getOpcode() != Instruction::Select ||
9163 It != MinBWs.
end()) {
9164 auto UserBWIt = MinBWs.
find(EI.UserTE);
9165 Type *UserScalarTy =
9166 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
9167 if (UserBWIt != MinBWs.
end())
9169 UserBWIt->second.first);
9170 if (ScalarTy != UserScalarTy) {
9171 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
9172 unsigned SrcBWSz =
DL->getTypeSizeInBits(UserScalarTy);
9177 VecOpcode = Instruction::Trunc;
9180 It->second.second ? Instruction::SExt : Instruction::ZExt;
9187 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
9188 ScalarCost,
"Calculated costs for Tree"));
9189 return VecCost - ScalarCost;
9194 assert((E->State == TreeEntry::Vectorize ||
9195 E->State == TreeEntry::StridedVectorize) &&
9196 "Entry state expected to be Vectorize or StridedVectorize here.");
9200 *
TTI, Ptrs, BasePtr, E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
9201 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
9202 "Calculated GEPs cost for Tree"));
9204 return VecCost - ScalarCost;
9207 switch (ShuffleOrOp) {
9208 case Instruction::PHI: {
9212 for (
Value *V : UniqueValues) {
9213 auto *
PHI = dyn_cast<PHINode>(V);
9218 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
9222 if (
const TreeEntry *OpTE = getTreeEntry(
Operands.front()))
9224 if (!OpTE->ReuseShuffleIndices.empty())
9225 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
9226 OpTE->Scalars.size());
9229 return CommonCost - ScalarCost;
9231 case Instruction::ExtractValue:
9232 case Instruction::ExtractElement: {
9233 auto GetScalarCost = [&](
unsigned Idx) {
9234 auto *
I = cast<Instruction>(UniqueValues[
Idx]);
9236 if (ShuffleOrOp == Instruction::ExtractElement) {
9237 auto *EE = cast<ExtractElementInst>(
I);
9238 SrcVecTy = EE->getVectorOperandType();
9240 auto *EV = cast<ExtractValueInst>(
I);
9241 Type *AggregateTy = EV->getAggregateOperand()->getType();
9243 if (
auto *ATy = dyn_cast<ArrayType>(AggregateTy))
9244 NumElts = ATy->getNumElements();
9249 if (
I->hasOneUse()) {
9251 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
9252 all_of(
Ext->users(), IsaPred<GetElementPtrInst>)) {
9259 Ext->getOpcode(),
Ext->getType(),
I->getType(),
9267 auto GetVectorCost = [](
InstructionCost CommonCost) {
return CommonCost; };
9268 return GetCostDiff(GetScalarCost, GetVectorCost);
9270 case Instruction::InsertElement: {
9271 assert(E->ReuseShuffleIndices.empty() &&
9272 "Unique insertelements only are expected.");
9273 auto *SrcVecTy = cast<FixedVectorType>(VL0->
getType());
9274 unsigned const NumElts = SrcVecTy->getNumElements();
9275 unsigned const NumScalars = VL.
size();
9281 unsigned OffsetEnd = OffsetBeg;
9282 InsertMask[OffsetBeg] = 0;
9285 if (OffsetBeg >
Idx)
9287 else if (OffsetEnd <
Idx)
9289 InsertMask[
Idx] =
I + 1;
9293 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
9294 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
9296 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
9297 unsigned InsertVecSz = std::min<unsigned>(
9299 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
9300 bool IsWholeSubvector =
9301 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
9305 if (OffsetBeg + InsertVecSz > VecSz) {
9308 InsertVecSz = VecSz;
9314 if (!E->ReorderIndices.empty()) {
9319 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
9321 bool IsIdentity =
true;
9323 Mask.swap(PrevMask);
9324 for (
unsigned I = 0;
I < NumScalars; ++
I) {
9326 DemandedElts.
setBit(InsertIdx);
9327 IsIdentity &= InsertIdx - OffsetBeg ==
I;
9328 Mask[InsertIdx - OffsetBeg] =
I;
9330 assert(
Offset < NumElts &&
"Failed to find vector index offset");
9345 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
9346 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
9354 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
9355 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
9356 if (InsertVecSz != VecSz) {
9368 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
9377 case Instruction::ZExt:
9378 case Instruction::SExt:
9379 case Instruction::FPToUI:
9380 case Instruction::FPToSI:
9381 case Instruction::FPExt:
9382 case Instruction::PtrToInt:
9383 case Instruction::IntToPtr:
9384 case Instruction::SIToFP:
9385 case Instruction::UIToFP:
9386 case Instruction::Trunc:
9387 case Instruction::FPTrunc:
9388 case Instruction::BitCast: {
9389 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
9392 unsigned Opcode = ShuffleOrOp;
9393 unsigned VecOpcode = Opcode;
9395 (SrcIt != MinBWs.
end() || It != MinBWs.
end())) {
9397 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
9398 if (SrcIt != MinBWs.
end()) {
9399 SrcBWSz = SrcIt->second.first;
9403 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
9404 if (BWSz == SrcBWSz) {
9405 VecOpcode = Instruction::BitCast;
9406 }
else if (BWSz < SrcBWSz) {
9407 VecOpcode = Instruction::Trunc;
9408 }
else if (It != MinBWs.
end()) {
9409 assert(BWSz > SrcBWSz &&
"Invalid cast!");
9410 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
9411 }
else if (SrcIt != MinBWs.
end()) {
9412 assert(BWSz > SrcBWSz &&
"Invalid cast!");
9414 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
9416 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
9417 !SrcIt->second.second) {
9418 VecOpcode = Instruction::UIToFP;
9421 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9429 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9431 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
9435 VecOpcode == Opcode ? VI :
nullptr);
9437 return GetCostDiff(GetScalarCost, GetVectorCost);
9439 case Instruction::FCmp:
9440 case Instruction::ICmp:
9441 case Instruction::Select: {
9445 match(VL0, MatchCmp))
9451 auto GetScalarCost = [&](
unsigned Idx) {
9452 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9458 !
match(VI, MatchCmp)) ||
9459 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9465 Builder.getInt1Ty(), CurrentPred,
CostKind,
9472 E->getOpcode(), VecTy, MaskTy, VecPred,
CostKind, VL0);
9484 if (IntrinsicAndUse.second)
9487 VecCost = std::min(VecCost, IntrinsicCost);
9489 return VecCost + CommonCost;
9491 return GetCostDiff(GetScalarCost, GetVectorCost);
9493 case Instruction::FNeg:
9494 case Instruction::Add:
9495 case Instruction::FAdd:
9496 case Instruction::Sub:
9497 case Instruction::FSub:
9498 case Instruction::Mul:
9499 case Instruction::FMul:
9500 case Instruction::UDiv:
9501 case Instruction::SDiv:
9502 case Instruction::FDiv:
9503 case Instruction::URem:
9504 case Instruction::SRem:
9505 case Instruction::FRem:
9506 case Instruction::Shl:
9507 case Instruction::LShr:
9508 case Instruction::AShr:
9509 case Instruction::And:
9510 case Instruction::Or:
9511 case Instruction::Xor: {
9512 auto GetScalarCost = [&](
unsigned Idx) {
9513 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9514 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
9523 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
9524 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
9527 auto *CI = dyn_cast<ConstantInt>(
Op);
9528 return CI && CI->getValue().countr_one() >= It->second.first;
9533 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
9537 Op2Info, std::nullopt,
nullptr, TLI) +
9540 return GetCostDiff(GetScalarCost, GetVectorCost);
9542 case Instruction::GetElementPtr: {
9543 return CommonCost + GetGEPCostDiff(VL, VL0);
9545 case Instruction::Load: {
9546 auto GetScalarCost = [&](
unsigned Idx) {
9547 auto *
VI = cast<LoadInst>(UniqueValues[
Idx]);
9549 VI->getAlign(),
VI->getPointerAddressSpace(),
9552 auto *LI0 = cast<LoadInst>(VL0);
9555 if (E->State == TreeEntry::Vectorize) {
9557 Instruction::Load, VecTy, LI0->getAlign(),
9559 }
else if (E->State == TreeEntry::StridedVectorize) {
9560 Align CommonAlignment =
9561 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9563 Instruction::Load, VecTy, LI0->getPointerOperand(),
9566 assert(E->State == TreeEntry::ScatterVectorize &&
"Unknown EntryState");
9567 Align CommonAlignment =
9568 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9570 Instruction::Load, VecTy, LI0->getPointerOperand(),
9573 return VecLdCost + CommonCost;
9579 if (E->State == TreeEntry::ScatterVectorize)
9585 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
9586 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
9588 case Instruction::Store: {
9589 bool IsReorder = !E->ReorderIndices.empty();
9590 auto GetScalarCost = [=](
unsigned Idx) {
9591 auto *
VI = cast<StoreInst>(VL[
Idx]);
9594 VI->getAlign(),
VI->getPointerAddressSpace(),
9598 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
9602 if (E->State == TreeEntry::StridedVectorize) {
9603 Align CommonAlignment =
9604 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
9606 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9609 assert(E->State == TreeEntry::Vectorize &&
9610 "Expected either strided or consecutive stores.");
9613 Instruction::Store, VecTy, BaseSI->getAlign(),
9614 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
9616 return VecStCost + CommonCost;
9620 unsigned Idx = IsReorder ? E->ReorderIndices[
I] :
I;
9621 PointerOps[
Idx] = cast<StoreInst>(V)->getPointerOperand();
9624 return GetCostDiff(GetScalarCost, GetVectorCost) +
9625 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
9627 case Instruction::Call: {
9628 auto GetScalarCost = [&](
unsigned Idx) {
9629 auto *CI = cast<CallInst>(UniqueValues[
Idx]);
9640 auto *CI = cast<CallInst>(VL0);
9644 It != MinBWs.
end() ? It->second.first : 0);
9646 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
9648 return GetCostDiff(GetScalarCost, GetVectorCost);
9650 case Instruction::ShuffleVector: {
9651 assert(E->isAltShuffle() &&
9656 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
9657 "Invalid Shuffle Vector Operand");
9660 auto TryFindNodeWithEqualOperands = [=]() {
9661 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9664 if (
TE->isAltShuffle() &&
9665 ((
TE->getOpcode() == E->getOpcode() &&
9666 TE->getAltOpcode() == E->getAltOpcode()) ||
9667 (
TE->getOpcode() == E->getAltOpcode() &&
9668 TE->getAltOpcode() == E->getOpcode())) &&
9669 TE->hasEqualOperands(*E))
9674 auto GetScalarCost = [&](
unsigned Idx) {
9675 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9676 assert(E->isOpcodeOrAlt(VI) &&
"Unexpected main/alternate opcode");
9686 if (TryFindNodeWithEqualOperands()) {
9688 dbgs() <<
"SLP: diamond match for alternate node found.\n";
9695 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,
CostKind);
9697 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
9698 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
9700 VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
9701 CI0->getPredicate(),
CostKind, VL0);
9702 VecCost += TTIRef.getCmpSelInstrCost(
9703 E->getOpcode(), VecTy, MaskTy,
9704 cast<CmpInst>(E->getAltOp())->getPredicate(),
CostKind,
9707 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
9710 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
9711 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
9713 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
9714 if (SrcIt != MinBWs.
end()) {
9715 SrcBWSz = SrcIt->second.first;
9719 if (BWSz <= SrcBWSz) {
9722 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
9726 <<
"SLP: alternate extension, which should be truncated.\n";
9732 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
9735 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
9739 E->buildAltOpShuffleMask(
9741 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
9742 return I->getOpcode() == E->getAltOpcode();
9751 unsigned Opcode0 = E->getOpcode();
9752 unsigned Opcode1 = E->getAltOpcode();
9755 for (
unsigned Lane : seq<unsigned>(0, E->Scalars.size()))
9756 if (cast<Instruction>(E->Scalars[Lane])->getOpcode() == Opcode1)
9757 OpcodeMask.set(Lane);
9760 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
9762 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
9763 return AltVecCost < VecCost ? AltVecCost : VecCost;
9768 return GetCostDiff(GetScalarCost, GetVectorCost);
9775bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
9777 << VectorizableTree.size() <<
" is fully vectorizable .\n");
9779 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
9781 return TE->State == TreeEntry::NeedToGather &&
9783 [
this](
Value *V) { return EphValues.contains(V); }) &&
9785 TE->Scalars.size() < Limit ||
9786 ((
TE->getOpcode() == Instruction::ExtractElement ||
9787 all_of(
TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
9789 (
TE->State == TreeEntry::NeedToGather &&
9790 TE->getOpcode() == Instruction::Load && !
TE->isAltShuffle()));
9794 if (VectorizableTree.size() == 1 &&
9795 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
9797 AreVectorizableGathers(VectorizableTree[0].
get(),
9798 VectorizableTree[0]->Scalars.size()) &&
9799 VectorizableTree[0]->getVectorFactor() > 2)))
9802 if (VectorizableTree.size() != 2)
9810 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
9811 AreVectorizableGathers(VectorizableTree[1].
get(),
9812 VectorizableTree[0]->Scalars.size()))
9816 if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
9817 (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9818 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
9819 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
9827 bool MustMatchOrInst) {
9831 Value *ZextLoad = Root;
9832 const APInt *ShAmtC;
9833 bool FoundOr =
false;
9834 while (!isa<ConstantExpr>(ZextLoad) &&
9837 ShAmtC->
urem(8) == 0))) {
9838 auto *BinOp = cast<BinaryOperator>(ZextLoad);
9839 ZextLoad = BinOp->getOperand(0);
9840 if (BinOp->getOpcode() == Instruction::Or)
9845 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
9852 Type *SrcTy = Load->getType();
9859 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
9860 << *(cast<Instruction>(Root)) <<
"\n");
9869 unsigned NumElts = VectorizableTree[0]->Scalars.size();
9870 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
9878 unsigned NumElts = Stores.
size();
9879 for (
Value *Scalar : Stores) {
9890 if (VectorizableTree.size() == 2 &&
9891 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
9892 VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9893 (VectorizableTree[1]->getVectorFactor() <= 2 ||
9894 !(
isSplat(VectorizableTree[1]->Scalars) ||
9902 constexpr int Limit = 4;
9904 !VectorizableTree.empty() &&
9905 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
9906 return (TE->State == TreeEntry::NeedToGather &&
9907 TE->getOpcode() != Instruction::ExtractElement &&
9908 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
9909 TE->getOpcode() == Instruction::PHI;
9920 if (isFullyVectorizableTinyTree(ForReduction))
9925 bool IsAllowedSingleBVNode =
9926 VectorizableTree.size() > 1 ||
9927 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
9928 !VectorizableTree.front()->isAltShuffle() &&
9929 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
9930 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
9932 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
9933 return TE->State == TreeEntry::NeedToGather &&
9935 return isa<ExtractElementInst, UndefValue>(V) ||
9936 (IsAllowedSingleBVNode &&
9937 !V->hasNUsesOrMore(UsesLimit) &&
9938 any_of(V->users(), IsaPred<InsertElementInst>));
9943 assert(VectorizableTree.empty()
9944 ? ExternalUses.empty()
9945 :
true &&
"We shouldn't have any external users");
9957 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
9970 for (
const auto &TEPtr : VectorizableTree) {
9971 if (TEPtr->State != TreeEntry::Vectorize)
9973 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
9979 auto *NodeA = DT->
getNode(
A->getParent());
9980 auto *NodeB = DT->
getNode(
B->getParent());
9981 assert(NodeA &&
"Should only process reachable instructions");
9982 assert(NodeB &&
"Should only process reachable instructions");
9983 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
9984 "Different nodes should have different DFS numbers");
9986 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
9987 return B->comesBefore(
A);
9997 LiveValues.
erase(PrevInst);
9998 for (
auto &J : PrevInst->
operands()) {
9999 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
10000 LiveValues.
insert(cast<Instruction>(&*J));
10004 dbgs() <<
"SLP: #LV: " << LiveValues.
size();
10005 for (
auto *
X : LiveValues)
10006 dbgs() <<
" " <<
X->getName();
10007 dbgs() <<
", Looking at ";
10012 unsigned NumCalls = 0;
10016 while (InstIt != PrevInstIt) {
10018 PrevInstIt = Inst->getParent()->rbegin();
10023 if (
auto *II = dyn_cast<IntrinsicInst>(
I)) {
10024 if (II->isAssumeLikeIntrinsic())
10028 for (
auto &ArgOp : II->args())
10030 if (
auto *FPMO = dyn_cast<FPMathOperator>(II))
10031 FMF = FPMO->getFastMathFlags();
10038 if (IntrCost < CallCost)
10045 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
10046 &*PrevInstIt != PrevInst)
10054 for (
auto *II : LiveValues) {
10055 auto *ScalarTy = II->getType();
10056 if (
auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
10057 ScalarTy = VectorTy->getElementType();
10075 const auto *I1 = IE1;
10076 const auto *I2 = IE2;
10088 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
10090 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
10091 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
10093 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
10094 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
10101struct ValueSelect {
10102 template <
typename U>
10103 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
10106 template <
typename U>
10107 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
10125template <
typename T>
10131 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
10133 auto VMIt = std::next(ShuffleMask.begin());
10136 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
10138 if (!IsBaseUndef.
all()) {
10140 std::pair<T *, bool> Res =
10141 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
10143 for (
unsigned Idx = 0, VF = Mask.size();
Idx < VF; ++
Idx) {
10147 Mask[
Idx] = (Res.second ?
Idx : Mask[
Idx]) + VF;
10149 auto *V = ValueSelect::get<T *>(
Base);
10151 assert((!V || GetVF(V) == Mask.size()) &&
10152 "Expected base vector of VF number of elements.");
10153 Prev = Action(Mask, {
nullptr, Res.first});
10154 }
else if (ShuffleMask.size() == 1) {
10157 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
10163 Prev = Action(Mask, {ShuffleMask.begin()->first});
10167 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
10168 unsigned Vec2VF = GetVF(VMIt->first);
10169 if (Vec1VF == Vec2VF) {
10173 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
10176 Mask[
I] = SecMask[
I] + Vec1VF;
10179 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
10182 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
10184 std::pair<T *, bool> Res2 =
10185 ResizeAction(VMIt->first, VMIt->second,
false);
10187 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
10194 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
10197 Prev = Action(Mask, {Res1.first, Res2.first});
10199 VMIt = std::next(VMIt);
10201 bool IsBaseNotUndef = !IsBaseUndef.
all();
10202 (void)IsBaseNotUndef;
10204 for (
auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
10206 std::pair<T *, bool> Res =
10207 ResizeAction(VMIt->first, VMIt->second,
false);
10209 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
10212 "Multiple uses of scalars.");
10213 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
10218 Prev = Action(Mask, {Prev, Res.first});
10226 << VectorizableTree.size() <<
".\n");
10228 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
10231 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
10232 TreeEntry &TE = *VectorizableTree[
I];
10233 if (TE.State == TreeEntry::NeedToGather) {
10234 if (
const TreeEntry *E = getTreeEntry(TE.getMainOp());
10235 E && E->getVectorFactor() == TE.getVectorFactor() &&
10236 E->isSame(TE.Scalars)) {
10241 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10250 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10260 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
10261 for (ExternalUser &EU : ExternalUses) {
10263 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
10264 !ExtractCostCalculated.
insert(EU.Scalar).second)
10270 if (EphValues.
count(EU.User))
10274 if (isa<FixedVectorType>(EU.Scalar->getType()))
10279 if (
auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
10280 if (
auto *FTy = dyn_cast<FixedVectorType>(VU->
getType())) {
10281 if (!UsedInserts.
insert(VU).second)
10285 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
10288 [
this, VU](
const std::pair<Value *, const TreeEntry *> &Pair) {
10290 VU, cast<InsertElementInst>(Pair.first),
10292 Value *Op0 = II->getOperand(0);
10293 if (getTreeEntry(II) && !getTreeEntry(Op0))
10299 if (It == FirstUsers.
end()) {
10306 while (
auto *IEBase = dyn_cast<InsertElementInst>(
Base)) {
10307 if (IEBase != EU.User &&
10308 (!IEBase->hasOneUse() ||
10312 if (
const TreeEntry *E = getTreeEntry(IEBase)) {
10315 IEBase = cast<InsertElementInst>(
Base);
10318 "InsertElementInstruction used already.");
10320 Base = IEBase->getOperand(0);
10321 }
while (E == getTreeEntry(
Base));
10324 Base = cast<InsertElementInst>(
Base)->getOperand(0);
10328 VecId = FirstUsers.
size() - 1;
10329 auto It = MinBWs.
find(ScalarTE);
10330 if (It != MinBWs.
end() &&
10332 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
10334 unsigned BWSz = It->second.first;
10335 unsigned DstBWSz =
DL->getTypeSizeInBits(FTy->getElementType());
10336 unsigned VecOpcode;
10337 if (DstBWSz < BWSz)
10338 VecOpcode = Instruction::Trunc;
10341 It->second.second ? Instruction::SExt : Instruction::ZExt;
10347 FTy->getNumElements()),
10350 <<
" for extending externally used vector with "
10351 "non-equal minimum bitwidth.\n");
10357 VecId = std::distance(FirstUsers.
begin(), It);
10359 int InIdx = *InsertIdx;
10363 Mask[InIdx] = EU.Lane;
10364 DemandedElts[VecId].setBit(InIdx);
10372 if (
auto *
GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
10373 if (!ValueToExtUses) {
10374 ValueToExtUses.emplace();
10376 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
10382 if (!getTreeEntry(V))
10384 auto It = ValueToExtUses->find(V);
10385 if (It != ValueToExtUses->end()) {
10387 ExternalUses[It->second].User = nullptr;
10392 if (CanBeUsedAsGEP) {
10394 ExternalUsesAsGEPs.
insert(EU.Scalar);
10403 auto It = MinBWs.
find(getTreeEntry(EU.Scalar));
10404 if (It != MinBWs.
end()) {
10407 It->second.second ? Instruction::SExt : Instruction::ZExt;
10417 if (!VectorizedVals.
empty()) {
10418 const TreeEntry &Root = *VectorizableTree.front().get();
10419 auto BWIt = MinBWs.find(&Root);
10420 if (BWIt != MinBWs.end()) {
10421 Type *DstTy = Root.Scalars.front()->getType();
10422 unsigned OriginalSz =
DL->getTypeSizeInBits(DstTy);
10424 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
10425 if (OriginalSz != SrcSz) {
10426 unsigned Opcode = Instruction::Trunc;
10427 if (OriginalSz > SrcSz)
10428 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
10438 Cost += SpillCost + ExtractCost;
10442 unsigned VF =
Mask.size();
10443 unsigned VecVF =
TE->getVectorFactor();
10445 (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); }) ||
10448 std::copy(
Mask.begin(), std::next(
Mask.begin(), std::min(VF, VecVF)),
10454 dbgs() <<
"SLP: Adding cost " <<
C
10455 <<
" for final shuffle of insertelement external users.\n";
10456 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10458 return std::make_pair(TE,
true);
10460 return std::make_pair(TE,
false);
10463 for (
int I = 0, E = FirstUsers.size();
I < E; ++
I) {
10464 Value *
Base = cast<Instruction>(FirstUsers[
I].first)->getOperand(0);
10465 auto Vector = ShuffleMasks[
I].takeVector();
10469 assert((TEs.size() == 1 || TEs.size() == 2) &&
10470 "Expected exactly 1 or 2 tree entries.");
10471 if (TEs.size() == 1) {
10473 VF = TEs.front()->getVectorFactor();
10479 (
Data.index() < VF &&
10480 static_cast<int>(
Data.index()) ==
Data.value());
10485 <<
" for final shuffle of insertelement "
10486 "external users.\n";
10487 TEs.front()->
dump();
10488 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10494 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
10495 VF = TEs.front()->getVectorFactor();
10504 <<
" for final shuffle of vector node and external "
10505 "insertelement users.\n";
10506 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
10507 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10513 (void)performExtractsShuffleAction<const TreeEntry>(
10515 [](
const TreeEntry *E) {
return E->getVectorFactor(); }, ResizeToVF,
10516 EstimateShufflesCost);
10518 cast<FixedVectorType>(FirstUsers[
I].first->getType()), DemandedElts[
I],
10520 Cost -= InsertCost;
10524 if (ReductionBitWidth != 0) {
10525 assert(UserIgnoreList &&
"Expected reduction tree.");
10526 const TreeEntry &E = *VectorizableTree.front().get();
10527 auto It = MinBWs.find(&E);
10528 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
10529 unsigned SrcSize = It->second.first;
10530 unsigned DstSize = ReductionBitWidth;
10531 unsigned Opcode = Instruction::Trunc;
10532 if (SrcSize < DstSize)
10533 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
10540 switch (E.getOpcode()) {
10541 case Instruction::SExt:
10542 case Instruction::ZExt:
10543 case Instruction::Trunc: {
10544 const TreeEntry *OpTE = getOperandEntry(&E, 0);
10545 CCH = getCastContextHint(*OpTE);
10555 <<
" for final resize for reduction from " << SrcVecTy
10556 <<
" to " << DstVecTy <<
"\n";
10557 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10565 OS <<
"SLP: Spill Cost = " << SpillCost <<
".\n"
10566 <<
"SLP: Extract Cost = " << ExtractCost <<
".\n"
10567 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
10571 ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
10582std::optional<TTI::ShuffleKind>
10583BoUpSLP::tryToGatherSingleRegisterExtractElements(
10589 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
10590 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
10592 if (isa<UndefValue>(VL[
I]))
10596 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
10597 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
10606 ExtractMask.reset(*
Idx);
10611 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
10615 for (
const auto &
Data : VectorOpToIdx)
10616 VFToVector[cast<FixedVectorType>(
Data.first->getType())->getNumElements()]
10617 .push_back(
Data.first);
10618 for (
auto &
Data : VFToVector) {
10620 return VectorOpToIdx.find(V1)->second.size() >
10621 VectorOpToIdx.find(V2)->second.size();
10626 const int UndefSz = UndefVectorExtracts.
size();
10627 unsigned SingleMax = 0;
10628 Value *SingleVec =
nullptr;
10629 unsigned PairMax = 0;
10630 std::pair<Value *, Value *> PairVec(
nullptr,
nullptr);
10631 for (
auto &
Data : VFToVector) {
10633 if (SingleMax < VectorOpToIdx[V1].
size() + UndefSz) {
10634 SingleMax = VectorOpToIdx[V1].size() + UndefSz;
10638 if (
Data.second.size() > 1)
10639 V2 = *std::next(
Data.second.begin());
10640 if (V2 && PairMax < VectorOpToIdx[V1].
size() + VectorOpToIdx[V2].
size() +
10642 PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[
V2].size() + UndefSz;
10643 PairVec = std::make_pair(V1, V2);
10646 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
10647 return std::nullopt;
10653 if (SingleMax >= PairMax && SingleMax) {
10654 for (
int Idx : VectorOpToIdx[SingleVec])
10657 for (
Value *V : {PairVec.first, PairVec.second})
10658 for (
int Idx : VectorOpToIdx[V])
10662 for (
int Idx : UndefVectorExtracts)
10666 std::optional<TTI::ShuffleKind> Res =
10672 return std::nullopt;
10676 for (
int I = 0, E = GatheredExtracts.size();
I < E; ++
I) {
10677 if (Mask[
I] ==
PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[
I]) &&
10678 isa<UndefValue>(GatheredExtracts[
I])) {
10682 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
10683 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
10684 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
10699 unsigned NumParts)
const {
10700 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
10703 unsigned SliceSize = VL.
size() / NumParts;
10704 for (
unsigned Part = 0; Part < NumParts; ++Part) {
10710 std::optional<TTI::ShuffleKind> Res =
10711 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
10712 ShufflesRes[Part] = Res;
10713 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
10715 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
10716 return Res.has_value();
10718 ShufflesRes.clear();
10719 return ShufflesRes;
10722std::optional<TargetTransformInfo::ShuffleKind>
10723BoUpSLP::isGatherShuffledSingleRegisterEntry(
10729 const EdgeInfo &TEUseEI =
TE->UserTreeIndices.front();
10730 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
10734 if (
auto *
PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
10735 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
10738 TEInsertBlock = TEInsertPt->
getParent();
10741 return std::nullopt;
10742 auto *NodeUI = DT->
getNode(TEInsertBlock);
10743 assert(NodeUI &&
"Should only process reachable instructions");
10745 auto CheckOrdering = [&](
const Instruction *InsertPt) {
10759 auto *NodeEUI = DT->
getNode(InsertBlock);
10762 assert((NodeUI == NodeEUI) ==
10763 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
10764 "Different nodes should have different DFS numbers");
10766 if (TEInsertPt->
getParent() != InsertBlock &&
10769 if (TEInsertPt->
getParent() == InsertBlock &&
10783 for (
Value *V : VL) {
10788 for (
const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
10792 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
10793 "Must contain at least single gathered value.");
10794 assert(TEPtr->UserTreeIndices.size() == 1 &&
10795 "Expected only single user of a gather node.");
10796 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
10798 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
10801 : &getLastInstructionInBundle(UseEI.UserTE);
10802 if (TEInsertPt == InsertPt) {
10806 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
10810 if (TEUseEI.UserTE != UseEI.UserTE &&
10811 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
10817 if ((TEInsertBlock != InsertPt->
getParent() ||
10818 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
10819 !CheckOrdering(InsertPt))
10823 if (
const TreeEntry *VTE = getTreeEntry(V)) {
10825 if (VTE->State != TreeEntry::Vectorize) {
10826 auto It = MultiNodeScalars.
find(V);
10827 if (It == MultiNodeScalars.
end())
10829 VTE = *It->getSecond().begin();
10831 auto *MIt =
find_if(It->getSecond(), [](
const TreeEntry *MTE) {
10832 return MTE->State == TreeEntry::Vectorize;
10834 if (MIt == It->getSecond().end())
10839 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
10840 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
10844 if (VToTEs.
empty())
10846 if (UsedTEs.
empty()) {
10860 if (!VToTEs.
empty()) {
10866 VToTEs = SavedVToTEs;
10875 if (UsedTEs.
size() == 2)
10877 UsedTEs.push_back(SavedVToTEs);
10884 if (UsedTEs.
empty()) {
10886 return std::nullopt;
10890 if (UsedTEs.
size() == 1) {
10893 UsedTEs.front().
end());
10894 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
10895 return TE1->Idx < TE2->Idx;
10898 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
10899 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
10901 if (It != FirstEntries.end() &&
10902 ((*It)->getVectorFactor() == VL.size() ||
10903 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
10904 TE->ReuseShuffleIndices.size() == VL.size() &&
10905 (*It)->isSame(
TE->Scalars)))) {
10906 Entries.push_back(*It);
10907 if ((*It)->getVectorFactor() == VL.size()) {
10908 std::iota(std::next(
Mask.begin(), Part * VL.size()),
10909 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
10915 for (
int I = 0, Sz = VL.size();
I < Sz; ++
I)
10916 if (isa<PoisonValue>(VL[
I]))
10922 Entries.push_back(FirstEntries.front());
10925 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
10928 for (
const TreeEntry *TE : UsedTEs.front()) {
10929 unsigned VF =
TE->getVectorFactor();
10930 auto It = VFToTE.
find(VF);
10931 if (It != VFToTE.
end()) {
10932 if (It->second->Idx >
TE->Idx)
10933 It->getSecond() =
TE;
10940 UsedTEs.back().
end());
10941 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
10942 return TE1->Idx < TE2->Idx;
10944 for (
const TreeEntry *TE : SecondEntries) {
10945 auto It = VFToTE.
find(
TE->getVectorFactor());
10946 if (It != VFToTE.
end()) {
10948 Entries.push_back(It->second);
10949 Entries.push_back(TE);
10955 if (Entries.empty()) {
10957 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
10958 return TE1->Idx < TE2->Idx;
10960 Entries.push_back(SecondEntries.front());
10961 VF = std::max(Entries.front()->getVectorFactor(),
10962 Entries.back()->getVectorFactor());
10966 bool IsSplatOrUndefs =
isSplat(VL) ||
all_of(VL, IsaPred<UndefValue>);
10969 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
10970 auto *
PHI = cast<PHINode>(V);
10971 auto *PHI1 = cast<PHINode>(V1);
10976 for (
int I = 0, E =
PHI->getNumIncomingValues();
I < E; ++
I) {
10978 Value *In1 = PHI1->getIncomingValue(
I);
10983 if (cast<Instruction>(In)->
getParent() !=
10993 auto MightBeIgnored = [=](
Value *
V) {
10994 auto *
I = dyn_cast<Instruction>(V);
10995 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.
count(
I) &&
10997 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
11002 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
11004 bool UsedInSameVTE =
false;
11005 auto It = UsedValuesEntry.
find(V1);
11006 if (It != UsedValuesEntry.
end())
11007 UsedInSameVTE = It->second == UsedValuesEntry.
find(V)->second;
11008 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
11010 cast<Instruction>(V)->getParent() ==
11011 cast<Instruction>(V1)->getParent() &&
11012 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
11017 for (
int I = 0, E = VL.size();
I < E; ++
I) {
11019 auto It = UsedValuesEntry.
find(V);
11020 if (It == UsedValuesEntry.
end())
11026 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
11027 (
I != E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
11029 unsigned Idx = It->second;
11036 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
11037 if (!UsedIdxs.test(
I))
11043 for (std::pair<unsigned, int> &Pair : EntryLanes)
11044 if (Pair.first ==
I)
11045 Pair.first = TempEntries.
size();
11048 Entries.swap(TempEntries);
11049 if (EntryLanes.size() == Entries.size() &&
11051 .
slice(Part * VL.size(),
11052 std::min<int>(VL.size(),
TE->Scalars.size())))) {
11058 return std::nullopt;
11061 bool IsIdentity = Entries.size() == 1;
11064 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
11065 unsigned Idx = Part * VL.size() + Pair.second;
11068 (ForOrder ? std::distance(
11069 Entries[Pair.first]->Scalars.begin(),
11070 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
11071 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
11072 IsIdentity &=
Mask[
Idx] == Pair.second;
11074 switch (Entries.size()) {
11076 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
11080 if (EntryLanes.size() > 2 || VL.size() <= 2)
11088 std::fill(std::next(
Mask.begin(), Part * VL.size()),
11090 return std::nullopt;
11094BoUpSLP::isGatherShuffledEntry(
11098 assert(NumParts > 0 && NumParts < VL.
size() &&
11099 "Expected positive number of registers.");
11102 if (TE == VectorizableTree.front().get())
11105 if (
TE->isNonPowOf2Vec())
11108 assert(
TE->UserTreeIndices.size() == 1 &&
11109 "Expected only single user of the gather node.");
11111 "Number of scalars must be divisible by NumParts.");
11112 unsigned SliceSize = VL.
size() / NumParts;
11114 for (
unsigned Part = 0; Part < NumParts; ++Part) {
11117 std::optional<TTI::ShuffleKind> SubRes =
11118 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
11121 SubEntries.
clear();
11124 SubEntries.
front()->getVectorFactor() == VL.
size() &&
11125 (SubEntries.
front()->isSame(
TE->Scalars) ||
11126 SubEntries.
front()->isSame(VL))) {
11128 LocalSubEntries.
swap(SubEntries);
11131 std::iota(
Mask.begin(),
Mask.end(), 0);
11133 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
11134 if (isa<PoisonValue>(VL[
I]))
11136 Entries.emplace_back(1, LocalSubEntries.
front());
11142 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
11150 Type *ScalarTy)
const {
11152 bool DuplicateNonConst =
false;
11160 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
11161 if (
V->getType() != ScalarTy) {
11172 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
11175 if ((ForPoisonSrc &&
isConstant(V)) || isa<UndefValue>(V)) {
11183 EstimateInsertCost(
I, V);
11184 ShuffleMask[
I] =
I;
11188 DuplicateNonConst =
true;
11190 ShuffleMask[
I] = Res.first->second;
11196 if (DuplicateNonConst)
11198 VecTy, ShuffleMask);
11210 VLOperands Ops(VL, R);
11213 Left = Ops.getVL(0);
11214 Right = Ops.getVL(1);
11217Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *E) {
11220 return *Res.second;
11224 auto *Front = E->getMainOp();
11227 if (E->getOpcode() == Instruction::GetElementPtr &&
11228 !isa<GetElementPtrInst>(V))
11230 auto *I = cast<Instruction>(V);
11231 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
11232 isVectorLikeInstWithConstOps(I);
11235 auto FindLastInst = [&]() {
11237 for (
Value *V : E->Scalars) {
11238 auto *
I = dyn_cast<Instruction>(V);
11241 if (LastInst->
getParent() ==
I->getParent()) {
11246 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11247 !isa<GetElementPtrInst>(
I)) ||
11250 "Expected vector-like or non-GEP in GEP node insts only.");
11258 auto *NodeB = DT->
getNode(
I->getParent());
11259 assert(NodeA &&
"Should only process reachable instructions");
11260 assert(NodeB &&
"Should only process reachable instructions");
11261 assert((NodeA == NodeB) ==
11262 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11263 "Different nodes should have different DFS numbers");
11264 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
11271 auto FindFirstInst = [&]() {
11273 for (
Value *V : E->Scalars) {
11274 auto *
I = dyn_cast<Instruction>(V);
11277 if (FirstInst->
getParent() ==
I->getParent()) {
11278 if (
I->comesBefore(FirstInst))
11282 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11283 !isa<GetElementPtrInst>(
I)) ||
11286 "Expected vector-like or non-GEP in GEP node insts only.");
11294 auto *NodeB = DT->
getNode(
I->getParent());
11295 assert(NodeA &&
"Should only process reachable instructions");
11296 assert(NodeB &&
"Should only process reachable instructions");
11297 assert((NodeA == NodeB) ==
11298 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11299 "Different nodes should have different DFS numbers");
11300 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
11309 (E->State != TreeEntry::NeedToGather &&
11311 if ((E->getOpcode() == Instruction::GetElementPtr &&
11314 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
11318 return !isVectorLikeInstWithConstOps(V) &&
11319 isUsedOutsideBlock(V);
11321 (E->State == TreeEntry::NeedToGather && E->Idx == 0 &&
11323 return isa<ExtractElementInst, UndefValue>(V) ||
11324 areAllOperandsNonInsts(V);
11326 Res.second = FindLastInst();
11328 Res.second = FindFirstInst();
11329 return *Res.second;
11336 if (BlocksSchedules.count(BB)) {
11337 Value *
V = E->isOneOf(E->Scalars.back());
11340 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
11341 if (Bundle && Bundle->isPartOfBundle())
11342 for (; Bundle; Bundle = Bundle->NextInBundle)
11343 if (Bundle->OpValue == Bundle->Inst)
11344 Res.second = Bundle->Inst;
11366 Res.second = FindLastInst();
11367 assert(Res.second &&
"Failed to find last instruction in bundle");
11368 return *Res.second;
11371void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *E) {
11372 auto *Front = E->getMainOp();
11373 Instruction *LastInst = &getLastInstructionInBundle(E);
11374 assert(LastInst &&
"Failed to find last instruction in bundle");
11377 bool IsPHI = isa<PHINode>(LastInst);
11380 if (IsPHI || (E->State != TreeEntry::NeedToGather &&
11382 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
11386 Builder.SetInsertPoint(
11390 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11400 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
11403 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
11404 InsertBB = InsertBB->getSinglePredecessor();
11405 return InsertBB && InsertBB == InstBB;
11407 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
11408 if (
auto *Inst = dyn_cast<Instruction>(VL[
I]))
11409 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
11410 getTreeEntry(Inst) ||
11411 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
11412 PostponedIndices.
insert(
I).second)
11416 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
11419 if (
Scalar->getType() != Ty) {
11421 "Expected integer types only.");
11423 if (
auto *CI = dyn_cast<CastInst>(Scalar);
11424 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
11426 if (
auto *IOp = dyn_cast<Instruction>(
Op);
11427 !IOp || !(
isDeleted(IOp) || getTreeEntry(IOp)))
11430 Scalar = Builder.CreateIntCast(
11434 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
11435 auto *InsElt = dyn_cast<InsertElementInst>(Vec);
11438 GatherShuffleExtractSeq.
insert(InsElt);
11439 CSEBlocks.
insert(InsElt->getParent());
11441 if (isa<Instruction>(V)) {
11442 if (TreeEntry *Entry = getTreeEntry(V)) {
11444 User *UserOp =
nullptr;
11446 if (
auto *SI = dyn_cast<Instruction>(Scalar))
11452 unsigned FoundLane = Entry->findLaneForValue(V);
11453 ExternalUses.emplace_back(V, UserOp, FoundLane);
11463 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
11471 if (!isa<UndefValue>(VL[
I])) {
11475 if (isa<PoisonValue>(VL[
I]))
11477 if (
auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
11482 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
11485 for (
int I : NonConsts)
11486 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
11489 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
11490 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
11528 bool IsFinalized =
false;
11538 Type *ScalarTy =
nullptr;
11542 class ShuffleIRBuilder {
11555 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
11556 CSEBlocks(CSEBlocks),
DL(
DL) {}
11557 ~ShuffleIRBuilder() =
default;
11560 if (V1->
getType() != V2->getType()) {
11563 "Expected integer vector types only.");
11564 if (V1->
getType() != V2->getType()) {
11565 if (cast<VectorType>(V2->getType())
11567 ->getIntegerBitWidth() < cast<VectorType>(V1->
getType())
11569 ->getIntegerBitWidth())
11578 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
11579 GatherShuffleExtractSeq.
insert(
I);
11580 CSEBlocks.
insert(
I->getParent());
11589 unsigned VF = Mask.size();
11590 unsigned LocalVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
11594 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
11595 GatherShuffleExtractSeq.
insert(
I);
11596 CSEBlocks.
insert(
I->getParent());
11600 Value *createIdentity(
Value *V) {
return V; }
11601 Value *createPoison(
Type *Ty,
unsigned VF) {
11606 void resizeToMatch(
Value *&V1,
Value *&V2) {
11607 if (V1->
getType() == V2->getType())
11609 int V1VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
11610 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
11611 int VF = std::max(V1VF, V2VF);
11612 int MinVF = std::min(V1VF, V2VF);
11614 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
11616 Value *&
Op = MinVF == V1VF ? V1 : V2;
11618 if (
auto *
I = dyn_cast<Instruction>(
Op)) {
11619 GatherShuffleExtractSeq.
insert(
I);
11620 CSEBlocks.
insert(
I->getParent());
11633 assert(V1 &&
"Expected at least one vector value.");
11634 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
11635 R.CSEBlocks, *R.DL);
11636 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
11644 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11652 std::optional<bool> IsSigned = std::nullopt) {
11653 auto *VecTy = cast<VectorType>(V->getType());
11663 : ScalarTy(ScalarTy), Builder(Builder), R(R) {}
11667 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
11668 unsigned NumParts,
bool &UseVecBaseAsInput) {
11669 UseVecBaseAsInput =
false;
11671 Value *VecBase =
nullptr;
11672 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
11676 auto *EI = cast<ExtractElementInst>(E->Scalars[
I]);
11677 VecBase = EI->getVectorOperand();
11678 if (
const TreeEntry *TE = R.getTreeEntry(VecBase))
11679 VecBase = TE->VectorizedValue;
11680 assert(VecBase &&
"Expected vectorized value.");
11681 UniqueBases.
insert(VecBase);
11684 if (!EI->hasOneUse() || (NumParts != 1 &&
count(E->Scalars, EI) > 1) ||
11686 const TreeEntry *UTE = R.getTreeEntry(U);
11687 return !UTE || R.MultiNodeScalars.contains(U) ||
11688 (isa<GetElementPtrInst>(U) &&
11689 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
11690 count_if(R.VectorizableTree,
11691 [&](const std::unique_ptr<TreeEntry> &TE) {
11692 return any_of(TE->UserTreeIndices,
11693 [&](const EdgeInfo &Edge) {
11694 return Edge.UserTE == UTE;
11696 is_contained(TE->Scalars, EI);
11700 R.eraseInstruction(EI);
11702 if (NumParts == 1 || UniqueBases.
size() == 1) {
11703 VecBase = castToScalarTyElem(VecBase);
11706 UseVecBaseAsInput =
true;
11716 Value *Vec =
nullptr;
11718 unsigned SliceSize = E->Scalars.size() / NumParts;
11719 for (
unsigned Part = 0; Part < NumParts; ++Part) {
11723 constexpr int MaxBases = 2;
11731 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
11732 if (
const TreeEntry *TE = R.getTreeEntry(VecOp))
11733 VecOp = TE->VectorizedValue;
11734 assert(VecOp &&
"Expected vectorized value.");
11736 cast<FixedVectorType>(VecOp->
getType())->getNumElements();
11738 assert((PrevSize ==
Size || PrevSize == 0) &&
11739 "Expected vectors of the same size.");
11742 VecOp = castToScalarTyElem(VecOp);
11743 Bases[SubMask[
I] <
Size ? 0 : 1] = VecOp;
11745 if (!Bases.front())
11748 if (Bases.back()) {
11749 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
11750 TransformToIdentity(SubMask);
11752 SubVec = Bases.front();
11759 Mask.slice(
P * SliceSize, SliceSize);
11764 "Expected first part or all previous parts masked.");
11765 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11767 unsigned VF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
11769 unsigned SubVecVF =
11770 cast<FixedVectorType>(SubVec->
getType())->getNumElements();
11771 VF = std::max(VF, SubVecVF);
11774 for (
int &
Idx : SubMask)
11777 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11778 Vec = createShuffle(Vec, SubVec, VecMask);
11779 TransformToIdentity(VecMask);
11787 std::optional<Value *>
11793 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
11795 return std::nullopt;
11807 Value *V1 = E1.VectorizedValue;
11809 V1 = castToScalarTyElem(V1,
all_of(E1.Scalars, [&](
Value *V) {
11810 return !isKnownNonNegative(
11811 V, SimplifyQuery(*R.DL));
11813 Value *V2 = E2.VectorizedValue;
11814 if (V2->getType()->isIntOrIntVectorTy())
11815 V2 = castToScalarTyElem(V2,
all_of(E2.Scalars, [&](
Value *V) {
11816 return !isKnownNonNegative(
11817 V, SimplifyQuery(*R.DL));
11824 Value *V1 = E1.VectorizedValue;
11826 V1 = castToScalarTyElem(V1,
all_of(E1.Scalars, [&](
Value *V) {
11827 return !isKnownNonNegative(
11828 V, SimplifyQuery(*R.DL));
11834 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
11835 V1 = castToScalarTyElem(V1);
11836 V2 = castToScalarTyElem(V2);
11837 if (InVectors.
empty()) {
11840 CommonMask.
assign(Mask.begin(), Mask.end());
11844 if (InVectors.
size() == 2) {
11845 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
11846 transformMaskAfterShuffle(CommonMask, CommonMask);
11847 }
else if (cast<FixedVectorType>(Vec->
getType())->getNumElements() !=
11849 Vec = createShuffle(Vec,
nullptr, CommonMask);
11850 transformMaskAfterShuffle(CommonMask, CommonMask);
11852 V1 = createShuffle(V1, V2, Mask);
11853 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11855 CommonMask[
Idx] =
Idx + Sz;
11856 InVectors.
front() = Vec;
11857 if (InVectors.
size() == 2)
11858 InVectors.
back() = V1;
11864 V1 = castToScalarTyElem(V1);
11865 if (InVectors.
empty()) {
11866 if (!isa<FixedVectorType>(V1->
getType())) {
11867 V1 = createShuffle(V1,
nullptr, CommonMask);
11869 transformMaskAfterShuffle(CommonMask, Mask);
11872 CommonMask.
assign(Mask.begin(), Mask.end());
11875 const auto *It =
find(InVectors, V1);
11876 if (It == InVectors.
end()) {
11877 if (InVectors.
size() == 2 ||
11879 !isa<FixedVectorType>(V1->
getType())) {
11881 if (InVectors.
size() == 2) {
11882 V = createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
11883 transformMaskAfterShuffle(CommonMask, CommonMask);
11884 }
else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
11885 CommonMask.
size()) {
11886 V = createShuffle(InVectors.
front(),
nullptr, CommonMask);
11887 transformMaskAfterShuffle(CommonMask, CommonMask);
11889 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11892 V->getType() != V1->
getType()
11894 : Mask[
Idx] + cast<FixedVectorType>(V1->
getType())
11895 ->getNumElements();
11896 if (V->getType() != V1->
getType())
11897 V1 = createShuffle(V1,
nullptr, Mask);
11898 InVectors.
front() = V;
11899 if (InVectors.
size() == 2)
11900 InVectors.
back() = V1;
11907 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11913 int VF = CommonMask.
size();
11914 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
11915 VF = FTy->getNumElements();
11916 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11918 CommonMask[
Idx] = Mask[
Idx] + (It == InVectors.
begin() ? 0 : VF);
11927 Value *Root =
nullptr) {
11928 return R.gather(VL, Root, ScalarTy);
11937 IsFinalized =
true;
11940 if (InVectors.
size() == 2) {
11941 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
11944 Vec = createShuffle(Vec,
nullptr, CommonMask);
11946 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11950 "Expected vector length for the final value before action.");
11951 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
11954 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
11955 Vec = createShuffle(Vec,
nullptr, ResizeMask);
11957 Action(Vec, CommonMask);
11958 InVectors.
front() = Vec;
11960 if (!ExtMask.
empty()) {
11961 if (CommonMask.
empty()) {
11965 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
11968 NewMask[
I] = CommonMask[ExtMask[
I]];
11970 CommonMask.
swap(NewMask);
11973 if (CommonMask.
empty()) {
11974 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
11975 return InVectors.
front();
11977 if (InVectors.
size() == 2)
11978 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
11979 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
11984 "Shuffle construction must be finalized.");
11988Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
11989 bool PostponedPHIs) {
11990 ValueList &VL = E->getOperand(NodeIdx);
11991 const unsigned VF = VL.size();
11994 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
11995 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
11996 if (It != VL.end())
11999 if (S.getOpcode()) {
12000 auto CheckSameVE = [&](
const TreeEntry *VE) {
12001 return VE->isSame(VL) &&
12002 (
any_of(VE->UserTreeIndices,
12003 [E, NodeIdx](
const EdgeInfo &EI) {
12004 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12006 any_of(VectorizableTree,
12007 [E, NodeIdx, VE](
const std::unique_ptr<TreeEntry> &TE) {
12008 return TE->isOperandGatherNode({E, NodeIdx}) &&
12009 VE->isSame(TE->Scalars);
12012 TreeEntry *VE = getTreeEntry(S.OpValue);
12013 bool IsSameVE = VE && CheckSameVE(VE);
12015 auto It = MultiNodeScalars.
find(S.OpValue);
12016 if (It != MultiNodeScalars.
end()) {
12017 auto *
I =
find_if(It->getSecond(), [&](
const TreeEntry *TE) {
12018 return TE != VE && CheckSameVE(TE);
12020 if (
I != It->getSecond().end()) {
12028 ShuffleInstructionBuilder ShuffleBuilder(
12029 cast<VectorType>(
V->getType())->getElementType(), Builder, *
this);
12030 ShuffleBuilder.add(V, Mask);
12031 return ShuffleBuilder.finalize(std::nullopt);
12034 if (VF != cast<FixedVectorType>(
V->getType())->getNumElements()) {
12035 if (!VE->ReuseShuffleIndices.empty()) {
12056 if (isa<PoisonValue>(V))
12058 Mask[
I] = VE->findLaneForValue(V);
12060 V = FinalShuffle(V, Mask);
12062 assert(VF < cast<FixedVectorType>(
V->getType())->getNumElements() &&
12063 "Expected vectorization factor less "
12064 "than original vector size.");
12066 std::iota(UniformMask.begin(), UniformMask.end(), 0);
12067 V = FinalShuffle(V, UniformMask);
12073 if (
find_if(VE->UserTreeIndices, [&](
const EdgeInfo &EI) {
12074 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12075 }) == VE->UserTreeIndices.end()) {
12077 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12078 return TE->State == TreeEntry::NeedToGather &&
12079 TE->UserTreeIndices.front().UserTE == E &&
12080 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
12082 assert(It != VectorizableTree.end() &&
"Expected gather node operand.");
12083 (*It)->VectorizedValue =
V;
12092 auto *
I =
find_if(VectorizableTree,
12093 [E, NodeIdx](
const std::unique_ptr<TreeEntry> &TE) {
12094 return TE->isOperandGatherNode({E, NodeIdx});
12096 assert(
I != VectorizableTree.end() &&
"Gather node is not in the graph.");
12097 assert(
I->get()->UserTreeIndices.size() == 1 &&
12098 "Expected only single user for the gather node.");
12099 assert(
I->get()->isSame(VL) &&
"Expected same list of scalars.");
12103template <
typename BVTy,
typename ResTy,
typename...
Args>
12104ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
12106 assert(E->State == TreeEntry::NeedToGather &&
"Expected gather node.");
12107 unsigned VF = E->getVectorFactor();
12109 bool NeedFreeze =
false;
12111 E->ReuseShuffleIndices.end());
12117 if (!ReorderMask.
empty())
12120 unsigned I,
unsigned SliceSize) {
12122 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12125 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
12126 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
12127 if (UserTE->getNumOperands() != 2)
12130 find_if(VectorizableTree, [=](
const std::unique_ptr<TreeEntry> &TE) {
12131 return find_if(
TE->UserTreeIndices, [=](
const EdgeInfo &EI) {
12132 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
12133 }) !=
TE->UserTreeIndices.end();
12135 if (It == VectorizableTree.end())
12138 if ((
Mask.size() < InputVF &&
12141 (
Mask.size() == InputVF &&
12143 std::iota(std::next(
Mask.begin(),
I * SliceSize),
12144 std::next(
Mask.begin(), (
I + 1) * SliceSize), 0);
12148 std::fill(std::next(
Mask.begin(),
I * SliceSize),
12149 std::next(
Mask.begin(), (
I + 1) * SliceSize), IVal);
12153 BVTy ShuffleBuilder(ScalarTy, Params...);
12154 ResTy Res = ResTy();
12158 Value *ExtractVecBase =
nullptr;
12159 bool UseVecBaseAsInput =
false;
12162 Type *OrigScalarTy = GatheredScalars.front()->getType();
12165 if (NumParts == 0 || NumParts >= GatheredScalars.size())
12167 if (!
all_of(GatheredScalars, IsaPred<UndefValue>)) {
12169 bool Resized =
false;
12171 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
12172 if (!ExtractShuffles.
empty()) {
12177 if (
const auto *TE = getTreeEntry(
12178 cast<ExtractElementInst>(E->Scalars[
Idx])->getVectorOperand()))
12181 if (std::optional<ResTy> Delayed =
12182 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
12184 PostponedGathers.
insert(E);
12189 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
12190 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
12191 ExtractVecBase = VecBase;
12192 if (
auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
12193 if (VF == VecBaseTy->getNumElements() &&
12194 GatheredScalars.size() != VF) {
12196 GatheredScalars.append(VF - GatheredScalars.size(),
12202 if (!ExtractShuffles.
empty() || E->getOpcode() != Instruction::Load ||
12203 E->isAltShuffle() ||
12204 all_of(E->Scalars, [
this](
Value *V) { return getTreeEntry(V); }) ||
12206 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
12208 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
12210 if (!GatherShuffles.
empty()) {
12211 if (std::optional<ResTy> Delayed =
12212 ShuffleBuilder.needToDelay(E, Entries)) {
12214 PostponedGathers.
insert(E);
12219 if (GatherShuffles.
size() == 1 &&
12221 Entries.front().front()->isSame(E->Scalars)) {
12226 <<
"SLP: perfect diamond match for gather bundle "
12229 Mask.resize(E->Scalars.size());
12230 const TreeEntry *FrontTE = Entries.front().front();
12231 if (FrontTE->ReorderIndices.empty() &&
12232 ((FrontTE->ReuseShuffleIndices.empty() &&
12233 E->Scalars.size() == FrontTE->Scalars.size()) ||
12234 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
12235 std::iota(
Mask.begin(),
Mask.end(), 0);
12238 if (isa<PoisonValue>(V)) {
12242 Mask[
I] = FrontTE->findLaneForValue(V);
12245 ShuffleBuilder.add(*FrontTE, Mask);
12246 Res = ShuffleBuilder.finalize(E->getCommonMask());
12250 if (GatheredScalars.size() != VF &&
12252 return any_of(TEs, [&](
const TreeEntry *TE) {
12253 return TE->getVectorFactor() == VF;
12256 GatheredScalars.append(VF - GatheredScalars.size(),
12260 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
12268 bool IsRootPoison) {
12271 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
12278 int NumNonConsts = 0;
12281 if (isa<UndefValue>(V)) {
12282 if (!isa<PoisonValue>(V)) {
12297 Scalars.
front() = OrigV;
12300 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
12301 Scalars[Res.first->second] = OrigV;
12302 ReuseMask[
I] = Res.first->second;
12305 if (NumNonConsts == 1) {
12310 if (!UndefPos.
empty() && UndefPos.
front() == 0)
12313 ReuseMask[SinglePos] = SinglePos;
12314 }
else if (!UndefPos.
empty() && IsSplat) {
12319 return !isa<UndefValue>(V) &&
12321 (E->UserTreeIndices.size() == 1 &&
12325 return E->UserTreeIndices.front().EdgeIdx !=
12326 U.getOperandNo() &&
12328 E->UserTreeIndices.front().UserTE->Scalars,
12332 if (It != Scalars.
end()) {
12334 int Pos = std::distance(Scalars.
begin(), It);
12335 for (
int I : UndefPos) {
12337 ReuseMask[
I] = Pos;
12346 for (
int I : UndefPos) {
12348 if (isa<UndefValue>(Scalars[
I]))
12355 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
12356 bool IsNonPoisoned =
true;
12357 bool IsUsedInExpr =
true;
12358 Value *Vec1 =
nullptr;
12359 if (!ExtractShuffles.
empty()) {
12363 Value *Vec2 =
nullptr;
12364 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
12368 if (UseVecBaseAsInput) {
12369 Vec1 = ExtractVecBase;
12371 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
12374 if (isa<UndefValue>(E->Scalars[
I]))
12376 auto *EI = cast<ExtractElementInst>(E->Scalars[
I]);
12377 Value *VecOp = EI->getVectorOperand();
12378 if (
const auto *TE = getTreeEntry(VecOp))
12379 if (
TE->VectorizedValue)
12380 VecOp =
TE->VectorizedValue;
12383 }
else if (Vec1 != VecOp) {
12384 assert((!Vec2 || Vec2 == VecOp) &&
12385 "Expected only 1 or 2 vectors shuffle.");
12391 IsUsedInExpr =
false;
12394 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12396 IsUsedInExpr &= FindReusedSplat(
12398 cast<FixedVectorType>(Vec1->
getType())->getNumElements(), 0,
12399 ExtractMask.size());
12400 ShuffleBuilder.add(Vec1, ExtractMask,
true);
12403 IsUsedInExpr =
false;
12408 if (!GatherShuffles.
empty()) {
12409 unsigned SliceSize = E->Scalars.size() / NumParts;
12411 for (
const auto [
I, TEs] :
enumerate(Entries)) {
12414 "No shuffles with empty entries list expected.");
12418 "Expected shuffle of 1 or 2 entries.");
12421 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
12422 if (TEs.
size() == 1) {
12423 IsUsedInExpr &= FindReusedSplat(
12424 VecMask, TEs.
front()->getVectorFactor(),
I, SliceSize);
12425 ShuffleBuilder.add(*TEs.
front(), VecMask);
12426 if (TEs.
front()->VectorizedValue)
12430 IsUsedInExpr =
false;
12431 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
12432 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
12443 int EMSz = ExtractMask.size();
12444 int MSz =
Mask.size();
12447 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
12448 bool IsIdentityShuffle =
12449 ((UseVecBaseAsInput ||
12451 [](
const std::optional<TTI::ShuffleKind> &SK) {
12455 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
12457 (!GatherShuffles.
empty() &&
12459 [](
const std::optional<TTI::ShuffleKind> &SK) {
12463 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
12465 bool EnoughConstsForShuffle =
12469 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12473 return isa<Constant>(V) && !isa<UndefValue>(V);
12475 (!IsIdentityShuffle ||
12476 (GatheredScalars.size() == 2 &&
12478 [](
Value *V) {
return !isa<UndefValue>(V); })) ||
12480 return isa<Constant>(V) && !isa<PoisonValue>(V);
12484 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
12485 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[
I]))
12491 if (!
all_of(GatheredScalars, IsaPred<PoisonValue>)) {
12493 TryPackScalars(GatheredScalars, BVMask,
true);
12494 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
12495 ShuffleBuilder.add(BV, BVMask);
12498 return isa<PoisonValue>(V) ||
12499 (IsSingleShuffle && ((IsIdentityShuffle &&
12500 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
12502 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12504 Res = ShuffleBuilder.finalize(
12505 E->ReuseShuffleIndices, E->Scalars.size(),
12507 TryPackScalars(NonConstants, Mask,
false);
12508 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
12513 TryPackScalars(GatheredScalars, ReuseMask,
true);
12514 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
12515 ShuffleBuilder.add(BV, ReuseMask);
12516 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12521 if (!isa<PoisonValue>(V))
12524 Value *BV = ShuffleBuilder.gather(E->Scalars);
12525 ShuffleBuilder.add(BV, Mask);
12526 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12530 Res = ShuffleBuilder.createFreeze(Res);
12534Value *BoUpSLP::createBuildVector(
const TreeEntry *E,
Type *ScalarTy) {
12535 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
12542 if (E->VectorizedValue &&
12543 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
12544 E->isAltShuffle())) {
12545 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *E->Scalars[0] <<
".\n");
12546 return E->VectorizedValue;
12549 Value *
V = E->Scalars.front();
12550 Type *ScalarTy =
V->getType();
12551 if (
auto *Store = dyn_cast<StoreInst>(V))
12552 ScalarTy =
Store->getValueOperand()->getType();
12553 else if (
auto *IE = dyn_cast<InsertElementInst>(V))
12554 ScalarTy =
IE->getOperand(1)->getType();
12555 auto It = MinBWs.
find(E);
12556 if (It != MinBWs.
end())
12559 if (E->State == TreeEntry::NeedToGather) {
12561 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
12562 setInsertPointAfterBundle(E);
12563 Value *Vec = createBuildVector(E, ScalarTy);
12564 E->VectorizedValue = Vec;
12569 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *E,
VectorType *VecTy) {
12570 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *
this);
12571 if (E->getOpcode() == Instruction::Store &&
12572 E->State == TreeEntry::Vectorize) {
12574 ArrayRef(
reinterpret_cast<const int *
>(E->ReorderIndices.begin()),
12575 E->ReorderIndices.size());
12576 ShuffleBuilder.add(V, Mask);
12577 }
else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
12578 ShuffleBuilder.addOrdered(V, std::nullopt);
12580 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
12582 return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12585 assert((E->State == TreeEntry::Vectorize ||
12586 E->State == TreeEntry::ScatterVectorize ||
12587 E->State == TreeEntry::StridedVectorize) &&
12588 "Unhandled state");
12589 unsigned ShuffleOrOp =
12590 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
12592 auto GetOperandSignedness = [&](
unsigned Idx) {
12593 const TreeEntry *OpE = getOperandEntry(E,
Idx);
12594 bool IsSigned =
false;
12595 auto It = MinBWs.
find(OpE);
12596 if (It != MinBWs.
end())
12597 IsSigned = It->second.second;
12600 return !isKnownNonNegative(R, SimplifyQuery(*DL));
12604 switch (ShuffleOrOp) {
12605 case Instruction::PHI: {
12606 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
12607 E != VectorizableTree.front().get() ||
12608 !E->UserTreeIndices.empty()) &&
12609 "PHI reordering is free.");
12610 if (PostponedPHIs && E->VectorizedValue)
12611 return E->VectorizedValue;
12612 auto *PH = cast<PHINode>(VL0);
12614 PH->getParent()->getFirstNonPHIIt());
12616 if (PostponedPHIs || !E->VectorizedValue) {
12623 PH->getParent()->getFirstInsertionPt());
12626 V = FinalShuffle(V, E, VecTy);
12628 E->VectorizedValue =
V;
12632 PHINode *NewPhi = cast<PHINode>(E->PHI);
12641 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
12647 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12651 if (!VisitedBBs.
insert(IBB).second) {
12658 Value *Vec = vectorizeOperand(E,
I,
true);
12659 if (VecTy != Vec->
getType()) {
12661 getOperandEntry(E,
I)->State == TreeEntry::NeedToGather ||
12662 MinBWs.
contains(getOperandEntry(E,
I))) &&
12663 "Expected item in MinBWs.");
12664 Vec = Builder.
CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
12670 "Invalid number of incoming values");
12674 case Instruction::ExtractElement: {
12675 Value *
V = E->getSingleOperand(0);
12676 if (
const TreeEntry *TE = getTreeEntry(V))
12677 V =
TE->VectorizedValue;
12678 setInsertPointAfterBundle(E);
12679 V = FinalShuffle(V, E, VecTy);
12680 E->VectorizedValue =
V;
12683 case Instruction::ExtractValue: {
12684 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
12689 NewV = FinalShuffle(NewV, E, VecTy);
12690 E->VectorizedValue = NewV;
12693 case Instruction::InsertElement: {
12694 assert(E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
12696 Value *
V = vectorizeOperand(E, 1, PostponedPHIs);
12698 Type *ScalarTy =
Op.front()->getType();
12699 if (cast<VectorType>(
V->getType())->getElementType() != ScalarTy) {
12701 std::pair<unsigned, bool> Res = MinBWs.
lookup(getOperandEntry(E, 1));
12702 assert(Res.first > 0 &&
"Expected item in MinBWs.");
12707 cast<FixedVectorType>(
V->getType())->getNumElements()),
12712 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
12713 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
12715 const unsigned NumElts =
12716 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
12717 const unsigned NumScalars = E->Scalars.size();
12720 assert(
Offset < NumElts &&
"Failed to find vector index offset");
12724 if (!E->ReorderIndices.empty()) {
12729 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
12732 bool IsIdentity =
true;
12734 Mask.swap(PrevMask);
12735 for (
unsigned I = 0;
I < NumScalars; ++
I) {
12738 IsIdentity &= InsertIdx -
Offset ==
I;
12741 if (!IsIdentity || NumElts != NumScalars) {
12745 if (NumElts != NumScalars &&
Offset == 0) {
12754 InsertMask[*InsertIdx] = *InsertIdx;
12755 if (!
Ins->hasOneUse())
12757 Ins = dyn_cast_or_null<InsertElementInst>(
12758 Ins->getUniqueUndroppableUser());
12761 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12763 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12766 if (!IsFirstPoison.
all()) {
12768 for (
unsigned I = 0;
I < NumElts;
I++) {
12770 IsFirstUndef.
test(
I)) {
12771 if (IsVNonPoisonous) {
12772 InsertMask[
I] =
I < NumScalars ?
I : 0;
12777 if (
Idx >= NumScalars)
12778 Idx = NumScalars - 1;
12779 InsertMask[
I] = NumScalars +
Idx;
12793 if (
auto *
I = dyn_cast<Instruction>(V)) {
12794 GatherShuffleExtractSeq.
insert(
I);
12795 CSEBlocks.
insert(
I->getParent());
12800 for (
unsigned I = 0;
I < NumElts;
I++) {
12805 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12808 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
12809 NumElts != NumScalars) {
12810 if (IsFirstUndef.
all()) {
12813 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12814 if (!IsFirstPoison.
all()) {
12815 for (
unsigned I = 0;
I < NumElts;
I++) {
12817 InsertMask[
I] =
I + NumElts;
12824 InsertMask, cast<Instruction>(E->Scalars.back())->
getName());
12825 if (
auto *
I = dyn_cast<Instruction>(V)) {
12826 GatherShuffleExtractSeq.
insert(
I);
12827 CSEBlocks.
insert(
I->getParent());
12832 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12833 for (
unsigned I = 0;
I < NumElts;
I++) {
12837 InsertMask[
I] += NumElts;
12840 FirstInsert->getOperand(0), V, InsertMask,
12841 cast<Instruction>(E->Scalars.back())->getName());
12842 if (
auto *
I = dyn_cast<Instruction>(V)) {
12843 GatherShuffleExtractSeq.
insert(
I);
12844 CSEBlocks.
insert(
I->getParent());
12849 ++NumVectorInstructions;
12850 E->VectorizedValue =
V;
12853 case Instruction::ZExt:
12854 case Instruction::SExt:
12855 case Instruction::FPToUI:
12856 case Instruction::FPToSI:
12857 case Instruction::FPExt:
12858 case Instruction::PtrToInt:
12859 case Instruction::IntToPtr:
12860 case Instruction::SIToFP:
12861 case Instruction::UIToFP:
12862 case Instruction::Trunc:
12863 case Instruction::FPTrunc:
12864 case Instruction::BitCast: {
12865 setInsertPointAfterBundle(E);
12867 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
12868 if (E->VectorizedValue) {
12869 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12870 return E->VectorizedValue;
12873 auto *CI = cast<CastInst>(VL0);
12875 Type *SrcScalarTy = cast<VectorType>(InVec->
getType())->getElementType();
12876 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
12878 (SrcIt != MinBWs.
end() || It != MinBWs.
end() ||
12881 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
12882 if (SrcIt != MinBWs.
end())
12883 SrcBWSz = SrcIt->second.first;
12884 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
12885 if (BWSz == SrcBWSz) {
12886 VecOpcode = Instruction::BitCast;
12887 }
else if (BWSz < SrcBWSz) {
12888 VecOpcode = Instruction::Trunc;
12889 }
else if (It != MinBWs.
end()) {
12890 assert(BWSz > SrcBWSz &&
"Invalid cast!");
12891 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12892 }
else if (SrcIt != MinBWs.
end()) {
12893 assert(BWSz > SrcBWSz &&
"Invalid cast!");
12895 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
12897 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
12898 !SrcIt->second.second) {
12899 VecOpcode = Instruction::UIToFP;
12901 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
12903 : Builder.
CreateCast(VecOpcode, InVec, VecTy);
12904 V = FinalShuffle(V, E, VecTy);
12906 E->VectorizedValue =
V;
12907 ++NumVectorInstructions;
12910 case Instruction::FCmp:
12911 case Instruction::ICmp: {
12912 setInsertPointAfterBundle(E);
12914 Value *
L = vectorizeOperand(E, 0, PostponedPHIs);
12915 if (E->VectorizedValue) {
12916 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12917 return E->VectorizedValue;
12919 Value *
R = vectorizeOperand(E, 1, PostponedPHIs);
12920 if (E->VectorizedValue) {
12921 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12922 return E->VectorizedValue;
12924 if (
L->getType() !=
R->getType()) {
12925 assert((getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12926 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12927 MinBWs.
contains(getOperandEntry(E, 0)) ||
12928 MinBWs.
contains(getOperandEntry(E, 1))) &&
12929 "Expected item in MinBWs.");
12930 if (cast<VectorType>(
L->getType())
12932 ->getIntegerBitWidth() < cast<VectorType>(
R->getType())
12934 ->getIntegerBitWidth()) {
12935 Type *CastTy =
R->getType();
12938 Type *CastTy =
L->getType();
12947 VecTy = cast<FixedVectorType>(
V->getType());
12948 V = FinalShuffle(V, E, VecTy);
12950 E->VectorizedValue =
V;
12951 ++NumVectorInstructions;
12954 case Instruction::Select: {
12955 setInsertPointAfterBundle(E);
12957 Value *
Cond = vectorizeOperand(E, 0, PostponedPHIs);
12958 if (E->VectorizedValue) {
12959 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12960 return E->VectorizedValue;
12962 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
12963 if (E->VectorizedValue) {
12964 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12965 return E->VectorizedValue;
12967 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
12968 if (E->VectorizedValue) {
12969 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12970 return E->VectorizedValue;
12974 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12975 getOperandEntry(E, 2)->State == TreeEntry::NeedToGather ||
12976 MinBWs.
contains(getOperandEntry(E, 1)) ||
12977 MinBWs.
contains(getOperandEntry(E, 2))) &&
12978 "Expected item in MinBWs.");
12979 if (True->
getType() != VecTy)
12980 True = Builder.
CreateIntCast(True, VecTy, GetOperandSignedness(1));
12981 if (False->
getType() != VecTy)
12982 False = Builder.
CreateIntCast(False, VecTy, GetOperandSignedness(2));
12986 V = FinalShuffle(V, E, VecTy);
12988 E->VectorizedValue =
V;
12989 ++NumVectorInstructions;
12992 case Instruction::FNeg: {
12993 setInsertPointAfterBundle(E);
12995 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
12997 if (E->VectorizedValue) {
12998 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12999 return E->VectorizedValue;
13005 if (
auto *
I = dyn_cast<Instruction>(V))
13008 V = FinalShuffle(V, E, VecTy);
13010 E->VectorizedValue =
V;
13011 ++NumVectorInstructions;
13015 case Instruction::Add:
13016 case Instruction::FAdd:
13017 case Instruction::Sub:
13018 case Instruction::FSub:
13019 case Instruction::Mul:
13020 case Instruction::FMul:
13021 case Instruction::UDiv:
13022 case Instruction::SDiv:
13023 case Instruction::FDiv:
13024 case Instruction::URem:
13025 case Instruction::SRem:
13026 case Instruction::FRem:
13027 case Instruction::Shl:
13028 case Instruction::LShr:
13029 case Instruction::AShr:
13030 case Instruction::And:
13031 case Instruction::Or:
13032 case Instruction::Xor: {
13033 setInsertPointAfterBundle(E);
13035 Value *
LHS = vectorizeOperand(E, 0, PostponedPHIs);
13036 if (E->VectorizedValue) {
13037 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13038 return E->VectorizedValue;
13040 Value *
RHS = vectorizeOperand(E, 1, PostponedPHIs);
13041 if (E->VectorizedValue) {
13042 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13043 return E->VectorizedValue;
13045 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
13046 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
13049 auto *CI = dyn_cast<ConstantInt>(
Op);
13050 return CI && CI->getValue().countr_one() >= It->second.first;
13052 V = FinalShuffle(
I == 0 ? RHS : LHS, E, VecTy);
13053 E->VectorizedValue =
V;
13054 ++NumVectorInstructions;
13061 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
13062 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
13063 MinBWs.
contains(getOperandEntry(E, 0)) ||
13064 MinBWs.
contains(getOperandEntry(E, 1))) &&
13065 "Expected item in MinBWs.");
13076 if (
auto *
I = dyn_cast<Instruction>(V)) {
13079 if (!MinBWs.
contains(E) && ShuffleOrOp == Instruction::Sub &&
13081 return isCommutative(cast<Instruction>(V));
13083 I->setHasNoUnsignedWrap(
false);
13086 V = FinalShuffle(V, E, VecTy);
13088 E->VectorizedValue =
V;
13089 ++NumVectorInstructions;
13093 case Instruction::Load: {
13096 setInsertPointAfterBundle(E);
13098 LoadInst *LI = cast<LoadInst>(VL0);
13101 if (E->State == TreeEntry::Vectorize) {
13103 }
else if (E->State == TreeEntry::StridedVectorize) {
13104 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
13105 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
13106 PO = IsReverseOrder ? PtrN : Ptr0;
13112 int Stride = *Diff / (
static_cast<int>(E->Scalars.size()) - 1);
13114 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
13115 DL->getTypeAllocSize(ScalarTy));
13119 return cast<LoadInst>(V)->getPointerOperand();
13122 std::optional<Value *> Stride =
13131 (IsReverseOrder ? -1 : 1) *
13132 static_cast<int>(
DL->getTypeAllocSize(ScalarTy))));
13134 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13136 Intrinsic::experimental_vp_strided_load,
13137 {VecTy, PO->
getType(), StrideTy},
13139 Builder.
getInt32(E->Scalars.size())});
13145 assert(E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
13146 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
13147 if (E->VectorizedValue) {
13148 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13149 return E->VectorizedValue;
13152 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13157 V = FinalShuffle(V, E, VecTy);
13158 E->VectorizedValue =
V;
13159 ++NumVectorInstructions;
13162 case Instruction::Store: {
13163 auto *
SI = cast<StoreInst>(VL0);
13165 setInsertPointAfterBundle(E);
13167 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
13168 if (VecValue->
getType() != VecTy)
13170 Builder.
CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
13171 VecValue = FinalShuffle(VecValue, E, VecTy);
13175 if (E->State == TreeEntry::Vectorize) {
13178 assert(E->State == TreeEntry::StridedVectorize &&
13179 "Expected either strided or conseutive stores.");
13180 if (!E->ReorderIndices.empty()) {
13181 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
13182 Ptr =
SI->getPointerOperand();
13184 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
13185 Type *StrideTy =
DL->getIndexType(
SI->getPointerOperandType());
13187 Intrinsic::experimental_vp_strided_store,
13188 {VecTy,
Ptr->getType(), StrideTy},
13191 StrideTy, -
static_cast<int>(
DL->getTypeAllocSize(ScalarTy))),
13193 Builder.
getInt32(E->Scalars.size())});
13202 E->VectorizedValue =
V;
13203 ++NumVectorInstructions;
13206 case Instruction::GetElementPtr: {
13207 auto *GEP0 = cast<GetElementPtrInst>(VL0);
13208 setInsertPointAfterBundle(E);
13210 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
13211 if (E->VectorizedValue) {
13212 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13213 return E->VectorizedValue;
13217 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
13218 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
13219 if (E->VectorizedValue) {
13220 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13221 return E->VectorizedValue;
13226 Value *
V = Builder.
CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
13227 if (
Instruction *
I = dyn_cast<GetElementPtrInst>(V)) {
13229 for (
Value *V : E->Scalars) {
13230 if (isa<GetElementPtrInst>(V))
13236 V = FinalShuffle(V, E, VecTy);
13238 E->VectorizedValue =
V;
13239 ++NumVectorInstructions;
13243 case Instruction::Call: {
13244 CallInst *CI = cast<CallInst>(VL0);
13245 setInsertPointAfterBundle(E);
13251 It != MinBWs.
end() ? It->second.first : 0);
13254 VecCallCosts.first <= VecCallCosts.second;
13256 Value *ScalarArg =
nullptr;
13262 auto *CEI = cast<CallInst>(VL0);
13263 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
13268 ScalarArg = CEI->getArgOperand(
I);
13271 if (
ID == Intrinsic::abs && It != MinBWs.
end() &&
13272 It->second.first <
DL->getTypeSizeInBits(CEI->getType()))
13280 Value *OpVec = vectorizeOperand(E,
I, PostponedPHIs);
13281 if (E->VectorizedValue) {
13282 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13283 return E->VectorizedValue;
13285 ScalarArg = CEI->getArgOperand(
I);
13286 if (cast<VectorType>(OpVec->
getType())->getElementType() !=
13288 It == MinBWs.
end()) {
13291 OpVec = Builder.
CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
13292 }
else if (It != MinBWs.
end()) {
13293 OpVec = Builder.
CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
13302 if (!UseIntrinsic) {
13318 V = FinalShuffle(V, E, VecTy);
13320 E->VectorizedValue =
V;
13321 ++NumVectorInstructions;
13324 case Instruction::ShuffleVector: {
13325 assert(E->isAltShuffle() &&
13330 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
13331 "Invalid Shuffle Vector Operand");
13335 setInsertPointAfterBundle(E);
13336 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13337 if (E->VectorizedValue) {
13338 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13339 return E->VectorizedValue;
13341 RHS = vectorizeOperand(E, 1, PostponedPHIs);
13343 setInsertPointAfterBundle(E);
13344 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13346 if (E->VectorizedValue) {
13347 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13348 return E->VectorizedValue;
13355 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
13356 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
13357 MinBWs.
contains(getOperandEntry(E, 0)) ||
13358 MinBWs.
contains(getOperandEntry(E, 1))) &&
13359 "Expected item in MinBWs.");
13360 Type *CastTy = VecTy;
13364 ->getIntegerBitWidth() < cast<VectorType>(
RHS->
getType())
13366 ->getIntegerBitWidth())
13383 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
13384 V0 = Builder.
CreateCmp(CI0->getPredicate(), LHS, RHS);
13385 auto *AltCI = cast<CmpInst>(E->getAltOp());
13387 V1 = Builder.
CreateCmp(AltPred, LHS, RHS);
13390 unsigned SrcBWSz =
DL->getTypeSizeInBits(
13391 cast<VectorType>(
LHS->
getType())->getElementType());
13392 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
13393 if (BWSz <= SrcBWSz) {
13394 if (BWSz < SrcBWSz)
13397 if (
auto *
I = dyn_cast<Instruction>(LHS))
13399 E->VectorizedValue =
LHS;
13400 ++NumVectorInstructions;
13411 for (
Value *V : {V0, V1}) {
13412 if (
auto *
I = dyn_cast<Instruction>(V)) {
13413 GatherShuffleExtractSeq.
insert(
I);
13414 CSEBlocks.
insert(
I->getParent());
13423 E->buildAltOpShuffleMask(
13425 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
13429 Mask, &OpScalars, &AltScalars);
13433 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
13435 if (
auto *
I = dyn_cast<Instruction>(Vec);
13436 I && Opcode == Instruction::Sub && !MinBWs.
contains(E) &&
13438 auto *IV = cast<Instruction>(V);
13439 return IV->getOpcode() == Instruction::Sub &&
13440 isCommutative(cast<Instruction>(IV));
13442 I->setHasNoUnsignedWrap(
false);
13444 DropNuwFlag(V0, E->getOpcode());
13445 DropNuwFlag(V1, E->getAltOpcode());
13448 if (
auto *
I = dyn_cast<Instruction>(V)) {
13450 GatherShuffleExtractSeq.
insert(
I);
13451 CSEBlocks.
insert(
I->getParent());
13454 E->VectorizedValue =
V;
13455 ++NumVectorInstructions;
13468 return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
13474struct ShuffledInsertData {
13487 for (
auto &BSIter : BlocksSchedules) {
13488 scheduleBlock(BSIter.second.get());
13492 EntryToLastInstruction.
clear();
13502 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
13503 if (TE->State == TreeEntry::Vectorize &&
13504 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
13505 TE->VectorizedValue)
13511 for (
const TreeEntry *E : PostponedNodes) {
13512 auto *TE =
const_cast<TreeEntry *
>(E);
13513 if (
auto *VecTE = getTreeEntry(TE->Scalars.front()))
13514 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
13515 TE->UserTreeIndices.front().EdgeIdx)) &&
13516 VecTE->isSame(TE->Scalars))
13520 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
13521 TE->VectorizedValue =
nullptr;
13523 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
13532 if (isa<PHINode>(UserI)) {
13535 for (
User *U : PrevVec->users()) {
13538 auto *UI = dyn_cast<Instruction>(U);
13539 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->
getParent())
13541 if (UI->comesBefore(InsertPt))
13550 if (Vec->
getType() != PrevVec->getType()) {
13552 PrevVec->getType()->isIntOrIntVectorTy() &&
13553 "Expected integer vector types only.");
13554 std::optional<bool> IsSigned;
13555 for (
Value *V : TE->Scalars) {
13556 if (
const TreeEntry *BaseTE = getTreeEntry(V)) {
13557 auto It = MinBWs.
find(BaseTE);
13558 if (It != MinBWs.
end()) {
13559 IsSigned = IsSigned.value_or(
false) || It->second.second;
13563 for (
const TreeEntry *MNTE : MultiNodeScalars.
lookup(V)) {
13564 auto It = MinBWs.
find(MNTE);
13565 if (It != MinBWs.
end()) {
13566 IsSigned = IsSigned.value_or(
false) || It->second.second;
13571 if (IsSigned.value_or(
false))
13574 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
13575 auto It = MinBWs.
find(BVE);
13576 if (It != MinBWs.
end()) {
13577 IsSigned = IsSigned.value_or(
false) || It->second.second;
13582 if (IsSigned.value_or(
false))
13584 if (
auto *EE = dyn_cast<ExtractElementInst>(V)) {
13586 IsSigned.value_or(
false) ||
13590 if (IsSigned.value_or(
false))
13594 if (IsSigned.value_or(
false)) {
13596 auto It = MinBWs.
find(TE->UserTreeIndices.front().UserTE);
13597 if (It != MinBWs.
end())
13598 IsSigned = It->second.second;
13601 "Expected user node or perfect diamond match in MinBWs.");
13605 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
13608 auto It = PostponedValues.
find(PrevVec);
13609 if (It != PostponedValues.
end()) {
13610 for (TreeEntry *VTE : It->getSecond())
13611 VTE->VectorizedValue = Vec;
13631 for (
const auto &ExternalUse : ExternalUses) {
13632 Value *Scalar = ExternalUse.Scalar;
13639 TreeEntry *E = getTreeEntry(Scalar);
13640 assert(E &&
"Invalid scalar");
13641 assert(E->State != TreeEntry::NeedToGather &&
13642 "Extracting from a gather list");
13644 if (E->getOpcode() == Instruction::GetElementPtr &&
13645 !isa<GetElementPtrInst>(Scalar))
13648 Value *Vec = E->VectorizedValue;
13649 assert(Vec &&
"Can't find vectorizable value");
13652 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
13653 if (Scalar->getType() != Vec->
getType()) {
13654 Value *Ex =
nullptr;
13655 Value *ExV =
nullptr;
13656 auto *
GEP = dyn_cast<GetElementPtrInst>(Scalar);
13658 auto It = ScalarToEEs.find(Scalar);
13659 if (It != ScalarToEEs.end()) {
13663 if (EEIt != It->second.end()) {
13669 if (
auto *CI = EEIt->second.second)
13673 ExV = EEIt->second.second ? EEIt->second.second : Ex;
13678 if (
auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
13679 Value *V = ES->getVectorOperand();
13680 if (
const TreeEntry *ETE = getTreeEntry(V))
13681 V = ETE->VectorizedValue;
13683 }
else if (ReplaceGEP) {
13686 auto *CloneGEP =
GEP->clone();
13687 if (isa<Instruction>(Vec))
13691 CloneGEP->insertBefore(
GEP);
13692 if (
GEP->hasName())
13693 CloneGEP->takeName(
GEP);
13701 if (Scalar->getType() != Ex->
getType())
13703 MinBWs.
find(E)->second.second);
13704 if (
auto *
I = dyn_cast<Instruction>(Ex))
13705 ScalarToEEs[Scalar].try_emplace(
13707 std::make_pair(
I, cast<Instruction>(ExV)));
13711 if (
auto *ExI = dyn_cast<Instruction>(Ex)) {
13712 GatherShuffleExtractSeq.
insert(ExI);
13713 CSEBlocks.
insert(ExI->getParent());
13717 assert(isa<FixedVectorType>(Scalar->getType()) &&
13718 isa<InsertElementInst>(Scalar) &&
13719 "In-tree scalar of vector type is not insertelement?");
13720 auto *IE = cast<InsertElementInst>(Scalar);
13728 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
13733 if (ExternalUsesAsGEPs.contains(U))
13735 TreeEntry *UseEntry = getTreeEntry(U);
13737 (UseEntry->State == TreeEntry::Vectorize ||
13739 TreeEntry::StridedVectorize) &&
13740 (E->State == TreeEntry::Vectorize ||
13741 E->State == TreeEntry::StridedVectorize) &&
13742 doesInTreeUserNeedToExtract(
13744 cast<Instruction>(UseEntry->Scalars.front()),
13747 "Scalar with nullptr User must be registered in "
13748 "ExternallyUsedValues map or remain as scalar in vectorized "
13750 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
13751 if (
auto *
PHI = dyn_cast<PHINode>(VecI))
13753 PHI->getParent()->getFirstNonPHIIt());
13756 std::next(VecI->getIterator()));
13760 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13762 Scalar->replaceAllUsesWith(NewInst);
13763 ReplacedExternals.emplace_back(Scalar, NewInst);
13767 if (
auto *VU = dyn_cast<InsertElementInst>(
User)) {
13769 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
13770 if (
auto *FTy = dyn_cast<FixedVectorType>(
User->
getType())) {
13771 if (!UsedInserts.
insert(VU).second)
13774 auto BWIt = MinBWs.
find(E);
13776 auto *ScalarTy = FTy->getElementType();
13777 auto Key = std::make_pair(Vec, ScalarTy);
13778 auto VecIt = VectorCasts.
find(Key);
13779 if (VecIt == VectorCasts.
end()) {
13781 if (
auto *IVec = dyn_cast<PHINode>(Vec))
13783 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
13784 else if (
auto *IVec = dyn_cast<Instruction>(Vec))
13790 cast<FixedVectorType>(Vec->
getType())->getNumElements()),
13791 BWIt->second.second);
13794 Vec = VecIt->second;
13801 find_if(ShuffledInserts, [VU](
const ShuffledInsertData &
Data) {
13808 unsigned Idx = *InsertIdx;
13809 if (It == ShuffledInserts.
end()) {
13811 It = std::next(ShuffledInserts.
begin(),
13812 ShuffledInserts.
size() - 1);
13818 while (
auto *IEBase = dyn_cast<InsertElementInst>(
Base)) {
13819 if (IEBase !=
User &&
13820 (!IEBase->hasOneUse() ||
13824 if (
const TreeEntry *E = getTreeEntry(IEBase)) {
13826 IEBase = cast<InsertElementInst>(
Base);
13829 "InsertElementInstruction used already.");
13830 Mask[IEIdx] = IEIdx;
13831 Base = IEBase->getOperand(0);
13832 }
while (E == getTreeEntry(
Base));
13835 Base = cast<InsertElementInst>(
Base)->getOperand(0);
13839 auto It = VectorToInsertElement.
find(
Base);
13840 if (It != VectorToInsertElement.
end())
13847 Mask[
Idx] = ExternalUse.Lane;
13848 It->InsertElements.push_back(cast<InsertElementInst>(
User));
13857 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
13859 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
13860 if (PH->getIncomingValue(
I) == Scalar) {
13862 PH->getIncomingBlock(
I)->getTerminator();
13863 if (isa<CatchSwitchInst>(IncomingTerminator)) {
13865 std::next(VecI->getIterator()));
13869 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13870 PH->setOperand(
I, NewInst);
13875 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13880 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13890 int VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
13891 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
13893 CombinedMask1[
I] = Mask[
I];
13895 CombinedMask2[
I] = Mask[
I] - VF;
13898 cast<VectorType>(V1->
getType())->getElementType(), Builder, *
this);
13899 ShuffleBuilder.
add(V1, CombinedMask1);
13901 ShuffleBuilder.
add(V2, CombinedMask2);
13902 return ShuffleBuilder.
finalize(std::nullopt);
13906 bool ForSingleMask) {
13907 unsigned VF = Mask.size();
13908 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
13910 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
13911 Vec = CreateShuffle(Vec,
nullptr, Mask);
13912 return std::make_pair(Vec,
true);
13914 if (!ForSingleMask) {
13916 for (
unsigned I = 0;
I < VF; ++
I) {
13918 ResizeMask[Mask[
I]] = Mask[
I];
13920 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
13924 return std::make_pair(Vec,
false);
13928 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
13934 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
13935 Value *NewInst = performExtractsShuffleAction<Value>(
13939 return cast<VectorType>(Vec->getType())
13940 ->getElementCount()
13941 .getKnownMinValue();
13946 assert((Vals.size() == 1 || Vals.size() == 2) &&
13947 "Expected exactly 1 or 2 input values.");
13948 if (Vals.size() == 1) {
13951 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
13952 ->getNumElements() ||
13953 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
13954 return CreateShuffle(Vals.front(), nullptr, Mask);
13955 return Vals.front();
13957 return CreateShuffle(Vals.
front() ? Vals.
front()
13959 Vals.
back(), Mask);
13961 auto It = ShuffledInserts[
I].InsertElements.
rbegin();
13964 if (It != ShuffledInserts[
I].InsertElements.
rend())
13967 while (It != ShuffledInserts[
I].InsertElements.
rend()) {
13968 assert(II &&
"Must be an insertelement instruction.");
13972 Inserts.
push_back(cast<Instruction>(II));
13973 II = dyn_cast<InsertElementInst>(II->
getOperand(0));
13977 if (
auto *NewI = dyn_cast<Instruction>(NewInst))
13984 IE->replaceUsesOfWith(IE->getOperand(0),
13986 IE->replaceUsesOfWith(IE->getOperand(1),
13990 CSEBlocks.
insert(LastInsert->getParent());
13995 for (
auto &TEPtr : VectorizableTree) {
13996 TreeEntry *Entry = TEPtr.get();
13999 if (Entry->State == TreeEntry::NeedToGather)
14002 assert(Entry->VectorizedValue &&
"Can't find vectorizable value");
14005 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
14006 Value *Scalar = Entry->Scalars[Lane];
14008 if (Entry->getOpcode() == Instruction::GetElementPtr &&
14009 !isa<GetElementPtrInst>(Scalar))
14012 Type *Ty = Scalar->getType();
14014 for (
User *U : Scalar->users()) {
14018 assert((getTreeEntry(U) ||
14019 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14020 (isa_and_nonnull<Instruction>(U) &&
14021 isDeleted(cast<Instruction>(U)))) &&
14022 "Deleting out-of-tree value");
14026 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
14031 RemovedInsts.
push_back(cast<Instruction>(Scalar));
14037 if (
auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
14038 V->mergeDIAssignID(RemovedInsts);
14041 InstrElementSize.
clear();
14043 const TreeEntry &RootTE = *VectorizableTree.front().get();
14044 Value *Vec = RootTE.VectorizedValue;
14045 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
14046 It != MinBWs.end() &&
14047 ReductionBitWidth != It->second.first) {
14050 ReductionRoot->getIterator());
14054 cast<VectorType>(Vec->
getType())->getElementCount()),
14055 It->second.second);
14062 <<
" gather sequences instructions.\n");
14069 Loop *L = LI->getLoopFor(
I->getParent());
14074 BasicBlock *PreHeader = L->getLoopPreheader();
14082 auto *OpI = dyn_cast<Instruction>(V);
14083 return OpI && L->contains(OpI);
14089 CSEBlocks.
insert(PreHeader);
14104 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
14105 "Different nodes should have different DFS numbers");
14106 return A->getDFSNumIn() <
B->getDFSNumIn();
14116 if (I1->getType() != I2->getType())
14118 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
14119 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
14121 return I1->isIdenticalTo(I2);
14122 if (SI1->isIdenticalTo(SI2))
14124 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
14125 if (SI1->getOperand(
I) != SI2->getOperand(
I))
14128 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
14132 unsigned LastUndefsCnt = 0;
14133 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
14139 NewMask[
I] != SM1[
I])
14142 NewMask[
I] = SM1[
I];
14146 return SM1.
size() - LastUndefsCnt > 1 &&
14150 SM1.
size() - LastUndefsCnt));
14156 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
14159 "Worklist not sorted properly!");
14165 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
14166 !GatherShuffleExtractSeq.contains(&In))
14171 bool Replaced =
false;
14174 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
14175 DT->
dominates(V->getParent(), In.getParent())) {
14176 In.replaceAllUsesWith(V);
14178 if (
auto *SI = dyn_cast<ShuffleVectorInst>(V))
14179 if (!NewMask.
empty())
14180 SI->setShuffleMask(NewMask);
14184 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
14185 GatherShuffleExtractSeq.contains(V) &&
14186 IsIdenticalOrLessDefined(V, &In, NewMask) &&
14187 DT->
dominates(In.getParent(), V->getParent())) {
14189 V->replaceAllUsesWith(&In);
14191 if (
auto *SI = dyn_cast<ShuffleVectorInst>(&In))
14192 if (!NewMask.
empty())
14193 SI->setShuffleMask(NewMask);
14201 Visited.push_back(&In);
14206 GatherShuffleExtractSeq.clear();
14209BoUpSLP::ScheduleData *
14211 ScheduleData *Bundle =
nullptr;
14212 ScheduleData *PrevInBundle =
nullptr;
14213 for (
Value *V : VL) {
14216 ScheduleData *BundleMember = getScheduleData(V);
14218 "no ScheduleData for bundle member "
14219 "(maybe not in same basic block)");
14220 assert(BundleMember->isSchedulingEntity() &&
14221 "bundle member already part of other bundle");
14222 if (PrevInBundle) {
14223 PrevInBundle->NextInBundle = BundleMember;
14225 Bundle = BundleMember;
14229 BundleMember->FirstInBundle = Bundle;
14230 PrevInBundle = BundleMember;
14232 assert(Bundle &&
"Failed to find schedule bundle");
14238std::optional<BoUpSLP::ScheduleData *>
14240 const InstructionsState &S) {
14251 auto TryScheduleBundleImpl = [
this, OldScheduleEnd, SLP](
bool ReSchedule,
14252 ScheduleData *Bundle) {
14258 if (ScheduleEnd != OldScheduleEnd) {
14259 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode())
14260 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->clearDependencies(); });
14265 <<
" in block " << BB->
getName() <<
"\n");
14266 calculateDependencies(Bundle,
true, SLP);
14271 initialFillReadyList(ReadyInsts);
14278 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
14279 !ReadyInsts.empty()) {
14280 ScheduleData *Picked = ReadyInsts.pop_back_val();
14281 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
14282 "must be ready to schedule");
14283 schedule(Picked, ReadyInsts);
14289 for (
Value *V : VL) {
14292 if (!extendSchedulingRegion(V, S)) {
14299 TryScheduleBundleImpl(
false,
nullptr);
14300 return std::nullopt;
14304 bool ReSchedule =
false;
14305 for (
Value *V : VL) {
14308 ScheduleData *BundleMember = getScheduleData(V);
14310 "no ScheduleData for bundle member (maybe not in same basic block)");
14314 ReadyInsts.remove(BundleMember);
14316 if (!BundleMember->IsScheduled)
14321 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
14322 <<
" was already scheduled\n");
14326 auto *Bundle = buildBundle(VL);
14327 TryScheduleBundleImpl(ReSchedule, Bundle);
14328 if (!Bundle->isReady()) {
14329 cancelScheduling(VL, S.OpValue);
14330 return std::nullopt;
14343 ScheduleData *Bundle = getScheduleData(OpValue);
14344 LLVM_DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
14345 assert(!Bundle->IsScheduled &&
14346 "Can't cancel bundle which is already scheduled");
14347 assert(Bundle->isSchedulingEntity() &&
14349 "tried to unbundle something which is not a bundle");
14352 if (Bundle->isReady())
14353 ReadyInsts.remove(Bundle);
14356 ScheduleData *BundleMember = Bundle;
14357 while (BundleMember) {
14358 assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
14359 BundleMember->FirstInBundle = BundleMember;
14360 ScheduleData *Next = BundleMember->NextInBundle;
14361 BundleMember->NextInBundle =
nullptr;
14362 BundleMember->TE =
nullptr;
14363 if (BundleMember->unscheduledDepsInBundle() == 0) {
14364 ReadyInsts.insert(BundleMember);
14366 BundleMember = Next;
14370BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
14372 if (ChunkPos >= ChunkSize) {
14373 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
14376 return &(ScheduleDataChunks.back()[ChunkPos++]);
14379bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
Value *V,
14380 const InstructionsState &S) {
14381 if (getScheduleData(V,
isOneOf(S, V)))
14384 assert(
I &&
"bundle member must be an instruction");
14387 "phi nodes/insertelements/extractelements/extractvalues don't need to "
14389 auto &&CheckScheduleForI = [
this, &S](
Instruction *
I) ->
bool {
14390 ScheduleData *ISD = getScheduleData(
I);
14393 assert(isInSchedulingRegion(ISD) &&
14394 "ScheduleData not in scheduling region");
14395 ScheduleData *SD = allocateScheduleDataChunks();
14397 SD->init(SchedulingRegionID, S.OpValue);
14398 ExtraScheduleDataMap[
I][S.OpValue] = SD;
14401 if (CheckScheduleForI(
I))
14403 if (!ScheduleStart) {
14405 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
14407 ScheduleEnd =
I->getNextNode();
14409 CheckScheduleForI(
I);
14410 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
14411 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
14419 ++ScheduleStart->getIterator().getReverse();
14424 if (
auto *II = dyn_cast<IntrinsicInst>(&
I))
14425 return II->isAssumeLikeIntrinsic();
14428 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14429 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14430 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
14432 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
14433 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
14440 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14441 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14443 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
14444 assert(
I->getParent() == ScheduleStart->getParent() &&
14445 "Instruction is in wrong basic block.");
14446 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
14449 CheckScheduleForI(
I);
14454 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
14455 "Expected to reach top of the basic block or instruction down the "
14457 assert(
I->getParent() == ScheduleEnd->getParent() &&
14458 "Instruction is in wrong basic block.");
14459 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
14461 ScheduleEnd =
I->getNextNode();
14463 CheckScheduleForI(
I);
14464 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
14465 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
14469void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
14471 ScheduleData *PrevLoadStore,
14472 ScheduleData *NextLoadStore) {
14473 ScheduleData *CurrentLoadStore = PrevLoadStore;
14478 ScheduleData *SD = ScheduleDataMap.lookup(
I);
14480 SD = allocateScheduleDataChunks();
14481 ScheduleDataMap[
I] = SD;
14484 assert(!isInSchedulingRegion(SD) &&
14485 "new ScheduleData already in scheduling region");
14486 SD->init(SchedulingRegionID,
I);
14488 if (
I->mayReadOrWriteMemory() &&
14489 (!isa<IntrinsicInst>(
I) ||
14490 (cast<IntrinsicInst>(
I)->getIntrinsicID() != Intrinsic::sideeffect &&
14491 cast<IntrinsicInst>(
I)->getIntrinsicID() !=
14492 Intrinsic::pseudoprobe))) {
14494 if (CurrentLoadStore) {
14495 CurrentLoadStore->NextLoadStore = SD;
14497 FirstLoadStoreInRegion = SD;
14499 CurrentLoadStore = SD;
14502 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
14503 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
14504 RegionHasStackSave =
true;
14506 if (NextLoadStore) {
14507 if (CurrentLoadStore)
14508 CurrentLoadStore->NextLoadStore = NextLoadStore;
14510 LastLoadStoreInRegion = CurrentLoadStore;
14514void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
14515 bool InsertInReadyList,
14517 assert(SD->isSchedulingEntity());
14522 while (!WorkList.
empty()) {
14524 for (ScheduleData *BundleMember = SD; BundleMember;
14525 BundleMember = BundleMember->NextInBundle) {
14526 assert(isInSchedulingRegion(BundleMember));
14527 if (BundleMember->hasValidDependencies())
14532 BundleMember->Dependencies = 0;
14533 BundleMember->resetUnscheduledDeps();
14536 if (BundleMember->OpValue != BundleMember->Inst) {
14537 if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
14538 BundleMember->Dependencies++;
14539 ScheduleData *DestBundle = UseSD->FirstInBundle;
14540 if (!DestBundle->IsScheduled)
14541 BundleMember->incrementUnscheduledDeps(1);
14542 if (!DestBundle->hasValidDependencies())
14546 for (
User *U : BundleMember->Inst->
users()) {
14547 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
14548 BundleMember->Dependencies++;
14549 ScheduleData *DestBundle = UseSD->FirstInBundle;
14550 if (!DestBundle->IsScheduled)
14551 BundleMember->incrementUnscheduledDeps(1);
14552 if (!DestBundle->hasValidDependencies())
14559 auto *DepDest = getScheduleData(
I);
14560 assert(DepDest &&
"must be in schedule window");
14561 DepDest->ControlDependencies.push_back(BundleMember);
14562 BundleMember->Dependencies++;
14563 ScheduleData *DestBundle = DepDest->FirstInBundle;
14564 if (!DestBundle->IsScheduled)
14565 BundleMember->incrementUnscheduledDeps(1);
14566 if (!DestBundle->hasValidDependencies())
14574 for (
Instruction *
I = BundleMember->Inst->getNextNode();
14575 I != ScheduleEnd;
I =
I->getNextNode()) {
14580 MakeControlDependent(
I);
14588 if (RegionHasStackSave) {
14592 if (
match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
14593 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
14594 for (
Instruction *
I = BundleMember->Inst->getNextNode();
14595 I != ScheduleEnd;
I =
I->getNextNode()) {
14596 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
14597 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
14602 if (!isa<AllocaInst>(
I))
14606 MakeControlDependent(
I);
14615 if (isa<AllocaInst>(BundleMember->Inst) ||
14616 BundleMember->Inst->mayReadOrWriteMemory()) {
14617 for (
Instruction *
I = BundleMember->Inst->getNextNode();
14618 I != ScheduleEnd;
I =
I->getNextNode()) {
14619 if (!
match(
I, m_Intrinsic<Intrinsic::stacksave>()) &&
14620 !
match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
14624 MakeControlDependent(
I);
14631 ScheduleData *DepDest = BundleMember->NextLoadStore;
14636 "NextLoadStore list for non memory effecting bundle?");
14638 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
14639 unsigned NumAliased = 0;
14640 unsigned DistToSrc = 1;
14642 for (; DepDest; DepDest = DepDest->NextLoadStore) {
14643 assert(isInSchedulingRegion(DepDest));
14653 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
14655 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
14662 DepDest->MemoryDependencies.push_back(BundleMember);
14663 BundleMember->Dependencies++;
14664 ScheduleData *DestBundle = DepDest->FirstInBundle;
14665 if (!DestBundle->IsScheduled) {
14666 BundleMember->incrementUnscheduledDeps(1);
14668 if (!DestBundle->hasValidDependencies()) {
14691 if (InsertInReadyList && SD->isReady()) {
14692 ReadyInsts.insert(SD);
14699void BoUpSLP::BlockScheduling::resetSchedule() {
14701 "tried to reset schedule on block which has not been scheduled");
14702 for (
Instruction *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
14703 doForAllOpcodes(
I, [&](ScheduleData *SD) {
14704 assert(isInSchedulingRegion(SD) &&
14705 "ScheduleData not in scheduling region");
14706 SD->IsScheduled =
false;
14707 SD->resetUnscheduledDeps();
14710 ReadyInsts.clear();
14713void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
14714 if (!BS->ScheduleStart)
14717 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
14724 BS->resetSchedule();
14731 struct ScheduleDataCompare {
14732 bool operator()(ScheduleData *SD1, ScheduleData *SD2)
const {
14733 return SD2->SchedulingPriority < SD1->SchedulingPriority;
14736 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
14741 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
14742 I =
I->getNextNode()) {
14743 BS->doForAllOpcodes(
I, [
this, &
Idx, BS](ScheduleData *SD) {
14744 TreeEntry *SDTE = getTreeEntry(SD->Inst);
14747 SD->isPartOfBundle() ==
14749 "scheduler and vectorizer bundle mismatch");
14750 SD->FirstInBundle->SchedulingPriority =
Idx++;
14752 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
14753 BS->calculateDependencies(SD,
false,
this);
14756 BS->initialFillReadyList(ReadyInsts);
14758 Instruction *LastScheduledInst = BS->ScheduleEnd;
14761 while (!ReadyInsts.empty()) {
14762 ScheduleData *Picked = *ReadyInsts.begin();
14763 ReadyInsts.erase(ReadyInsts.begin());
14767 for (ScheduleData *BundleMember = Picked; BundleMember;
14768 BundleMember = BundleMember->NextInBundle) {
14772 LastScheduledInst = PickedInst;
14775 BS->schedule(Picked, ReadyInsts);
14779#ifdef EXPENSIVE_CHECKS
14783#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
14785 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
I =
I->getNextNode()) {
14786 BS->doForAllOpcodes(
I, [&](ScheduleData *SD) {
14787 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
14788 assert(SD->IsScheduled &&
"must be scheduled at this point");
14795 BS->ScheduleStart =
nullptr;
14802 if (
auto *Store = dyn_cast<StoreInst>(V))
14803 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
14805 if (
auto *IEI = dyn_cast<InsertElementInst>(V))
14808 auto E = InstrElementSize.
find(V);
14809 if (E != InstrElementSize.
end())
14818 if (
auto *
I = dyn_cast<Instruction>(V)) {
14826 Value *FirstNonBool =
nullptr;
14827 while (!Worklist.
empty()) {
14832 auto *Ty =
I->getType();
14833 if (isa<VectorType>(Ty))
14835 if (Ty != Builder.
getInt1Ty() && !FirstNonBool)
14842 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(
I))
14843 Width = std::max<unsigned>(Width,
DL->getTypeSizeInBits(Ty));
14851 for (
Use &U :
I->operands()) {
14852 if (
auto *J = dyn_cast<Instruction>(U.get()))
14853 if (Visited.
insert(J).second &&
14854 (isa<PHINode>(
I) || J->getParent() == Parent)) {
14858 if (!FirstNonBool && U.get()->getType() != Builder.
getInt1Ty())
14859 FirstNonBool = U.get();
14870 if (V->getType() == Builder.
getInt1Ty() && FirstNonBool)
14872 Width =
DL->getTypeSizeInBits(V->getType());
14876 InstrElementSize[
I] = Width;
14881bool BoUpSLP::collectValuesToDemote(
14882 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
14884 unsigned &MaxDepthLevel,
bool &IsProfitableToDemote,
14885 bool IsTruncRoot)
const {
14887 if (
all_of(E.Scalars, IsaPred<Constant>))
14890 unsigned OrigBitWidth =
DL->getTypeSizeInBits(E.Scalars.front()->getType());
14899 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
14908 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
14912 if (
auto *
I = dyn_cast<Instruction>(V)) {
14914 unsigned BitWidth2 =
14915 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
14916 while (!IsSigned && BitWidth2 < OrigBitWidth) {
14922 BitWidth1 = std::min(BitWidth1, BitWidth2);
14927 using namespace std::placeholders;
14928 auto FinalAnalysis = [&]() {
14929 if (!IsProfitableToDemote)
14932 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
14934 if (Res && E.State == TreeEntry::NeedToGather) {
14938 for (
Value *V : E.Scalars) {
14939 auto *EE = dyn_cast<ExtractElementInst>(V);
14942 UniqueBases.
insert(EE->getVectorOperand());
14944 const unsigned VF = E.Scalars.size();
14945 Type *OrigScalarTy = E.Scalars.front()->getType();
14946 if (UniqueBases.
size() <= 2 ||
14954 if (E.State == TreeEntry::NeedToGather || !Visited.
insert(&E).second ||
14956 return all_of(V->users(), [&](User *U) {
14957 return isa<InsertElementInst>(U) && !getTreeEntry(U);
14960 return FinalAnalysis();
14963 return !all_of(V->users(), [=](User *U) {
14964 return getTreeEntry(U) ||
14965 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14966 (!isa<CmpInst>(U) && U->getType()->isSized() &&
14967 !U->getType()->isScalableTy() &&
14968 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
14969 }) && !IsPotentiallyTruncated(V,
BitWidth);
14974 bool &NeedToExit) {
14975 NeedToExit =
false;
14976 unsigned InitLevel = MaxDepthLevel;
14978 unsigned Level = InitLevel;
14979 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
14980 ToDemote, Visited, Level, IsProfitableToDemote,
14982 if (!IsProfitableToDemote)
14985 if (!FinalAnalysis())
14989 MaxDepthLevel = std::max(MaxDepthLevel, Level);
14993 auto AttemptCheckBitwidth =
14996 NeedToExit =
false;
14997 unsigned BestFailBitwidth = 0;
14999 if (Checker(
BitWidth, OrigBitWidth))
15001 if (BestFailBitwidth == 0 && FinalAnalysis())
15005 if (BestFailBitwidth == 0) {
15016 auto TryProcessInstruction =
15023 (void)
for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15028 if (E.UserTreeIndices.size() > 1 &&
15029 !
all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15032 bool NeedToExit =
false;
15033 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
15037 if (!ProcessOperands(
Operands, NeedToExit))
15046 return IsProfitableToDemote;
15048 switch (E.getOpcode()) {
15052 case Instruction::Trunc:
15053 if (IsProfitableToDemoteRoot)
15054 IsProfitableToDemote =
true;
15055 return TryProcessInstruction(
BitWidth);
15056 case Instruction::ZExt:
15057 case Instruction::SExt:
15058 IsProfitableToDemote =
true;
15059 return TryProcessInstruction(
BitWidth);
15063 case Instruction::Add:
15064 case Instruction::Sub:
15065 case Instruction::Mul:
15066 case Instruction::And:
15067 case Instruction::Or:
15068 case Instruction::Xor: {
15069 return TryProcessInstruction(
15070 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
15072 case Instruction::Shl: {
15077 auto *I = cast<Instruction>(V);
15078 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15079 return AmtKnownBits.getMaxValue().ult(BitWidth);
15082 return TryProcessInstruction(
15083 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
15085 case Instruction::LShr: {
15089 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
15091 auto *I = cast<Instruction>(V);
15092 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15093 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15094 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15095 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
15096 SimplifyQuery(*DL));
15099 return TryProcessInstruction(
15100 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15103 case Instruction::AShr: {
15107 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
15109 auto *I = cast<Instruction>(V);
15110 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15111 unsigned ShiftedBits = OrigBitWidth - BitWidth;
15112 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15113 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
15117 return TryProcessInstruction(
15118 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15121 case Instruction::UDiv:
15122 case Instruction::URem: {
15124 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
15127 auto *I = cast<Instruction>(V);
15128 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15129 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
15130 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15133 return TryProcessInstruction(
15134 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
15138 case Instruction::Select: {
15139 return TryProcessInstruction(
15140 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
15145 case Instruction::PHI: {
15146 const unsigned NumOps = E.getNumOperands();
15149 std::bind(&BoUpSLP::getOperandEntry,
this, &E, _1));
15151 return TryProcessInstruction(
BitWidth, Ops);
15154 case Instruction::Call: {
15155 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
15159 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
15160 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
15164 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
15167 auto *I = cast<Instruction>(V);
15168 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
15169 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15170 return MaskedValueIsZero(I->getOperand(0), Mask,
15171 SimplifyQuery(*DL)) &&
15172 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15174 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
15175 "Expected min/max intrinsics only.");
15176 unsigned SignBits = OrigBitWidth -
BitWidth;
15182 return SignBits <= Op0SignBits &&
15183 ((SignBits != Op0SignBits &&
15187 SignBits <= Op1SignBits &&
15188 ((SignBits != Op1SignBits &&
15193 if (
ID != Intrinsic::abs) {
15194 Operands.push_back(getOperandEntry(&E, 1));
15195 CallChecker = CompChecker;
15198 std::numeric_limits<InstructionCost::CostType>::max();
15200 unsigned VF = E.Scalars.size();
15210 if (
Cost < BestCost) {
15216 [[maybe_unused]]
bool NeedToExit;
15217 (void)AttemptCheckBitwidth(Checker, NeedToExit);
15227 return FinalAnalysis();
15234 bool IsStoreOrInsertElt =
15235 VectorizableTree.front()->getOpcode() == Instruction::Store ||
15236 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
15237 if ((IsStoreOrInsertElt || UserIgnoreList) &&
15238 ExtraBitWidthNodes.
size() <= 1 &&
15239 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
15240 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
15243 unsigned NodeIdx = 0;
15244 if (IsStoreOrInsertElt &&
15245 VectorizableTree.front()->State != TreeEntry::NeedToGather)
15249 if (VectorizableTree[NodeIdx]->State == TreeEntry::NeedToGather ||
15250 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.
empty()) ||
15251 (NodeIdx != 0 &&
any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15254 static_cast<int>(NodeIdx);
15260 bool IsTruncRoot =
false;
15261 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
15263 if (NodeIdx != 0 &&
15264 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15265 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
15266 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
15267 IsTruncRoot =
true;
15269 IsProfitableToDemoteRoot =
true;
15274 if (AnalyzedMinBWVals.
contains(VectorizableTree[NodeIdx]->Scalars.front()))
15278 auto ComputeMaxBitWidth = [&](
const TreeEntry &E,
bool IsTopRoot,
15279 bool IsProfitableToDemoteRoot,
unsigned Opcode,
15280 unsigned Limit,
bool IsTruncRoot,
15281 bool IsSignedCmp) {
15283 unsigned VF = E.getVectorFactor();
15284 auto *TreeRootIT = dyn_cast<IntegerType>(E.Scalars.front()->getType());
15285 if (!TreeRootIT || !Opcode)
15289 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
15292 unsigned NumParts =
15298 unsigned MaxBitWidth = 1u;
15306 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
15307 KnownBits Known = computeKnownBits(R, *DL);
15308 return Known.isNonNegative();
15313 for (
Value *Root : E.Scalars) {
15316 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15332 if (!IsKnownPositive)
15336 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15338 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
15341 if (MaxBitWidth < 8 && MaxBitWidth > 1)
15346 if (NumParts > 1 &&
15352 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
15353 Opcode == Instruction::SExt ||
15354 Opcode == Instruction::ZExt || NumParts > 1;
15359 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
15360 bool NeedToDemote = IsProfitableToDemote;
15362 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
15363 ToDemote, Visited, MaxDepthLevel, NeedToDemote,
15365 (MaxDepthLevel <= Limit &&
15366 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
15367 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
15368 DL->getTypeSizeInBits(TreeRootIT) /
15369 DL->getTypeSizeInBits(cast<Instruction>(E.Scalars.front())
15375 MaxBitWidth =
bit_ceil(MaxBitWidth);
15377 return MaxBitWidth;
15384 if (UserIgnoreList &&
15385 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
15386 for (
Value *V : *UserIgnoreList) {
15388 auto NumTypeBits =
DL->getTypeSizeInBits(V->getType());
15389 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15392 unsigned BitWidth2 = BitWidth1;
15395 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15397 ReductionBitWidth =
15398 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
15400 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
15401 ReductionBitWidth = 8;
15403 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
15405 bool IsTopRoot = NodeIdx == 0;
15406 while (NodeIdx < VectorizableTree.size() &&
15407 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15408 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15411 IsTruncRoot =
true;
15413 bool IsSignedCmp =
false;
15414 while (NodeIdx < VectorizableTree.size()) {
15416 unsigned Limit = 2;
15417 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
15419 ReductionBitWidth ==
15420 DL->getTypeSizeInBits(
15421 VectorizableTree.front()->Scalars.front()->getType()))
15423 unsigned MaxBitWidth = ComputeMaxBitWidth(
15424 *VectorizableTree[NodeIdx].
get(), IsTopRoot, IsProfitableToDemoteRoot,
15425 Opcode, Limit, IsTruncRoot, IsSignedCmp);
15426 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.
empty())) {
15427 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
15428 ReductionBitWidth =
bit_ceil(MaxBitWidth);
15429 else if (MaxBitWidth == 0)
15430 ReductionBitWidth = 0;
15433 for (
unsigned Idx : RootDemotes) {
15435 uint32_t OrigBitWidth =
DL->getTypeSizeInBits(V->getType());
15436 if (OrigBitWidth > MaxBitWidth) {
15444 RootDemotes.clear();
15446 IsProfitableToDemoteRoot =
true;
15448 if (ExtraBitWidthNodes.
empty()) {
15449 NodeIdx = VectorizableTree.size();
15451 unsigned NewIdx = 0;
15453 NewIdx = *ExtraBitWidthNodes.
begin();
15454 ExtraBitWidthNodes.
erase(ExtraBitWidthNodes.
begin());
15455 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.
empty());
15458 NodeIdx < VectorizableTree.size() &&
15459 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15462 EI.
UserTE->getOpcode() == Instruction::Trunc &&
15463 !EI.
UserTE->isAltShuffle();
15466 NodeIdx < VectorizableTree.size() &&
15467 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15469 return EI.
UserTE->getOpcode() == Instruction::ICmp &&
15471 auto *IC = dyn_cast<ICmpInst>(V);
15474 !isKnownNonNegative(IC->getOperand(0),
15475 SimplifyQuery(*DL)) ||
15476 !isKnownNonNegative(IC->getOperand(1),
15477 SimplifyQuery(*DL)));
15484 if (MaxBitWidth == 0 ||
15486 cast<IntegerType>(TreeRoot.
front()->getType())->getBitWidth()) {
15487 if (UserIgnoreList)
15494 for (
unsigned Idx : ToDemote) {
15495 TreeEntry *TE = VectorizableTree[
Idx].get();
15498 bool IsSigned =
any_of(TE->Scalars, [&](
Value *R) {
15499 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15517 bool Changed =
runImpl(
F, SE,
TTI, TLI, AA, LI, DT, AC, DB, ORE);
15542 DL = &
F.getParent()->getDataLayout();
15546 bool Changed =
false;
15552 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
15557 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
15560 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
15564 BoUpSLP R(&
F, SE,
TTI, TLI, AA, LI, DT, AC, DB,
DL, ORE_);
15573 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
15575 R.clearReductionData();
15576 collectSeedInstructions(BB);
15579 if (!Stores.empty()) {
15581 <<
" underlying objects.\n");
15582 Changed |= vectorizeStoreChains(R);
15586 Changed |= vectorizeChainsInBlock(BB, R);
15591 if (!GEPs.
empty()) {
15593 <<
" underlying objects.\n");
15594 Changed |= vectorizeGEPIndices(BB, R);
15599 R.optimizeGatherSequence();
15607 unsigned Idx,
unsigned MinVF,
15612 const unsigned Sz = R.getVectorElementSize(Chain[0]);
15613 unsigned VF = Chain.
size();
15627 for (
Value *V : Chain)
15628 ValOps.
insert(cast<StoreInst>(V)->getValueOperand());
15631 if (
all_of(ValOps, IsaPred<Instruction>) && ValOps.
size() > 1) {
15636 if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
15637 (!S.MainOp->isSafeToRemove() ||
15640 return !isa<ExtractElementInst>(V) &&
15641 (V->getNumUses() > Chain.size() ||
15642 any_of(V->users(), [&](User *U) {
15643 return !Stores.contains(U);
15646 (ValOps.
size() > Chain.size() / 2 && !S.getOpcode())) {
15647 Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
15651 if (
R.isLoadCombineCandidate(Chain))
15653 R.buildTree(Chain);
15655 if (
R.isTreeTinyAndNotFullyVectorizable()) {
15656 if (
R.isGathered(Chain.front()) ||
15657 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
15658 return std::nullopt;
15659 Size =
R.getTreeSize();
15662 R.reorderTopToBottom();
15663 R.reorderBottomToTop();
15664 R.buildExternalUses();
15666 R.computeMinimumValueSizes();
15667 R.transformNodes();
15669 Size =
R.getTreeSize();
15670 if (S.getOpcode() == Instruction::Load)
15678 using namespace ore;
15681 cast<StoreInst>(Chain[0]))
15682 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
15683 <<
" and with tree size "
15684 <<
NV(
"TreeSize",
R.getTreeSize()));
15698 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
15699 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
15700 unsigned Size = First ? Val.first : Val.second;
15712 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
15713 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
15714 unsigned P = First ? Val.first : Val.second;
15717 return V + (P - Mean) * (P - Mean);
15720 return Dev * 81 / (Mean * Mean) == 0;
15723bool SLPVectorizerPass::vectorizeStores(
15725 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
15730 bool Changed =
false;
15732 struct StoreDistCompare {
15733 bool operator()(
const std::pair<unsigned, int> &Op1,
15734 const std::pair<unsigned, int> &Op2)
const {
15735 return Op1.second < Op2.second;
15740 using StoreIndexToDistSet =
15741 std::set<std::pair<unsigned, int>, StoreDistCompare>;
15742 auto TryToVectorize = [&](
const StoreIndexToDistSet &Set) {
15747 if (
Operands.empty() ||
Data.second - PrevDist == 1) {
15749 PrevDist =
Data.second;
15750 if (
Idx != Set.size() - 1)
15755 Operands.push_back(Stores[DataVar.first]);
15756 PrevDist = DataVar.second;
15761 .
insert({Operands.front(),
15762 cast<StoreInst>(Operands.front())->getValueOperand(),
15764 cast<StoreInst>(Operands.back())->getValueOperand(),
15769 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
15770 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
15774 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
15775 unsigned MaxRegVF = MaxVF;
15777 Type *StoreTy =
Store->getValueOperand()->getType();
15778 Type *ValueTy = StoreTy;
15779 if (
auto *Trunc = dyn_cast<TruncInst>(
Store->getValueOperand()))
15780 ValueTy = Trunc->getSrcTy();
15781 if (ValueTy == StoreTy &&
15782 R.getVectorElementSize(
Store->getValueOperand()) <= EltSize)
15784 unsigned MinVF = std::max<unsigned>(
15786 R.getMinVF(
DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
15789 if (MaxVF < MinVF) {
15790 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
15792 <<
"MinVF (" << MinVF <<
")\n");
15796 unsigned NonPowerOf2VF = 0;
15801 unsigned CandVF =
Operands.size();
15803 NonPowerOf2VF = CandVF;
15808 unsigned Size = MinVF;
15810 VF =
Size > MaxVF ? NonPowerOf2VF :
Size;
15814 unsigned Repeat = 0;
15815 constexpr unsigned MaxAttempts = 4;
15817 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &
P) {
15818 P.first =
P.second = 1;
15821 auto IsNotVectorized = [](
bool First,
15822 const std::pair<unsigned, unsigned> &
P) {
15823 return First ?
P.first > 0 :
P.second > 0;
15825 auto IsVectorized = [](
bool First,
15826 const std::pair<unsigned, unsigned> &
P) {
15827 return First ?
P.first == 0 :
P.second == 0;
15829 auto VFIsProfitable = [](
bool First,
unsigned Size,
15830 const std::pair<unsigned, unsigned> &
P) {
15833 auto FirstSizeSame = [](
unsigned Size,
15834 const std::pair<unsigned, unsigned> &
P) {
15835 return Size ==
P.first;
15839 bool RepeatChanged =
false;
15840 bool AnyProfitableGraph;
15841 for (
unsigned Size : CandidateVFs) {
15842 AnyProfitableGraph =
false;
15843 unsigned StartIdx = std::distance(
15844 RangeSizes.begin(),
15845 find_if(RangeSizes, std::bind(IsNotVectorized,
Size >= MaxRegVF,
15846 std::placeholders::_1)));
15847 while (StartIdx <
End) {
15849 std::distance(RangeSizes.begin(),
15850 find_if(RangeSizes.drop_front(StartIdx),
15851 std::bind(IsVectorized,
Size >= MaxRegVF,
15852 std::placeholders::_1)));
15853 unsigned Sz = EndIdx >=
End ?
End : EndIdx;
15854 for (
unsigned Cnt = StartIdx; Cnt +
Size <= Sz;) {
15856 Size >= MaxRegVF)) {
15863 return cast<StoreInst>(V)
15864 ->getValueOperand()
15866 cast<StoreInst>(Slice.
front())
15867 ->getValueOperand()
15870 "Expected all operands of same type.");
15871 if (!NonSchedulable.empty()) {
15872 auto [NonSchedSizeMax, NonSchedSizeMin] =
15873 NonSchedulable.lookup(Slice.
front());
15874 if (NonSchedSizeMax > 0 && NonSchedSizeMin <=
Size) {
15875 Cnt += NonSchedSizeMax;
15880 std::optional<bool> Res =
15881 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
15885 .first->getSecond()
15893 AnyProfitableGraph = RepeatChanged = Changed =
true;
15897 [](std::pair<unsigned, unsigned> &
P) {
15898 P.first = P.second = 0;
15900 if (Cnt < StartIdx + MinVF) {
15901 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
15902 [](std::pair<unsigned, unsigned> &
P) {
15903 P.first = P.second = 0;
15905 StartIdx = Cnt +
Size;
15907 if (Cnt > Sz -
Size - MinVF) {
15909 [](std::pair<unsigned, unsigned> &
P) {
15910 P.first = P.second = 0;
15919 if (
Size > 2 && Res &&
15921 std::bind(VFIsProfitable,
Size >= MaxRegVF, TreeSize,
15922 std::placeholders::_1))) {
15928 if (
Size > MaxRegVF && TreeSize > 1 &&
15930 std::bind(FirstSizeSame, TreeSize,
15931 std::placeholders::_1))) {
15933 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
15939 [&](std::pair<unsigned, unsigned> &
P) {
15940 if (Size >= MaxRegVF)
15941 P.second = std::max(P.second, TreeSize);
15943 P.first = std::max(P.first, TreeSize);
15946 AnyProfitableGraph =
true;
15948 if (StartIdx >=
End)
15950 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
15951 AnyProfitableGraph =
true;
15952 StartIdx = std::distance(
15953 RangeSizes.begin(),
15954 find_if(RangeSizes.drop_front(Sz),
15955 std::bind(IsNotVectorized,
Size >= MaxRegVF,
15956 std::placeholders::_1)));
15958 if (!AnyProfitableGraph &&
Size >= MaxRegVF)
15962 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
15963 return P.first == 0 &&
P.second == 0;
15967 if (Repeat >= MaxAttempts ||
15968 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
15970 constexpr unsigned StoresLimit = 64;
15971 const unsigned MaxTotalNum =
bit_floor(std::min<unsigned>(
15973 static_cast<unsigned>(
15976 RangeSizes.begin(),
15977 find_if(RangeSizes, std::bind(IsNotVectorized,
true,
15978 std::placeholders::_1))) +
15981 if (VF > MaxTotalNum || VF >= StoresLimit)
15983 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &
P) {
15985 P.first = std::max(
P.second,
P.first);
15989 CandidateVFs.clear();
15990 CandidateVFs.push_back(VF);
16037 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
16039 Stores[Set.first]->getValueOperand()->getType(),
16040 Stores[Set.first]->getPointerOperand(),
16041 SI->getValueOperand()->getType(),
SI->getPointerOperand(), *
DL, *SE,
16045 auto It = Set.second.find(std::make_pair(
Idx, *Diff));
16046 if (It == Set.second.end()) {
16047 Set.second.emplace(
Idx, *Diff);
16051 TryToVectorize(Set.second);
16052 StoreIndexToDistSet PrevSet;
16053 PrevSet.swap(Set.second);
16055 Set.second.emplace(
Idx, 0);
16058 unsigned StartIdx = It->first + 1;
16063 for (
const std::pair<unsigned, int> &Pair :
reverse(PrevSet)) {
16065 if (Pair.first <= It->first ||
16066 VectorizedStores.
contains(Stores[Pair.first]))
16068 unsigned BI = Pair.first - StartIdx;
16069 UsedStores.set(BI);
16070 Dists[BI] = Pair.second - It->second;
16072 for (
unsigned I = StartIdx;
I <
Idx; ++
I) {
16073 unsigned BI =
I - StartIdx;
16074 if (UsedStores.test(BI))
16075 Set.second.emplace(
I, Dists[BI]);
16079 auto &Res = SortedStores.emplace_back();
16081 Res.second.emplace(
Idx, 0);
16087 SI->getValueOperand()->getType()) {
16088 for (
auto &Set : SortedStores)
16089 TryToVectorize(Set.second);
16090 SortedStores.clear();
16093 FillStoresSet(
I, SI);
16097 for (
auto &Set : SortedStores)
16098 TryToVectorize(Set.second);
16103void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
16114 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
16115 if (!
SI->isSimple())
16125 else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
16126 if (
GEP->getNumIndices() != 1)
16129 if (isa<Constant>(
Idx))
16133 if (
GEP->getType()->isVectorTy())
16145 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
16146 << VL.
size() <<
".\n");
16151 if (!S.getOpcode())
16157 for (
Value *V : VL) {
16158 Type *Ty =
V->getType();
16162 R.getORE()->emit([&]() {
16163 std::string TypeStr;
16167 <<
"Cannot SLP vectorize list: type "
16168 << rso.str() +
" is unsupported by vectorizer";
16174 unsigned Sz =
R.getVectorElementSize(I0);
16175 unsigned MinVF =
R.getMinVF(Sz);
16176 unsigned MaxVF = std::max<unsigned>(
llvm::bit_floor(VL.size()), MinVF);
16177 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
16179 R.getORE()->emit([&]() {
16181 <<
"Cannot SLP vectorize list: vectorization factor "
16182 <<
"less than 2 is not supported";
16187 bool Changed =
false;
16188 bool CandidateFound =
false;
16190 Type *ScalarTy = VL[0]->getType();
16191 if (
auto *IE = dyn_cast<InsertElementInst>(VL[0]))
16192 ScalarTy =
IE->getOperand(1)->getType();
16194 unsigned NextInst = 0, MaxInst = VL.size();
16195 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
16202 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
16203 unsigned ActualVF = std::min(MaxInst -
I, VF);
16208 if (MaxVFOnly && ActualVF < MaxVF)
16210 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
16216 auto *
I = dyn_cast<Instruction>(V);
16217 return I &&
R.isDeleted(
I);
16221 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
16225 if (
R.isTreeTinyAndNotFullyVectorizable())
16227 R.reorderTopToBottom();
16228 R.reorderBottomToTop(
16229 !isa<InsertElementInst>(Ops.
front()) &&
16230 !
R.doesRootHaveInTreeUses());
16231 R.buildExternalUses();
16233 R.computeMinimumValueSizes();
16234 R.transformNodes();
16236 CandidateFound =
true;
16237 MinCost = std::min(MinCost,
Cost);
16240 <<
" for VF=" << ActualVF <<
"\n");
16244 cast<Instruction>(Ops[0]))
16245 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
16246 <<
" and with tree size "
16247 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
16258 if (!Changed && CandidateFound) {
16259 R.getORE()->emit([&]() {
16261 <<
"List vectorization was possible but not beneficial with cost "
16262 <<
ore::NV(
"Cost", MinCost) <<
" >= "
16265 }
else if (!Changed) {
16266 R.getORE()->emit([&]() {
16268 <<
"Cannot SLP vectorize list: vectorization was impossible"
16269 <<
" with available vectorization factors";
16279 if (!isa<BinaryOperator, CmpInst>(
I) || isa<VectorType>(
I->getType()))
16285 auto *Op0 = dyn_cast<Instruction>(
I->getOperand(0));
16286 auto *Op1 = dyn_cast<Instruction>(
I->getOperand(1));
16287 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P)
16294 auto *
A = dyn_cast<BinaryOperator>(Op0);
16295 auto *
B = dyn_cast<BinaryOperator>(Op1);
16297 if (
A &&
B &&
B->hasOneUse()) {
16298 auto *B0 = dyn_cast<BinaryOperator>(
B->getOperand(0));
16299 auto *B1 = dyn_cast<BinaryOperator>(
B->getOperand(1));
16300 if (B0 && B0->getParent() ==
P)
16302 if (B1 && B1->getParent() ==
P)
16306 if (
B &&
A &&
A->hasOneUse()) {
16307 auto *A0 = dyn_cast<BinaryOperator>(
A->getOperand(0));
16308 auto *A1 = dyn_cast<BinaryOperator>(
A->getOperand(1));
16309 if (A0 && A0->getParent() ==
P)
16311 if (A1 && A1->getParent() ==
P)
16315 if (Candidates.
size() == 1)
16316 return tryToVectorizeList({Op0, Op1},
R);
16319 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
16320 if (!BestCandidate)
16322 return tryToVectorizeList(
16323 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},
R);
16357 ReductionOpsListType ReductionOps;
16369 bool IsSupportedHorRdxIdentityOp =
false;
16380 return isa<SelectInst>(
I) &&
16386 if (Kind == RecurKind::None)
16394 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
16398 return I->getFastMathFlags().noNaNs();
16401 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
16404 return I->isAssociative();
16413 return I->getOperand(2);
16414 return I->getOperand(
Index);
16422 case RecurKind::Or:
16428 case RecurKind::And:
16434 case RecurKind::Add:
16435 case RecurKind::Mul:
16436 case RecurKind::Xor:
16437 case RecurKind::FAdd:
16438 case RecurKind::FMul:
16441 case RecurKind::FMax:
16443 case RecurKind::FMin:
16445 case RecurKind::FMaximum:
16447 case RecurKind::FMinimum:
16449 case RecurKind::SMax:
16455 case RecurKind::SMin:
16461 case RecurKind::UMax:
16467 case RecurKind::UMin:
16482 const ReductionOpsListType &ReductionOps) {
16483 bool UseSelect = ReductionOps.size() == 2 ||
16485 (ReductionOps.size() == 1 &&
16486 any_of(ReductionOps.front(), IsaPred<SelectInst>));
16487 assert((!UseSelect || ReductionOps.size() != 2 ||
16488 isa<SelectInst>(ReductionOps[1][0])) &&
16489 "Expected cmp + select pairs for reduction");
16492 if (
auto *Sel = dyn_cast<SelectInst>(
Op)) {
16506 auto *
I = dyn_cast<Instruction>(V);
16508 return RecurKind::None;
16510 return RecurKind::Add;
16512 return RecurKind::Mul;
16515 return RecurKind::And;
16518 return RecurKind::Or;
16520 return RecurKind::Xor;
16522 return RecurKind::FAdd;
16524 return RecurKind::FMul;
16527 return RecurKind::FMax;
16529 return RecurKind::FMin;
16532 return RecurKind::FMaximum;
16534 return RecurKind::FMinimum;
16540 return RecurKind::SMax;
16542 return RecurKind::SMin;
16544 return RecurKind::UMax;
16546 return RecurKind::UMin;
16548 if (
auto *
Select = dyn_cast<SelectInst>(
I)) {
16570 if (!isa<ExtractElementInst>(
RHS) ||
16572 return RecurKind::None;
16574 if (!isa<ExtractElementInst>(
LHS) ||
16576 return RecurKind::None;
16578 if (!isa<ExtractElementInst>(
LHS) || !isa<ExtractElementInst>(
RHS))
16579 return RecurKind::None;
16583 return RecurKind::None;
16588 return RecurKind::None;
16591 return RecurKind::SMax;
16594 return RecurKind::SMin;
16597 return RecurKind::UMax;
16600 return RecurKind::UMin;
16603 return RecurKind::None;
16607 static unsigned getFirstOperandIndex(
Instruction *
I) {
16608 return isCmpSelMinMax(
I) ? 1 : 0;
16614 return isCmpSelMinMax(
I) ? 3 : 2;
16620 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
16621 auto *Sel = cast<SelectInst>(
I);
16622 auto *
Cmp = dyn_cast<Instruction>(Sel->getCondition());
16623 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
16625 return I->getParent() == BB;
16629 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax,
Instruction *
I) {
16630 if (IsCmpSelMinMax) {
16633 if (
auto *Sel = dyn_cast<SelectInst>(
I))
16634 return Sel->
hasNUses(2) && Sel->getCondition()->hasOneUse();
16635 return I->hasNUses(2);
16639 return I->hasOneUse();
16644 if (isCmpSelMinMax(
I))
16645 ReductionOps.assign(2, ReductionOpsType());
16647 ReductionOps.assign(1, ReductionOpsType());
16652 if (isCmpSelMinMax(
I)) {
16653 ReductionOps[0].emplace_back(cast<SelectInst>(
I)->getCondition());
16654 ReductionOps[1].emplace_back(
I);
16656 ReductionOps[0].emplace_back(
I);
16661 int Sz = Data.size();
16662 auto *
I = dyn_cast<Instruction>(Data.front());
16663 return Sz > 1 ||
isConstant(Data.front()) ||
16674 RdxKind = HorizontalReduction::getRdxKind(Root);
16675 if (!isVectorizable(RdxKind, Root))
16686 if (
auto *Sel = dyn_cast<SelectInst>(Root))
16687 if (!Sel->getCondition()->hasOneUse())
16690 ReductionRoot = Root;
16695 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
16704 for (
int I = getFirstOperandIndex(TreeN),
16705 End = getNumberOfOperands(TreeN);
16707 Value *EdgeVal = getRdxOperand(TreeN,
I);
16708 ReducedValsToOps[EdgeVal].push_back(TreeN);
16709 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
16712 !hasSameParent(EdgeInst, BB)) {
16713 ExtraArgs.push_back(EdgeVal);
16720 if (!EdgeInst ||
getRdxKind(EdgeInst) != RdxKind ||
16721 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
16722 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
16723 !isVectorizable(RdxKind, EdgeInst) ||
16724 (
R.isAnalyzedReductionRoot(EdgeInst) &&
16725 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
16726 PossibleReducedVals.push_back(EdgeVal);
16729 ReductionOps.push_back(EdgeInst);
16738 PossibleReducedVals;
16739 initReductionOps(Root);
16744 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
16747 auto LIt = LoadsMap.
find(
Ptr);
16748 if (LIt != LoadsMap.
end()) {
16749 for (
LoadInst *RLI : LIt->second) {
16755 for (
LoadInst *RLI : LIt->second) {
16759 DoNotReverseVals.
insert(RLI);
16763 if (LIt->second.size() > 2) {
16765 hash_value(LIt->second.back()->getPointerOperand());
16766 DoNotReverseVals.
insert(LIt->second.back());
16771 LoadKeyUsed.
insert(Key);
16776 while (!Worklist.empty()) {
16781 CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
16784 if (
Args.size() < 2) {
16785 addReductionOps(TreeN);
16787 if (!
Args.empty()) {
16788 assert(
Args.size() == 1 &&
"Expected only single argument.");
16789 ExtraArgs[TreeN] =
Args.front();
16793 for (
Value *V : PossibleRedVals) {
16797 ++PossibleReducedVals[
Key][
Idx]
16798 .
insert(std::make_pair(V, 0))
16801 Worklist.append(PossibleReductionOps.
rbegin(),
16802 PossibleReductionOps.
rend());
16807 ++PossibleReducedVals[
Key][
Idx]
16808 .
insert(std::make_pair(TreeN, 0))
16812 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
16815 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
16816 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
16818 for (
auto It = PossibleRedVals.begin(),
E = PossibleRedVals.end();
16821 auto RedValsVect = It->second.takeVector();
16823 for (
const std::pair<Value *, unsigned> &Data : RedValsVect)
16824 PossibleRedValsVect.
back().append(Data.second, Data.first);
16826 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
16827 return P1.size() > P2.size();
16831 if (isGoodForReduction(Data) ||
16832 (isa<LoadInst>(Data.front()) && NewIdx >= 0 &&
16833 isa<LoadInst>(ReducedVals[NewIdx].front()) &&
16835 cast<LoadInst>(Data.front())->getPointerOperand()) ==
16839 NewIdx = ReducedVals.
size();
16842 if (DoNotReverseVals.
contains(Data.front()))
16843 ReducedVals[NewIdx].
append(Data.begin(), Data.end());
16845 ReducedVals[NewIdx].
append(Data.rbegin(), Data.rend());
16847 ReducedVals.
emplace_back().append(Data.rbegin(), Data.rend());
16862 constexpr int ReductionLimit = 4;
16863 constexpr unsigned RegMaxNumber = 4;
16864 constexpr unsigned RedValsMaxNumber = 128;
16868 unsigned NumReducedVals =
16869 std::accumulate(ReducedVals.
begin(), ReducedVals.
end(), 0,
16871 if (!isGoodForReduction(Vals))
16873 return Num + Vals.size();
16875 if (NumReducedVals < ReductionLimit &&
16880 for (ReductionOpsType &RdxOps : ReductionOps)
16881 for (
Value *RdxOp : RdxOps)
16882 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
16893 ReducedVals.
size() * ReducedVals.
front().size() + ExtraArgs.size());
16896 ExternallyUsedValues.
reserve(ExtraArgs.size() + 1);
16899 for (
const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
16900 assert(Pair.first &&
"DebugLoc must be set.");
16901 ExternallyUsedValues[Pair.second].push_back(Pair.first);
16902 TrackedVals.
try_emplace(Pair.second, Pair.second);
16907 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
16908 assert(isa<SelectInst>(RdxRootInst) &&
16909 "Expected min/max reduction to have select root instruction");
16910 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
16911 assert(isa<Instruction>(ScalarCond) &&
16912 "Expected min/max reduction to have compare condition");
16913 return cast<Instruction>(ScalarCond);
16917 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
16918 if (VectorizedTree) {
16921 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
16922 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
16925 auto It = ReducedValsToOps.
find(Res);
16926 if (It != ReducedValsToOps.
end() &&
16932 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
16938 bool AnyBoolLogicOp =
16940 return isBoolLogicOp(cast<Instruction>(V));
16944 ExternallyUsedValues[ReductionRoot];
16946 ReductionOps.front().size());
16947 for (ReductionOpsType &RdxOps : ReductionOps)
16948 for (
Value *RdxOp : RdxOps) {
16951 IgnoreList.insert(RdxOp);
16956 for (
Value *U : IgnoreList)
16957 if (
auto *FPMO = dyn_cast<FPMathOperator>(U))
16958 RdxFMF &= FPMO->getFastMathFlags();
16959 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
16964 for (
Value *V : Candidates)
16965 TrackedVals.try_emplace(V, V);
16971 Value *VectorizedTree =
nullptr;
16972 bool CheckForReusedReductionOps =
false;
16974 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
16980 for (
unsigned Cnt = 0, Sz = OrigReducedVals.
size(); Cnt < Sz; ++Cnt) {
16981 Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
16986 auto *Inst = dyn_cast<Instruction>(RdxVal);
16988 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
16989 (S.getOpcode() && !Inst))
16992 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
16994 bool ShuffledExtracts =
false;
16996 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
16998 InstructionsState NextS =
getSameOpcode(ReducedVals[
I + 1], TLI);
16999 if (NextS.getOpcode() == Instruction::ExtractElement &&
17000 !NextS.isAltShuffle()) {
17002 for (
Value *RV : ReducedVals[
I + 1]) {
17003 Value *RdxVal = TrackedVals.find(RV)->second;
17007 if (
auto *Inst = dyn_cast<Instruction>(RdxVal))
17008 if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
17010 CommonCandidates.push_back(RdxVal);
17011 TrackedToOrig.try_emplace(RdxVal, RV);
17016 Candidates.
swap(CommonCandidates);
17017 ShuffledExtracts =
true;
17026 ++VectorizedVals.try_emplace(Candidates.
front(), 0).first->getSecond();
17028 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
17029 ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
17030 if (
auto *ResI = dyn_cast<Instruction>(Res))
17031 V.analyzedReductionRoot(ResI);
17033 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
17037 unsigned NumReducedVals = Candidates.
size();
17038 if (NumReducedVals < ReductionLimit &&
17045 IsSupportedHorRdxIdentityOp =
17047 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
17050 if (IsSupportedHorRdxIdentityOp)
17051 for (
Value *V : Candidates)
17052 ++SameValuesCounter.
insert(std::make_pair(V, 0)).first->second;
17063 bool SameScaleFactor =
false;
17064 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
17065 SameValuesCounter.
size() != Candidates.size();
17066 if (OptReusedScalars) {
17068 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
17069 RdxKind == RecurKind::Xor) &&
17071 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
17072 return P.second == SameValuesCounter.
front().second;
17074 Candidates.resize(SameValuesCounter.
size());
17075 transform(SameValuesCounter, Candidates.begin(),
17076 [](
const auto &
P) { return P.first; });
17077 NumReducedVals = Candidates.size();
17079 if (NumReducedVals == 1) {
17080 Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
17081 unsigned Cnt = SameValuesCounter.
lookup(OrigV);
17083 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
17084 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17085 VectorizedVals.try_emplace(OrigV, Cnt);
17090 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
17091 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
17095 unsigned ReduxWidth = std::min<unsigned>(
17097 std::clamp<unsigned>(MaxElts, RedValsMaxNumber,
17098 RegMaxNumber * RedValsMaxNumber));
17099 unsigned Start = 0;
17100 unsigned Pos = Start;
17102 unsigned PrevReduxWidth = ReduxWidth;
17103 bool CheckForReusedReductionOpsLocal =
false;
17104 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
17105 &CheckForReusedReductionOpsLocal,
17106 &PrevReduxWidth, &
V,
17107 &IgnoreList](
bool IgnoreVL =
false) {
17108 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
17109 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
17112 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
17115 if (Pos < NumReducedVals - ReduxWidth + 1)
17116 return IsAnyRedOpGathered;
17119 return IsAnyRedOpGathered;
17121 bool AnyVectorized =
false;
17122 while (Pos < NumReducedVals - ReduxWidth + 1 &&
17123 ReduxWidth >= ReductionLimit) {
17126 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
17128 CheckForReusedReductionOps =
true;
17131 PrevReduxWidth = ReduxWidth;
17134 if (
V.areAnalyzedReductionVals(VL)) {
17135 (void)AdjustReducedVals(
true);
17141 auto *RedValI = dyn_cast<Instruction>(RedVal);
17144 return V.isDeleted(RedValI);
17147 V.buildTree(VL, IgnoreList);
17148 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
17149 if (!AdjustReducedVals())
17150 V.analyzedReductionVals(VL);
17153 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
17154 if (!AdjustReducedVals())
17155 V.analyzedReductionVals(VL);
17158 V.reorderTopToBottom();
17160 V.reorderBottomToTop(
true);
17164 ExternallyUsedValues);
17165 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
17166 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
17168 for (
Value *V : ReducedVals[Cnt])
17169 if (isa<Instruction>(V))
17170 LocalExternallyUsedValues[TrackedVals[
V]];
17172 if (!IsSupportedHorRdxIdentityOp) {
17175 "Reused values counter map is not empty");
17176 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17177 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17179 Value *
V = Candidates[Cnt];
17180 Value *OrigV = TrackedToOrig.find(V)->second;
17181 ++SameValuesCounter[OrigV];
17187 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17188 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17190 Value *RdxVal = Candidates[Cnt];
17191 if (!Visited.
insert(RdxVal).second)
17195 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
17196 LocalExternallyUsedValues[RdxVal];
17199 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17201 VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
17202 if (NumOps != ReducedValsToOps.
find(OrigV)->second.size())
17203 LocalExternallyUsedValues[RdxVal];
17206 if (!IsSupportedHorRdxIdentityOp)
17207 SameValuesCounter.
clear();
17208 for (
Value *RdxVal : VL)
17209 if (RequiredExtract.
contains(RdxVal))
17210 LocalExternallyUsedValues[RdxVal];
17214 for (
const std::pair<Value *, Value *> &Pair : ReplacedExternals)
17215 ReplacementToExternal.
try_emplace(Pair.second, Pair.first);
17216 for (
const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
17218 auto RIt = ReplacementToExternal.
find(Ext);
17219 while (RIt != ReplacementToExternal.
end()) {
17221 RIt = ReplacementToExternal.
find(Ext);
17223 auto *It = ExternallyUsedValues.
find(Ext);
17224 if (It == ExternallyUsedValues.
end())
17226 LocalExternallyUsedValues[Pair.second].append(It->second);
17228 V.buildExternalUses(LocalExternallyUsedValues);
17230 V.computeMinimumValueSizes();
17231 V.transformNodes();
17236 getReductionCost(
TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
17239 <<
" for reduction\n");
17243 V.getORE()->emit([&]() {
17245 SV_NAME,
"HorSLPNotBeneficial",
17246 ReducedValsToOps.
find(VL[0])->second.front())
17247 <<
"Vectorizing horizontal reduction is possible "
17248 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
17249 <<
" and threshold "
17252 if (!AdjustReducedVals())
17253 V.analyzedReductionVals(VL);
17257 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
17258 <<
Cost <<
". (HorRdx)\n");
17259 V.getORE()->emit([&]() {
17261 SV_NAME,
"VectorizedHorizontalReduction",
17262 ReducedValsToOps.
find(VL[0])->second.front())
17263 <<
"Vectorized horizontal reduction with cost "
17264 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
17265 <<
ore::NV(
"TreeSize",
V.getTreeSize());
17272 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
17274 if (IsCmpSelMinMax)
17275 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
17278 Value *VectorizedRoot =
V.vectorizeTree(LocalExternallyUsedValues,
17279 ReplacedExternals, InsertPt);
17286 if ((isBoolLogicOp(RdxRootInst) ||
17287 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
17289 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
17292 if (OptReusedScalars && !SameScaleFactor) {
17294 emitReusedOps(VectorizedRoot, Builder,
V.getRootNodeScalars(),
17295 SameValuesCounter, TrackedToOrig);
17298 Value *ReducedSubTree =
17299 emitReduction(VectorizedRoot, Builder, ReduxWidth,
TTI);
17300 if (ReducedSubTree->
getType() != VL.front()->getType()) {
17302 ReducedSubTree, VL.front()->getType(),
any_of(VL, [&](
Value *R) {
17304 R, cast<Instruction>(ReductionOps.front().front())
17306 ->getDataLayout());
17314 if (OptReusedScalars && SameScaleFactor)
17315 ReducedSubTree = emitScaleForReusedOps(
17316 ReducedSubTree, Builder, SameValuesCounter.
front().second);
17318 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
17320 for (
Value *RdxVal : VL) {
17321 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17322 if (IsSupportedHorRdxIdentityOp) {
17323 VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
17326 ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
17327 if (!
V.isVectorized(RdxVal))
17328 RequiredExtract.
insert(RdxVal);
17333 AnyVectorized =
true;
17335 if (OptReusedScalars && !AnyVectorized) {
17336 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
17337 Value *RedVal = emitScaleForReusedOps(
P.first, Builder,
P.second);
17338 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17339 Value *OrigV = TrackedToOrig.find(
P.first)->second;
17340 VectorizedVals.try_emplace(OrigV,
P.second);
17345 if (VectorizedTree) {
17366 if (!AnyBoolLogicOp)
17368 if (isBoolLogicOp(RedOp1) &&
17369 ((!InitStep &&
LHS == VectorizedTree) ||
17372 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
17373 getRdxOperand(RedOp2, 0) ==
RHS ||
17378 if (
LHS != VectorizedTree)
17389 unsigned Sz = InstVals.
size();
17392 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
17395 Value *RdxVal1 = InstVals[
I].second;
17396 Value *StableRdxVal1 = RdxVal1;
17397 auto It1 = TrackedVals.find(RdxVal1);
17398 if (It1 != TrackedVals.end())
17399 StableRdxVal1 = It1->second;
17400 Value *RdxVal2 = InstVals[
I + 1].second;
17401 Value *StableRdxVal2 = RdxVal2;
17402 auto It2 = TrackedVals.find(RdxVal2);
17403 if (It2 != TrackedVals.end())
17404 StableRdxVal2 = It2->second;
17408 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
17410 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
17411 StableRdxVal2,
"op.rdx", ReductionOps);
17412 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
17415 ExtraReds[Sz / 2] = InstVals.
back();
17419 ExtraReductions.
emplace_back(cast<Instruction>(ReductionRoot),
17423 for (
Value *RdxVal : Candidates) {
17424 if (!Visited.
insert(RdxVal).second)
17426 unsigned NumOps = VectorizedVals.lookup(RdxVal);
17433 for (
auto &Pair : ExternallyUsedValues) {
17435 for (
auto *
I : Pair.second)
17439 bool InitStep =
true;
17440 while (ExtraReductions.
size() > 1) {
17441 VectorizedTree = ExtraReductions.
front().second;
17443 FinalGen(ExtraReductions, InitStep);
17444 ExtraReductions.
swap(NewReds);
17447 VectorizedTree = ExtraReductions.
front().second;
17449 ReductionRoot->replaceAllUsesWith(VectorizedTree);
17458 IgnoreSet.
insert(RdxOps.begin(), RdxOps.end());
17465 for (
auto *U :
Ignore->users()) {
17467 "All users must be either in the reduction ops list.");
17470 if (!
Ignore->use_empty()) {
17472 Ignore->replaceAllUsesWith(Undef);
17474 V.eraseInstruction(cast<Instruction>(
Ignore));
17477 }
else if (!CheckForReusedReductionOps) {
17478 for (ReductionOpsType &RdxOps : ReductionOps)
17479 for (
Value *RdxOp : RdxOps)
17480 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
17482 return VectorizedTree;
17489 bool IsCmpSelMinMax,
unsigned ReduxWidth,
17492 Type *ScalarTy = ReducedVals.
front()->getType();
17501 int Cnt = ReducedVals.
size();
17502 for (
Value *RdxVal : ReducedVals) {
17507 Cost += GenCostFn();
17512 auto *RdxOp = cast<Instruction>(U);
17513 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
17521 Cost += ScalarCost;
17523 Cost += GenCostFn();
17528 case RecurKind::Add:
17529 case RecurKind::Mul:
17530 case RecurKind::Or:
17531 case RecurKind::And:
17532 case RecurKind::Xor:
17533 case RecurKind::FAdd:
17534 case RecurKind::FMul: {
17539 ScalarCost = EvaluateScalarCost([&]() {
17544 case RecurKind::FMax:
17545 case RecurKind::FMin:
17546 case RecurKind::FMaximum:
17547 case RecurKind::FMinimum:
17548 case RecurKind::SMax:
17549 case RecurKind::SMin:
17550 case RecurKind::UMax:
17551 case RecurKind::UMin: {
17555 ScalarCost = EvaluateScalarCost([&]() {
17565 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
17567 <<
" (It is a splitting reduction)\n");
17568 return VectorCost - ScalarCost;
17574 assert(VectorizedValue &&
"Need to have a vectorized tree node");
17576 "We only handle power-of-two reductions for now");
17577 assert(RdxKind != RecurKind::FMulAdd &&
17578 "A call to the llvm.fmuladd intrinsic is not handled yet");
17580 ++NumVectorInstructions;
17587 assert(IsSupportedHorRdxIdentityOp &&
17588 "The optimization of matched scalar identity horizontal reductions "
17589 "must be supported.");
17591 case RecurKind::Add: {
17593 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
17595 << VectorizedValue <<
". (HorRdx)\n");
17596 return Builder.
CreateMul(VectorizedValue, Scale);
17598 case RecurKind::Xor: {
17600 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
17601 <<
". (HorRdx)\n");
17604 return VectorizedValue;
17606 case RecurKind::FAdd: {
17608 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
17610 << VectorizedValue <<
". (HorRdx)\n");
17611 return Builder.
CreateFMul(VectorizedValue, Scale);
17613 case RecurKind::And:
17614 case RecurKind::Or:
17615 case RecurKind::SMax:
17616 case RecurKind::SMin:
17617 case RecurKind::UMax:
17618 case RecurKind::UMin:
17619 case RecurKind::FMax:
17620 case RecurKind::FMin:
17621 case RecurKind::FMaximum:
17622 case RecurKind::FMinimum:
17624 return VectorizedValue;
17625 case RecurKind::Mul:
17626 case RecurKind::FMul:
17627 case RecurKind::FMulAdd:
17628 case RecurKind::IAnyOf:
17629 case RecurKind::FAnyOf:
17630 case RecurKind::None:
17642 assert(IsSupportedHorRdxIdentityOp &&
17643 "The optimization of matched scalar identity horizontal reductions "
17644 "must be supported.");
17645 auto *VTy = cast<FixedVectorType>(VectorizedValue->
getType());
17646 if (VTy->getElementType() != VL.
front()->getType()) {
17652 R, cast<Instruction>(ReductionOps.front().front())
17654 ->getDataLayout());
17659 case RecurKind::Add: {
17662 for (
Value *V : VL) {
17663 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
17664 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
17668 << VectorizedValue <<
". (HorRdx)\n");
17669 return Builder.
CreateMul(VectorizedValue, Scale);
17671 case RecurKind::And:
17672 case RecurKind::Or:
17675 <<
". (HorRdx)\n");
17676 return VectorizedValue;
17677 case RecurKind::SMax:
17678 case RecurKind::SMin:
17679 case RecurKind::UMax:
17680 case RecurKind::UMin:
17681 case RecurKind::FMax:
17682 case RecurKind::FMin:
17683 case RecurKind::FMaximum:
17684 case RecurKind::FMinimum:
17687 <<
". (HorRdx)\n");
17688 return VectorizedValue;
17689 case RecurKind::Xor: {
17695 cast<FixedVectorType>(VectorizedValue->
getType())->getNumElements(),
17697 std::iota(
Mask.begin(),
Mask.end(), 0);
17698 bool NeedShuffle =
false;
17699 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
17701 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
17702 if (Cnt % 2 == 0) {
17704 NeedShuffle =
true;
17710 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
17714 ConstantVector::getNullValue(VectorizedValue->
getType()),
Mask);
17715 return VectorizedValue;
17717 case RecurKind::FAdd: {
17720 for (
Value *V : VL) {
17721 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
17722 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
17725 return Builder.
CreateFMul(VectorizedValue, Scale);
17727 case RecurKind::Mul:
17728 case RecurKind::FMul:
17729 case RecurKind::FMulAdd:
17730 case RecurKind::IAnyOf:
17731 case RecurKind::FAnyOf:
17732 case RecurKind::None:
17742 return HorizontalReduction::getRdxKind(V);
17745 if (
auto *IE = dyn_cast<InsertElementInst>(InsertInst))
17746 return cast<FixedVectorType>(IE->getType())->getNumElements();
17748 unsigned AggregateSize = 1;
17749 auto *
IV = cast<InsertValueInst>(InsertInst);
17750 Type *CurrentType =
IV->getType();
17752 if (
auto *ST = dyn_cast<StructType>(CurrentType)) {
17753 for (
auto *Elt : ST->elements())
17754 if (Elt != ST->getElementType(0))
17755 return std::nullopt;
17756 AggregateSize *= ST->getNumElements();
17757 CurrentType = ST->getElementType(0);
17758 }
else if (
auto *AT = dyn_cast<ArrayType>(CurrentType)) {
17759 AggregateSize *= AT->getNumElements();
17760 CurrentType = AT->getElementType();
17761 }
else if (
auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
17762 AggregateSize *= VT->getNumElements();
17763 return AggregateSize;
17765 return AggregateSize;
17767 return std::nullopt;
17776 unsigned OperandOffset) {
17779 std::optional<unsigned> OperandIndex =
17783 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
17785 BuildVectorOpds, InsertElts, *OperandIndex);
17788 BuildVectorOpds[*OperandIndex] = InsertedOperand;
17789 InsertElts[*OperandIndex] = LastInsertInst;
17791 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->
getOperand(0));
17792 }
while (LastInsertInst !=
nullptr &&
17793 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
17816 assert((isa<InsertElementInst>(LastInsertInst) ||
17817 isa<InsertValueInst>(LastInsertInst)) &&
17818 "Expected insertelement or insertvalue instruction!");
17821 "Expected empty result vectors!");
17824 if (!AggregateSize)
17826 BuildVectorOpds.
resize(*AggregateSize);
17827 InsertElts.
resize(*AggregateSize);
17832 if (BuildVectorOpds.
size() >= 2)
17850 auto DominatedReduxValue = [&](
Value *R) {
17851 return isa<Instruction>(R) &&
17852 DT->
dominates(
P->getParent(), cast<Instruction>(R)->getParent());
17858 if (
P->getIncomingBlock(0) == ParentBB) {
17859 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
17860 }
else if (
P->getIncomingBlock(1) == ParentBB) {
17861 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
17864 if (Rdx && DominatedReduxValue(Rdx))
17877 if (
P->getIncomingBlock(0) == BBLatch) {
17878 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
17879 }
else if (
P->getIncomingBlock(1) == BBLatch) {
17880 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
17883 if (Rdx && DominatedReduxValue(Rdx))
17917 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
17918 isa<IntrinsicInst>(Root)) &&
17919 "Expected binop, select, or intrinsic for reduction matching");
17921 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
17923 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
17925 return dyn_cast<Instruction>(
RHS);
17927 return dyn_cast<Instruction>(
LHS);
17934 Value *Op0 =
nullptr;
17935 Value *Op1 =
nullptr;
17938 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
17944 Value *B0 =
nullptr, *B1 =
nullptr;
17949bool SLPVectorizerPass::vectorizeHorReduction(
17954 bool TryOperandsAsNewSeeds =
P && isa<BinaryOperator>(Root);
17956 if (Root->
getParent() != BB || isa<PHINode>(Root))
17960 auto SelectRoot = [&]() {
17979 std::queue<std::pair<Instruction *, unsigned>>
Stack;
17980 Stack.emplace(SelectRoot(), 0);
17984 if (
R.isAnalyzedReductionRoot(Inst))
17989 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *
DL, *TLI))
17991 return HorRdx.tryToReduce(R, *
DL,
TTI, *TLI);
17993 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
17994 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
18001 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
18006 while (!
Stack.empty()) {
18009 std::tie(Inst, Level) =
Stack.front();
18014 if (
R.isDeleted(Inst))
18016 if (
Value *VectorizedV = TryToReduce(Inst)) {
18018 if (
auto *
I = dyn_cast<Instruction>(VectorizedV)) {
18020 Stack.emplace(
I, Level);
18025 if (!TryAppendToPostponedInsts(Inst)) {
18036 if (VisitedInstrs.
insert(
Op).second)
18037 if (
auto *
I = dyn_cast<Instruction>(
Op))
18040 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(
I) &&
18041 !
R.isDeleted(
I) &&
I->getParent() == BB)
18042 Stack.emplace(
I, Level);
18051 bool Res = vectorizeHorReduction(
P, Root, BB, R,
TTI, PostponedInsts);
18052 Res |= tryToVectorize(PostponedInsts, R);
18059 for (
Value *V : Insts)
18060 if (
auto *Inst = dyn_cast<Instruction>(V); Inst && !
R.isDeleted(Inst))
18061 Res |= tryToVectorize(Inst, R);
18065bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
18067 if (!
R.canMapToVector(IVI->
getType()))
18075 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
18077 return tryToVectorizeList(BuildVectorOpds, R);
18086 (
llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
18090 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
18091 return tryToVectorizeList(BuildVectorInsts, R);
18094template <
typename T>
18099 bool MaxVFOnly,
BoUpSLP &R) {
18100 bool Changed =
false;
18109 auto *SameTypeIt = IncIt;
18110 while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt))
18114 unsigned NumElts = (SameTypeIt - IncIt);
18115 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
18116 << NumElts <<
")\n");
18127 TryToVectorizeHelper(
ArrayRef(IncIt, NumElts), MaxVFOnly)) {
18133 auto GetMinNumElements = [&R](
Value *V) {
18134 unsigned EltSize = R.getVectorElementSize(V);
18135 return std::max(2U, R.getMaxVecRegSize() / EltSize);
18137 if (NumElts < GetMinNumElements(*IncIt) &&
18138 (Candidates.
empty() ||
18139 Candidates.
front()->getType() == (*IncIt)->getType())) {
18140 Candidates.
append(IncIt, std::next(IncIt, NumElts));
18144 if (Candidates.
size() > 1 &&
18145 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
18146 if (TryToVectorizeHelper(Candidates,
false)) {
18149 }
else if (MaxVFOnly) {
18151 for (
auto *It = Candidates.
begin(), *
End = Candidates.
end();
18153 auto *SameTypeIt = It;
18154 while (SameTypeIt !=
End && AreCompatible(*SameTypeIt, *It))
18156 unsigned NumElts = (SameTypeIt - It);
18157 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(It, NumElts),
18163 Candidates.
clear();
18167 IncIt = SameTypeIt;
18179template <
bool IsCompatibility>
18184 "Expected valid element types only.");
18186 return IsCompatibility;
18187 auto *CI1 = cast<CmpInst>(V);
18188 auto *CI2 = cast<CmpInst>(V2);
18189 if (CI1->getOperand(0)->getType()->getTypeID() <
18191 return !IsCompatibility;
18192 if (CI1->getOperand(0)->getType()->getTypeID() >
18201 if (BasePred1 < BasePred2)
18202 return !IsCompatibility;
18203 if (BasePred1 > BasePred2)
18206 bool CI1Preds = Pred1 == BasePred1;
18207 bool CI2Preds = Pred2 == BasePred1;
18208 for (
int I = 0, E = CI1->getNumOperands();
I < E; ++
I) {
18209 auto *Op1 = CI1->getOperand(CI1Preds ?
I : E -
I - 1);
18210 auto *Op2 = CI2->
getOperand(CI2Preds ?
I : E -
I - 1);
18214 return !IsCompatibility;
18217 if (
auto *I1 = dyn_cast<Instruction>(Op1))
18218 if (
auto *I2 = dyn_cast<Instruction>(Op2)) {
18219 if (IsCompatibility) {
18220 if (I1->getParent() != I2->getParent())
18227 return NodeI2 !=
nullptr;
18230 assert((NodeI1 == NodeI2) ==
18232 "Different nodes should have different DFS numbers");
18233 if (NodeI1 != NodeI2)
18237 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
18239 if (IsCompatibility)
18241 if (I1->getOpcode() != I2->getOpcode())
18242 return I1->getOpcode() < I2->getOpcode();
18245 return IsCompatibility;
18248template <
typename ItT>
18251 bool Changed =
false;
18254 if (
R.isDeleted(
I))
18257 if (
auto *RootOp = dyn_cast<Instruction>(
Op))
18258 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R,
TTI);
18262 if (
R.isDeleted(
I))
18264 Changed |= tryToVectorize(
I, R);
18271 return compareCmp<false>(V, V2, *TLI, *DT);
18274 auto AreCompatibleCompares = [&](
Value *V1,
Value *
V2) {
18277 return compareCmp<true>(V1, V2, *TLI, *DT);
18284 if (Vals.
size() <= 1)
18286 Changed |= tryToVectorizeSequence<Value>(
18287 Vals, CompareSorter, AreCompatibleCompares,
18290 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
18292 auto *Select = dyn_cast<SelectInst>(U);
18294 Select->getParent() != cast<Instruction>(V)->getParent();
18297 if (ArePossiblyReducedInOtherBlock)
18299 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18305bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
18307 assert(
all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
18308 "This function only accepts Insert instructions");
18309 bool OpsChanged =
false;
18312 for (
auto *
I :
reverse(Instructions)) {
18313 if (
R.isDeleted(
I))
18315 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R,
TTI, PostponedInsts);
18318 for (
auto *
I :
reverse(Instructions)) {
18319 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
18321 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
18322 OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
18323 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
18324 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
18328 OpsChanged |= tryToVectorize(PostponedInsts, R);
18335 bool Changed =
false;
18342 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *
V2) {
18345 "Expected vectorizable types only.");
18354 if (Opcodes1.
size() < Opcodes2.
size())
18356 if (Opcodes1.
size() > Opcodes2.
size())
18358 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
18361 auto *
I1 = dyn_cast<Instruction>(Opcodes1[
I]);
18362 auto *I2 = dyn_cast<Instruction>(Opcodes2[
I]);
18367 return NodeI2 !=
nullptr;
18370 assert((NodeI1 == NodeI2) ==
18372 "Different nodes should have different DFS numbers");
18373 if (NodeI1 != NodeI2)
18376 if (S.getOpcode() && !S.isAltShuffle())
18378 return I1->getOpcode() < I2->getOpcode();
18387 bool C1 = isa<Constant>(Opcodes1[
I]) && !isa<UndefValue>(Opcodes1[
I]);
18388 bool C2 = isa<Constant>(Opcodes2[
I]) && !isa<UndefValue>(Opcodes2[
I]);
18396 bool U1 = isa<UndefValue>(Opcodes1[
I]);
18397 bool U2 = isa<UndefValue>(Opcodes2[
I]);
18401 auto ValID1 = Opcodes1[
I]->getValueID();
18402 auto ValID2 = Opcodes2[
I]->getValueID();
18403 if (ValID1 == ValID2)
18405 if (ValID1 < ValID2)
18407 if (ValID1 > ValID2)
18416 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
18421 auto AreCompatiblePHIs = [&PHIToOpcodes,
this](
Value *V1,
Value *
V2) {
18424 if (V1->getType() !=
V2->getType())
18428 if (Opcodes1.
size() != Opcodes2.
size())
18430 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
18432 if (isa<UndefValue>(Opcodes1[
I]) || isa<UndefValue>(Opcodes2[
I]))
18434 if (
auto *I1 = dyn_cast<Instruction>(Opcodes1[
I]))
18435 if (
auto *I2 = dyn_cast<Instruction>(Opcodes2[
I])) {
18436 if (
I1->getParent() != I2->getParent())
18443 if (isa<Constant>(Opcodes1[
I]) && isa<Constant>(Opcodes2[
I]))
18445 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
18451 bool HaveVectorizedPhiNodes =
false;
18456 auto *
P = dyn_cast<PHINode>(&
I);
18462 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
18475 if (!Opcodes.
empty())
18479 while (!Nodes.empty()) {
18480 auto *
PHI = cast<PHINode>(Nodes.pop_back_val());
18483 for (
Value *V :
PHI->incoming_values()) {
18484 if (
auto *PHI1 = dyn_cast<PHINode>((V))) {
18485 Nodes.push_back(PHI1);
18493 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
18494 Incoming, PHICompare, AreCompatiblePHIs,
18496 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18499 Changed |= HaveVectorizedPhiNodes;
18501 }
while (HaveVectorizedPhiNodes);
18503 VisitedInstrs.
clear();
18505 InstSetVector PostProcessInserts;
18509 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
18510 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
18511 if (VectorizeCmps) {
18512 Changed |= vectorizeCmpInsts(
reverse(PostProcessCmps), BB, R);
18513 PostProcessCmps.
clear();
18515 PostProcessInserts.clear();
18520 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
18521 return PostProcessCmps.
contains(Cmp);
18522 return isa<InsertElementInst, InsertValueInst>(
I) &&
18523 PostProcessInserts.contains(
I);
18529 return I->use_empty() &&
18530 (
I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(
I));
18535 if (isa<ScalableVectorType>(It->getType()))
18539 if (
R.isDeleted(&*It))
18542 if (!VisitedInstrs.
insert(&*It).second) {
18543 if (HasNoUsers(&*It) &&
18544 VectorizeInsertsAndCmps(It->isTerminator())) {
18554 if (isa<DbgInfoIntrinsic>(It))
18558 if (
PHINode *
P = dyn_cast<PHINode>(It)) {
18560 if (
P->getNumIncomingValues() == 2) {
18563 if (Root && vectorizeRootInstruction(
P, Root, BB, R,
TTI)) {
18572 for (
unsigned I = 0, E =
P->getNumIncomingValues();
I != E;
I++) {
18577 if (BB ==
P->getIncomingBlock(
I) ||
18583 if (
auto *PI = dyn_cast<Instruction>(
P->getIncomingValue(
I));
18584 PI && !IsInPostProcessInstrs(PI))
18585 Changed |= vectorizeRootInstruction(
nullptr, PI,
18586 P->getIncomingBlock(
I), R,
TTI);
18591 if (HasNoUsers(&*It)) {
18592 bool OpsChanged =
false;
18593 auto *
SI = dyn_cast<StoreInst>(It);
18603 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
18604 SI->getValueOperand()->hasOneUse();
18606 if (TryToVectorizeRoot) {
18607 for (
auto *V : It->operand_values()) {
18610 if (
auto *VI = dyn_cast<Instruction>(V);
18611 VI && !IsInPostProcessInstrs(VI))
18613 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R,
TTI);
18620 VectorizeInsertsAndCmps(It->isTerminator());
18631 if (isa<InsertElementInst, InsertValueInst>(It))
18632 PostProcessInserts.insert(&*It);
18633 else if (isa<CmpInst>(It))
18634 PostProcessCmps.
insert(cast<CmpInst>(&*It));
18641 auto Changed =
false;
18642 for (
auto &Entry : GEPs) {
18645 if (Entry.second.size() < 2)
18648 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
18649 << Entry.second.size() <<
".\n");
18656 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
18657 unsigned EltSize =
R.getVectorElementSize(*Entry.second[0]->idx_begin());
18658 if (MaxVecRegSize < EltSize)
18661 unsigned MaxElts = MaxVecRegSize / EltSize;
18662 for (
unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
18663 auto Len = std::min<unsigned>(BE - BI, MaxElts);
18676 Candidates.remove_if([&R](
Value *
I) {
18677 return R.isDeleted(cast<Instruction>(
I)) ||
18678 isa<Constant>(cast<GetElementPtrInst>(
I)->idx_begin()->
get());
18686 for (
int I = 0, E = GEPList.size();
I < E && Candidates.
size() > 1; ++
I) {
18687 auto *GEPI = GEPList[
I];
18688 if (!Candidates.count(GEPI))
18690 auto *SCEVI = SE->
getSCEV(GEPList[
I]);
18691 for (
int J =
I + 1; J < E && Candidates.
size() > 1; ++J) {
18692 auto *GEPJ = GEPList[J];
18693 auto *SCEVJ = SE->
getSCEV(GEPList[J]);
18694 if (isa<SCEVConstant>(SE->
getMinusSCEV(SCEVI, SCEVJ))) {
18695 Candidates.remove(GEPI);
18696 Candidates.remove(GEPJ);
18697 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
18698 Candidates.remove(GEPJ);
18705 if (Candidates.
size() < 2)
18712 auto BundleIndex = 0
u;
18713 for (
auto *V : Candidates) {
18714 auto *
GEP = cast<GetElementPtrInst>(V);
18715 auto *GEPIdx =
GEP->idx_begin()->get();
18716 assert(
GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
18717 Bundle[BundleIndex++] = GEPIdx;
18729 Changed |= tryToVectorizeList(Bundle, R);
18735bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
18736 bool Changed =
false;
18741 if (
V->getValueOperand()->getType()->getTypeID() <
18742 V2->getValueOperand()->getType()->getTypeID())
18744 if (
V->getValueOperand()->getType()->getTypeID() >
18745 V2->getValueOperand()->getType()->getTypeID())
18747 if (
V->getPointerOperandType()->getTypeID() <
18748 V2->getPointerOperandType()->getTypeID())
18750 if (
V->getPointerOperandType()->getTypeID() >
18751 V2->getPointerOperandType()->getTypeID())
18754 if (isa<UndefValue>(
V->getValueOperand()) ||
18755 isa<UndefValue>(
V2->getValueOperand()))
18757 if (
auto *I1 = dyn_cast<Instruction>(
V->getValueOperand()))
18758 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
18762 DT->
getNode(I2->getParent());
18763 assert(NodeI1 &&
"Should only process reachable instructions");
18764 assert(NodeI2 &&
"Should only process reachable instructions");
18765 assert((NodeI1 == NodeI2) ==
18767 "Different nodes should have different DFS numbers");
18768 if (NodeI1 != NodeI2)
18773 return I1->getOpcode() < I2->getOpcode();
18775 if (isa<Constant>(
V->getValueOperand()) &&
18776 isa<Constant>(
V2->getValueOperand()))
18778 return V->getValueOperand()->getValueID() <
18779 V2->getValueOperand()->getValueID();
18791 isa<UndefValue>(
V2->getValueOperand()))
18794 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
18795 if (
I1->getParent() != I2->getParent())
18798 return S.getOpcode() > 0;
18801 isa<Constant>(
V2->getValueOperand()))
18804 V2->getValueOperand()->getValueID();
18809 for (
auto &Pair : Stores) {
18810 if (Pair.second.size() < 2)
18814 << Pair.second.size() <<
".\n");
18823 Pair.second.rend());
18824 Changed |= tryToVectorizeSequence<StoreInst>(
18825 ReversedStores, StoreSorter, AreCompatibleStores,
18827 return vectorizeStores(Candidates, R, Attempted);
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
DenseMap< Block *, BlockRelaxAux > Blocks
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Loop::LoopBounds::Direction Direction
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > AllowHorRdxIdenityOptimization("slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, cl::desc("Allow optimization of original scalar identity operations on " "matched horizontal reductions."))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static Value * isOneOf(const InstructionsState &S, Value *Op)
Chooses the correct key for scheduling data.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static std::string shortBundleName(ArrayRef< Value * > VL)
Print a short descriptor of the instruction bundle suitable for debug output.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static std::optional< unsigned > getInsertIndex(const Value *InsertInst, unsigned Offset=0)
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static bool isValidForAlternation(unsigned Opcode)
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt)
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI, unsigned BaseIndex=0)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
This defines the Use class.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
~ShuffleInstructionBuilder()
A manager for alias analyses.
Class for arbitrary precision integers.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
An analysis that produces DemandedBits for a function.
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
value_type & FindAndConstruct(const KeyT &Key)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
ConstantInt * getTrue()
Get the constant value for i1 true.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
BasicBlock::iterator GetInsertPoint() const
Value * CreateFreeze(Value *V, const Twine &Name="")
BasicBlock * GetInsertBlock() const
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
ConstantInt * getFalse()
Get the constant value for i1 false.
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
const BasicBlock * getParent() const
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
size_type count(const KeyT &Key) const
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
ValueT lookup(const KeyT &Key) const
void reserve(size_type NumEntries)
Grow the MapVector so that it can contain at least NumEntries items before resizing again.
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
This is a MutableArrayRef that owns its array.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T get() const
Returns the value of the specified pointer type.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
void clear()
Completely clear the SetVector.
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getIntegerBitWidth() const
bool isX86_FP80Ty() const
Return true if this is x86 long double.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, Use *, unsigned NumOps)
Value * getOperand(unsigned i) const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
bool erase(const ValueT &V)
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
self_iterator getIterator()
CRTP base class for adapting an iterator to a different type.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals=std::nullopt)
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
unsigned getTreeSize() const
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
MapVector< Value *, SmallVector< Instruction *, 2 > > ExtraValueToDebugLocsMap
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
SmallPtrSet< Value *, 16 > ValueSet
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
unsigned getMaxVecRegSize() const
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
@ Undef
Value of the register doesn't matter.
ManagedStatic< cl::opt< FnT >, OptCreatorT > Action
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Value * createSimpleTargetReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a target reduction of the given vector.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
testing::Matcher< const detail::ErrorHolder & > Failed()
bool getAlign(const Function &F, unsigned index, unsigned &align)
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
iterator_range< po_iterator< T > > post_order(const T &G)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
auto reverse(ContainerTy &&C)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
auto find_if_not(R &&Range, UnaryPredicate P)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
constexpr int PoisonMaskElem
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
auto max_element(R &&Range)
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
OutputIt copy(R &&Range, OutputIt Out)
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if the instruction does not have any effects besides calculating the result and does not ...
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the vector form of the intrinsic has a scalar operand.
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const