llvm.org GIT mirror llvm / c34693f
[DAGCombiner] Slice a big load in two loads when the element are next to each other in memory and the target has paired load and performs post-isel loads combining. E.g., this optimization will transform something like this: a = load i64* addr b = trunc i64 a to i32 c = lshr i64 a, 32 d = trunc i64 c to i32 into: b = load i32* addr1 d = load i32* addr2 Where addr1 = addr2 +/- sizeof(i32), if the target supports paired load and performs post-isel loads combining. One should overload TargetLowering::hasPairedLoad to provide this information. The default is false. <rdar://problem/14477220> git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@192471 91177308-0d34-0410-b5e6-96231b3b80d8 Quentin Colombet 6 years ago
3 changed file(s) with 743 addition(s) and 2 deletion(s). Raw diff Collapse all Expand all
11821182 return false;
11831183 }
11841184
1185 /// Return true if the target supplies and combines to a paired load
1186 /// two loaded values of type LoadedType next to each other in memory.
1187 /// RequiredAlignment gives the minimal alignment constraints that must be met to
1188 /// be able to select this paired load.
1189 ///
1190 /// This information is *not* used to generate actual paired loads, but it is used
1191 /// to generate a sequence of loads that is easier to combine into a paired load.
1192 /// For instance, something like this:
1193 /// a = load i64* addr
1194 /// b = trunc i64 a to i32
1195 /// c = lshr i64 a, 32
1196 /// d = trunc i64 c to i32
1197 /// will be optimized into:
1198 /// b = load i32* addr1
1199 /// d = load i32* addr2
1200 /// Where addr1 = addr2 +/- sizeof(i32).
1201 ///
1202 /// In other words, unless the target performs a post-isel load combining, this
1203 /// information should not be provided because it will generate more loads.
1204 virtual bool hasPairedLoad(Type * /*LoadedType*/,
1205 unsigned & /*RequiredAligment*/) const {
1206 return false;
1207 }
1208
1209 virtual bool hasPairedLoad(EVT /*LoadedType*/,
1210 unsigned & /*RequiredAligment*/) const {
1211 return false;
1212 }
1213
11851214 /// Return true if zero-extending the specific node Val to type VT2 is free
11861215 /// (either because it's implicitly zero-extended such as ARM ldrb / ldrh or
11871216 /// because it's folded such as X86 zero-extending loads).
3434 #include "llvm/Target/TargetLowering.h"
3535 #include "llvm/Target/TargetMachine.h"
3636 #include "llvm/Target/TargetOptions.h"
37 #include "llvm/Target/TargetRegisterInfo.h"
3738 #include "llvm/Target/TargetSubtargetInfo.h"
3839 #include
3940 using namespace llvm;
4344 STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created");
4445 STATISTIC(OpsNarrowed , "Number of load/op/store narrowed");
4546 STATISTIC(LdStFP2Int , "Number of fp load/store pairs transformed to int");
47 STATISTIC(SlicedLoads, "Number of load sliced");
4648
4749 namespace {
4850 static cl::opt
5254 static cl::opt
5355 CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
5456 cl::desc("Include global information in alias analysis"));
57
58 /// Hidden option to stress test load slicing, i.e., when this option
59 /// is enabled, load slicing bypasses most of its profitability guards.
60 static cl::opt
61 StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden,
62 cl::desc("Bypass the profitability model of load "
63 "slicing"),
64 cl::init(false));
5565
5666 //------------------------------ DAGCombiner ---------------------------------//
5767
6272 CodeGenOpt::Level OptLevel;
6373 bool LegalOperations;
6474 bool LegalTypes;
75 bool ForCodeSize;
6576
6677 // Worklist of all of the nodes that need to be simplified.
6778 //
144155
145156 bool CombineToPreIndexedLoadStore(SDNode *N);
146157 bool CombineToPostIndexedLoadStore(SDNode *N);
158 bool SliceUpLoad(SDNode *N);
147159
148160 void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad);
149161 SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace);
315327
316328 public:
317329 DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
318 : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
319 OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {}
330 : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
331 OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {
332 AttributeSet FnAttrs =
333 DAG.getMachineFunction().getFunction()->getAttributes();
334 ForCodeSize =
335 FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
336 Attribute::OptimizeForSize) ||
337 FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
338 }
320339
321340 /// Run - runs the dag combiner on all nodes in the work list
322341 void Run(CombineLevel AtLevel);
75787597 if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
75797598 return SDValue(N, 0);
75807599
7600 // Try to slice up N to more direct loads if the slices are mapped to
7601 // different register banks or pairing can take place.
7602 if (SliceUpLoad(N))
7603 return SDValue(N, 0);
7604
75817605 return SDValue();
7606 }
7607
7608 namespace {
7609 /// \brief Helper structure used to slice a load in smaller loads.
7610 /// Basically a slice is obtained from the following sequence:
7611 /// Origin = load Ty1, Base
7612 /// Shift = srl Ty1 Origin, CstTy Amount
7613 /// Inst = trunc Shift to Ty2
7614 ///
7615 /// Then, it will be rewriten into:
7616 /// Slice = load SliceTy, Base + SliceOffset
7617 /// [Inst = zext Slice to Ty2], only if SliceTy <> Ty2
7618 ///
7619 /// SliceTy is deduced from the number of bits that are actually used to
7620 /// build Inst.
7621 struct LoadedSlice {
7622 /// \brief Helper structure used to compute the cost of a slice.
7623 struct Cost {
7624 /// Are we optimizing for code size.
7625 bool ForCodeSize;
7626 /// Various cost.
7627 unsigned Loads;
7628 unsigned Truncates;
7629 unsigned CrossRegisterBanksCopies;
7630 unsigned ZExts;
7631 unsigned Shift;
7632
7633 Cost(bool ForCodeSize = false)
7634 : ForCodeSize(ForCodeSize), Loads(0), Truncates(0),
7635 CrossRegisterBanksCopies(0), ZExts(0), Shift(0) {}
7636
7637 /// \brief Get the cost of one isolated slice.
7638 Cost(const LoadedSlice &LS, bool ForCodeSize = false)
7639 : ForCodeSize(ForCodeSize), Loads(1), Truncates(0),
7640 CrossRegisterBanksCopies(0), ZExts(0), Shift(0) {
7641 EVT TruncType = LS.Inst->getValueType(0);
7642 EVT LoadedType = LS.getLoadedType();
7643 if (TruncType != LoadedType &&
7644 !LS.DAG->getTargetLoweringInfo().isZExtFree(LoadedType, TruncType))
7645 ZExts = 1;
7646 }
7647
7648 /// \brief Account for slicing gain in the current cost.
7649 /// Slicing provide a few gains like removing a shift or a
7650 /// truncate. This method allows to grow the cost of the original
7651 /// load with the gain from this slice.
7652 void addSliceGain(const LoadedSlice &LS) {
7653 // Each slice saves a truncate.
7654 const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo();
7655 if (!TLI.isTruncateFree(LS.Inst->getValueType(0),
7656 LS.Inst->getOperand(0).getValueType()))
7657 ++Truncates;
7658 // If there is a shift amount, this slice gets rid of it.
7659 if (LS.Shift)
7660 ++Shift;
7661 // If this slice can merge a cross register bank copy, account for it.
7662 if (LS.canMergeExpensiveCrossRegisterBankCopy())
7663 ++CrossRegisterBanksCopies;
7664 }
7665
7666 Cost &operator+=(const Cost &RHS) {
7667 Loads += RHS.Loads;
7668 Truncates += RHS.Truncates;
7669 CrossRegisterBanksCopies += RHS.CrossRegisterBanksCopies;
7670 ZExts += RHS.ZExts;
7671 Shift += RHS.Shift;
7672 return *this;
7673 }
7674
7675 bool operator==(const Cost &RHS) const {
7676 return Loads == RHS.Loads && Truncates == RHS.Truncates &&
7677 CrossRegisterBanksCopies == RHS.CrossRegisterBanksCopies &&
7678 ZExts == RHS.ZExts && Shift == RHS.Shift;
7679 }
7680
7681 bool operator!=(const Cost &RHS) const { return !(*this == RHS); }
7682
7683 bool operator<(const Cost &RHS) const {
7684 // Assume cross register banks copies are as expensive as loads.
7685 // FIXME: Do we want some more target hooks?
7686 unsigned ExpensiveOpsLHS = Loads + CrossRegisterBanksCopies;
7687 unsigned ExpensiveOpsRHS = RHS.Loads + RHS.CrossRegisterBanksCopies;
7688 // Unless we are optimizing for code size, consider the
7689 // expensive operation first.
7690 if (!ForCodeSize && ExpensiveOpsLHS != ExpensiveOpsRHS)
7691 return ExpensiveOpsLHS < ExpensiveOpsRHS;
7692 return (Truncates + ZExts + Shift + ExpensiveOpsLHS) <
7693 (RHS.Truncates + RHS.ZExts + RHS.Shift + ExpensiveOpsRHS);
7694 }
7695
7696 bool operator>(const Cost &RHS) const { return RHS < *this; }
7697
7698 bool operator<=(const Cost &RHS) const { return !(RHS < *this); }
7699
7700 bool operator>=(const Cost &RHS) const { return !(*this < RHS); }
7701 };
7702 // The last instruction that represent the slice. This should be a
7703 // truncate instruction.
7704 SDNode *Inst;
7705 // The original load instruction.
7706 LoadSDNode *Origin;
7707 // The right shift amount in bits from the original load.
7708 unsigned Shift;
7709 // The DAG from which Origin came from.
7710 // This is used to get some contextual information about legal types, etc.
7711 SelectionDAG *DAG;
7712
7713 LoadedSlice(SDNode *Inst = NULL, LoadSDNode *Origin = NULL,
7714 unsigned Shift = 0, SelectionDAG *DAG = NULL)
7715 : Inst(Inst), Origin(Origin), Shift(Shift), DAG(DAG) {}
7716
7717 LoadedSlice(const LoadedSlice &LS)
7718 : Inst(LS.Inst), Origin(LS.Origin), Shift(LS.Shift), DAG(LS.DAG) {}
7719
7720 /// \brief Get the bits used in a chunk of bits \p BitWidth large.
7721 /// \return Result is \p BitWidth and has used bits set to 1 and
7722 /// not used bits set to 0.
7723 APInt getUsedBits() const {
7724 // Reproduce the trunc(lshr) sequence:
7725 // - Start from the truncated value.
7726 // - Zero extend to the desired bit width.
7727 // - Shift left.
7728 assert(Origin && "No original load to compare against.");
7729 unsigned BitWidth = Origin->getValueSizeInBits(0);
7730 assert(Inst && "This slice is not bound to an instruction");
7731 assert(Inst->getValueSizeInBits(0) <= BitWidth &&
7732 "Extracted slice is bigger than the whole type!");
7733 APInt UsedBits(Inst->getValueSizeInBits(0), 0);
7734 UsedBits.setAllBits();
7735 UsedBits = UsedBits.zext(BitWidth);
7736 UsedBits <<= Shift;
7737 return UsedBits;
7738 }
7739
7740 /// \brief Get the size of the slice to be loaded in bytes.
7741 unsigned getLoadedSize() const {
7742 unsigned SliceSize = getUsedBits().countPopulation();
7743 assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte.");
7744 return SliceSize / 8;
7745 }
7746
7747 /// \brief Get the type that will be loaded for this slice.
7748 /// Note: This may not be the final type for the slice.
7749 EVT getLoadedType() const {
7750 assert(DAG && "Missing context");
7751 LLVMContext &Ctxt = *DAG->getContext();
7752 return EVT::getIntegerVT(Ctxt, getLoadedSize() * 8);
7753 }
7754
7755 /// \brief Get the alignment of the load used for this slice.
7756 unsigned getAlignment() const {
7757 unsigned Alignment = Origin->getAlignment();
7758 unsigned Offset = getOffsetFromBase();
7759 if (Offset != 0)
7760 Alignment = MinAlign(Alignment, Alignment + Offset);
7761 return Alignment;
7762 }
7763
7764 /// \brief Check if this slice can be rewritten with legal operations.
7765 bool isLegal() const {
7766 // An invalid slice is not legal.
7767 if (!Origin || !Inst || !DAG)
7768 return false;
7769
7770 // Offsets are for indexed load only, we do not handle that.
7771 if (Origin->getOffset().getOpcode() != ISD::UNDEF)
7772 return false;
7773
7774 const TargetLowering &TLI = DAG->getTargetLoweringInfo();
7775
7776 // Check that the type is legal.
7777 EVT SliceType = getLoadedType();
7778 if (!TLI.isTypeLegal(SliceType))
7779 return false;
7780
7781 // Check that the load is legal for this type.
7782 if (!TLI.isOperationLegal(ISD::LOAD, SliceType))
7783 return false;
7784
7785 // Check that the offset can be computed.
7786 // 1. Check its type.
7787 EVT PtrType = Origin->getBasePtr().getValueType();
7788 if (PtrType == MVT::Untyped || PtrType.isExtended())
7789 return false;
7790
7791 // 2. Check that it fits in the immediate.
7792 if (!TLI.isLegalAddImmediate(getOffsetFromBase()))
7793 return false;
7794
7795 // 3. Check that the computation is legal.
7796 if (!TLI.isOperationLegal(ISD::ADD, PtrType))
7797 return false;
7798
7799 // Check that the zext is legal if it needs one.
7800 EVT TruncateType = Inst->getValueType(0);
7801 if (TruncateType != SliceType &&
7802 !TLI.isOperationLegal(ISD::ZERO_EXTEND, TruncateType))
7803 return false;
7804
7805 return true;
7806 }
7807
7808 /// \brief Get the offset in bytes of this slice in the original chunk of
7809 /// bits.
7810 /// \pre DAG != NULL.
7811 uint64_t getOffsetFromBase() const {
7812 assert(DAG && "Missing context.");
7813 bool IsBigEndian =
7814 DAG->getTargetLoweringInfo().getDataLayout()->isBigEndian();
7815 assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not supported.");
7816 uint64_t Offset = Shift / 8;
7817 unsigned TySizeInBytes = Origin->getValueSizeInBits(0) / 8;
7818 assert(!(Origin->getValueSizeInBits(0) & 0x7) &&
7819 "The size of the original loaded type is not a multiple of a"
7820 " byte.");
7821 // If Offset is bigger than TySizeInBytes, it means we are loading all
7822 // zeros. This should have been optimized before in the process.
7823 assert(TySizeInBytes > Offset &&
7824 "Invalid shift amount for given loaded size");
7825 if (IsBigEndian)
7826 Offset = TySizeInBytes - Offset - getLoadedSize();
7827 return Offset;
7828 }
7829
7830 /// \brief Generate the sequence of instructions to load the slice
7831 /// represented by this object and redirect the uses of this slice to
7832 /// this new sequence of instructions.
7833 /// \pre this->Inst && this->Origin are valid Instructions and this
7834 /// object passed the legal check: LoadedSlice::isLegal returned true.
7835 /// \return The last instruction of the sequence used to load the slice.
7836 SDValue loadSlice() const {
7837 assert(Inst && Origin && "Unable to replace a non-existing slice.");
7838 const SDValue &OldBaseAddr = Origin->getBasePtr();
7839 SDValue BaseAddr = OldBaseAddr;
7840 // Get the offset in that chunk of bytes w.r.t. the endianess.
7841 int64_t Offset = static_cast(getOffsetFromBase());
7842 assert(Offset >= 0 && "Offset too big to fit in int64_t!");
7843 if (Offset) {
7844 // BaseAddr = BaseAddr + Offset.
7845 EVT ArithType = BaseAddr.getValueType();
7846 BaseAddr = DAG->getNode(ISD::ADD, SDLoc(Origin), ArithType, BaseAddr,
7847 DAG->getConstant(Offset, ArithType));
7848 }
7849
7850 // Create the type of the loaded slice according to its size.
7851 EVT SliceType = getLoadedType();
7852
7853 // Create the load for the slice.
7854 SDValue LastInst = DAG->getLoad(
7855 SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr,
7856 Origin->getPointerInfo().getWithOffset(Offset), Origin->isVolatile(),
7857 Origin->isNonTemporal(), Origin->isInvariant(), getAlignment());
7858 // If the final type is not the same as the loaded type, this means that
7859 // we have to pad with zero. Create a zero extend for that.
7860 EVT FinalType = Inst->getValueType(0);
7861 if (SliceType != FinalType)
7862 LastInst =
7863 DAG->getNode(ISD::ZERO_EXTEND, SDLoc(LastInst), FinalType, LastInst);
7864 return LastInst;
7865 }
7866
7867 /// \brief Check if this slice can be merged with an expensive cross register
7868 /// bank copy. E.g.,
7869 /// i = load i32
7870 /// f = bitcast i32 i to float
7871 bool canMergeExpensiveCrossRegisterBankCopy() const {
7872 if (!Inst || !Inst->hasOneUse())
7873 return false;
7874 SDNode *Use = *Inst->use_begin();
7875 if (Use->getOpcode() != ISD::BITCAST)
7876 return false;
7877 assert(DAG && "Missing context");
7878 const TargetLowering &TLI = DAG->getTargetLoweringInfo();
7879 EVT ResVT = Use->getValueType(0);
7880 const TargetRegisterClass *ResRC = TLI.getRegClassFor(ResVT.getSimpleVT());
7881 const TargetRegisterClass *ArgRC =
7882 TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT());
7883 if (ArgRC == ResRC || !TLI.isOperationLegal(ISD::LOAD, ResVT))
7884 return false;
7885
7886 // At this point, we know that we perform a cross-register-bank copy.
7887 // Check if it is expensive.
7888 const TargetRegisterInfo *TRI = TLI.getTargetMachine().getRegisterInfo();
7889 // Assume bitcasts are cheap, unless both register classes do not
7890 // explicitly share a common sub class.
7891 if (!TRI || TRI->getCommonSubClass(ArgRC, ResRC))
7892 return false;
7893
7894 // Check if it will be merged with the load.
7895 // 1. Check the alignment constraint.
7896 unsigned RequiredAlignment = TLI.getDataLayout()->getABITypeAlignment(
7897 ResVT.getTypeForEVT(*DAG->getContext()));
7898
7899 if (RequiredAlignment > getAlignment())
7900 return false;
7901
7902 // 2. Check that the load is a legal operation for that type.
7903 if (!TLI.isOperationLegal(ISD::LOAD, ResVT))
7904 return false;
7905
7906 // 3. Check that we do not have a zext in the way.
7907 if (Inst->getValueType(0) != getLoadedType())
7908 return false;
7909
7910 return true;
7911 }
7912 };
7913 }
7914
7915 /// \brief Sorts LoadedSlice according to their offset.
7916 struct LoadedSliceSorter {
7917 bool operator()(const LoadedSlice &LHS, const LoadedSlice &RHS) {
7918 assert(LHS.Origin == RHS.Origin && "Different bases not implemented.");
7919 return LHS.getOffsetFromBase() < RHS.getOffsetFromBase();
7920 }
7921 };
7922
7923 /// \brief Check that all bits set in \p UsedBits form a dense region, i.e.,
7924 /// \p UsedBits looks like 0..0 1..1 0..0.
7925 static bool areUsedBitsDense(const APInt &UsedBits) {
7926 // If all the bits are one, this is dense!
7927 if (UsedBits.isAllOnesValue())
7928 return true;
7929
7930 // Get rid of the unused bits on the right.
7931 APInt NarrowedUsedBits = UsedBits.lshr(UsedBits.countTrailingZeros());
7932 // Get rid of the unused bits on the left.
7933 if (NarrowedUsedBits.countLeadingZeros())
7934 NarrowedUsedBits = NarrowedUsedBits.trunc(NarrowedUsedBits.getActiveBits());
7935 // Check that the chunk of bits is completely used.
7936 return NarrowedUsedBits.isAllOnesValue();
7937 }
7938
7939 /// \brief Check whether or not \p First and \p Second are next to each other
7940 /// in memory. This means that there is no hole between the bits loaded
7941 /// by \p First and the bits loaded by \p Second.
7942 static bool areSlicesNextToEachOther(const LoadedSlice &First,
7943 const LoadedSlice &Second) {
7944 assert(First.Origin == Second.Origin && First.Origin &&
7945 "Unable to match different memory origins.");
7946 APInt UsedBits = First.getUsedBits();
7947 assert((UsedBits & Second.getUsedBits()) == 0 &&
7948 "Slices are not supposed to overlap.");
7949 UsedBits |= Second.getUsedBits();
7950 return areUsedBitsDense(UsedBits);
7951 }
7952
7953 /// \brief Adjust the \p GlobalLSCost according to the target
7954 /// paring capabilities and the layout of the slices.
7955 /// \pre \p GlobalLSCost should account for at least as many loads as
7956 /// there is in the slices in \p LoadedSlices.
7957 static void adjustCostForPairing(SmallVectorImpl &LoadedSlices,
7958 LoadedSlice::Cost &GlobalLSCost) {
7959 unsigned NumberOfSlices = LoadedSlices.size();
7960 // If there is less than 2 elements, no pairing is possible.
7961 if (NumberOfSlices < 2)
7962 return;
7963
7964 // Sort the slices so that elements that are likely to be next to each
7965 // other in memory are next to each other in the list.
7966 std::sort(LoadedSlices.begin(), LoadedSlices.end(), LoadedSliceSorter());
7967 const TargetLowering &TLI = LoadedSlices[0].DAG->getTargetLoweringInfo();
7968 // First (resp. Second) is the first (resp. Second) potentially candidate
7969 // to be placed in a paired load.
7970 const LoadedSlice *First = NULL;
7971 const LoadedSlice *Second = NULL;
7972 for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice,
7973 // Set the beginning of the pair.
7974 First = Second) {
7975
7976 Second = &LoadedSlices[CurrSlice];
7977
7978 // If First is NULL, it means we start a new pair.
7979 // Get to the next slice.
7980 if (!First)
7981 continue;
7982
7983 EVT LoadedType = First->getLoadedType();
7984
7985 // If the types of the slices are different, we cannot pair them.
7986 if (LoadedType != Second->getLoadedType())
7987 continue;
7988
7989 // Check if the target supplies paired loads for this type.
7990 unsigned RequiredAlignment = 0;
7991 if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) {
7992 // move to the next pair, this type is hopeless.
7993 Second = NULL;
7994 continue;
7995 }
7996 // Check if we meet the alignment requirement.
7997 if (RequiredAlignment > First->getAlignment())
7998 continue;
7999
8000 // Check that both loads are next to each other in memory.
8001 if (!areSlicesNextToEachOther(*First, *Second))
8002 continue;
8003
8004 assert(GlobalLSCost.Loads > 0 && "We save more loads than we created!");
8005 --GlobalLSCost.Loads;
8006 // Move to the next pair.
8007 Second = NULL;
8008 }
8009 }
8010
8011 /// \brief Check the profitability of all involved LoadedSlice.
8012 /// Currently, it is considered profitable if there is exactly two
8013 /// involved slices (1) which are (2) next to each other in memory, and
8014 /// whose cost (\see LoadedSlice::Cost) is smaller than the original load (3).
8015 ///
8016 /// Note: The order of the elements in \p LoadedSlices may be modified, but not
8017 /// the elements themselves.
8018 ///
8019 /// FIXME: When the cost model will be mature enough, we can relax
8020 /// constraints (1) and (2).
8021 static bool isSlicingProfitable(SmallVectorImpl &LoadedSlices,
8022 const APInt &UsedBits, bool ForCodeSize) {
8023 unsigned NumberOfSlices = LoadedSlices.size();
8024 if (StressLoadSlicing)
8025 return NumberOfSlices > 1;
8026
8027 // Check (1).
8028 if (NumberOfSlices != 2)
8029 return false;
8030
8031 // Check (2).
8032 if (!areUsedBitsDense(UsedBits))
8033 return false;
8034
8035 // Check (3).
8036 LoadedSlice::Cost OrigCost(ForCodeSize), GlobalSlicingCost(ForCodeSize);
8037 // The original code has one big load.
8038 OrigCost.Loads = 1;
8039 for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice) {
8040 const LoadedSlice &LS = LoadedSlices[CurrSlice];
8041 // Accumulate the cost of all the slices.
8042 LoadedSlice::Cost SliceCost(LS, ForCodeSize);
8043 GlobalSlicingCost += SliceCost;
8044
8045 // Account as cost in the original configuration the gain obtained
8046 // with the current slices.
8047 OrigCost.addSliceGain(LS);
8048 }
8049
8050 // If the target supports paired load, adjust the cost accordingly.
8051 adjustCostForPairing(LoadedSlices, GlobalSlicingCost);
8052 return OrigCost > GlobalSlicingCost;
8053 }
8054
8055 /// \brief If the given load, \p LI, is used only by trunc or trunc(lshr)
8056 /// operations, split it in the various pieces being extracted.
8057 ///
8058 /// This sort of thing is introduced by SROA.
8059 /// This slicing takes care not to insert overlapping loads.
8060 /// \pre LI is a simple load (i.e., not an atomic or volatile load).
8061 bool DAGCombiner::SliceUpLoad(SDNode *N) {
8062 if (Level < AfterLegalizeDAG)
8063 return false;
8064
8065 LoadSDNode *LD = cast(N);
8066 if (LD->isVolatile() || !ISD::isNormalLoad(LD) ||
8067 !LD->getValueType(0).isInteger())
8068 return false;
8069
8070 // Keep track of already used bits to detect overlapping values.
8071 // In that case, we will just abort the transformation.
8072 APInt UsedBits(LD->getValueSizeInBits(0), 0);
8073
8074 SmallVector LoadedSlices;
8075
8076 // Check if this load is used as several smaller chunks of bits.
8077 // Basically, look for uses in trunc or trunc(lshr) and record a new chain
8078 // of computation for each trunc.
8079 for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
8080 UI != UIEnd; ++UI) {
8081 // Skip the uses of the chain.
8082 if (UI.getUse().getResNo() != 0)
8083 continue;
8084
8085 SDNode *User = *UI;
8086 unsigned Shift = 0;
8087
8088 // Check if this is a trunc(lshr).
8089 if (User->getOpcode() == ISD::SRL && User->hasOneUse() &&
8090 isa(User->getOperand(1))) {
8091 Shift = cast(User->getOperand(1))->getZExtValue();
8092 User = *User->use_begin();
8093 }
8094
8095 // At this point, User is a Truncate, iff we encountered, trunc or
8096 // trunc(lshr).
8097 if (User->getOpcode() != ISD::TRUNCATE)
8098 return false;
8099
8100 // The width of the type must be a power of 2 and greater than 8-bits.
8101 // Otherwise the load cannot be represented in LLVM IR.
8102 // Moreover, if we shifted with a non 8-bits multiple, the slice
8103 // will be accross several bytes. We do not support that.
8104 unsigned Width = User->getValueSizeInBits(0);
8105 if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7))
8106 return 0;
8107
8108 // Build the slice for this chain of computations.
8109 LoadedSlice LS(User, LD, Shift, &DAG);
8110 APInt CurrentUsedBits = LS.getUsedBits();
8111
8112 // Check if this slice overlaps with another.
8113 if ((CurrentUsedBits & UsedBits) != 0)
8114 return false;
8115 // Update the bits used globally.
8116 UsedBits |= CurrentUsedBits;
8117
8118 // Check if the new slice would be legal.
8119 if (!LS.isLegal())
8120 return false;
8121
8122 // Record the slice.
8123 LoadedSlices.push_back(LS);
8124 }
8125
8126 // Abort slicing if it does not seem to be profitable.
8127 if (!isSlicingProfitable(LoadedSlices, UsedBits, ForCodeSize))
8128 return false;
8129
8130 ++SlicedLoads;
8131
8132 // Rewrite each chain to use an independent load.
8133 // By construction, each chain can be represented by a unique load.
8134
8135 // Prepare the argument for the new token factor for all the slices.
8136 SmallVector ArgChains;
8137 for (SmallVectorImpl::const_iterator
8138 LSIt = LoadedSlices.begin(),
8139 LSItEnd = LoadedSlices.end();
8140 LSIt != LSItEnd; ++LSIt) {
8141 SDValue SliceInst = LSIt->loadSlice();
8142 CombineTo(LSIt->Inst, SliceInst, true);
8143 if (SliceInst.getNode()->getOpcode() != ISD::LOAD)
8144 SliceInst = SliceInst.getOperand(0);
8145 assert(SliceInst->getOpcode() == ISD::LOAD &&
8146 "It takes more than a zext to get to the loaded slice!!");
8147 ArgChains.push_back(SliceInst.getValue(1));
8148 }
8149
8150 SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other,
8151 &ArgChains[0], ArgChains.size());
8152 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
8153 return true;
75828154 }
75838155
75848156 /// CheckForMaskedLoad - Check to see if V is (and load (ptr), imm), where the
0 ; RUN: llc -mtriple x86_64-apple-macosx -combiner-stress-load-slicing < %s -o - | FileCheck %s --check-prefix=STRESS
1 ; RUN: llc -mtriple x86_64-apple-macosx < %s -o - | FileCheck %s --check-prefix=REGULAR
2 ;
3 ;
4
5 %class.Complex = type { float, float }
6
7
8 ; Check that independant slices leads to independant loads then the slices leads to
9 ; different register file.
10 ;
11 ; The layout is:
12 ; LSB 0 1 2 3 | 4 5 6 7 MSB
13 ; Low High
14 ; The base address points to 0 and is 8-bytes aligned.
15 ; Low slice starts at 0 (base) and is 8-bytes aligned.
16 ; High slice starts at 4 (base + 4-bytes) and is 4-bytes aligned.
17 ;
18 ; STRESS-LABEL: t1:
19 ; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
20 ; STRESS: vmovss 68([[BASE:[^)]+]]), [[OUT_Imm:%xmm[0-9]+]]
21 ; Add high slice: out[out_start].imm, this is base + 4.
22 ; STRESS-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
23 ; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
24 ; STRESS-NEXT: vmovss 64([[BASE]]), [[OUT_Real:%xmm[0-9]+]]
25 ; Add low slice: out[out_start].real, this is base + 0.
26 ; STRESS-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
27 ; Swap Imm and Real.
28 ; STRESS-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
29 ; Put the results back into out[out_start].
30 ; STRESS-NEXT: vmovq [[RES_Vec]], ([[BASE]])
31 ;
32 ; Same for REGULAR, we eliminate register bank copy with each slices.
33 ; REGULAR-LABEL: t1:
34 ; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
35 ; REGULAR: vmovss 68([[BASE:[^)]+]]), [[OUT_Imm:%xmm[0-9]+]]
36 ; Add high slice: out[out_start].imm, this is base + 4.
37 ; REGULAR-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
38 ; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
39 ; REGULAR-NEXT: vmovss 64([[BASE]]), [[OUT_Real:%xmm[0-9]+]]
40 ; Add low slice: out[out_start].real, this is base + 0.
41 ; REGULAR-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
42 ; Swap Imm and Real.
43 ; REGULAR-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
44 ; Put the results back into out[out_start].
45 ; REGULAR-NEXT: vmovq [[RES_Vec]], ([[BASE]])
46 define void @t1(%class.Complex* nocapture %out, i64 %out_start) {
47 entry:
48 %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
49 %tmp = bitcast %class.Complex* %arrayidx to i64*
50 %tmp1 = load i64* %tmp, align 8
51 %t0.sroa.0.0.extract.trunc = trunc i64 %tmp1 to i32
52 %tmp2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float
53 %t0.sroa.2.0.extract.shift = lshr i64 %tmp1, 32
54 %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
55 %tmp3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float
56 %add = add i64 %out_start, 8
57 %arrayidx2 = getelementptr inbounds %class.Complex* %out, i64 %add
58 %i.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 0
59 %tmp4 = load float* %i.i, align 4
60 %add.i = fadd float %tmp4, %tmp2
61 %retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0
62 %r.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 1
63 %tmp5 = load float* %r.i, align 4
64 %add5.i = fadd float %tmp5, %tmp3
65 %retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1
66 %ref.tmp.sroa.0.0.cast = bitcast %class.Complex* %arrayidx to <2 x float>*
67 store <2 x float> %retval.sroa.0.4.vec.insert.i, <2 x float>* %ref.tmp.sroa.0.0.cast, align 4
68 ret void
69 }
70
71 ; Function Attrs: nounwind
72 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
73
74 ; Function Attrs: nounwind
75 declare void @llvm.lifetime.start(i64, i8* nocapture)
76
77 ; Function Attrs: nounwind
78 declare void @llvm.lifetime.end(i64, i8* nocapture)
79
80 ; Check that we do not read outside of the chunk of bits of the original loads.
81 ;
82 ; The 64-bits should have been split in one 32-bits and one 16-bits slices.
83 ; The 16-bits should be zero extended to match the final type.
84 ;
85 ; The memory layout is:
86 ; LSB 0 1 2 3 | 4 5 | 6 7 MSB
87 ; Low High
88 ; The base address points to 0 and is 8-bytes aligned.
89 ; Low slice starts at 0 (base) and is 8-bytes aligned.
90 ; High slice starts at 6 (base + 6-bytes) and is 2-bytes aligned.
91 ;
92 ; STRESS-LABEL: t2:
93 ; STRESS: movzwl 6([[BASE:[^)]+]]), %eax
94 ; STRESS-NEXT: addl ([[BASE]]), %eax
95 ; STRESS-NEXT: ret
96 ;
97 ; For the REGULAR heuristic, this is not profitable to slice things that are not
98 ; next to each other in memory. Here we have a hole with bytes #4-5.
99 ; REGULAR-LABEL: t2:
100 ; REGULAR: shrq $48
101 define i32 @t2(%class.Complex* nocapture %out, i64 %out_start) {
102 %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
103 %bitcast = bitcast %class.Complex* %arrayidx to i64*
104 %chunk64 = load i64* %bitcast, align 8
105 %slice32_low = trunc i64 %chunk64 to i32
106 %shift48 = lshr i64 %chunk64, 48
107 %slice32_high = trunc i64 %shift48 to i32
108 %res = add i32 %slice32_high, %slice32_low
109 ret i32 %res
110 }
111
112 ; Check that we do not optimize overlapping slices.
113 ;
114 ; The 64-bits should NOT have been split in as slices are overlapping.
115 ; First slice uses bytes numbered 0 to 3.
116 ; Second slice uses bytes numbered 6 and 7.
117 ; Third slice uses bytes numbered 4 to 7.
118 ;
119 ; STRESS-LABEL: t3:
120 ; STRESS: shrq $48
121 ; STRESS: shrq $32
122 ;
123 ; REGULAR-LABEL: t3:
124 ; REGULAR: shrq $48
125 ; REGULAR: shrq $32
126 define i32 @t3(%class.Complex* nocapture %out, i64 %out_start) {
127 %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
128 %bitcast = bitcast %class.Complex* %arrayidx to i64*
129 %chunk64 = load i64* %bitcast, align 8
130 %slice32_low = trunc i64 %chunk64 to i32
131 %shift48 = lshr i64 %chunk64, 48
132 %slice32_high = trunc i64 %shift48 to i32
133 %shift32 = lshr i64 %chunk64, 32
134 %slice32_lowhigh = trunc i64 %shift32 to i32
135 %tmpres = add i32 %slice32_high, %slice32_low
136 %res = add i32 %slice32_lowhigh, %tmpres
137 ret i32 %res
138 }
139