llvm.org GIT mirror llvm / 7ea3099
[SLP] Recommit: Look-ahead operand reordering heuristic. Summary: This patch introduces a new heuristic for guiding operand reordering. The new "look-ahead" heuristic can look beyond the immediate predecessors. This helps break ties when the immediate predecessors have identical opcodes (see lit test for an example). Reviewers: RKSimon, ABataev, dtemirbulatov, Ayal, hfinkel, rnk Reviewed By: RKSimon, dtemirbulatov Subscribers: hiraditya, phosek, rnk, rcorcs, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D60897 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@364964 91177308-0d34-0410-b5e6-96231b3b80d8 Vasileios Porpodas 2 months ago
2 changed file(s) with 426 addition(s) and 82 deletion(s). Raw diff Collapse all Expand all
145145 static cl::opt MinTreeSize(
146146 "slp-min-tree-size", cl::init(3), cl::Hidden,
147147 cl::desc("Only vectorize small trees if they are fully vectorizable"));
148
149 // The maximum depth that the look-ahead score heuristic will explore.
150 // The higher this value, the higher the compilation time overhead.
151 static cl::opt LookAheadMaxDepth(
152 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
153 cl::desc("The maximum look-ahead depth for operand reordering scores"));
154
155 // The Look-ahead heuristic goes through the users of the bundle to calculate
156 // the users cost in getExternalUsesCost(). To avoid compilation time increase
157 // we limit the number of users visited to this value.
158 static cl::opt LookAheadUsersBudget(
159 "slp-look-ahead-users-budget", cl::init(2), cl::Hidden,
160 cl::desc("The maximum number of users to visit while visiting the "
161 "predecessors. This prevents compilation time increase."));
148162
149163 static cl::opt
150164 ViewSLPTree("view-slp-tree", cl::Hidden,
707721
708722 const DataLayout &DL;
709723 ScalarEvolution &SE;
724 const BoUpSLP &R;
710725
711726 /// \returns the operand data at \p OpIdx and \p Lane.
712727 OperandData &getData(unsigned OpIdx, unsigned Lane) {
732747 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
733748 }
734749
750 // The hard-coded scores listed here are not very important. When computing
751 // the scores of matching one sub-tree with another, we are basically
752 // counting the number of values that are matching. So even if all scores
753 // are set to 1, we would still get a decent matching result.
754 // However, sometimes we have to break ties. For example we may have to
755 // choose between matching loads vs matching opcodes. This is what these
756 // scores are helping us with: they provide the order of preference.
757
758 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
759 static const int ScoreConsecutiveLoads = 3;
760 /// Constants.
761 static const int ScoreConstants = 2;
762 /// Instructions with the same opcode.
763 static const int ScoreSameOpcode = 2;
764 /// Instructions with alt opcodes (e.g, add + sub).
765 static const int ScoreAltOpcodes = 1;
766 /// Identical instructions (a.k.a. splat or broadcast).
767 static const int ScoreSplat = 1;
768 /// Matching with an undef is preferable to failing.
769 static const int ScoreUndef = 1;
770 /// Score for failing to find a decent match.
771 static const int ScoreFail = 0;
772 /// User exteranl to the vectorized code.
773 static const int ExternalUseCost = 1;
774 /// The user is internal but in a different lane.
775 static const int UserInDiffLaneCost = ExternalUseCost;
776
777 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
778 static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
779 ScalarEvolution &SE) {
780 auto *LI1 = dyn_cast(V1);
781 auto *LI2 = dyn_cast(V2);
782 if (LI1 && LI2)
783 return isConsecutiveAccess(LI1, LI2, DL, SE)
784 ? VLOperands::ScoreConsecutiveLoads
785 : VLOperands::ScoreFail;
786
787 auto *C1 = dyn_cast(V1);
788 auto *C2 = dyn_cast(V2);
789 if (C1 && C2)
790 return VLOperands::ScoreConstants;
791
792 auto *I1 = dyn_cast(V1);
793 auto *I2 = dyn_cast(V2);
794 if (I1 && I2) {
795 if (I1 == I2)
796 return VLOperands::ScoreSplat;
797 InstructionsState S = getSameOpcode({I1, I2});
798 // Note: Only consider instructions with <= 2 operands to avoid
799 // complexity explosion.
800 if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
801 return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
802 : VLOperands::ScoreSameOpcode;
803 }
804
805 if (isa(V2))
806 return VLOperands::ScoreUndef;
807
808 return VLOperands::ScoreFail;
809 }
810
811 /// Holds the values and their lane that are taking part in the look-ahead
812 /// score calculation. This is used in the external uses cost calculation.
813 SmallDenseMap InLookAheadValues;
814
815 /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are
816 /// either external to the vectorized code, or require shuffling.
817 int getExternalUsesCost(const std::pair &LHS,
818 const std::pair &RHS) {
819 int Cost = 0;
820 SmallVector, 2> Values = {LHS, RHS};
821 for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
822 Value *V = Values[Idx].first;
823 // Calculate the absolute lane, using the minimum relative lane of LHS
824 // and RHS as base and Idx as the offset.
825 int Ln = std::min(LHS.second, RHS.second) + Idx;
826 assert(Ln >= 0 && "Bad lane calculation");
827 unsigned UsersBudget = LookAheadUsersBudget;
828 for (User *U : V->users()) {
829 if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
830 // The user is in the VectorizableTree. Check if we need to insert.
831 auto It = llvm::find(UserTE->Scalars, U);
832 assert(It != UserTE->Scalars.end() && "U is in UserTE");
833 int UserLn = std::distance(UserTE->Scalars.begin(), It);
834 assert(UserLn >= 0 && "Bad lane");
835 if (UserLn != Ln)
836 Cost += UserInDiffLaneCost;
837 } else {
838 // Check if the user is in the look-ahead code.
839 auto It2 = InLookAheadValues.find(U);
840 if (It2 != InLookAheadValues.end()) {
841 // The user is in the look-ahead code. Check the lane.
842 if (It2->second != Ln)
843 Cost += UserInDiffLaneCost;
844 } else {
845 // The user is neither in SLP tree nor in the look-ahead code.
846 Cost += ExternalUseCost;
847 }
848 }
849 // Limit the number of visited uses to cap compilation time.
850 if (--UsersBudget == 0)
851 break;
852 }
853 }
854 return Cost;
855 }
856
857 /// Go through the operands of \p LHS and \p RHS recursively until \p
858 /// MaxLevel, and return the cummulative score. For example:
859 /// \verbatim
860 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
861 /// \ / \ / \ / \ /
862 /// + + + +
863 /// G1 G2 G3 G4
864 /// \endverbatim
865 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
866 /// each level recursively, accumulating the score. It starts from matching
867 /// the additions at level 0, then moves on to the loads (level 1). The
868 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
869 /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
870 /// {A[0],C[0]} has a score of VLOperands::ScoreFail.
871 /// Please note that the order of the operands does not matter, as we
872 /// evaluate the score of all profitable combinations of operands. In
873 /// other words the score of G1 and G4 is the same as G1 and G2. This
874 /// heuristic is based on ideas described in:
875 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
876 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
877 /// Luís F. W. Góes
878 int getScoreAtLevelRec(const std::pair &LHS,
879 const std::pair &RHS, int CurrLevel,
880 int MaxLevel) {
881
882 Value *V1 = LHS.first;
883 Value *V2 = RHS.first;
884 // Get the shallow score of V1 and V2.
885 int ShallowScoreAtThisLevel =
886 std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -
887 getExternalUsesCost(LHS, RHS));
888 int Lane1 = LHS.second;
889 int Lane2 = RHS.second;
890
891 // If reached MaxLevel,
892 // or if V1 and V2 are not instructions,
893 // or if they are SPLAT,
894 // or if they are not consecutive, early return the current cost.
895 auto *I1 = dyn_cast(V1);
896 auto *I2 = dyn_cast(V2);
897 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
898 ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
899 (isa(I1) && isa(I2) && ShallowScoreAtThisLevel))
900 return ShallowScoreAtThisLevel;
901 assert(I1 && I2 && "Should have early exited.");
902
903 // Keep track of in-tree values for determining the external-use cost.
904 InLookAheadValues[V1] = Lane1;
905 InLookAheadValues[V2] = Lane2;
906
907 // Contains the I2 operand indexes that got matched with I1 operands.
908 SmallSet Op2Used;
909
910 // Recursion towards the operands of I1 and I2. We are trying all possbile
911 // operand pairs, and keeping track of the best score.
912 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
913 OpIdx1 != NumOperands1; ++OpIdx1) {
914 // Try to pair op1I with the best operand of I2.
915 int MaxTmpScore = 0;
916 unsigned MaxOpIdx2 = 0;
917 bool FoundBest = false;
918 // If I2 is commutative try all combinations.
919 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
920 unsigned ToIdx = isCommutative(I2)
921 ? I2->getNumOperands()
922 : std::min(I2->getNumOperands(), OpIdx1 + 1);
923 assert(FromIdx <= ToIdx && "Bad index");
924 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
925 // Skip operands already paired with OpIdx1.
926 if (Op2Used.count(OpIdx2))
927 continue;
928 // Recursively calculate the cost at each level
929 int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},
930 {I2->getOperand(OpIdx2), Lane2},
931 CurrLevel + 1, MaxLevel);
932 // Look for the best score.
933 if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
934 MaxTmpScore = TmpScore;
935 MaxOpIdx2 = OpIdx2;
936 FoundBest = true;
937 }
938 }
939 if (FoundBest) {
940 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
941 Op2Used.insert(MaxOpIdx2);
942 ShallowScoreAtThisLevel += MaxTmpScore;
943 }
944 }
945 return ShallowScoreAtThisLevel;
946 }
947
948 /// \Returns the look-ahead score, which tells us how much the sub-trees
949 /// rooted at \p LHS and \p RHS match, the more they match the higher the
950 /// score. This helps break ties in an informed way when we cannot decide on
951 /// the order of the operands by just considering the immediate
952 /// predecessors.
953 int getLookAheadScore(const std::pair &LHS,
954 const std::pair &RHS) {
955 InLookAheadValues.clear();
956 return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
957 }
958
735959 // Search all operands in Ops[*][Lane] for the one that matches best
736960 // Ops[OpIdx][LastLane] and return its opreand index.
737961 // If no good match can be found, return None.
749973 // The linearized opcode of the operand at OpIdx, Lane.
750974 bool OpIdxAPO = getData(OpIdx, Lane).APO;
751975
752 const unsigned BestScore = 2;
753 const unsigned GoodScore = 1;
754
755976 // The best operand index and its score.
756977 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
757978 // are using the score to differentiate between the two.
7801001 // Look for an operand that matches the current mode.
7811002 switch (RMode) {
7821003 case ReorderingMode::Load:
783 if (isa(Op)) {
784 // Figure out which is left and right, so that we can check for
785 // consecutive loads
786 bool LeftToRight = Lane > LastLane;
787 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
788 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
789 if (isConsecutiveAccess(cast(OpLeft),
790 cast(OpRight), DL, SE))
791 BestOp.Idx = Idx;
1004 case ReorderingMode::Constant:
1005 case ReorderingMode::Opcode: {
1006 bool LeftToRight = Lane > LastLane;
1007 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1008 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1009 unsigned Score =
1010 getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
1011 if (Score > BestOp.Score) {
1012 BestOp.Idx = Idx;
1013 BestOp.Score = Score;
7921014 }
7931015 break;
794 case ReorderingMode::Opcode:
795 // We accept both Instructions and Undefs, but with different scores.
796 if ((isa(Op) && isa(OpLastLane) &&
797 cast(Op)->getOpcode() ==
798 cast(OpLastLane)->getOpcode()) ||
799 (isa(OpLastLane) && isa(Op)) ||
800 isa(Op)) {
801 // An instruction has a higher score than an undef.
802 unsigned Score = (isa(Op)) ? GoodScore : BestScore;
803 if (Score > BestOp.Score) {
804 BestOp.Idx = Idx;
805 BestOp.Score = Score;
806 }
807 }
808 break;
809 case ReorderingMode::Constant:
810 if (isa(Op)) {
811 unsigned Score = (isa(Op)) ? GoodScore : BestScore;
812 if (Score > BestOp.Score) {
813 BestOp.Idx = Idx;
814 BestOp.Score = Score;
815 }
816 }
817 break;
1016 }
8181017 case ReorderingMode::Splat:
8191018 if (Op == OpLastLane)
8201019 BestOp.Idx = Idx;
9451144 public:
9461145 /// Initialize with all the operands of the instruction vector \p RootVL.
9471146 VLOperands(ArrayRef RootVL, const DataLayout &DL,
948 ScalarEvolution &SE)
949 : DL(DL), SE(SE) {
1147 ScalarEvolution &SE, const BoUpSLP &R)
1148 : DL(DL), SE(SE), R(R) {
9501149 // Append all the operands of RootVL.
9511150 appendOperandsOfVL(RootVL);
9521151 }
11681367 SmallVectorImpl &Left,
11691368 SmallVectorImpl &Right,
11701369 const DataLayout &DL,
1171 ScalarEvolution &SE);
1370 ScalarEvolution &SE,
1371 const BoUpSLP &R);
11721372 struct TreeEntry {
11731373 using VecTreeTy = SmallVector, 8>;
11741374 TreeEntry(VecTreeTy &Container) : Container(Container) {}
23702570 // Commutative predicate - collect + sort operands of the instructions
23712571 // so that each side is more likely to have the same opcode.
23722572 assert(P0 == SwapP0 && "Commutative Predicate mismatch");
2373 reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
2573 reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
23742574 } else {
23752575 // Collect operands - commute if it uses the swapped predicate.
23762576 for (Value *V : VL) {
24152615 // have the same opcode.
24162616 if (isa(VL0) && VL0->isCommutative()) {
24172617 ValueList Left, Right;
2418 reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
2618 reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
24192619 buildTree_rec(Left, Depth + 1, {TE, 0});
24202620 buildTree_rec(Right, Depth + 1, {TE, 1});
24212621 return;
25842784 // Reorder operands if reordering would enable vectorization.
25852785 if (isa(VL0)) {
25862786 ValueList Left, Right;
2587 reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
2787 reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
25882788 buildTree_rec(Left, Depth + 1, {TE, 0});
25892789 buildTree_rec(Right, Depth + 1, {TE, 1});
25902790 return;
33013501
33023502 // Perform operand reordering on the instructions in VL and return the reordered
33033503 // operands in Left and Right.
3304 void BoUpSLP::reorderInputsAccordingToOpcode(
3305 ArrayRef VL, SmallVectorImpl &Left,
3306 SmallVectorImpl &Right, const DataLayout &DL,
3307 ScalarEvolution &SE) {
3504 void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef VL,
3505 SmallVectorImpl &Left,
3506 SmallVectorImpl &Right,
3507 const DataLayout &DL,
3508 ScalarEvolution &SE,
3509 const BoUpSLP &R) {
33083510 if (VL.empty())
33093511 return;
3310 VLOperands Ops(VL, DL, SE);
3512 VLOperands Ops(VL, DL, SE, R);
33113513 // Reorder the operands in place.
33123514 Ops.reorder();
33133515 Left = Ops.getVL(0);
2626 ; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5
2727 ; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6
2828 ; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7
29 ; CHECK-NEXT: [[A_0:%.*]] = load double, double* [[IDX0]], align 8
30 ; CHECK-NEXT: [[A_1:%.*]] = load double, double* [[IDX1]], align 8
31 ; CHECK-NEXT: [[B_0:%.*]] = load double, double* [[IDX2]], align 8
32 ; CHECK-NEXT: [[B_1:%.*]] = load double, double* [[IDX3]], align 8
33 ; CHECK-NEXT: [[C_0:%.*]] = load double, double* [[IDX4]], align 8
34 ; CHECK-NEXT: [[C_1:%.*]] = load double, double* [[IDX5]], align 8
35 ; CHECK-NEXT: [[D_0:%.*]] = load double, double* [[IDX6]], align 8
36 ; CHECK-NEXT: [[D_1:%.*]] = load double, double* [[IDX7]], align 8
37 ; CHECK-NEXT: [[SUBAB_0:%.*]] = fsub fast double [[A_0]], [[B_0]]
38 ; CHECK-NEXT: [[SUBCD_0:%.*]] = fsub fast double [[C_0]], [[D_0]]
39 ; CHECK-NEXT: [[SUBAB_1:%.*]] = fsub fast double [[A_1]], [[B_1]]
40 ; CHECK-NEXT: [[SUBCD_1:%.*]] = fsub fast double [[C_1]], [[D_1]]
41 ; CHECK-NEXT: [[ADDABCD_0:%.*]] = fadd fast double [[SUBAB_0]], [[SUBCD_0]]
42 ; CHECK-NEXT: [[ADDCDAB_1:%.*]] = fadd fast double [[SUBCD_1]], [[SUBAB_1]]
43 ; CHECK-NEXT: store double [[ADDABCD_0]], double* [[IDX0]], align 8
44 ; CHECK-NEXT: store double [[ADDCDAB_1]], double* [[IDX1]], align 8
29 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
30 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
31 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>*
32 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
33 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>*
34 ; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8
35 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>*
36 ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8
37 ; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
38 ; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
39 ; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP8]], [[TMP9]]
40 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
41 ; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8
4542 ; CHECK-NEXT: ret void
4643 ;
4744 entry:
163160 ; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5
164161 ; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6
165162 ; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7
166 ; CHECK-NEXT: [[A_0:%.*]] = load double, double* [[IDX0]], align 8
167 ; CHECK-NEXT: [[A_1:%.*]] = load double, double* [[IDX1]], align 8
168 ; CHECK-NEXT: [[B_0:%.*]] = load double, double* [[IDX2]], align 8
169 ; CHECK-NEXT: [[B_1:%.*]] = load double, double* [[IDX3]], align 8
170 ; CHECK-NEXT: [[C_0:%.*]] = load double, double* [[IDX4]], align 8
171 ; CHECK-NEXT: [[C_1:%.*]] = load double, double* [[IDX5]], align 8
172 ; CHECK-NEXT: [[D_0:%.*]] = load double, double* [[IDX6]], align 8
173 ; CHECK-NEXT: [[D_1:%.*]] = load double, double* [[IDX7]], align 8
174 ; CHECK-NEXT: [[ADDAB_0:%.*]] = fadd fast double [[A_0]], [[B_0]]
175 ; CHECK-NEXT: [[SUBCD_0:%.*]] = fsub fast double [[C_0]], [[D_0]]
176 ; CHECK-NEXT: [[ADDCD_1:%.*]] = fadd fast double [[C_1]], [[D_1]]
177 ; CHECK-NEXT: [[SUBAB_1:%.*]] = fsub fast double [[A_1]], [[B_1]]
178 ; CHECK-NEXT: [[ADDABCD_0:%.*]] = fadd fast double [[ADDAB_0]], [[SUBCD_0]]
179 ; CHECK-NEXT: [[ADDCDAB_1:%.*]] = fadd fast double [[ADDCD_1]], [[SUBAB_1]]
180 ; CHECK-NEXT: store double [[ADDABCD_0]], double* [[IDX0]], align 8
181 ; CHECK-NEXT: store double [[ADDCDAB_1]], double* [[IDX1]], align 8
163 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
164 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
165 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>*
166 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
167 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>*
168 ; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8
169 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>*
170 ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8
171 ; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
172 ; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP7]]
173 ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32>
174 ; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]]
175 ; CHECK-NEXT: [[TMP12:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
176 ; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x i32>
177 ; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP10]]
178 ; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
179 ; CHECK-NEXT: store <2 x double> [[TMP14]], <2 x double>* [[TMP15]], align 8
182180 ; CHECK-NEXT: ret void
183181 ;
184182 entry:
229227
230228 define void @lookahead_external_uses(double* %A, double *%B, double *%C, double *%D, double *%S, double *%Ext1, double *%Ext2) {
231229 ; CHECK-LABEL: @lookahead_external_uses(
230 ; CHECK-NEXT: entry:
231 ; CHECK-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0
232 ; CHECK-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 0
233 ; CHECK-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 0
234 ; CHECK-NEXT: [[IDXD0:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 0
235 ; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
236 ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2
237 ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2
238 ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
239 ; CHECK-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8
240 ; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8
241 ; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8
242 ; CHECK-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8
243 ; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8
244 ; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8
245 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>*
246 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
247 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0
248 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[A1]], i32 1
249 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[D0]], i32 0
250 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[B2]], i32 1
251 ; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP5]]
252 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[A0]], i32 0
253 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A2]], i32 1
254 ; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP1]]
255 ; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]]
256 ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
257 ; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
258 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
259 ; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8
260 ; CHECK-NEXT: store double [[A1]], double* [[EXT1:%.*]], align 8
261 ; CHECK-NEXT: ret void
262 ;
263 entry:
264 %IdxA0 = getelementptr inbounds double, double* %A, i64 0
265 %IdxB0 = getelementptr inbounds double, double* %B, i64 0
266 %IdxC0 = getelementptr inbounds double, double* %C, i64 0
267 %IdxD0 = getelementptr inbounds double, double* %D, i64 0
268
269 %IdxA1 = getelementptr inbounds double, double* %A, i64 1
270 %IdxB2 = getelementptr inbounds double, double* %B, i64 2
271 %IdxA2 = getelementptr inbounds double, double* %A, i64 2
272 %IdxB1 = getelementptr inbounds double, double* %B, i64 1
273
274 %A0 = load double, double *%IdxA0, align 8
275 %B0 = load double, double *%IdxB0, align 8
276 %C0 = load double, double *%IdxC0, align 8
277 %D0 = load double, double *%IdxD0, align 8
278
279 %A1 = load double, double *%IdxA1, align 8
280 %B2 = load double, double *%IdxB2, align 8
281 %A2 = load double, double *%IdxA2, align 8
282 %B1 = load double, double *%IdxB1, align 8
283
284 %subA0B0 = fsub fast double %A0, %B0
285 %subC0D0 = fsub fast double %C0, %D0
286
287 %subA1B2 = fsub fast double %A1, %B2
288 %subA2B1 = fsub fast double %A2, %B1
289
290 %add0 = fadd fast double %subA0B0, %subC0D0
291 %add1 = fadd fast double %subA1B2, %subA2B1
292
293 %IdxS0 = getelementptr inbounds double, double* %S, i64 0
294 %IdxS1 = getelementptr inbounds double, double* %S, i64 1
295
296 store double %add0, double *%IdxS0, align 8
297 store double %add1, double *%IdxS1, align 8
298
299 ; External use
300 store double %A1, double *%Ext1, align 8
301 ret void
302 }
303
304 ; A[0] B[0] C[0] D[0] A[1] B[2] A[2] B[1]
305 ; \ / \ / / \ / \ / \
306 ; - - U1,U2,U3 - - U4,U5
307 ; \ / \ /
308 ; + +
309 ; | |
310 ; S[0] S[1]
311 ;
312 ;
313 ; If we limit the users budget for the look-ahead heuristic to 2, then the
314 ; look-ahead heuristic has no way of choosing B[1] (with 2 external users)
315 ; over A[1] (with 3 external users).
316 ; The result is that the operands are of the Add not reordered and the loads
317 ; from A get vectorized instead of the loads from B.
318 ;
319 define void @lookahead_limit_users_budget(double* %A, double *%B, double *%C, double *%D, double *%S, double *%Ext1, double *%Ext2, double *%Ext3, double *%Ext4, double *%Ext5) {
320 ; CHECK-LABEL: @lookahead_limit_users_budget(
232321 ; CHECK-NEXT: entry:
233322 ; CHECK-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0
234323 ; CHECK-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 0
261350 ; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8
262351 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
263352 ; CHECK-NEXT: store double [[TMP12]], double* [[EXT1:%.*]], align 8
353 ; CHECK-NEXT: store double [[TMP12]], double* [[EXT2:%.*]], align 8
354 ; CHECK-NEXT: store double [[TMP12]], double* [[EXT3:%.*]], align 8
355 ; CHECK-NEXT: store double [[B1]], double* [[EXT4:%.*]], align 8
356 ; CHECK-NEXT: store double [[B1]], double* [[EXT5:%.*]], align 8
264357 ; CHECK-NEXT: ret void
265358 ;
266359 entry:
299392 store double %add0, double *%IdxS0, align 8
300393 store double %add1, double *%IdxS1, align 8
301394
302 ; External use
395 ; External uses of A1
303396 store double %A1, double *%Ext1, align 8
304 ret void
305 }
397 store double %A1, double *%Ext2, align 8
398 store double %A1, double *%Ext3, align 8
399
400 ; External uses of B1
401 store double %B1, double *%Ext4, align 8
402 store double %B1, double *%Ext5, align 8
403
404 ret void
405 }
406
407 ; This checks that the lookahead code does not crash when instructions with the same opcodes have different numbers of operands (in this case the calls).
408
409 %Class = type { i8 }
410 declare double @_ZN1i2ayEv(%Class*)
411 declare double @_ZN1i2axEv()
412
413 define void @lookahead_crash(double* %A, double *%S, %Class *%Arg0) {
414 ; CHECK-LABEL: @lookahead_crash(
415 ; CHECK-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0
416 ; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
417 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[IDXA0]] to <2 x double>*
418 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
419 ; CHECK-NEXT: [[C0:%.*]] = call double @_ZN1i2ayEv(%Class* [[ARG0:%.*]])
420 ; CHECK-NEXT: [[C1:%.*]] = call double @_ZN1i2axEv()
421 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0
422 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[C1]], i32 1
423 ; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP4]]
424 ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
425 ; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
426 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
427 ; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
428 ; CHECK-NEXT: ret void
429 ;
430 %IdxA0 = getelementptr inbounds double, double* %A, i64 0
431 %IdxA1 = getelementptr inbounds double, double* %A, i64 1
432
433 %A0 = load double, double *%IdxA0, align 8
434 %A1 = load double, double *%IdxA1, align 8
435
436 %C0 = call double @_ZN1i2ayEv(%Class *%Arg0)
437 %C1 = call double @_ZN1i2axEv()
438
439 %add0 = fadd fast double %A0, %C0
440 %add1 = fadd fast double %A1, %C1
441
442 %IdxS0 = getelementptr inbounds double, double* %S, i64 0
443 %IdxS1 = getelementptr inbounds double, double* %S, i64 1
444 store double %add0, double *%IdxS0, align 8
445 store double %add1, double *%IdxS1, align 8
446 ret void
447 }