llvm.org GIT mirror llvm / 88564d2
Temporarily Revert "[SLP] Recommit: Look-ahead operand reordering heuristic." As there are some reported miscompiles with AVX512 and performance regressions in Eigen. Verified with the original committer and testcases will be forthcoming. This reverts commit r364964. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@366154 91177308-0d34-0410-b5e6-96231b3b80d8 Eric Christopher a month ago
2 changed file(s) with 80 addition(s) and 424 deletion(s). Raw diff Collapse all Expand all
145145 static cl::opt MinTreeSize(
146146 "slp-min-tree-size", cl::init(3), cl::Hidden,
147147 cl::desc("Only vectorize small trees if they are fully vectorizable"));
148
149 // The maximum depth that the look-ahead score heuristic will explore.
150 // The higher this value, the higher the compilation time overhead.
151 static cl::opt LookAheadMaxDepth(
152 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
153 cl::desc("The maximum look-ahead depth for operand reordering scores"));
154
155 // The Look-ahead heuristic goes through the users of the bundle to calculate
156 // the users cost in getExternalUsesCost(). To avoid compilation time increase
157 // we limit the number of users visited to this value.
158 static cl::opt LookAheadUsersBudget(
159 "slp-look-ahead-users-budget", cl::init(2), cl::Hidden,
160 cl::desc("The maximum number of users to visit while visiting the "
161 "predecessors. This prevents compilation time increase."));
162148
163149 static cl::opt
164150 ViewSLPTree("view-slp-tree", cl::Hidden,
721707
722708 const DataLayout &DL;
723709 ScalarEvolution &SE;
724 const BoUpSLP &R;
725710
726711 /// \returns the operand data at \p OpIdx and \p Lane.
727712 OperandData &getData(unsigned OpIdx, unsigned Lane) {
747732 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
748733 }
749734
750 // The hard-coded scores listed here are not very important. When computing
751 // the scores of matching one sub-tree with another, we are basically
752 // counting the number of values that are matching. So even if all scores
753 // are set to 1, we would still get a decent matching result.
754 // However, sometimes we have to break ties. For example we may have to
755 // choose between matching loads vs matching opcodes. This is what these
756 // scores are helping us with: they provide the order of preference.
757
758 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
759 static const int ScoreConsecutiveLoads = 3;
760 /// Constants.
761 static const int ScoreConstants = 2;
762 /// Instructions with the same opcode.
763 static const int ScoreSameOpcode = 2;
764 /// Instructions with alt opcodes (e.g, add + sub).
765 static const int ScoreAltOpcodes = 1;
766 /// Identical instructions (a.k.a. splat or broadcast).
767 static const int ScoreSplat = 1;
768 /// Matching with an undef is preferable to failing.
769 static const int ScoreUndef = 1;
770 /// Score for failing to find a decent match.
771 static const int ScoreFail = 0;
772 /// User exteranl to the vectorized code.
773 static const int ExternalUseCost = 1;
774 /// The user is internal but in a different lane.
775 static const int UserInDiffLaneCost = ExternalUseCost;
776
777 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
778 static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
779 ScalarEvolution &SE) {
780 auto *LI1 = dyn_cast(V1);
781 auto *LI2 = dyn_cast(V2);
782 if (LI1 && LI2)
783 return isConsecutiveAccess(LI1, LI2, DL, SE)
784 ? VLOperands::ScoreConsecutiveLoads
785 : VLOperands::ScoreFail;
786
787 auto *C1 = dyn_cast(V1);
788 auto *C2 = dyn_cast(V2);
789 if (C1 && C2)
790 return VLOperands::ScoreConstants;
791
792 auto *I1 = dyn_cast(V1);
793 auto *I2 = dyn_cast(V2);
794 if (I1 && I2) {
795 if (I1 == I2)
796 return VLOperands::ScoreSplat;
797 InstructionsState S = getSameOpcode({I1, I2});
798 // Note: Only consider instructions with <= 2 operands to avoid
799 // complexity explosion.
800 if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
801 return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
802 : VLOperands::ScoreSameOpcode;
803 }
804
805 if (isa(V2))
806 return VLOperands::ScoreUndef;
807
808 return VLOperands::ScoreFail;
809 }
810
811 /// Holds the values and their lane that are taking part in the look-ahead
812 /// score calculation. This is used in the external uses cost calculation.
813 SmallDenseMap InLookAheadValues;
814
815 /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are
816 /// either external to the vectorized code, or require shuffling.
817 int getExternalUsesCost(const std::pair &LHS,
818 const std::pair &RHS) {
819 int Cost = 0;
820 SmallVector, 2> Values = {LHS, RHS};
821 for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
822 Value *V = Values[Idx].first;
823 // Calculate the absolute lane, using the minimum relative lane of LHS
824 // and RHS as base and Idx as the offset.
825 int Ln = std::min(LHS.second, RHS.second) + Idx;
826 assert(Ln >= 0 && "Bad lane calculation");
827 unsigned UsersBudget = LookAheadUsersBudget;
828 for (User *U : V->users()) {
829 if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
830 // The user is in the VectorizableTree. Check if we need to insert.
831 auto It = llvm::find(UserTE->Scalars, U);
832 assert(It != UserTE->Scalars.end() && "U is in UserTE");
833 int UserLn = std::distance(UserTE->Scalars.begin(), It);
834 assert(UserLn >= 0 && "Bad lane");
835 if (UserLn != Ln)
836 Cost += UserInDiffLaneCost;
837 } else {
838 // Check if the user is in the look-ahead code.
839 auto It2 = InLookAheadValues.find(U);
840 if (It2 != InLookAheadValues.end()) {
841 // The user is in the look-ahead code. Check the lane.
842 if (It2->second != Ln)
843 Cost += UserInDiffLaneCost;
844 } else {
845 // The user is neither in SLP tree nor in the look-ahead code.
846 Cost += ExternalUseCost;
847 }
848 }
849 // Limit the number of visited uses to cap compilation time.
850 if (--UsersBudget == 0)
851 break;
852 }
853 }
854 return Cost;
855 }
856
857 /// Go through the operands of \p LHS and \p RHS recursively until \p
858 /// MaxLevel, and return the cummulative score. For example:
859 /// \verbatim
860 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
861 /// \ / \ / \ / \ /
862 /// + + + +
863 /// G1 G2 G3 G4
864 /// \endverbatim
865 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
866 /// each level recursively, accumulating the score. It starts from matching
867 /// the additions at level 0, then moves on to the loads (level 1). The
868 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
869 /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
870 /// {A[0],C[0]} has a score of VLOperands::ScoreFail.
871 /// Please note that the order of the operands does not matter, as we
872 /// evaluate the score of all profitable combinations of operands. In
873 /// other words the score of G1 and G4 is the same as G1 and G2. This
874 /// heuristic is based on ideas described in:
875 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
876 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
877 /// Luís F. W. Góes
878 int getScoreAtLevelRec(const std::pair &LHS,
879 const std::pair &RHS, int CurrLevel,
880 int MaxLevel) {
881
882 Value *V1 = LHS.first;
883 Value *V2 = RHS.first;
884 // Get the shallow score of V1 and V2.
885 int ShallowScoreAtThisLevel =
886 std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -
887 getExternalUsesCost(LHS, RHS));
888 int Lane1 = LHS.second;
889 int Lane2 = RHS.second;
890
891 // If reached MaxLevel,
892 // or if V1 and V2 are not instructions,
893 // or if they are SPLAT,
894 // or if they are not consecutive, early return the current cost.
895 auto *I1 = dyn_cast(V1);
896 auto *I2 = dyn_cast(V2);
897 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
898 ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
899 (isa(I1) && isa(I2) && ShallowScoreAtThisLevel))
900 return ShallowScoreAtThisLevel;
901 assert(I1 && I2 && "Should have early exited.");
902
903 // Keep track of in-tree values for determining the external-use cost.
904 InLookAheadValues[V1] = Lane1;
905 InLookAheadValues[V2] = Lane2;
906
907 // Contains the I2 operand indexes that got matched with I1 operands.
908 SmallSet Op2Used;
909
910 // Recursion towards the operands of I1 and I2. We are trying all possbile
911 // operand pairs, and keeping track of the best score.
912 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
913 OpIdx1 != NumOperands1; ++OpIdx1) {
914 // Try to pair op1I with the best operand of I2.
915 int MaxTmpScore = 0;
916 unsigned MaxOpIdx2 = 0;
917 bool FoundBest = false;
918 // If I2 is commutative try all combinations.
919 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
920 unsigned ToIdx = isCommutative(I2)
921 ? I2->getNumOperands()
922 : std::min(I2->getNumOperands(), OpIdx1 + 1);
923 assert(FromIdx <= ToIdx && "Bad index");
924 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
925 // Skip operands already paired with OpIdx1.
926 if (Op2Used.count(OpIdx2))
927 continue;
928 // Recursively calculate the cost at each level
929 int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},
930 {I2->getOperand(OpIdx2), Lane2},
931 CurrLevel + 1, MaxLevel);
932 // Look for the best score.
933 if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
934 MaxTmpScore = TmpScore;
935 MaxOpIdx2 = OpIdx2;
936 FoundBest = true;
937 }
938 }
939 if (FoundBest) {
940 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
941 Op2Used.insert(MaxOpIdx2);
942 ShallowScoreAtThisLevel += MaxTmpScore;
943 }
944 }
945 return ShallowScoreAtThisLevel;
946 }
947
948 /// \Returns the look-ahead score, which tells us how much the sub-trees
949 /// rooted at \p LHS and \p RHS match, the more they match the higher the
950 /// score. This helps break ties in an informed way when we cannot decide on
951 /// the order of the operands by just considering the immediate
952 /// predecessors.
953 int getLookAheadScore(const std::pair &LHS,
954 const std::pair &RHS) {
955 InLookAheadValues.clear();
956 return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
957 }
958
959735 // Search all operands in Ops[*][Lane] for the one that matches best
960736 // Ops[OpIdx][LastLane] and return its opreand index.
961737 // If no good match can be found, return None.
973749 // The linearized opcode of the operand at OpIdx, Lane.
974750 bool OpIdxAPO = getData(OpIdx, Lane).APO;
975751
752 const unsigned BestScore = 2;
753 const unsigned GoodScore = 1;
754
976755 // The best operand index and its score.
977756 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
978757 // are using the score to differentiate between the two.
1001780 // Look for an operand that matches the current mode.
1002781 switch (RMode) {
1003782 case ReorderingMode::Load:
1004 case ReorderingMode::Constant:
1005 case ReorderingMode::Opcode: {
1006 bool LeftToRight = Lane > LastLane;
1007 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1008 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1009 unsigned Score =
1010 getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
1011 if (Score > BestOp.Score) {
1012 BestOp.Idx = Idx;
1013 BestOp.Score = Score;
783 if (isa(Op)) {
784 // Figure out which is left and right, so that we can check for
785 // consecutive loads
786 bool LeftToRight = Lane > LastLane;
787 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
788 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
789 if (isConsecutiveAccess(cast(OpLeft),
790 cast(OpRight), DL, SE))
791 BestOp.Idx = Idx;
1014792 }
1015793 break;
1016 }
794 case ReorderingMode::Opcode:
795 // We accept both Instructions and Undefs, but with different scores.
796 if ((isa(Op) && isa(OpLastLane) &&
797 cast(Op)->getOpcode() ==
798 cast(OpLastLane)->getOpcode()) ||
799 (isa(OpLastLane) && isa(Op)) ||
800 isa(Op)) {
801 // An instruction has a higher score than an undef.
802 unsigned Score = (isa(Op)) ? GoodScore : BestScore;
803 if (Score > BestOp.Score) {
804 BestOp.Idx = Idx;
805 BestOp.Score = Score;
806 }
807 }
808 break;
809 case ReorderingMode::Constant:
810 if (isa(Op)) {
811 unsigned Score = (isa(Op)) ? GoodScore : BestScore;
812 if (Score > BestOp.Score) {
813 BestOp.Idx = Idx;
814 BestOp.Score = Score;
815 }
816 }
817 break;
1017818 case ReorderingMode::Splat:
1018819 if (Op == OpLastLane)
1019820 BestOp.Idx = Idx;
1144945 public:
1145946 /// Initialize with all the operands of the instruction vector \p RootVL.
1146947 VLOperands(ArrayRef RootVL, const DataLayout &DL,
1147 ScalarEvolution &SE, const BoUpSLP &R)
1148 : DL(DL), SE(SE), R(R) {
948 ScalarEvolution &SE)
949 : DL(DL), SE(SE) {
1149950 // Append all the operands of RootVL.
1150951 appendOperandsOfVL(RootVL);
1151952 }
13671168 SmallVectorImpl &Left,
13681169 SmallVectorImpl &Right,
13691170 const DataLayout &DL,
1370 ScalarEvolution &SE,
1371 const BoUpSLP &R);
1171 ScalarEvolution &SE);
13721172 struct TreeEntry {
13731173 using VecTreeTy = SmallVector, 8>;
13741174 TreeEntry(VecTreeTy &Container) : Container(Container) {}
25702370 // Commutative predicate - collect + sort operands of the instructions
25712371 // so that each side is more likely to have the same opcode.
25722372 assert(P0 == SwapP0 && "Commutative Predicate mismatch");
2573 reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
2373 reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
25742374 } else {
25752375 // Collect operands - commute if it uses the swapped predicate.
25762376 for (Value *V : VL) {
26152415 // have the same opcode.
26162416 if (isa(VL0) && VL0->isCommutative()) {
26172417 ValueList Left, Right;
2618 reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
2418 reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
26192419 buildTree_rec(Left, Depth + 1, {TE, 0});
26202420 buildTree_rec(Right, Depth + 1, {TE, 1});
26212421 return;
27842584 // Reorder operands if reordering would enable vectorization.
27852585 if (isa(VL0)) {
27862586 ValueList Left, Right;
2787 reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
2587 reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
27882588 buildTree_rec(Left, Depth + 1, {TE, 0});
27892589 buildTree_rec(Right, Depth + 1, {TE, 1});
27902590 return;
35053305
35063306 // Perform operand reordering on the instructions in VL and return the reordered
35073307 // operands in Left and Right.
3508 void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef VL,
3509 SmallVectorImpl &Left,
3510 SmallVectorImpl &Right,
3511 const DataLayout &DL,
3512 ScalarEvolution &SE,
3513 const BoUpSLP &R) {
3308 void BoUpSLP::reorderInputsAccordingToOpcode(
3309 ArrayRef VL, SmallVectorImpl &Left,
3310 SmallVectorImpl &Right, const DataLayout &DL,
3311 ScalarEvolution &SE) {
35143312 if (VL.empty())
35153313 return;
3516 VLOperands Ops(VL, DL, SE, R);
3314 VLOperands Ops(VL, DL, SE);
35173315 // Reorder the operands in place.
35183316 Ops.reorder();
35193317 Left = Ops.getVL(0);
2626 ; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5
2727 ; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6
2828 ; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7
29 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
30 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
31 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>*
32 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
33 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>*
34 ; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8
35 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>*
36 ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8
37 ; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
38 ; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
39 ; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP8]], [[TMP9]]
40 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
41 ; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8
29 ; CHECK-NEXT: [[A_0:%.*]] = load double, double* [[IDX0]], align 8
30 ; CHECK-NEXT: [[A_1:%.*]] = load double, double* [[IDX1]], align 8
31 ; CHECK-NEXT: [[B_0:%.*]] = load double, double* [[IDX2]], align 8
32 ; CHECK-NEXT: [[B_1:%.*]] = load double, double* [[IDX3]], align 8
33 ; CHECK-NEXT: [[C_0:%.*]] = load double, double* [[IDX4]], align 8
34 ; CHECK-NEXT: [[C_1:%.*]] = load double, double* [[IDX5]], align 8
35 ; CHECK-NEXT: [[D_0:%.*]] = load double, double* [[IDX6]], align 8
36 ; CHECK-NEXT: [[D_1:%.*]] = load double, double* [[IDX7]], align 8
37 ; CHECK-NEXT: [[SUBAB_0:%.*]] = fsub fast double [[A_0]], [[B_0]]
38 ; CHECK-NEXT: [[SUBCD_0:%.*]] = fsub fast double [[C_0]], [[D_0]]
39 ; CHECK-NEXT: [[SUBAB_1:%.*]] = fsub fast double [[A_1]], [[B_1]]
40 ; CHECK-NEXT: [[SUBCD_1:%.*]] = fsub fast double [[C_1]], [[D_1]]
41 ; CHECK-NEXT: [[ADDABCD_0:%.*]] = fadd fast double [[SUBAB_0]], [[SUBCD_0]]
42 ; CHECK-NEXT: [[ADDCDAB_1:%.*]] = fadd fast double [[SUBCD_1]], [[SUBAB_1]]
43 ; CHECK-NEXT: store double [[ADDABCD_0]], double* [[IDX0]], align 8
44 ; CHECK-NEXT: store double [[ADDCDAB_1]], double* [[IDX1]], align 8
4245 ; CHECK-NEXT: ret void
4346 ;
4447 entry:
160163 ; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5
161164 ; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6
162165 ; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7
163 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
164 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
165 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>*
166 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
167 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>*
168 ; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8
169 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>*
170 ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8
171 ; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
172 ; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP7]]
173 ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32>
174 ; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]]
175 ; CHECK-NEXT: [[TMP12:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
176 ; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x i32>
177 ; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP10]]
178 ; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
179 ; CHECK-NEXT: store <2 x double> [[TMP14]], <2 x double>* [[TMP15]], align 8
166 ; CHECK-NEXT: [[A_0:%.*]] = load double, double* [[IDX0]], align 8
167 ; CHECK-NEXT: [[A_1:%.*]] = load double, double* [[IDX1]], align 8
168 ; CHECK-NEXT: [[B_0:%.*]] = load double, double* [[IDX2]], align 8
169 ; CHECK-NEXT: [[B_1:%.*]] = load double, double* [[IDX3]], align 8
170 ; CHECK-NEXT: [[C_0:%.*]] = load double, double* [[IDX4]], align 8
171 ; CHECK-NEXT: [[C_1:%.*]] = load double, double* [[IDX5]], align 8
172 ; CHECK-NEXT: [[D_0:%.*]] = load double, double* [[IDX6]], align 8
173 ; CHECK-NEXT: [[D_1:%.*]] = load double, double* [[IDX7]], align 8
174 ; CHECK-NEXT: [[ADDAB_0:%.*]] = fadd fast double [[A_0]], [[B_0]]
175 ; CHECK-NEXT: [[SUBCD_0:%.*]] = fsub fast double [[C_0]], [[D_0]]
176 ; CHECK-NEXT: [[ADDCD_1:%.*]] = fadd fast double [[C_1]], [[D_1]]
177 ; CHECK-NEXT: [[SUBAB_1:%.*]] = fsub fast double [[A_1]], [[B_1]]
178 ; CHECK-NEXT: [[ADDABCD_0:%.*]] = fadd fast double [[ADDAB_0]], [[SUBCD_0]]
179 ; CHECK-NEXT: [[ADDCDAB_1:%.*]] = fadd fast double [[ADDCD_1]], [[SUBAB_1]]
180 ; CHECK-NEXT: store double [[ADDABCD_0]], double* [[IDX0]], align 8
181 ; CHECK-NEXT: store double [[ADDCDAB_1]], double* [[IDX1]], align 8
180182 ; CHECK-NEXT: ret void
181183 ;
182184 entry:
227229
228230 define void @lookahead_external_uses(double* %A, double *%B, double *%C, double *%D, double *%S, double *%Ext1, double *%Ext2) {
229231 ; CHECK-LABEL: @lookahead_external_uses(
230 ; CHECK-NEXT: entry:
231 ; CHECK-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0
232 ; CHECK-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 0
233 ; CHECK-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 0
234 ; CHECK-NEXT: [[IDXD0:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 0
235 ; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
236 ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2
237 ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2
238 ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
239 ; CHECK-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8
240 ; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8
241 ; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8
242 ; CHECK-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8
243 ; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8
244 ; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8
245 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>*
246 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
247 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0
248 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[A1]], i32 1
249 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[D0]], i32 0
250 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[B2]], i32 1
251 ; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP5]]
252 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[A0]], i32 0
253 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A2]], i32 1
254 ; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP1]]
255 ; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]]
256 ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
257 ; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
258 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
259 ; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8
260 ; CHECK-NEXT: store double [[A1]], double* [[EXT1:%.*]], align 8
261 ; CHECK-NEXT: ret void
262 ;
263 entry:
264 %IdxA0 = getelementptr inbounds double, double* %A, i64 0
265 %IdxB0 = getelementptr inbounds double, double* %B, i64 0
266 %IdxC0 = getelementptr inbounds double, double* %C, i64 0
267 %IdxD0 = getelementptr inbounds double, double* %D, i64 0
268
269 %IdxA1 = getelementptr inbounds double, double* %A, i64 1
270 %IdxB2 = getelementptr inbounds double, double* %B, i64 2
271 %IdxA2 = getelementptr inbounds double, double* %A, i64 2
272 %IdxB1 = getelementptr inbounds double, double* %B, i64 1
273
274 %A0 = load double, double *%IdxA0, align 8
275 %B0 = load double, double *%IdxB0, align 8
276 %C0 = load double, double *%IdxC0, align 8
277 %D0 = load double, double *%IdxD0, align 8
278
279 %A1 = load double, double *%IdxA1, align 8
280 %B2 = load double, double *%IdxB2, align 8
281 %A2 = load double, double *%IdxA2, align 8
282 %B1 = load double, double *%IdxB1, align 8
283
284 %subA0B0 = fsub fast double %A0, %B0
285 %subC0D0 = fsub fast double %C0, %D0
286
287 %subA1B2 = fsub fast double %A1, %B2
288 %subA2B1 = fsub fast double %A2, %B1
289
290 %add0 = fadd fast double %subA0B0, %subC0D0
291 %add1 = fadd fast double %subA1B2, %subA2B1
292
293 %IdxS0 = getelementptr inbounds double, double* %S, i64 0
294 %IdxS1 = getelementptr inbounds double, double* %S, i64 1
295
296 store double %add0, double *%IdxS0, align 8
297 store double %add1, double *%IdxS1, align 8
298
299 ; External use
300 store double %A1, double *%Ext1, align 8
301 ret void
302 }
303
304 ; A[0] B[0] C[0] D[0] A[1] B[2] A[2] B[1]
305 ; \ / \ / / \ / \ / \
306 ; - - U1,U2,U3 - - U4,U5
307 ; \ / \ /
308 ; + +
309 ; | |
310 ; S[0] S[1]
311 ;
312 ;
313 ; If we limit the users budget for the look-ahead heuristic to 2, then the
314 ; look-ahead heuristic has no way of choosing B[1] (with 2 external users)
315 ; over A[1] (with 3 external users).
316 ; The result is that the operands are of the Add not reordered and the loads
317 ; from A get vectorized instead of the loads from B.
318 ;
319 define void @lookahead_limit_users_budget(double* %A, double *%B, double *%C, double *%D, double *%S, double *%Ext1, double *%Ext2, double *%Ext3, double *%Ext4, double *%Ext5) {
320 ; CHECK-LABEL: @lookahead_limit_users_budget(
321232 ; CHECK-NEXT: entry:
322233 ; CHECK-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0
323234 ; CHECK-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 0
350261 ; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8
351262 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
352263 ; CHECK-NEXT: store double [[TMP12]], double* [[EXT1:%.*]], align 8
353 ; CHECK-NEXT: store double [[TMP12]], double* [[EXT2:%.*]], align 8
354 ; CHECK-NEXT: store double [[TMP12]], double* [[EXT3:%.*]], align 8
355 ; CHECK-NEXT: store double [[B1]], double* [[EXT4:%.*]], align 8
356 ; CHECK-NEXT: store double [[B1]], double* [[EXT5:%.*]], align 8
357264 ; CHECK-NEXT: ret void
358265 ;
359266 entry:
392299 store double %add0, double *%IdxS0, align 8
393300 store double %add1, double *%IdxS1, align 8
394301
395 ; External uses of A1
302 ; External use
396303 store double %A1, double *%Ext1, align 8
397 store double %A1, double *%Ext2, align 8
398 store double %A1, double *%Ext3, align 8
399
400 ; External uses of B1
401 store double %B1, double *%Ext4, align 8
402 store double %B1, double *%Ext5, align 8
403
404304 ret void
405305 }
406
407 ; This checks that the lookahead code does not crash when instructions with the same opcodes have different numbers of operands (in this case the calls).
408
409 %Class = type { i8 }
410 declare double @_ZN1i2ayEv(%Class*)
411 declare double @_ZN1i2axEv()
412
413 define void @lookahead_crash(double* %A, double *%S, %Class *%Arg0) {
414 ; CHECK-LABEL: @lookahead_crash(
415 ; CHECK-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0
416 ; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
417 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[IDXA0]] to <2 x double>*
418 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
419 ; CHECK-NEXT: [[C0:%.*]] = call double @_ZN1i2ayEv(%Class* [[ARG0:%.*]])
420 ; CHECK-NEXT: [[C1:%.*]] = call double @_ZN1i2axEv()
421 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0
422 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[C1]], i32 1
423 ; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP4]]
424 ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
425 ; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
426 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
427 ; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
428 ; CHECK-NEXT: ret void
429 ;
430 %IdxA0 = getelementptr inbounds double, double* %A, i64 0
431 %IdxA1 = getelementptr inbounds double, double* %A, i64 1
432
433 %A0 = load double, double *%IdxA0, align 8
434 %A1 = load double, double *%IdxA1, align 8
435
436 %C0 = call double @_ZN1i2ayEv(%Class *%Arg0)
437 %C1 = call double @_ZN1i2axEv()
438
439 %add0 = fadd fast double %A0, %C0
440 %add1 = fadd fast double %A1, %C1
441
442 %IdxS0 = getelementptr inbounds double, double* %S, i64 0
443 %IdxS1 = getelementptr inbounds double, double* %S, i64 1
444 store double %add0, double *%IdxS0, align 8
445 store double %add1, double *%IdxS1, align 8
446 ret void
447 }