llvm.org GIT mirror llvm / 171807f
[SLP] Fixed cost model for horizontal reduction. Currently when cost of scalar operations is evaluated the vector type is used for scalar operations. Patch fixes this issue and fixes evaluation of the vector operations cost. Several test showed that vector cost model is too optimistic. It allowed vectorization of 8 or less add/fadd operations, though scalar code is faster. Actually, only for 16 or more operations vector code provides better performance. Differential Revision: https://reviews.llvm.org/D26277 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@288398 91177308-0d34-0410-b5e6-96231b3b80d8 Alexey Bataev 3 years ago
4 changed file(s) with 70 addition(s) and 14 deletion(s). Raw diff Collapse all Expand all
926926
927927 unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise) {
928928 assert(Ty->isVectorTy() && "Expect a vector type");
929 Type *ScalarTy = Ty->getVectorElementType();
929930 unsigned NumVecElts = Ty->getVectorNumElements();
930931 unsigned NumReduxLevels = Log2_32(NumVecElts);
931 unsigned ArithCost =
932 NumReduxLevels *
933 static_cast(this)->getArithmeticInstrCost(Opcode, Ty);
934 // Assume the pairwise shuffles add a cost.
935 unsigned ShuffleCost =
936 NumReduxLevels * (IsPairwise + 1) *
937 static_cast(this)
938 ->getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts / 2, Ty);
932 // Try to calculate arithmetic and shuffle op costs for reduction operations.
933 // We're assuming that reduction operation are performing the following way:
934 // 1. Non-pairwise reduction
935 // %val1 = shufflevector %val, %undef,
936 //
937 // \----------------v-------------/ \----------v------------/
938 // n/2 elements n/2 elements
939 // %red1 = op %val, val1
940 // After this operation we have a vector %red1 with only maningfull the
941 // first n/2 elements, the second n/2 elements are undefined and can be
942 // dropped. All other operations are actually working with the vector of
943 // length n/2, not n. though the real vector length is still n.
944 // %val2 = shufflevector %red1, %undef,
945 //
946 // \----------------v-------------/ \----------v------------/
947 // n/4 elements 3*n/4 elements
948 // %red2 = op %red1, val2 - working with the vector of
949 // length n/2, the resulting vector has length n/4 etc.
950 // 2. Pairwise reduction:
951 // Everything is the same except for an additional shuffle operation which
952 // is used to produce operands for pairwise kind of reductions.
953 // %val1 = shufflevector %val, %undef,
954 //
955 // \-------------v----------/ \----------v------------/
956 // n/2 elements n/2 elements
957 // %val2 = shufflevector %val, %undef,
958 //
959 // \-------------v----------/ \----------v------------/
960 // n/2 elements n/2 elements
961 // %red1 = op %val1, val2
962 // Again, the operation is performed on vector, but the resulting
963 // vector %red1 is vector.
964 //
965 // The cost model should take into account that the actual length of the
966 // vector is reduced on each iteration.
967 unsigned ArithCost = 0;
968 unsigned ShuffleCost = 0;
969 auto *ConcreteTTI = static_cast(this);
970 std::pair LT =
971 ConcreteTTI->getTLI()->getTypeLegalizationCost(DL, Ty);
972 unsigned LongVectorCount = 0;
973 unsigned MVTLen =
974 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
975 while (NumVecElts > MVTLen) {
976 NumVecElts /= 2;
977 // Assume the pairwise shuffles add a cost.
978 ShuffleCost += (IsPairwise + 1) *
979 ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
980 NumVecElts, Ty);
981 ArithCost += ConcreteTTI->getArithmeticInstrCost(Opcode, Ty);
982 Ty = VectorType::get(ScalarTy, NumVecElts);
983 ++LongVectorCount;
984 }
985 // The minimal length of the vector is limited by the real length of vector
986 // operations performed on the current platform. That's why several final
987 // reduction opertions are perfomed on the vectors with the same
988 // architecture-dependent length.
989 ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) *
990 ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
991 NumVecElts, Ty);
992 ArithCost += (NumReduxLevels - LongVectorCount) *
993 ConcreteTTI->getArithmeticInstrCost(Opcode, Ty);
939994 return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true);
940995 }
941996
42864286 int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
42874287
42884288 int ScalarReduxCost =
4289 ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
4289 (ReduxWidth - 1) *
4290 TTI->getArithmeticInstrCost(ReductionOpcode, ScalarTy);
42904291
42914292 DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
42924293 << " for reduction that starts with " << *FirstReducedVal
3232 %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
3333
3434 ; CHECK-LABEL: reduction_cost_int
35 ; CHECK: cost of 17 {{.*}} extractelement
35 ; CHECK: cost of 11 {{.*}} extractelement
3636 ; AVX-LABEL: reduction_cost_int
3737 ; AVX: cost of 5 {{.*}} extractelement
3838
99 ; return sum;
1010 ; }
1111
12 ; Vector cost is 5, Scalar cost is 32
13 ; CHECK: Adding cost -27 for reduction that starts with %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
14 ; Vector cost is 17, Scalar cost is 16
15 ; SSE2: Adding cost 1 for reduction that starts with %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
12 ; Vector cost is 5, Scalar cost is 7
13 ; CHECK: Adding cost -2 for reduction that starts with %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
14 ; Vector cost is 11, Scalar cost is 7
15 ; SSE2: Adding cost 4 for reduction that starts with %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
1616 define i32 @test(i32* nocapture readonly %p) {
1717 ; CHECK-LABEL: @test(
1818 ; CHECK: [[BC:%.*]] = bitcast i32* %p to <8 x i32>*