llvm.org GIT mirror llvm / 7768f31
[AMDGPU] Tune inlining parameters for AMDGPU target Summary: Since the target has no significant advantage of vectorization, vector instructions bous threshold bonus should be optional. amdgpu-inline-arg-alloca-cost parameter default value and the target InliningThresholdMultiplier value tuned then respectively. Reviewers: arsenm, rampitec Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, eraman, hiraditya, haicheng, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D64642 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@366348 91177308-0d34-0410-b5e6-96231b3b80d8 Daniil Fukalov a month ago
9 changed file(s) with 60 addition(s) and 19 deletion(s). Raw diff Collapse all Expand all
261261 /// TODO: This is a rather blunt instrument. Perhaps altering the costs of
262262 /// individual classes of instructions would be better.
263263 unsigned getInliningThresholdMultiplier() const;
264
265 /// \returns Vector bonus in percent.
266 ///
267 /// Vector bonuses: We want to more aggressively inline vector-dense kernels
268 /// and apply this bonus based on the percentage of vector instructions. A
269 /// bonus is applied if the vector instructions exceed 50% and half that amount
270 /// is applied if it exceeds 10%. Note that these bonuses are some what
271 /// arbitrary and evolved over time by accident as much as because they are
272 /// principled bonuses.
273 /// FIXME: It would be nice to base the bonus values on something more
274 /// scientific. A target may has no bonus on vector instructions.
275 int getInlinerVectorBonusPercent() const;
264276
265277 /// Estimate the cost of an intrinsic when lowered.
266278 ///
11271139 virtual int getCallCost(const Function *F,
11281140 ArrayRef Arguments, const User *U) = 0;
11291141 virtual unsigned getInliningThresholdMultiplier() = 0;
1142 virtual int getInlinerVectorBonusPercent() = 0;
11301143 virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
11311144 ArrayRef ParamTys, const User *U) = 0;
11321145 virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
13501363 unsigned getInliningThresholdMultiplier() override {
13511364 return Impl.getInliningThresholdMultiplier();
13521365 }
1366 int getInlinerVectorBonusPercent() override {
1367 return Impl.getInlinerVectorBonusPercent();
1368 }
13531369 int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
13541370 ArrayRef ParamTys, const User *U = nullptr) override {
13551371 return Impl.getIntrinsicCost(IID, RetTy, ParamTys, U);
138138 }
139139
140140 unsigned getInliningThresholdMultiplier() { return 1; }
141
142 int getInlinerVectorBonusPercent() { return 150; }
141143
142144 unsigned getMemcpyCost(const Instruction *I) {
143145 return TTI::TCC_Expensive;
425425 }
426426
427427 unsigned getInliningThresholdMultiplier() { return 1; }
428
429 int getInlinerVectorBonusPercent() { return 150; }
428430
429431 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
430432 TTI::UnrollingPreferences &UP) {
879879 // basic block at the given callsite context. This is speculatively applied
880880 // and withdrawn if more than one basic block is seen.
881881 //
882 // Vector bonuses: We want to more aggressively inline vector-dense kernels
883 // and apply this bonus based on the percentage of vector instructions. A
884 // bonus is applied if the vector instructions exceed 50% and half that amount
885 // is applied if it exceeds 10%. Note that these bonuses are some what
886 // arbitrary and evolved over time by accident as much as because they are
887 // principled bonuses.
888 // FIXME: It would be nice to base the bonus values on something more
889 // scientific.
890 //
891882 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
892883 // of the last call to a static function as inlining such functions is
893884 // guaranteed to reduce code size.
895886 // These bonus percentages may be set to 0 based on properties of the caller
896887 // and the callsite.
897888 int SingleBBBonusPercent = 50;
898 int VectorBonusPercent = 150;
889 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
899890 int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
900891
901892 // Lambda to set all the above bonus and bonus percentages to 0.
173173
174174 unsigned TargetTransformInfo::getInliningThresholdMultiplier() const {
175175 return TTIImpl->getInliningThresholdMultiplier();
176 }
177
178 int TargetTransformInfo::getInlinerVectorBonusPercent() const {
179 return TTIImpl->getInlinerVectorBonusPercent();
176180 }
177181
178182 int TargetTransformInfo::getGEPCost(Type *PointeeType, const Value *Ptr,
3838 #define DEBUG_TYPE "inline"
3939
4040 static cl::opt
41 ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200),
41 ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(1500),
4242 cl::desc("Cost of alloca argument"));
4343
4444 // If the amount of scratch memory to eliminate exceeds our ability to allocate
190190 bool areInlineCompatible(const Function *Caller,
191191 const Function *Callee) const;
192192
193 unsigned getInliningThresholdMultiplier() { return 9; }
193 unsigned getInliningThresholdMultiplier() { return 7; }
194
195 int getInlinerVectorBonusPercent() { return 0; }
194196
195197 int getArithmeticReductionCost(unsigned Opcode,
196198 Type *Ty,
2727 define coldcc void @foo_private_ptr2(float addrspace(5)* nocapture %p1, float addrspace(5)* nocapture %p2) {
2828 entry:
2929 %tmp1 = load float, float addrspace(5)* %p1, align 4
30 %cmp = fcmp ogt float %tmp1, 1.000000e+00
31 br i1 %cmp, label %if.then, label %if.end
32
33 if.then: ; preds = %entry
3430 %div = fdiv float 2.000000e+00, %tmp1
3531 store float %div, float addrspace(5)* %p2, align 4
36 br label %if.end
37
38 if.end: ; preds = %if.then, %entry
3932 ret void
4033 }
4134
0 ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-inline --inline-threshold=1 < %s | FileCheck %s
1
2 define hidden <16 x i32> @div_vecbonus(<16 x i32> %x, <16 x i32> %y) {
3 entry:
4 %div.1 = udiv <16 x i32> %x, %y
5 %div.2 = udiv <16 x i32> %div.1, %y
6 %div.3 = udiv <16 x i32> %div.2, %y
7 %div.4 = udiv <16 x i32> %div.3, %y
8 %div.5 = udiv <16 x i32> %div.4, %y
9 %div.6 = udiv <16 x i32> %div.5, %y
10 %div.7 = udiv <16 x i32> %div.6, %y
11 %div.8 = udiv <16 x i32> %div.7, %y
12 %div.9 = udiv <16 x i32> %div.8, %y
13 %div.10 = udiv <16 x i32> %div.9, %y
14 %div.11 = udiv <16 x i32> %div.10, %y
15 %div.12 = udiv <16 x i32> %div.11, %y
16 ret <16 x i32> %div.12
17 }
18
19 ; CHECK-LABEL: define amdgpu_kernel void @caller_vecbonus
20 ; CHECK-NOT: udiv
21 ; CHECK: tail call <16 x i32> @div_vecbonus
22 ; CHECK: ret void
23 define amdgpu_kernel void @caller_vecbonus(<16 x i32> addrspace(1)* nocapture %x, <16 x i32> addrspace(1)* nocapture readonly %y) {
24 entry:
25 %tmp = load <16 x i32>, <16 x i32> addrspace(1)* %x
26 %tmp1 = load <16 x i32>, <16 x i32> addrspace(1)* %y
27 %div.i = tail call <16 x i32> @div_vecbonus(<16 x i32> %tmp, <16 x i32> %tmp1)
28 store <16 x i32> %div.i, <16 x i32> addrspace(1)* %x
29 ret void
30 }