llvm.org GIT mirror llvm / 11ae250
[NVPTX] Improve handling of FP fusion We now consider the FPOpFusion flag when determining whether to fuse ops. We also explicitly emit add.rn when fusion is disabled to prevent ptxas from fusing the operations on its own. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213287 91177308-0d34-0410-b5e6-96231b3b80d8 Justin Holewinski 6 years ago
10 changed file(s) with 103 addition(s) and 53 deletion(s). Raw diff Collapse all Expand all
2323
2424 #define DEBUG_TYPE "nvptx-isel"
2525
26 unsigned FMAContractLevel = 0;
27
28 static cl::opt
29 FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
30 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
31 " 1: do it 2: do it aggressively"),
32 cl::location(FMAContractLevel),
33 cl::init(2));
34
3526 static cl::opt UsePrecDivF32(
3627 "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
3728 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
6051 CodeGenOpt::Level OptLevel)
6152 : SelectionDAGISel(tm, OptLevel),
6253 Subtarget(tm.getSubtarget()) {
63
64 doFMAF32 = (OptLevel > 0) && Subtarget.hasFMAF32() && (FMAContractLevel >= 1);
65 doFMAF64 = (OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel >= 1);
66 doFMAF32AGG =
67 (OptLevel > 0) && Subtarget.hasFMAF32() && (FMAContractLevel == 2);
68 doFMAF64AGG =
69 (OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel == 2);
70
71 allowFMA = (FMAContractLevel >= 1);
72
7354 doMulWide = (OptLevel > 0);
7455 }
7556
11394 else
11495 return false;
11596 }
97 }
98
99 bool NVPTXDAGToDAGISel::allowFMA() const {
100 const NVPTXTargetLowering *TL = (NVPTXTargetLowering *)getTargetLowering();
101 return TL->allowFMA(*MF, OptLevel);
116102 }
117103
118104 /// Select - Select instructions not customized! Used for
2323
2424 class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
2525
26 // If true, generate corresponding FPCONTRACT. This is
27 // language dependent (i.e. CUDA and OpenCL works differently).
28 bool doFMAF64;
29 bool doFMAF32;
30 bool doFMAF64AGG;
31 bool doFMAF32AGG;
32 bool allowFMA;
33
3426 // If true, generate mul.wide from sext and mul
3527 bool doMulWide;
3628
3729 int getDivF32Level() const;
3830 bool usePrecSqrtF32() const;
3931 bool useF32FTZ() const;
32 bool allowFMA() const;
4033
4134 public:
4235 explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
4646 static cl::opt sched4reg(
4747 "nvptx-sched4reg",
4848 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
49
50 static cl::opt
51 FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
52 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
53 " 1: do it 2: do it aggressively"),
54 cl::init(2));
4955
5056 static bool IsPTXVectorType(MVT VT) {
5157 switch (VT.SimpleTy) {
37983804 // NVPTX DAG Combining
37993805 //===----------------------------------------------------------------------===//
38003806
3801 extern unsigned FMAContractLevel;
3807 bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
3808 CodeGenOpt::Level OptLevel) const {
3809 const Function *F = MF.getFunction();
3810 const TargetOptions &TO = MF.getTarget().Options;
3811
3812 // Always honor command-line argument
3813 if (FMAContractLevelOpt.getNumOccurrences() > 0) {
3814 return FMAContractLevelOpt > 0;
3815 } else if (OptLevel == 0) {
3816 // Do not contract if we're not optimizing the code
3817 return false;
3818 } else if (TO.AllowFPOpFusion == FPOpFusion::Fast || TO.UnsafeFPMath) {
3819 // Honor TargetOptions flags that explicitly say fusion is okay
3820 return true;
3821 } else if (F->hasFnAttribute("unsafe-fp-math")) {
3822 // Check for unsafe-fp-math=true coming from Clang
3823 Attribute Attr = F->getFnAttribute("unsafe-fp-math");
3824 StringRef Val = Attr.getValueAsString();
3825 if (Val == "true")
3826 return true;
3827 }
3828
3829 // We did not have a clear indication that fusion is allowed, so assume not
3830 return false;
3831 }
38023832
38033833 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
38043834 /// operands N0 and N1. This is a helper for PerformADDCombine that is
38323862 }
38333863 else if (N0.getOpcode() == ISD::FMUL) {
38343864 if (VT == MVT::f32 || VT == MVT::f64) {
3835 if (FMAContractLevel == 0)
3865 NVPTXTargetLowering *TLI =
3866 (NVPTXTargetLowering *)&DAG.getTargetLoweringInfo();
3867 if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
38363868 return SDValue();
38373869
38383870 // For floating point:
502502 TargetLoweringBase::LegalizeTypeAction
503503 getPreferredVectorAction(EVT VT) const override;
504504
505 bool allowFMA(MachineFunction &MF, CodeGenOpt::Level OptLevel) const;
506
507 virtual bool isFMAFasterThanFMulAndFAdd(EVT) const {
508 return true;
509 }
510
505511 private:
506512 const NVPTXSubtarget &nvptxSubtarget; // cache the subtarget here
507513
138138 def doF32FTZ : Predicate<"useF32FTZ()">;
139139 def doNoF32FTZ : Predicate<"!useF32FTZ()">;
140140
141 def doFMAF32 : Predicate<"doFMAF32">;
142 def doFMAF32_ftz : Predicate<"(doFMAF32 && useF32FTZ())">;
143 def doFMAF32AGG : Predicate<"doFMAF32AGG">;
144 def doFMAF32AGG_ftz : Predicate<"(doFMAF32AGG && useF32FTZ())">;
145 def doFMAF64 : Predicate<"doFMAF64">;
146 def doFMAF64AGG : Predicate<"doFMAF64AGG">;
147
148141 def doMulWide : Predicate<"doMulWide">;
149142
150 def allowFMA : Predicate<"allowFMA">;
151 def allowFMA_ftz : Predicate<"(allowFMA && useF32FTZ())">;
143 def allowFMA : Predicate<"allowFMA()">;
144 def noFMA : Predicate<"!allowFMA()">;
152145
153146 def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
154147 def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
221214 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
222215 [(set Float32Regs:$dst,
223216 (OpNode Float32Regs:$a, Float32Regs:$b))]>,
224 Requires<[allowFMA_ftz]>;
217 Requires<[allowFMA, doF32FTZ]>;
225218 def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
226219 (ins Float32Regs:$a, f32imm:$b),
227220 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
228221 [(set Float32Regs:$dst,
229222 (OpNode Float32Regs:$a, fpimm:$b))]>,
230 Requires<[allowFMA_ftz]>;
223 Requires<[allowFMA, doF32FTZ]>;
231224 def f32rr : NVPTXInst<(outs Float32Regs:$dst),
232225 (ins Float32Regs:$a, Float32Regs:$b),
233226 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
247240 (ins Float64Regs:$a, Float64Regs:$b),
248241 !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
249242 [(set Float64Regs:$dst,
250 (OpNode Float64Regs:$a, Float64Regs:$b))]>;
243 (OpNode Float64Regs:$a, Float64Regs:$b))]>,
244 Requires<[noFMA]>;
251245 def f64ri : NVPTXInst<(outs Float64Regs:$dst),
252246 (ins Float64Regs:$a, f64imm:$b),
253247 !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
254248 [(set Float64Regs:$dst,
255 (OpNode Float64Regs:$a, fpimm:$b))]>;
249 (OpNode Float64Regs:$a, fpimm:$b))]>,
250 Requires<[noFMA]>;
256251 def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
257252 (ins Float32Regs:$a, Float32Regs:$b),
258253 !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
259254 [(set Float32Regs:$dst,
260255 (OpNode Float32Regs:$a, Float32Regs:$b))]>,
261 Requires<[doF32FTZ]>;
256 Requires<[noFMA, doF32FTZ]>;
262257 def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
263258 (ins Float32Regs:$a, f32imm:$b),
264259 !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
265260 [(set Float32Regs:$dst,
266261 (OpNode Float32Regs:$a, fpimm:$b))]>,
267 Requires<[doF32FTZ]>;
262 Requires<[noFMA, doF32FTZ]>;
268263 def f32rr : NVPTXInst<(outs Float32Regs:$dst),
269264 (ins Float32Regs:$a, Float32Regs:$b),
270265 !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
271266 [(set Float32Regs:$dst,
272 (OpNode Float32Regs:$a, Float32Regs:$b))]>;
267 (OpNode Float32Regs:$a, Float32Regs:$b))]>,
268 Requires<[noFMA]>;
273269 def f32ri : NVPTXInst<(outs Float32Regs:$dst),
274270 (ins Float32Regs:$a, f32imm:$b),
275271 !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
276272 [(set Float32Regs:$dst,
277 (OpNode Float32Regs:$a, fpimm:$b))]>;
273 (OpNode Float32Regs:$a, fpimm:$b))]>,
274 Requires<[noFMA]>;
278275 }
279276
280277 multiclass F2 {
918915 }
919916
920917 defm FMA32_ftz : FPCONTRACT32<"fma.rn.ftz.f32", doF32FTZ>;
921 defm FMA32 : FPCONTRACT32<"fma.rn.f32", doNoF32FTZ>;
922 defm FMA64 : FPCONTRACT64<"fma.rn.f64", doNoF32FTZ>;
918 defm FMA32 : FPCONTRACT32<"fma.rn.f32", true>;
919 defm FMA64 : FPCONTRACT64<"fma.rn.f64", true>;
923920
924921 def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
925922 "sin.approx.f32 \t$dst, $src;",
None ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
1 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
0 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
1 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s
22
33 ;; These tests should run for all targets
44
None ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
0 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
11
22 define ptx_device float @t1_f32(float %x, float %y, float %z) {
33 ; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
0 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s --check-prefix=FAST
1 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefix=DEFAULT
2
3 target triple = "nvptx64-unknown-cuda"
4
5 ;; Make sure we are generating proper instruction sequences for fused ops
6 ;; If fusion is allowed, we try to form fma.rn at the PTX level, and emit
7 ;; add.f32 otherwise. Without an explicit rounding mode on add.f32, ptxas
8 ;; is free to fuse with a multiply if it is able. If fusion is not allowed,
9 ;; we do not form fma.rn at the PTX level and explicitly generate add.rn
10 ;; for all adds to prevent ptxas from fusion the ops.
11
12 ;; FAST-LABEL: @t0
13 ;; DEFAULT-LABEL: @t0
14 define float @t0(float %a, float %b, float %c) {
15 ;; FAST: fma.rn.f32
16 ;; DEFAULT: mul.rn.f32
17 ;; DEFAULT: add.rn.f32
18 %v0 = fmul float %a, %b
19 %v1 = fadd float %v0, %c
20 ret float %v1
21 }
22
23 ;; FAST-LABEL: @t1
24 ;; DEFAULT-LABEL: @t1
25 define float @t1(float %a, float %b) {
26 ;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32
27 ;; to prevent ptxas from fusing this with anything else.
28 ;; FAST: add.f32
29 ;; DEFAULT: add.rn.f32
30 %v1 = fadd float %a, %b
31 ret float %v1
32 }
None ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
0 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
1
2 target triple = "nvptx64-unknown-cuda"
3 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
14
25 ; Make sure we can properly differentiate between single-precision and
36 ; double-precision FP literals.
0 ; RUN: llc < %s -O0 -march=nvptx -mcpu=sm_20 -asm-verbose=1 | FileCheck %s
11
22 ; CHECK: // implicit-def: %f[[F0:[0-9]+]]
3 ; CHECK: add.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f[[F0]];
3 ; CHECK: add.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f[[F0]];
44 define float @foo(float %a) {
55 %ret = fadd float %a, undef
66 ret float %ret