llvm.org GIT mirror llvm / e4e5cf5
make reciprocal estimate code generation more flexible by adding command-line options (3rd try) The first try (r238051) to land this was reverted due to ExecutionEngine build failure; that was hopefully addressed by r238788. The second try (r238842) to land this was reverted due to BUILD_SHARED_LIBS failure; that was hopefully addressed by r238953. This patch adds a TargetRecip class for processing many recip codegen possibilities. The class is intended to handle both command-line options to llc as well as options passed in from a front-end such as clang with the -mrecip option. The x86 backend is updated to use the new functionality. Only -mcpu=btver2 with -ffast-math should see a functional change from this patch. All other x86 CPUs continue to *not* use reciprocal estimates by default with -ffast-math. Differential Revision: http://reviews.llvm.org/D8982 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@239001 91177308-0d34-0410-b5e6-96231b3b80d8 Sanjay Patel 4 years ago
12 changed file(s) with 359 addition(s) and 59 deletion(s). Raw diff Collapse all Expand all
2323 #include "llvm/Support/Host.h"
2424 #include "llvm/Target/TargetMachine.h"
2525 #include "llvm/Target/TargetOptions.h"
26 #include "llvm/Target/TargetRecip.h"
2627 #include
2728 using namespace llvm;
2829
151152 "Only fuse FP ops when the result won't be effected."),
152153 clEnumValEnd));
153154
155 cl::list
156 ReciprocalOps("recip",
157 cl::CommaSeparated,
158 cl::desc("Choose reciprocal operation types and parameters."),
159 cl::value_desc("all,none,default,divf,!vec-sqrtd,vec-divd:0,sqrt:9..."));
160
154161 cl::opt
155162 DontPlaceZerosInBSS("nozero-initialized-in-bss",
156163 cl::desc("Don't place zero-initialized symbols into bss section"),
229236 TargetOptions Options;
230237 Options.LessPreciseFPMADOption = EnableFPMAD;
231238 Options.AllowFPOpFusion = FuseFPOps;
239 Options.Reciprocals = TargetRecip(ReciprocalOps);
232240 Options.UnsafeFPMath = EnableUnsafeFPMath;
233241 Options.NoInfsFPMath = EnableNoInfsFPMath;
234242 Options.NoNaNsFPMath = EnableNoNaNsFPMath;
1414 #ifndef LLVM_TARGET_TARGETOPTIONS_H
1515 #define LLVM_TARGET_TARGETOPTIONS_H
1616
17 #include "llvm/Target/TargetRecip.h"
1718 #include "llvm/MC/MCTargetOptions.h"
1819 #include
1920
7172 CompressDebugSections(false), FunctionSections(false),
7273 DataSections(false), UniqueSectionNames(true), TrapUnreachable(false),
7374 TrapFuncName(), FloatABIType(FloatABI::Default),
74 AllowFPOpFusion(FPOpFusion::Standard), JTType(JumpTable::Single),
75 AllowFPOpFusion(FPOpFusion::Standard), Reciprocals(TargetRecip()),
76 JTType(JumpTable::Single),
7577 ThreadModel(ThreadModel::POSIX) {}
7678
7779 /// PrintMachineCode - This flag is enabled when the -print-machineinstrs
205207 /// the value of this option.
206208 FPOpFusion::FPOpFusionMode AllowFPOpFusion;
207209
210 /// This class encapsulates options for reciprocal-estimate code generation.
211 TargetRecip Reciprocals;
212
208213 /// JTType - This flag specifies the type of jump-instruction table to
209214 /// create for functions that have the jumptable attribute.
210215 JumpTable::JumpTableType JTType;
239244 ARE_EQUAL(TrapFuncName) &&
240245 ARE_EQUAL(FloatABIType) &&
241246 ARE_EQUAL(AllowFPOpFusion) &&
247 ARE_EQUAL(Reciprocals) &&
242248 ARE_EQUAL(JTType) &&
243249 ARE_EQUAL(ThreadModel) &&
244250 ARE_EQUAL(MCOptions);
0 //===--------------------- llvm/Target/TargetRecip.h ------------*- C++ -*-===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This class is used to customize machine-specific reciprocal estimate code
10 // generation in a target-independent way.
11 // If a target does not support operations in this specification, then code
12 // generation will default to using supported operations.
13 //
14 //===----------------------------------------------------------------------===//
15
16 #ifndef LLVM_TARGET_TARGETRECIP_H
17 #define LLVM_TARGET_TARGETRECIP_H
18
19 #include "llvm/ADT/StringRef.h"
20 #include
21 #include
22 #include
23
24 namespace llvm {
25
26 struct TargetRecip {
27 public:
28 TargetRecip();
29
30 /// Initialize all or part of the operations from command-line options or
31 /// a front end.
32 TargetRecip(const std::vector &Args);
33
34 /// Set whether a particular reciprocal operation is enabled and how many
35 /// refinement steps are needed when using it. Use "all" to set enablement
36 /// and refinement steps for all operations.
37 void setDefaults(const StringRef &Key, bool Enable, unsigned RefSteps);
38
39 /// Return true if the reciprocal operation has been enabled by default or
40 /// from the command-line. Return false if the operation has been disabled
41 /// by default or from the command-line.
42 bool isEnabled(const StringRef &Key) const;
43
44 /// Return the number of iterations necessary to refine the
45 /// the result of a machine instruction for the given reciprocal operation.
46 unsigned getRefinementSteps(const StringRef &Key) const;
47
48 bool operator==(const TargetRecip &Other) const;
49
50 private:
51 enum {
52 Uninitialized = -1
53 };
54
55 struct RecipParams {
56 int8_t Enabled;
57 int8_t RefinementSteps;
58
59 RecipParams() : Enabled(Uninitialized), RefinementSteps(Uninitialized) {}
60 };
61
62 std::map RecipMap;
63 typedef std::map::iterator RecipIter;
64 typedef std::map::const_iterator ConstRecipIter;
65
66 bool parseGlobalParams(const std::string &Arg);
67 void parseIndividualParams(const std::vector &Args);
68 };
69
70 } // End llvm namespace
71
72 #endif
55 TargetLoweringObjectFile.cpp
66 TargetMachine.cpp
77 TargetMachineC.cpp
8 TargetRecip.cpp
89 TargetSubtargetInfo.cpp
910
1011 ADDITIONAL_HEADER_DIRS
0 //===-------------------------- TargetRecip.cpp ---------------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This class is used to customize machine-specific reciprocal estimate code
10 // generation in a target-independent way.
11 // If a target does not support operations in this specification, then code
12 // generation will default to using supported operations.
13 //
14 //===----------------------------------------------------------------------===//
15
16 #include "llvm/ADT/StringRef.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/Support/ErrorHandling.h"
19 #include "llvm/Target/TargetRecip.h"
20 #include
21
22 using namespace llvm;
23
24 // These are the names of the individual reciprocal operations. These are
25 // the key strings for queries and command-line inputs.
26 // In addition, the command-line interface recognizes the global parameters
27 // "all", "none", and "default".
28 static const char *RecipOps[] = {
29 "divd",
30 "divf",
31 "vec-divd",
32 "vec-divf",
33 "sqrtd",
34 "sqrtf",
35 "vec-sqrtd",
36 "vec-sqrtf",
37 };
38
39 // The uninitialized state is needed for the enabled settings and refinement
40 // steps because custom settings may arrive via the command-line before target
41 // defaults are set.
42 TargetRecip::TargetRecip() {
43 unsigned NumStrings = llvm::array_lengthof(RecipOps);
44 for (unsigned i = 0; i < NumStrings; ++i)
45 RecipMap.insert(std::make_pair(RecipOps[i], RecipParams()));
46 }
47
48 static bool parseRefinementStep(const StringRef &In, size_t &Position,
49 uint8_t &Value) {
50 const char RefStepToken = ':';
51 Position = In.find(RefStepToken);
52 if (Position == StringRef::npos)
53 return false;
54
55 StringRef RefStepString = In.substr(Position + 1);
56 // Allow exactly one numeric character for the additional refinement
57 // step parameter.
58 if (RefStepString.size() == 1) {
59 char RefStepChar = RefStepString[0];
60 if (RefStepChar >= '0' && RefStepChar <= '9') {
61 Value = RefStepChar - '0';
62 return true;
63 }
64 }
65 report_fatal_error("Invalid refinement step for -recip.");
66 }
67
68 bool TargetRecip::parseGlobalParams(const std::string &Arg) {
69 StringRef ArgSub = Arg;
70
71 // Look for an optional setting of the number of refinement steps needed
72 // for this type of reciprocal operation.
73 size_t RefPos;
74 uint8_t RefSteps;
75 StringRef RefStepString;
76 if (parseRefinementStep(ArgSub, RefPos, RefSteps)) {
77 // Split the string for further processing.
78 RefStepString = ArgSub.substr(RefPos + 1);
79 ArgSub = ArgSub.substr(0, RefPos);
80 }
81 bool Enable;
82 bool UseDefaults;
83 if (ArgSub == "all") {
84 UseDefaults = false;
85 Enable = true;
86 } else if (ArgSub == "none") {
87 UseDefaults = false;
88 Enable = false;
89 } else if (ArgSub == "default") {
90 UseDefaults = true;
91 } else {
92 // Any other string is invalid or an individual setting.
93 return false;
94 }
95
96 // All enable values will be initialized to target defaults if 'default' was
97 // specified.
98 if (!UseDefaults)
99 for (auto &KV : RecipMap)
100 KV.second.Enabled = Enable;
101
102 // Custom refinement count was specified with all, none, or default.
103 if (!RefStepString.empty())
104 for (auto &KV : RecipMap)
105 KV.second.RefinementSteps = RefSteps;
106
107 return true;
108 }
109
110 void TargetRecip::parseIndividualParams(const std::vector &Args) {
111 static const char DisabledPrefix = '!';
112 unsigned NumArgs = Args.size();
113
114 for (unsigned i = 0; i != NumArgs; ++i) {
115 StringRef Val = Args[i];
116
117 bool IsDisabled = Val[0] == DisabledPrefix;
118 // Ignore the disablement token for string matching.
119 if (IsDisabled)
120 Val = Val.substr(1);
121
122 size_t RefPos;
123 uint8_t RefSteps;
124 StringRef RefStepString;
125 if (parseRefinementStep(Val, RefPos, RefSteps)) {
126 // Split the string for further processing.
127 RefStepString = Val.substr(RefPos + 1);
128 Val = Val.substr(0, RefPos);
129 }
130
131 RecipIter Iter = RecipMap.find(Val);
132 if (Iter == RecipMap.end()) {
133 // Try again specifying float suffix.
134 Iter = RecipMap.find(Val.str() + 'f');
135 if (Iter == RecipMap.end()) {
136 Iter = RecipMap.find(Val.str() + 'd');
137 assert(Iter == RecipMap.end() && "Float entry missing from map");
138 report_fatal_error("Invalid option for -recip.");
139 }
140
141 // The option was specified without a float or double suffix.
142 if (RecipMap[Val.str() + 'd'].Enabled != Uninitialized) {
143 // Make sure that the double entry was not already specified.
144 // The float entry will be checked below.
145 report_fatal_error("Duplicate option for -recip.");
146 }
147 }
148
149 if (Iter->second.Enabled != Uninitialized)
150 report_fatal_error("Duplicate option for -recip.");
151
152 // Mark the matched option as found. Do not allow duplicate specifiers.
153 Iter->second.Enabled = !IsDisabled;
154 if (!RefStepString.empty())
155 Iter->second.RefinementSteps = RefSteps;
156
157 // If the precision was not specified, the double entry is also initialized.
158 if (Val.back() != 'f' && Val.back() != 'd') {
159 RecipMap[Val.str() + 'd'].Enabled = !IsDisabled;
160 if (!RefStepString.empty())
161 RecipMap[Val.str() + 'd'].RefinementSteps = RefSteps;
162 }
163 }
164 }
165
166 TargetRecip::TargetRecip(const std::vector &Args) :
167 TargetRecip() {
168 unsigned NumArgs = Args.size();
169
170 // Check if "all", "default", or "none" was specified.
171 if (NumArgs == 1 && parseGlobalParams(Args[0]))
172 return;
173
174 parseIndividualParams(Args);
175 }
176
177 bool TargetRecip::isEnabled(const StringRef &Key) const {
178 ConstRecipIter Iter = RecipMap.find(Key);
179 assert(Iter != RecipMap.end() && "Unknown name for reciprocal map");
180 assert(Iter->second.Enabled != Uninitialized &&
181 "Enablement setting was not initialized");
182 return Iter->second.Enabled;
183 }
184
185 unsigned TargetRecip::getRefinementSteps(const StringRef &Key) const {
186 ConstRecipIter Iter = RecipMap.find(Key);
187 assert(Iter != RecipMap.end() && "Unknown name for reciprocal map");
188 assert(Iter->second.RefinementSteps != Uninitialized &&
189 "Refinement step setting was not initialized");
190 return Iter->second.RefinementSteps;
191 }
192
193 /// Custom settings (previously initialized values) override target defaults.
194 void TargetRecip::setDefaults(const StringRef &Key, bool Enable,
195 unsigned RefSteps) {
196 if (Key == "all") {
197 for (auto &KV : RecipMap) {
198 RecipParams &RP = KV.second;
199 if (RP.Enabled == Uninitialized)
200 RP.Enabled = Enable;
201 if (RP.RefinementSteps == Uninitialized)
202 RP.RefinementSteps = RefSteps;
203 }
204 } else {
205 RecipParams &RP = RecipMap[Key];
206 if (RP.Enabled == Uninitialized)
207 RP.Enabled = Enable;
208 if (RP.RefinementSteps == Uninitialized)
209 RP.RefinementSteps = RefSteps;
210 }
211 }
212
213 bool TargetRecip::operator==(const TargetRecip &Other) const {
214 for (const auto &KV : RecipMap) {
215 const StringRef &Op = KV.first;
216 const RecipParams &RP = KV.second;
217 const RecipParams &OtherRP = Other.RecipMap.find(Op)->second;
218 if (RP.RefinementSteps != OtherRP.RefinementSteps)
219 return false;
220 if (RP.Enabled != OtherRP.Enabled)
221 return false;
222 }
223 return true;
224 }
189189 "LEA instruction with certain arguments is slow">;
190190 def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
191191 "INC and DEC instructions are slower than ADD and SUB">;
192 def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true",
193 "Use RSQRT* to optimize square root calculations">;
194 def FeatureUseRecipEst : SubtargetFeature<"use-recip-est", "UseReciprocalEst",
195 "true", "Use RCP* to optimize division calculations">;
196192 def FeatureSoftFloat
197193 : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
198194 "Use software floating point features.">;
445441 FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
446442 FeatureBMI, FeatureF16C, FeatureMOVBE,
447443 FeatureLZCNT, FeaturePOPCNT, FeatureFastUAMem,
448 FeatureSlowSHLD, FeatureUseSqrtEst, FeatureUseRecipEst]>;
444 FeatureSlowSHLD]>;
449445
450446 // TODO: We should probably add 'FeatureFastUAMem' to all of the AMD chips.
451447
6565 cl::desc("Enable an experimental vector type legalization through widening "
6666 "rather than promotion."),
6767 cl::Hidden);
68
69 static cl::opt ReciprocalEstimateRefinementSteps(
70 "x86-recip-refinement-steps", cl::init(1),
71 cl::desc("Specify the number of Newton-Raphson iterations applied to the "
72 "result of the hardware reciprocal estimate instruction."),
73 cl::NotHidden);
7468
7569 // Forward declarations.
7670 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
1300512999 DAGCombinerInfo &DCI,
1300613000 unsigned &RefinementSteps,
1300713001 bool &UseOneConstNR) const {
13008 // FIXME: We should use instruction latency models to calculate the cost of
13009 // each potential sequence, but this is very hard to do reliably because
13010 // at least Intel's Core* chips have variable timing based on the number of
13011 // significant digits in the divisor and/or sqrt operand.
13012 if (!Subtarget->useSqrtEst())
13013 return SDValue();
13014
1301513002 EVT VT = Op.getValueType();
13016
13017 // SSE1 has rsqrtss and rsqrtps.
13003 const char *RecipOp;
13004
13005 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
1301813006 // TODO: Add support for AVX512 (v16f32).
1301913007 // It is likely not profitable to do this for f64 because a double-precision
1302013008 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
1302113009 // instructions: convert to single, rsqrtss, convert back to double, refine
1302213010 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
1302313011 // along with FMA, this could be a throughput win.
13024 if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
13025 (Subtarget->hasAVX() && VT == MVT::v8f32)) {
13026 RefinementSteps = 1;
13027 UseOneConstNR = false;
13028 return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
13029 }
13030 return SDValue();
13012 if (VT == MVT::f32 && Subtarget->hasSSE1())
13013 RecipOp = "sqrtf";
13014 else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
13015 (VT == MVT::v8f32 && Subtarget->hasAVX()))
13016 RecipOp = "vec-sqrtf";
13017 else
13018 return SDValue();
13019
13020 TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
13021 if (!Recips.isEnabled(RecipOp))
13022 return SDValue();
13023
13024 RefinementSteps = Recips.getRefinementSteps(RecipOp);
13025 UseOneConstNR = false;
13026 return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
1303113027 }
1303213028
1303313029 /// The minimum architected relative accuracy is 2^-12. We need one
1303513031 SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
1303613032 DAGCombinerInfo &DCI,
1303713033 unsigned &RefinementSteps) const {
13038 // FIXME: We should use instruction latency models to calculate the cost of
13039 // each potential sequence, but this is very hard to do reliably because
13040 // at least Intel's Core* chips have variable timing based on the number of
13041 // significant digits in the divisor.
13042 if (!Subtarget->useReciprocalEst())
13043 return SDValue();
13044
1304513034 EVT VT = Op.getValueType();
13046
13035 const char *RecipOp;
13036
1304713037 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
1304813038 // TODO: Add support for AVX512 (v16f32).
1304913039 // It is likely not profitable to do this for f64 because a double-precision
1305113041 // 15 instructions: convert to single, rcpss, convert back to double, refine
1305213042 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
1305313043 // along with FMA, this could be a throughput win.
13054 if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
13055 (Subtarget->hasAVX() && VT == MVT::v8f32)) {
13056 RefinementSteps = ReciprocalEstimateRefinementSteps;
13057 return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
13058 }
13059 return SDValue();
13044 if (VT == MVT::f32 && Subtarget->hasSSE1())
13045 RecipOp = "divf";
13046 else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
13047 (VT == MVT::v8f32 && Subtarget->hasAVX()))
13048 RecipOp = "vec-divf";
13049 else
13050 return SDValue();
13051
13052 TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
13053 if (!Recips.isEnabled(RecipOp))
13054 return SDValue();
13055
13056 RefinementSteps = Recips.getRefinementSteps(RecipOp);
13057 return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
1306013058 }
1306113059
1306213060 /// If we have at least two divisions that use the same divisor, convert to
273273 LEAUsesAG = false;
274274 SlowLEA = false;
275275 SlowIncDec = false;
276 UseSqrtEst = false;
277 UseReciprocalEst = false;
278276 stackAlignment = 4;
279277 // FIXME: this is a known good value for Yonah. How about others?
280278 MaxInlineSizeThreshold = 128;
188188
189189 /// True if INC and DEC instructions are slow when writing to flags
190190 bool SlowIncDec;
191
192 /// Use the RSQRT* instructions to optimize square root calculations.
193 /// For this to be profitable, the cost of FSQRT and FDIV must be
194 /// substantially higher than normal FP ops like FADD and FMUL.
195 bool UseSqrtEst;
196
197 /// Use the RCP* instructions to optimize FP division calculations.
198 /// For this to be profitable, the cost of FDIV must be
199 /// substantially higher than normal FP ops like FADD and FMUL.
200 bool UseReciprocalEst;
201191
202192 /// Processor has AVX-512 PreFetch Instructions
203193 bool HasPFI;
379369 bool LEAusesAG() const { return LEAUsesAG; }
380370 bool slowLEA() const { return SlowLEA; }
381371 bool slowIncDec() const { return SlowIncDec; }
382 bool useSqrtEst() const { return UseSqrtEst; }
383 bool useReciprocalEst() const { return UseReciprocalEst; }
384372 bool hasCDI() const { return HasCDI; }
385373 bool hasPFI() const { return HasPFI; }
386374 bool hasERI() const { return HasERI; }
104104 if (Subtarget.isTargetWin64())
105105 this->Options.TrapUnreachable = true;
106106
107 // TODO: By default, all reciprocal estimate operations are off because
108 // that matches the behavior before TargetRecip was added (except for btver2
109 // which used subtarget features to enable this type of codegen).
110 // We should change this to match GCC behavior where everything but
111 // scalar division estimates are turned on by default with -ffast-math.
112 this->Options.Reciprocals.setDefaults("all", false, 1);
113
107114 initAsmInfo();
108115 }
109116
0 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s
1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-recip-est | FileCheck %s --check-prefix=RECIP
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-recip-est -x86-recip-refinement-steps=2 | FileCheck %s --check-prefix=REFINE
1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=divf,vec-divf | FileCheck %s --check-prefix=RECIP
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=divf:2,vec-divf:2 | FileCheck %s --check-prefix=REFINE
33
44 ; If the target's divss/divps instructions are substantially
55 ; slower than rcpss/rcpps with a Newton-Raphson refinement,
0 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s
1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-sqrt-est | FileCheck %s --check-prefix=ESTIMATE
1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=sqrtf,vec-sqrtf | FileCheck %s --check-prefix=ESTIMATE
22
33 declare double @__sqrt_finite(double) #0
44 declare float @__sqrtf_finite(float) #0