llvm.org GIT mirror llvm / b60ab5d
[Target] move reciprocal estimate settings from TargetOptions to TargetLowering The motivation for the change is that we can't have pseudo-global settings for codegen living in TargetOptions because that doesn't work with LTO. Ideally, these reciprocal attributes will be moved to the instruction-level via FMF, metadata, or something else. But making them function attributes is at least an improvement over the current state. The ingredients of this patch are: Remove the reciprocal estimate command-line debug option. Add TargetRecip to TargetLowering. Remove TargetRecip from TargetOptions. Clean up the TargetRecip implementation to work with this new scheme. Set the default reciprocal settings in TargetLoweringBase (everything is off). Update the PowerPC defaults, users, and tests. Update the x86 defaults, users, and tests. Note that if this patch needs to be reverted, the related clang patch checked in at r283251 should be reverted too. Differential Revision: https://reviews.llvm.org/D24816 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@283252 91177308-0d34-0410-b5e6-96231b3b80d8 Sanjay Patel 3 years ago
14 changed file(s) with 391 addition(s) and 303 deletion(s). Raw diff Collapse all Expand all
2626 #include "llvm/Support/Host.h"
2727 #include "llvm/Target/TargetMachine.h"
2828 #include "llvm/Target/TargetOptions.h"
29 #include "llvm/Target/TargetRecip.h"
3029 #include
3130 using namespace llvm;
3231
200199 "Only fuse FP ops when the result won't be affected."),
201200 clEnumValEnd));
202201
203 cl::list
204 ReciprocalOps("recip",
205 cl::CommaSeparated,
206 cl::desc("Choose reciprocal operation types and parameters."),
207 cl::value_desc("all,none,default,divf,!vec-sqrtd,vec-divd:0,sqrt:9..."));
208
209202 cl::opt
210203 DontPlaceZerosInBSS("nozero-initialized-in-bss",
211204 cl::desc("Don't place zero-initialized symbols into bss section"),
304297 TargetOptions Options;
305298 Options.LessPreciseFPMADOption = EnableFPMAD;
306299 Options.AllowFPOpFusion = FuseFPOps;
307 Options.Reciprocals = TargetRecip(ReciprocalOps);
308300 Options.UnsafeFPMath = EnableUnsafeFPMath;
309301 Options.NoInfsFPMath = EnableNoInfsFPMath;
310302 Options.NoNaNsFPMath = EnableNoNaNsFPMath;
6060 class MCSymbol;
6161 template class SmallVectorImpl;
6262 class DataLayout;
63 struct TargetRecip;
6364 class TargetRegisterClass;
6465 class TargetLibraryInfo;
6566 class TargetLoweringObjectFile;
539540 }
540541 }
541542 }
543
544 /// Return the reciprocal estimate code generation preferences for this target
545 /// after potentially overriding settings using the function's attributes.
546 /// FIXME: Like all unsafe-math target settings, this should really be an
547 /// instruction-level attribute/metadata/FMF.
548 TargetRecip getTargetRecipForFunc(MachineFunction &MF) const;
542549
543550 /// Vector types are broken down into some number of legal first class types.
544551 /// For example, EVT::v8f32 maps to 2 EVT::v4f32 with Altivec or SSE1, or 8
21732180 /// sequence of memory operands that is recognized by PrologEpilogInserter.
21742181 MachineBasicBlock *emitPatchPoint(MachineInstr &MI,
21752182 MachineBasicBlock *MBB) const;
2183 TargetRecip ReciprocalEstimates;
21762184 };
21772185
21782186 /// This class defines information used to lower LLVM code to legal SelectionDAG
110110 DataSections(false), UniqueSectionNames(true), TrapUnreachable(false),
111111 EmulatedTLS(false), EnableIPRA(false),
112112 FloatABIType(FloatABI::Default),
113 AllowFPOpFusion(FPOpFusion::Standard), Reciprocals(TargetRecip()),
113 AllowFPOpFusion(FPOpFusion::Standard),
114114 JTType(JumpTable::Single), ThreadModel(ThreadModel::POSIX),
115115 EABIVersion(EABI::Default), DebuggerTuning(DebuggerKind::Default),
116116 FPDenormalMode(FPDenormal::IEEE),
250250 /// via the llvm.fma.* intrinsic) will always be honored, regardless of
251251 /// the value of this option.
252252 FPOpFusion::FPOpFusionMode AllowFPOpFusion;
253
254 /// This class encapsulates options for reciprocal-estimate code generation.
255 TargetRecip Reciprocals;
256253
257254 /// JTType - This flag specifies the type of jump-instruction table to
258255 /// create for functions that have the jumptable attribute.
300297 ARE_EQUAL(EmulatedTLS) &&
301298 ARE_EQUAL(FloatABIType) &&
302299 ARE_EQUAL(AllowFPOpFusion) &&
303 ARE_EQUAL(Reciprocals) &&
304300 ARE_EQUAL(JTType) &&
305301 ARE_EQUAL(ThreadModel) &&
306302 ARE_EQUAL(EABIVersion) &&
1616 #ifndef LLVM_TARGET_TARGETRECIP_H
1717 #define LLVM_TARGET_TARGETRECIP_H
1818
19 #include "llvm/ADT/StringRef.h"
2019 #include
2120 #include
2221 #include
2423
2524 namespace llvm {
2625
26 class StringRef;
27
2728 struct TargetRecip {
2829 public:
2930 TargetRecip();
3031
31 /// Initialize all or part of the operations from command-line options or
32 /// a front end.
33 TargetRecip(const std::vector &Args);
32 /// Parse a comma-separated string of reciprocal settings to set values in
33 /// this struct.
34 void set(StringRef &Args);
3435
35 /// Set whether a particular reciprocal operation is enabled and how many
36 /// refinement steps are needed when using it. Use "all" to set enablement
37 /// and refinement steps for all operations.
38 void setDefaults(StringRef Key, bool Enable, unsigned RefSteps);
36 /// Set enablement and refinement steps for a particular reciprocal operation.
37 /// Use "all" to give all operations the same values.
38 void set(StringRef Key, bool Enable, unsigned RefSteps);
3939
40 /// Return true if the reciprocal operation has been enabled by default or
41 /// from the command-line. Return false if the operation has been disabled
42 /// by default or from the command-line.
40 /// Return true if the reciprocal operation has been enabled.
4341 bool isEnabled(StringRef Key) const;
4442
4543 /// Return the number of iterations necessary to refine the
4947 bool operator==(const TargetRecip &Other) const;
5048
5149 private:
52 enum {
53 Uninitialized = -1
54 };
55
50 // TODO: We should be able to use special values (enums) to simplify this into
51 // just an int, but we have to be careful because the user is allowed to
52 // specify "default" as a setting and just change the refinement step count.
5653 struct RecipParams {
57 int8_t Enabled;
54 bool Enabled;
5855 int8_t RefinementSteps;
5956
60 RecipParams() : Enabled(Uninitialized), RefinementSteps(Uninitialized) {}
57 RecipParams() : Enabled(false), RefinementSteps(0) {}
6158 };
6259
6360 std::map RecipMap;
837837 InitLibcallNames(LibcallRoutineNames, TM.getTargetTriple());
838838 InitCmpLibcallCCs(CmpLibcallCCs);
839839 InitLibcallCallingConvs(LibcallCallingConvs);
840 ReciprocalEstimates.set("all", false, 0);
840841 }
841842
842843 void TargetLoweringBase::initActions() {
14821483
14831484 MVT::SimpleValueType TargetLoweringBase::getCmpLibcallReturnType() const {
14841485 return MVT::i32; // return the default value
1486 }
1487
1488 TargetRecip
1489 TargetLoweringBase::getTargetRecipForFunc(MachineFunction &MF) const {
1490 const Function *F = MF.getFunction();
1491 StringRef RecipAttrName = "reciprocal-estimates";
1492 if (!F->hasFnAttribute(RecipAttrName))
1493 return ReciprocalEstimates;
1494
1495 // Make a copy of the target's default reciprocal codegen settings.
1496 TargetRecip Recips = ReciprocalEstimates;
1497
1498 // Override any settings that are customized for this function.
1499 StringRef RecipString = F->getFnAttribute(RecipAttrName).getValueAsString();
1500 Recips.set(RecipString);
1501 return Recips;
14851502 }
14861503
14871504 /// getVectorTypeBreakdown - Vector types are broken down into some number of
899899 setTargetDAGCombine(ISD::FDIV);
900900 setTargetDAGCombine(ISD::FSQRT);
901901 }
902
903 // For the estimates, convergence is quadratic, so we essentially double the
904 // number of digits correct after every iteration. For both FRE and FRSQRTE,
905 // the minimum architected relative accuracy is 2^-5. When hasRecipPrec(),
906 // this is 2^-14. IEEE float has 23 digits and double has 52 digits.
907 unsigned RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3,
908 RefinementSteps64 = RefinementSteps + 1;
909
910 ReciprocalEstimates.set("sqrtf", true, RefinementSteps);
911 ReciprocalEstimates.set("vec-sqrtf", true, RefinementSteps);
912 ReciprocalEstimates.set("divf", true, RefinementSteps);
913 ReciprocalEstimates.set("vec-divf", true, RefinementSteps);
914
915 ReciprocalEstimates.set("sqrtd", true, RefinementSteps64);
916 ReciprocalEstimates.set("vec-sqrtd", true, RefinementSteps64);
917 ReciprocalEstimates.set("divd", true, RefinementSteps64);
918 ReciprocalEstimates.set("vec-divd", true, RefinementSteps64);
902919
903920 // Darwin long double math library functions have $LDBL128 appended.
904921 if (Subtarget.isDarwin()) {
96459662 (VT == MVT::v2f64 && Subtarget.hasVSX()) ||
96469663 (VT == MVT::v4f32 && Subtarget.hasQPX()) ||
96479664 (VT == MVT::v4f64 && Subtarget.hasQPX())) {
9648 TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
9665 TargetRecip Recips = getTargetRecipForFunc(DCI.DAG.getMachineFunction());
96499666 std::string RecipOp = getRecipOp("sqrt", VT);
96509667 if (!Recips.isEnabled(RecipOp))
96519668 return SDValue();
96679684 (VT == MVT::v2f64 && Subtarget.hasVSX()) ||
96689685 (VT == MVT::v4f32 && Subtarget.hasQPX()) ||
96699686 (VT == MVT::v4f64 && Subtarget.hasQPX())) {
9670 TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
9687 TargetRecip Recips = getTargetRecipForFunc(DCI.DAG.getMachineFunction());
96719688 std::string RecipOp = getRecipOp("div", VT);
96729689 if (!Recips.isEnabled(RecipOp))
96739690 return SDValue();
203203 TargetABI(computeTargetABI(TT, Options)),
204204 Subtarget(TargetTriple, CPU, computeFSAdditions(FS, OL, TT), *this) {
205205
206 // For the estimates, convergence is quadratic, so we essentially double the
207 // number of digits correct after every iteration. For both FRE and FRSQRTE,
208 // the minimum architected relative accuracy is 2^-5. When hasRecipPrec(),
209 // this is 2^-14. IEEE float has 23 digits and double has 52 digits.
210 unsigned RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3,
211 RefinementSteps64 = RefinementSteps + 1;
212
213 this->Options.Reciprocals.setDefaults("sqrtf", true, RefinementSteps);
214 this->Options.Reciprocals.setDefaults("vec-sqrtf", true, RefinementSteps);
215 this->Options.Reciprocals.setDefaults("divf", true, RefinementSteps);
216 this->Options.Reciprocals.setDefaults("vec-divf", true, RefinementSteps);
217
218 this->Options.Reciprocals.setDefaults("sqrtd", true, RefinementSteps64);
219 this->Options.Reciprocals.setDefaults("vec-sqrtd", true, RefinementSteps64);
220 this->Options.Reciprocals.setDefaults("divd", true, RefinementSteps64);
221 this->Options.Reciprocals.setDefaults("vec-divd", true, RefinementSteps64);
222
223206 initAsmInfo();
224207 }
225208
1515
1616 #include "llvm/Target/TargetRecip.h"
1717 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/ADT/StringExtras.h"
1819 #include "llvm/ADT/StringRef.h"
20 #include "llvm/ADT/SmallVector.h"
1921 #include "llvm/Support/ErrorHandling.h"
2022
2123 using namespace llvm;
3537 "vec-sqrtf",
3638 };
3739
38 // The uninitialized state is needed for the enabled settings and refinement
39 // steps because custom settings may arrive via the command-line before target
40 // defaults are set.
40 /// All operations are disabled by default and refinement steps are set to zero.
4141 TargetRecip::TargetRecip() {
4242 unsigned NumStrings = llvm::array_lengthof(RecipOps);
4343 for (unsigned i = 0; i < NumStrings; ++i)
136136 assert(Iter == RecipMap.end() && "Float entry missing from map");
137137 report_fatal_error("Invalid option for -recip.");
138138 }
139
140 // The option was specified without a float or double suffix.
141 if (RecipMap[Val.str() + 'd'].Enabled != Uninitialized) {
142 // Make sure that the double entry was not already specified.
143 // The float entry will be checked below.
144 report_fatal_error("Duplicate option for -recip.");
145 }
146 }
147
148 if (Iter->second.Enabled != Uninitialized)
149 report_fatal_error("Duplicate option for -recip.");
139 }
150140
151141 // Mark the matched option as found. Do not allow duplicate specifiers.
152142 Iter->second.Enabled = !IsDisabled;
163153 }
164154 }
165155
166 TargetRecip::TargetRecip(const std::vector &Args) :
167 TargetRecip() {
168 unsigned NumArgs = Args.size();
156 void TargetRecip::set(StringRef &RecipString) {
157 SmallVector RecipStringVector;
158 SplitString(RecipString, RecipStringVector, ",");
159 std::vector RecipVector;
160 for (unsigned i = 0; i < RecipStringVector.size(); ++i)
161 RecipVector.push_back(RecipStringVector[i].str());
162
163 unsigned NumArgs = RecipVector.size();
169164
170165 // Check if "all", "default", or "none" was specified.
171 if (NumArgs == 1 && parseGlobalParams(Args[0]))
166 if (NumArgs == 1 && parseGlobalParams(RecipVector[0]))
172167 return;
173
174 parseIndividualParams(Args);
168
169 parseIndividualParams(RecipVector);
175170 }
176171
177172 bool TargetRecip::isEnabled(StringRef Key) const {
178173 ConstRecipIter Iter = RecipMap.find(Key);
179174 assert(Iter != RecipMap.end() && "Unknown name for reciprocal map");
180 assert(Iter->second.Enabled != Uninitialized &&
181 "Enablement setting was not initialized");
182175 return Iter->second.Enabled;
183176 }
184177
185178 unsigned TargetRecip::getRefinementSteps(StringRef Key) const {
186179 ConstRecipIter Iter = RecipMap.find(Key);
187180 assert(Iter != RecipMap.end() && "Unknown name for reciprocal map");
188 assert(Iter->second.RefinementSteps != Uninitialized &&
189 "Refinement step setting was not initialized");
190181 return Iter->second.RefinementSteps;
191182 }
192183
193 /// Custom settings (previously initialized values) override target defaults.
194 void TargetRecip::setDefaults(StringRef Key, bool Enable,
195 unsigned RefSteps) {
184 void TargetRecip::set(StringRef Key, bool Enable, unsigned RefSteps) {
196185 if (Key == "all") {
197186 for (auto &KV : RecipMap) {
198187 RecipParams &RP = KV.second;
199 if (RP.Enabled == Uninitialized)
200 RP.Enabled = Enable;
201 if (RP.RefinementSteps == Uninitialized)
202 RP.RefinementSteps = RefSteps;
188 RP.Enabled = Enable;
189 RP.RefinementSteps = RefSteps;
203190 }
204191 } else {
205192 RecipParams &RP = RecipMap[Key];
206 if (RP.Enabled == Uninitialized)
207 RP.Enabled = Enable;
208 if (RP.RefinementSteps == Uninitialized)
209 RP.RefinementSteps = RefSteps;
193 RP.Enabled = Enable;
194 RP.RefinementSteps = RefSteps;
210195 }
211196 }
212197
5252 #include "llvm/Support/ErrorHandling.h"
5353 #include "llvm/Support/MathExtras.h"
5454 #include "llvm/Target/TargetOptions.h"
55 #include "llvm/Target/TargetRecip.h"
5556 #include "X86IntrinsicsInfo.h"
5657 #include
5758 #include
8283 setBooleanContents(ZeroOrOneBooleanContent);
8384 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
8485 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
86
87 // By default (and when -ffast-math is on), enable estimate codegen with 1
88 // refinement step for floats (not doubles) except scalar division. Scalar
89 // division estimates are disabled because they break too much real-world
90 // code. These defaults are intended to match GCC behavior.
91 ReciprocalEstimates.set("sqrtf", true, 1);
92 ReciprocalEstimates.set("divf", false, 1);
93 ReciprocalEstimates.set("vec-sqrtf", true, 1);
94 ReciprocalEstimates.set("vec-divf", true, 1);
8595
8696 // For 64-bit, since we have so many registers, use the ILP scheduler.
8797 // For 32-bit, use the register pressure specific scheduling.
1520515215 else
1520615216 return SDValue();
1520715217
15208 TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
15218 TargetRecip Recips = getTargetRecipForFunc(DCI.DAG.getMachineFunction());
1520915219 if (!Recips.isEnabled(RecipOp))
1521015220 return SDValue();
1521115221
1523715247 else
1523815248 return SDValue();
1523915249
15240 TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
15250 TargetRecip Recips = getTargetRecipForFunc(DCI.DAG.getMachineFunction());
1524115251 if (!Recips.isEnabled(RecipOp))
1524215252 return SDValue();
1524315253
165165 if ((TT.isOSWindows() && TT.getArch() == Triple::x86_64) || TT.isPS4())
166166 this->Options.TrapUnreachable = true;
167167
168 // By default (and when -ffast-math is on), enable estimate codegen for
169 // everything except scalar division. By default, use 1 refinement step for
170 // all operations. Defaults may be overridden by using command-line options.
171 // Scalar division estimates are disabled because they break too much
172 // real-world code. These defaults match GCC behavior.
173 this->Options.Reciprocals.setDefaults("sqrtf", true, 1);
174 this->Options.Reciprocals.setDefaults("divf", false, 1);
175 this->Options.Reciprocals.setDefaults("vec-sqrtf", true, 1);
176 this->Options.Reciprocals.setDefaults("vec-divf", true, 1);
177
178168 initAsmInfo();
179169 }
180170
0 ; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-unsafe-fp-math -mattr=-vsx | FileCheck %s
1 ; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-unsafe-fp-math -mattr=-vsx -recip=sqrtf:0,sqrtd:0 | FileCheck %s -check-prefix=CHECK-NONR
21 ; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx | FileCheck -check-prefix=CHECK-SAFE %s
2
33 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
44 target triple = "powerpc64-unknown-linux-gnu"
55
2424 ; CHECK-NEXT: fmul
2525 ; CHECK: blr
2626
27 ; CHECK-NONR: @foo
28 ; CHECK-NONR: frsqrte
29 ; CHECK-NONR-NOT: fmadd
30 ; CHECK-NONR: fmul
31 ; CHECK-NONR-NOT: fmadd
32 ; CHECK-NONR: blr
33
3427 ; CHECK-SAFE: @foo
3528 ; CHECK-SAFE: fsqrt
3629 ; CHECK-SAFE: fdiv
3730 ; CHECK-SAFE: blr
3831 }
32
33 define double @no_estimate_refinement_f64(double %a, double %b) #0 {
34 %x = call double @llvm.sqrt.f64(double %b)
35 %r = fdiv double %a, %x
36 ret double %r
37
38 ; CHECK-LABEL: @no_estimate_refinement_f64
39 ; CHECK: frsqrte
40 ; CHECK-NOT: fmadd
41 ; CHECK: fmul
42 ; CHECK-NOT: fmadd
43 ; CHECK: blr
44 }
45
3946
4047 define double @foof(double %a, float %b) nounwind {
4148 %x = call float @llvm.sqrt.f32(float %b)
97104 ; CHECK-NEXT: fmuls
98105 ; CHECK-NEXT: blr
99106
100 ; CHECK-NONR: @goo
101 ; CHECK-NONR: frsqrtes
102 ; CHECK-NONR-NOT: fmadds
103 ; CHECK-NONR: fmuls
104 ; CHECK-NONR-NOT: fmadds
105 ; CHECK-NONR: blr
106
107107 ; CHECK-SAFE: @goo
108108 ; CHECK-SAFE: fsqrts
109109 ; CHECK-SAFE: fdivs
110110 ; CHECK-SAFE: blr
111 }
112
113
114 define float @no_estimate_refinement_f32(float %a, float %b) #0 {
115 %x = call float @llvm.sqrt.f32(float %b)
116 %r = fdiv float %a, %x
117 ret float %r
118
119 ; CHECK-LABEL: @no_estimate_refinement_f32
120 ; CHECK: frsqrtes
121 ; CHECK-NOT: fmadds
122 ; CHECK: fmuls
123 ; CHECK-NOT: fmadds
124 ; CHECK: blr
111125 }
112126
113127 ; Recognize that this is rsqrt(a) * rcp(b) * c,
251265 ; CHECK-SAFE: blr
252266 }
253267
268 attributes #0 = { nounwind "reciprocal-estimates"="sqrtf:0,sqrtd:0" }
269
None ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 -recip=!divf,!vec-divf | FileCheck %s --check-prefix=NORECIP
1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=divf,vec-divf | FileCheck %s --check-prefix=RECIP
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=divf:2,vec-divf:2 | FileCheck %s --check-prefix=REFINE
0 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=AVX
32
43 ; If the target's divss/divps instructions are substantially
54 ; slower than rcpss/rcpps with a Newton-Raphson refinement,
98 ; for details about the accuracy, speed, and implementation
109 ; differences of x86 reciprocal estimates.
1110
12 define float @reciprocal_estimate(float %x) #0 {
11 define float @f32_no_estimate(float %x) #0 {
12 ; AVX-LABEL: f32_no_estimate:
13 ; AVX: # BB#0:
14 ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
15 ; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
16 ; AVX-NEXT: retq
17 ;
1318 %div = fdiv fast float 1.0, %x
1419 ret float %div
15
16 ; NORECIP-LABEL: reciprocal_estimate:
17 ; NORECIP: movss
18 ; NORECIP-NEXT: divss
19 ; NORECIP-NEXT: movaps
20 ; NORECIP-NEXT: retq
21
22 ; RECIP-LABEL: reciprocal_estimate:
23 ; RECIP: vrcpss
24 ; RECIP: vmulss
25 ; RECIP: vsubss
26 ; RECIP: vmulss
27 ; RECIP: vaddss
28 ; RECIP-NEXT: retq
29
30 ; REFINE-LABEL: reciprocal_estimate:
31 ; REFINE: vrcpss
32 ; REFINE: vmulss
33 ; REFINE: vsubss
34 ; REFINE: vmulss
35 ; REFINE: vaddss
36 ; REFINE: vmulss
37 ; REFINE: vsubss
38 ; REFINE: vmulss
39 ; REFINE: vaddss
40 ; REFINE-NEXT: retq
4120 }
4221
43 define <4 x float> @reciprocal_estimate_v4f32(<4 x float> %x) #0 {
22 define float @f32_one_step(float %x) #1 {
23 ; AVX-LABEL: f32_one_step:
24 ; AVX: # BB#0:
25 ; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm1
26 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
27 ; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
28 ; AVX-NEXT: vsubss %xmm0, %xmm2, %xmm0
29 ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
30 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
31 ; AVX-NEXT: retq
32 ;
33 %div = fdiv fast float 1.0, %x
34 ret float %div
35 }
36
37 define float @f32_two_step(float %x) #2 {
38 ; AVX-LABEL: f32_two_step:
39 ; AVX: # BB#0:
40 ; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm1
41 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm2
42 ; AVX-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
43 ; AVX-NEXT: vsubss %xmm2, %xmm3, %xmm2
44 ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm2
45 ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1
46 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
47 ; AVX-NEXT: vsubss %xmm0, %xmm3, %xmm0
48 ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
49 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
50 ; AVX-NEXT: retq
51 ;
52 %div = fdiv fast float 1.0, %x
53 ret float %div
54 }
55
56 define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
57 ; AVX-LABEL: v4f32_no_estimate:
58 ; AVX: # BB#0:
59 ; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
60 ; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0
61 ; AVX-NEXT: retq
62 ;
4463 %div = fdiv fast <4 x float> , %x
4564 ret <4 x float> %div
46
47 ; NORECIP-LABEL: reciprocal_estimate_v4f32:
48 ; NORECIP: movaps
49 ; NORECIP-NEXT: divps
50 ; NORECIP-NEXT: movaps
51 ; NORECIP-NEXT: retq
52
53 ; RECIP-LABEL: reciprocal_estimate_v4f32:
54 ; RECIP: vrcpps
55 ; RECIP: vmulps
56 ; RECIP: vsubps
57 ; RECIP: vmulps
58 ; RECIP: vaddps
59 ; RECIP-NEXT: retq
60
61 ; REFINE-LABEL: reciprocal_estimate_v4f32:
62 ; REFINE: vrcpps
63 ; REFINE: vmulps
64 ; REFINE: vsubps
65 ; REFINE: vmulps
66 ; REFINE: vaddps
67 ; REFINE: vmulps
68 ; REFINE: vsubps
69 ; REFINE: vmulps
70 ; REFINE: vaddps
71 ; REFINE-NEXT: retq
7265 }
7366
74 define <8 x float> @reciprocal_estimate_v8f32(<8 x float> %x) #0 {
67 define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
68 ; AVX-LABEL: v4f32_one_step:
69 ; AVX: # BB#0:
70 ; AVX-NEXT: vrcpps %xmm0, %xmm1
71 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
72 ; AVX-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
73 ; AVX-NEXT: vsubps %xmm0, %xmm2, %xmm0
74 ; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0
75 ; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0
76 ; AVX-NEXT: retq
77 ;
78 %div = fdiv fast <4 x float> , %x
79 ret <4 x float> %div
80 }
81
82 define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
83 ; AVX-LABEL: v4f32_two_step:
84 ; AVX: # BB#0:
85 ; AVX-NEXT: vrcpps %xmm0, %xmm1
86 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm2
87 ; AVX-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
88 ; AVX-NEXT: vsubps %xmm2, %xmm3, %xmm2
89 ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm2
90 ; AVX-NEXT: vaddps %xmm2, %xmm1, %xmm1
91 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
92 ; AVX-NEXT: vsubps %xmm0, %xmm3, %xmm0
93 ; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0
94 ; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0
95 ; AVX-NEXT: retq
96 ;
97 %div = fdiv fast <4 x float> , %x
98 ret <4 x float> %div
99 }
100
101 define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
102 ; AVX-LABEL: v8f32_no_estimate:
103 ; AVX: # BB#0:
104 ; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
105 ; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0
106 ; AVX-NEXT: retq
107 ;
75108 %div = fdiv fast <8 x float> , %x
76109 ret <8 x float> %div
77
78 ; NORECIP-LABEL: reciprocal_estimate_v8f32:
79 ; NORECIP: movaps
80 ; NORECIP: movaps
81 ; NORECIP-NEXT: divps
82 ; NORECIP-NEXT: divps
83 ; NORECIP-NEXT: movaps
84 ; NORECIP-NEXT: movaps
85 ; NORECIP-NEXT: retq
86
87 ; RECIP-LABEL: reciprocal_estimate_v8f32:
88 ; RECIP: vrcpps
89 ; RECIP: vmulps
90 ; RECIP: vsubps
91 ; RECIP: vmulps
92 ; RECIP: vaddps
93 ; RECIP-NEXT: retq
94
95 ; REFINE-LABEL: reciprocal_estimate_v8f32:
96 ; REFINE: vrcpps
97 ; REFINE: vmulps
98 ; REFINE: vsubps
99 ; REFINE: vmulps
100 ; REFINE: vaddps
101 ; REFINE: vmulps
102 ; REFINE: vsubps
103 ; REFINE: vmulps
104 ; REFINE: vaddps
105 ; REFINE-NEXT: retq
106110 }
107111
108 attributes #0 = { "unsafe-fp-math"="true" }
112 define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
113 ; AVX-LABEL: v8f32_one_step:
114 ; AVX: # BB#0:
115 ; AVX-NEXT: vrcpps %ymm0, %ymm1
116 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
117 ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
118 ; AVX-NEXT: vsubps %ymm0, %ymm2, %ymm0
119 ; AVX-NEXT: vmulps %ymm0, %ymm1, %ymm0
120 ; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0
121 ; AVX-NEXT: retq
122 ;
123 %div = fdiv fast <8 x float> , %x
124 ret <8 x float> %div
125 }
126
127 define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
128 ; AVX-LABEL: v8f32_two_step:
129 ; AVX: # BB#0:
130 ; AVX-NEXT: vrcpps %ymm0, %ymm1
131 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm2
132 ; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
133 ; AVX-NEXT: vsubps %ymm2, %ymm3, %ymm2
134 ; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm2
135 ; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1
136 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
137 ; AVX-NEXT: vsubps %ymm0, %ymm3, %ymm0
138 ; AVX-NEXT: vmulps %ymm0, %ymm1, %ymm0
139 ; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0
140 ; AVX-NEXT: retq
141 ;
142 %div = fdiv fast <8 x float> , %x
143 ret <8 x float> %div
144 }
145
146 attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" }
147 attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" }
148 attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" }
149
None ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2,fma -recip=sqrt:2 -stop-after=expand-isel-pseudos 2>&1 | FileCheck %s
0 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2,fma -stop-after=expand-isel-pseudos 2>&1 | FileCheck %s
11
22 declare float @llvm.sqrt.f32(float) #0
33
4747 ret float %div
4848 }
4949
50 attributes #0 = { "unsafe-fp-math"="true" }
50 attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt:2" }
5151 attributes #1 = { nounwind readnone }
None ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 -recip=!sqrtf,!vec-sqrtf,!divf,!vec-divf | FileCheck %s --check-prefix=NORECIP
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=sqrtf,vec-sqrtf | FileCheck %s --check-prefix=ESTIMATE
0 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=AVX
32
4 declare double @__sqrt_finite(double) #0
5 declare float @__sqrtf_finite(float) #0
6 declare x86_fp80 @__sqrtl_finite(x86_fp80) #0
7 declare float @llvm.sqrt.f32(float) #0
8 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #0
9 declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #0
3 declare double @__sqrt_finite(double)
4 declare float @__sqrtf_finite(float)
5 declare x86_fp80 @__sqrtl_finite(x86_fp80)
6 declare float @llvm.sqrt.f32(float)
7 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
8 declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
109
1110
12 define double @fd(double %d) #0 {
13 ; NORECIP-LABEL: fd:
14 ; NORECIP: # BB#0:
15 ; NORECIP-NEXT: sqrtsd %xmm0, %xmm0
16 ; NORECIP-NEXT: retq
11 define double @finite_f64_no_estimate(double %d) #0 {
12 ; AVX-LABEL: finite_f64_no_estimate:
13 ; AVX: # BB#0:
14 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
15 ; AVX-NEXT: retq
1716 ;
18 ; ESTIMATE-LABEL: fd:
19 ; ESTIMATE: # BB#0:
20 ; ESTIMATE-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
21 ; ESTIMATE-NEXT: retq
22 %call = tail call double @__sqrt_finite(double %d) #1
17 %call = tail call double @__sqrt_finite(double %d) #2
2318 ret double %call
2419 }
2520
21 ; No estimates for doubles.
2622
27 define float @ff(float %f) #0 {
28 ; NORECIP-LABEL: ff:
29 ; NORECIP: # BB#0:
30 ; NORECIP-NEXT: sqrtss %xmm0, %xmm0
31 ; NORECIP-NEXT: retq
23 define double @finite_f64_estimate(double %d) #1 {
24 ; AVX-LABEL: finite_f64_estimate:
25 ; AVX: # BB#0:
26 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
27 ; AVX-NEXT: retq
3228 ;
33 ; ESTIMATE-LABEL: ff:
34 ; ESTIMATE: # BB#0:
35 ; ESTIMATE-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
36 ; ESTIMATE-NEXT: vmulss %xmm1, %xmm0, %xmm2
37 ; ESTIMATE-NEXT: vmulss %xmm1, %xmm2, %xmm1
38 ; ESTIMATE-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1
39 ; ESTIMATE-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2
40 ; ESTIMATE-NEXT: vmulss %xmm1, %xmm2, %xmm1
41 ; ESTIMATE-NEXT: vxorps %xmm2, %xmm2, %xmm2
42 ; ESTIMATE-NEXT: vcmpeqss %xmm2, %xmm0, %xmm0
43 ; ESTIMATE-NEXT: vandnps %xmm1, %xmm0, %xmm0
44 ; ESTIMATE-NEXT: retq
45 %call = tail call float @__sqrtf_finite(float %f) #1
29 %call = tail call double @__sqrt_finite(double %d) #2
30 ret double %call
31 }
32
33 define float @finite_f32_no_estimate(float %f) #0 {
34 ; AVX-LABEL: finite_f32_no_estimate:
35 ; AVX: # BB#0:
36 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
37 ; AVX-NEXT: retq
38 ;
39 %call = tail call float @__sqrtf_finite(float %f) #2
4640 ret float %call
4741 }
4842
43 define float @finite_f32_estimate(float %f) #1 {
44 ; AVX-LABEL: finite_f32_estimate:
45 ; AVX: # BB#0:
46 ; AVX-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
47 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm2
48 ; AVX-NEXT: vmulss %xmm1, %xmm2, %xmm1
49 ; AVX-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1
50 ; AVX-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2
51 ; AVX-NEXT: vmulss %xmm1, %xmm2, %xmm1
52 ; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
53 ; AVX-NEXT: vcmpeqss %xmm2, %xmm0, %xmm0
54 ; AVX-NEXT: vandnps %xmm1, %xmm0, %xmm0
55 ; AVX-NEXT: retq
56 ;
57 %call = tail call float @__sqrtf_finite(float %f) #2
58 ret float %call
59 }
4960
50 define x86_fp80 @fld(x86_fp80 %ld) #0 {
51 ; NORECIP-LABEL: fld:
52 ; NORECIP: # BB#0:
53 ; NORECIP-NEXT: fldt {{[0-9]+}}(%rsp)
54 ; NORECIP-NEXT: fsqrt
55 ; NORECIP-NEXT: retq
61 define x86_fp80 @finite_f80_no_estimate(x86_fp80 %ld) #0 {
62 ; AVX-LABEL: finite_f80_no_estimate:
63 ; AVX: # BB#0:
64 ; AVX-NEXT: fldt {{[0-9]+}}(%rsp)
65 ; AVX-NEXT: fsqrt
66 ; AVX-NEXT: retq
5667 ;
57 ; ESTIMATE-LABEL: fld:
58 ; ESTIMATE: # BB#0:
59 ; ESTIMATE-NEXT: fldt {{[0-9]+}}(%rsp)
60 ; ESTIMATE-NEXT: fsqrt
61 ; ESTIMATE-NEXT: retq
62 %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #1
68 %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #2
6369 ret x86_fp80 %call
6470 }
6571
72 ; Don't die on the impossible.
6673
74 define x86_fp80 @finite_f80_estimate_but_no(x86_fp80 %ld) #1 {
75 ; AVX-LABEL: finite_f80_estimate_but_no:
76 ; AVX: # BB#0:
77 ; AVX-NEXT: fldt {{[0-9]+}}(%rsp)
78 ; AVX-NEXT: fsqrt
79 ; AVX-NEXT: retq
80 ;
81 %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #2
82 ret x86_fp80 %call
83 }
6784
68 define float @reciprocal_square_root(float %x) #0 {
69 ; NORECIP-LABEL: reciprocal_square_root:
70 ; NORECIP: # BB#0:
71 ; NORECIP-NEXT: sqrtss %xmm0, %xmm1
72 ; NORECIP-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
73 ; NORECIP-NEXT: divss %xmm1, %xmm0
74 ; NORECIP-NEXT: retq
85 define float @f32_no_estimate(float %x) #0 {
86 ; AVX-LABEL: f32_no_estimate:
87 ; AVX: # BB#0:
88 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
89 ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
90 ; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
91 ; AVX-NEXT: retq
7592 ;
76 ; ESTIMATE-LABEL: reciprocal_square_root:
77 ; ESTIMATE: # BB#0:
78 ; ESTIMATE-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
79 ; ESTIMATE-NEXT: vmulss %xmm1, %xmm1, %xmm2
80 ; ESTIMATE-NEXT: vmulss %xmm2, %xmm0, %xmm0
81 ; ESTIMATE-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0
82 ; ESTIMATE-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm1
83 ; ESTIMATE-NEXT: vmulss %xmm0, %xmm1, %xmm0
84 ; ESTIMATE-NEXT: retq
8593 %sqrt = tail call float @llvm.sqrt.f32(float %x)
8694 %div = fdiv fast float 1.0, %sqrt
8795 ret float %div
8896 }
8997
90 define <4 x float> @reciprocal_square_root_v4f32(<4 x float> %x) #0 {
91 ; NORECIP-LABEL: reciprocal_square_root_v4f32:
92 ; NORECIP: # BB#0:
93 ; NORECIP-NEXT: sqrtps %xmm0, %xmm1
94 ; NORECIP-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
95 ; NORECIP-NEXT: divps %xmm1, %xmm0
96 ; NORECIP-NEXT: retq
98 define float @f32_estimate(float %x) #1 {
99 ; AVX-LABEL: f32_estimate:
100 ; AVX: # BB#0:
101 ; AVX-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
102 ; AVX-NEXT: vmulss %xmm1, %xmm1, %xmm2
103 ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
104 ; AVX-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0
105 ; AVX-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm1
106 ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
107 ; AVX-NEXT: retq
97108 ;
98 ; ESTIMATE-LABEL: reciprocal_square_root_v4f32:
99 ; ESTIMATE: # BB#0:
100 ; ESTIMATE-NEXT: vrsqrtps %xmm0, %xmm1
101 ; ESTIMATE-NEXT: vmulps %xmm1, %xmm1, %xmm2
102 ; ESTIMATE-NEXT: vmulps %xmm2, %xmm0, %xmm0
103 ; ESTIMATE-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
104 ; ESTIMATE-NEXT: vmulps {{.*}}(%rip), %xmm1, %xmm1
105 ; ESTIMATE-NEXT: vmulps %xmm0, %xmm1, %xmm0
106 ; ESTIMATE-NEXT: retq
109 %sqrt = tail call float @llvm.sqrt.f32(float %x)
110 %div = fdiv fast float 1.0, %sqrt
111 ret float %div
112 }
113
114 define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
115 ; AVX-LABEL: v4f32_no_estimate:
116 ; AVX: # BB#0:
117 ; AVX-NEXT: vsqrtps %xmm0, %xmm0
118 ; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
119 ; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0
120 ; AVX-NEXT: retq
121 ;
107122 %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
108123 %div = fdiv fast <4 x float> , %sqrt
109124 ret <4 x float> %div
110125 }
111126
112 define <8 x float> @reciprocal_square_root_v8f32(<8 x float> %x) #0 {
113 ; NORECIP-LABEL: reciprocal_square_root_v8f32:
114 ; NORECIP: # BB#0:
115 ; NORECIP-NEXT: sqrtps %xmm1, %xmm2
116 ; NORECIP-NEXT: sqrtps %xmm0, %xmm3
117 ; NORECIP-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
118 ; NORECIP-NEXT: movaps %xmm1, %xmm0
119 ; NORECIP-NEXT: divps %xmm3, %xmm0
120 ; NORECIP-NEXT: divps %xmm2, %xmm1
121 ; NORECIP-NEXT: retq
127 define <4 x float> @v4f32_estimate(<4 x float> %x) #1 {
128 ; AVX-LABEL: v4f32_estimate:
129 ; AVX: # BB#0:
130 ; AVX-NEXT: vrsqrtps %xmm0, %xmm1
131 ; AVX-NEXT: vmulps %xmm1, %xmm1, %xmm2
132 ; AVX-NEXT: vmulps %xmm2, %xmm0, %xmm0
133 ; AVX-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
134 ; AVX-NEXT: vmulps {{.*}}(%rip), %xmm1, %xmm1
135 ; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0
136 ; AVX-NEXT: retq
122137 ;
123 ; ESTIMATE-LABEL: reciprocal_square_root_v8f32:
124 ; ESTIMATE: # BB#0:
125 ; ESTIMATE-NEXT: vrsqrtps %ymm0, %ymm1
126 ; ESTIMATE-NEXT: vmulps %ymm1, %ymm1, %ymm2
127 ; ESTIMATE-NEXT: vmulps %ymm2, %ymm0, %ymm0
128 ; ESTIMATE-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
129 ; ESTIMATE-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
130 ; ESTIMATE-NEXT: vmulps %ymm0, %ymm1, %ymm0
131 ; ESTIMATE-NEXT: retq
138 %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
139 %div = fdiv fast <4 x float> , %sqrt
140 ret <4 x float> %div
141 }
142
143 define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
144 ; AVX-LABEL: v8f32_no_estimate:
145 ; AVX: # BB#0:
146 ; AVX-NEXT: vsqrtps %ymm0, %ymm0
147 ; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
148 ; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0
149 ; AVX-NEXT: retq
150 ;
151 %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
152 %div = fdiv fast <8 x float> , %sqrt
153 ret <8 x float> %div
154 }
155
156 define <8 x float> @v8f32_estimate(<8 x float> %x) #1 {
157 ; AVX-LABEL: v8f32_estimate:
158 ; AVX: # BB#0:
159 ; AVX-NEXT: vrsqrtps %ymm0, %ymm1
160 ; AVX-NEXT: vmulps %ymm1, %ymm1, %ymm2
161 ; AVX-NEXT: vmulps %ymm2, %ymm0, %ymm0
162 ; AVX-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
163 ; AVX-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
164 ; AVX-NEXT: vmulps %ymm0, %ymm1, %ymm0
165 ; AVX-NEXT: retq
166 ;
132167 %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
133168 %div = fdiv fast <8 x float> , %sqrt
134169 ret <8 x float> %div
135170 }
136171
137172
138 attributes #0 = { "unsafe-fp-math"="true" }
139 attributes #1 = { nounwind readnone }
173 attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!sqrtf,!vec-sqrtf,!divf,!vec-divf" }
174 attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" }
175 attributes #2 = { nounwind readnone }
140176