llvm.org GIT mirror llvm / 3248ce6
Merging r276051 and r276823: ------------------------------------------------------------------------ r276051 | arsenm | 2016-07-19 16:16:53 -0700 (Tue, 19 Jul 2016) | 8 lines AMDGPU: Change fdiv lowering based on !fpmath metadata If 2.5 ulp is acceptable, denormals are not required, and isn't a reciprocal which will already be handled, replace with a faster fdiv. Simplify the lowering tests by using per function subtarget features. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r276823 | arsenm | 2016-07-26 16:25:44 -0700 (Tue, 26 Jul 2016) | 4 lines AMDGPU: Use rcp for fdiv 1, x with fpmath metadata Using rcp should be OK for safe math usually, so this should not be replacing the original fdiv. ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_39@278243 91177308-0d34-0410-b5e6-96231b3b80d8 Hans Wennborg 4 years ago
13 changed file(s) with 702 addition(s) and 227 deletion(s). Raw diff Collapse all Expand all
1919 class AMDGPUSubtarget;
2020 class AMDGPUTargetMachine;
2121 class FunctionPass;
22 class GCNTargetMachine;
2223 struct MachineSchedContext;
2324 class MCAsmInfo;
2425 class raw_ostream;
4950 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
5051 FunctionPass *createSIDebuggerInsertNopsPass();
5152 FunctionPass *createSIInsertWaitsPass();
52 FunctionPass *createAMDGPUCodeGenPreparePass(const TargetMachine *TM = nullptr);
53 FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);
5354
5455 ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C);
5556
1313 //===----------------------------------------------------------------------===//
1414
1515 #include "AMDGPU.h"
16 #include "AMDGPUIntrinsicInfo.h"
1617 #include "AMDGPUSubtarget.h"
18 #include "AMDGPUTargetMachine.h"
1719
1820 #include "llvm/Analysis/DivergenceAnalysis.h"
1921 #include "llvm/CodeGen/Passes.h"
2931 namespace {
3032
3133 class AMDGPUCodeGenPrepare : public FunctionPass,
32 public InstVisitor> {
34 public InstVisitor, bool> {
35 const GCNTargetMachine *TM;
36 const SISubtarget *ST;
3337 DivergenceAnalysis *DA;
34 const TargetMachine *TM;
38 Module *Mod;
39 bool HasUnsafeFPMath;
3540
3641 public:
3742 static char ID;
3843 AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
3944 FunctionPass(ID),
40 TM(TM) { }
45 TM(static_cast(TM)),
46 ST(nullptr),
47 DA(nullptr),
48 Mod(nullptr),
49 HasUnsafeFPMath(false) { }
50
51 bool visitFDiv(BinaryOperator &I);
52
53 bool visitInstruction(Instruction &I) {
54 return false;
55 }
4156
4257 bool doInitialization(Module &M) override;
4358 bool runOnFunction(Function &F) override;
5469
5570 } // End anonymous namespace
5671
72 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
73 const ConstantFP *CNum = dyn_cast(Num);
74 if (!CNum)
75 return false;
76
77 // Reciprocal f32 is handled separately without denormals.
78 return UnsafeDiv || CNum->isExactlyValue(+1.0);
79 }
80
81 // Insert an intrinsic for fast fdiv for safe math situations where we can
82 // reduce precision. Leave fdiv for situations where the generic node is
83 // expected to be optimized.
84 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
85 Type *Ty = FDiv.getType();
86
87 // TODO: Handle half
88 if (!Ty->getScalarType()->isFloatTy())
89 return false;
90
91 MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
92 if (!FPMath)
93 return false;
94
95 const FPMathOperator *FPOp = cast(&FDiv);
96 float ULP = FPOp->getFPAccuracy();
97 if (ULP < 2.5f)
98 return false;
99
100 FastMathFlags FMF = FPOp->getFastMathFlags();
101 bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
102 FMF.allowReciprocal();
103 if (ST->hasFP32Denormals() && !UnsafeDiv)
104 return false;
105
106 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
107 Builder.setFastMathFlags(FMF);
108 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
109
110 const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
111 Function *Decl
112 = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
113
114 Value *Num = FDiv.getOperand(0);
115 Value *Den = FDiv.getOperand(1);
116
117 Value *NewFDiv = nullptr;
118
119 if (VectorType *VT = dyn_cast(Ty)) {
120 NewFDiv = UndefValue::get(VT);
121
122 // FIXME: Doesn't do the right thing for cases where the vector is partially
123 // constant. This works when the scalarizer pass is run first.
124 for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
125 Value *NumEltI = Builder.CreateExtractElement(Num, I);
126 Value *DenEltI = Builder.CreateExtractElement(Den, I);
127 Value *NewElt;
128
129 if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
130 NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
131 } else {
132 NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
133 }
134
135 NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
136 }
137 } else {
138 if (!shouldKeepFDivF32(Num, UnsafeDiv))
139 NewFDiv = Builder.CreateCall(Decl, { Num, Den });
140 }
141
142 if (NewFDiv) {
143 FDiv.replaceAllUsesWith(NewFDiv);
144 NewFDiv->takeName(&FDiv);
145 FDiv.eraseFromParent();
146 }
147
148 return true;
149 }
150
151 static bool hasUnsafeFPMath(const Function &F) {
152 Attribute Attr = F.getFnAttribute("unsafe-fp-math");
153 return Attr.getValueAsString() == "true";
154 }
155
57156 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
157 Mod = &M;
58158 return false;
59159 }
60160
62162 if (!TM || skipFunction(F))
63163 return false;
64164
165 ST = &TM->getSubtarget(F);
65166 DA = &getAnalysis();
66 visit(F);
167 HasUnsafeFPMath = hasUnsafeFPMath(F);
67168
68 return true;
169 bool MadeChange = false;
170
171 for (BasicBlock &BB : F) {
172 BasicBlock::iterator Next;
173 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
174 Next = std::next(I);
175 MadeChange |= visit(*I);
176 }
177 }
178
179 return MadeChange;
69180 }
70181
71182 INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
76187
77188 char AMDGPUCodeGenPrepare::ID = 0;
78189
79 FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const TargetMachine *TM) {
190 FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) {
80191 return new AMDGPUCodeGenPrepare(TM);
81192 }
2828 #undef GET_INTRINSIC_NAME_TABLE
2929 };
3030
31 std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
32 unsigned numTys) const {
33 if (IntrID < Intrinsic::num_intrinsics) {
34 return nullptr;
35 }
31 namespace {
32 #define GET_INTRINSIC_ATTRIBUTES
33 #include "AMDGPUGenIntrinsics.inc"
34 #undef GET_INTRINSIC_ATTRIBUTES
35 }
36
37 StringRef AMDGPUIntrinsicInfo::getName(unsigned IntrID,
38 ArrayRef Tys) const {
39 if (IntrID < Intrinsic::num_intrinsics)
40 return StringRef();
41
3642 assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics &&
3743 "Invalid intrinsic ID");
3844
39 std::string Result(IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]);
40 return Result;
45 return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics];
46 }
47
48 std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
49 unsigned NumTys) const {
50 return getName(IntrID, makeArrayRef(Tys, NumTys)).str();
51 }
52
53 FunctionType *AMDGPUIntrinsicInfo::getType(LLVMContext &Context, unsigned ID,
54 ArrayRef Tys) const {
55 // FIXME: Re-use Intrinsic::getType machinery
56 switch (ID) {
57 case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
58 Type *F32Ty = Type::getFloatTy(Context);
59 return FunctionType::get(F32Ty, { F32Ty, F32Ty }, false);
60 }
61 default:
62 llvm_unreachable("unhandled intrinsic");
63 }
4164 }
4265
4366 unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData,
6891 }
6992
7093 Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
94 ArrayRef Tys) const {
95 FunctionType *FTy = getType(M->getContext(), IntrID, Tys);
96 Function *F
97 = cast(M->getOrInsertFunction(getName(IntrID, Tys), FTy));
98
99 AttributeSet AS = getAttributes(M->getContext(),
100 static_cast(IntrID));
101 F->setAttributes(AS);
102 return F;
103 }
104
105 Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
71106 Type **Tys,
72 unsigned numTys) const {
73 llvm_unreachable("Not implemented");
107 unsigned NumTys) const {
108 return getDeclaration(M, IntrID, makeArrayRef(Tys, NumTys));
74109 }
3333 class AMDGPUIntrinsicInfo final : public TargetIntrinsicInfo {
3434 public:
3535 AMDGPUIntrinsicInfo();
36
37 StringRef getName(unsigned IntrId, ArrayRef Tys = None) const;
38
3639 std::string getName(unsigned IntrId, Type **Tys = nullptr,
37 unsigned numTys = 0) const override;
40 unsigned NumTys = 0) const override;
41
3842 unsigned lookupName(const char *Name, unsigned Len) const override;
3943 bool isOverloaded(unsigned IID) const override;
4044 Function *getDeclaration(Module *M, unsigned ID,
4145 Type **Tys = nullptr,
42 unsigned numTys = 0) const override;
46 unsigned NumTys = 0) const override;
47
48 Function *getDeclaration(Module *M, unsigned ID,
49 ArrayRef = None) const;
50
51 FunctionType *getType(LLVMContext &Context, unsigned ID,
52 ArrayRef Tys = None) const;
4353 };
4454
4555 } // end namespace llvm
308308 ScheduleDAGInstrs *
309309 createMachineScheduler(MachineSchedContext *C) const override;
310310
311 void addIRPasses() override;
311312 bool addPreISel() override;
312313 void addMachineSSAOptimization() override;
313314 bool addInstSelector() override;
498499 addPass(&DeadMachineInstructionElimID);
499500 }
500501
502 void GCNPassConfig::addIRPasses() {
503 // TODO: May want to move later or split into an early and late one.
504 addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine()));
505
506 AMDGPUPassConfig::addIRPasses();
507 }
508
501509 bool GCNPassConfig::addInstSelector() {
502510 AMDGPUPassConfig::addInstSelector();
503511 addPass(createSILowerI1CopiesPass());
17911791 return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
17921792 Op->getVTList(), Ops, VT, MMO);
17931793 }
1794 case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
1795 return lowerFDIV_FAST(Op, DAG);
1796 }
17941797 case AMDGPUIntrinsic::SI_vs_load_input:
17951798 return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
17961799 Op.getOperand(1),
20972100
20982101 // Catch division cases where we can use shortcuts with rcp and rsq
20992102 // instructions.
2100 SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
2103 SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
2104 SelectionDAG &DAG) const {
21012105 SDLoc SL(Op);
21022106 SDValue LHS = Op.getOperand(0);
21032107 SDValue RHS = Op.getOperand(1);
21382142 return SDValue();
21392143 }
21402144
2145 // Faster 2.5 ULP division that does not support denormals.
2146 SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
2147 SDLoc SL(Op);
2148 SDValue LHS = Op.getOperand(1);
2149 SDValue RHS = Op.getOperand(2);
2150
2151 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
2152
2153 const APFloat K0Val(BitsToFloat(0x6f800000));
2154 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
2155
2156 const APFloat K1Val(BitsToFloat(0x2f800000));
2157 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
2158
2159 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
2160
2161 EVT SetCCVT =
2162 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
2163
2164 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
2165
2166 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
2167
2168 // TODO: Should this propagate fast-math-flags?
2169 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
2170
2171 // rcp does not support denormals.
2172 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
2173
2174 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
2175
2176 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
2177 }
2178
21412179 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
2142 if (SDValue FastLowered = LowerFastFDIV(Op, DAG))
2180 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
21432181 return FastLowered;
21442182
21452183 SDLoc SL(Op);
21462184 SDValue LHS = Op.getOperand(0);
21472185 SDValue RHS = Op.getOperand(1);
21482186
2149 // faster 2.5 ulp fdiv when using -amdgpu-fast-fdiv flag
2150 if (EnableAMDGPUFastFDIV) {
2151 // This does not support denormals.
2152 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
2153
2154 const APFloat K0Val(BitsToFloat(0x6f800000));
2155 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
2156
2157 const APFloat K1Val(BitsToFloat(0x2f800000));
2158 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
2159
2160 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
2161
2162 EVT SetCCVT =
2163 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
2164
2165 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
2166
2167 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
2168
2169 // TODO: Should this propagate fast-math-flags?
2170
2171 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
2172
2173 // rcp does not support denormals.
2174 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
2175
2176 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
2177
2178 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
2179 }
2180
2181 // Generates more precise fpdiv32.
21822187 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
21832188
21842189 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
22082213
22092214 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
22102215 if (DAG.getTarget().Options.UnsafeFPMath)
2211 return LowerFastFDIV(Op, DAG);
2216 return lowerFastUnsafeFDIV(Op, DAG);
22122217
22132218 SDLoc SL(Op);
22142219 SDValue X = Op.getOperand(0);
3535 SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
3636 SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
3737 SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
38 SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const;
38 SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
39 SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const;
3940 SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
4041 SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;
4142 SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
66 //
77 //===----------------------------------------------------------------------===//
88 //
9 // SI Intrinsic Definitions
9 // Backend internal SI Intrinsic Definitions. User code should not
10 // directly use these.
1011 //
1112 //===----------------------------------------------------------------------===//
1213
176177 } // End TargetPrefix = "SI", isTarget = 1
177178
178179 let TargetPrefix = "amdgcn", isTarget = 1 in {
180 // Emit 2.5 ulp, no denormal division. Should only be inserted by
181 // pass based on !fpmath metadata.
182 def int_amdgcn_fdiv_fast : Intrinsic<
183 [llvm_float_ty], [llvm_float_ty], [IntrNoMem]
184 >;
185
179186 /* Control flow Intrinsics */
180187
181188 def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>;
None ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare < %s | FileCheck %s
1 ; RUN: opt -S -amdgpu-codegenprepare < %s
0 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare %s | FileCheck %s
1 ; RUN: opt -S -amdgpu-codegenprepare %s | FileCheck -check-prefix=NOOP %s
22 ; Make sure this doesn't crash with no triple
33
4 ; CHECK-LABEL: @foo(
5 define void @foo() {
6 ret void
7 }
4 ; NOOP-LABEL: @noop_fdiv_fpmath(
5 ; NOOP: %md.25ulp = fdiv float %a, %b, !fpmath !0
6 define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
7 %md.25ulp = fdiv float %a, %b, !fpmath !0
8 store volatile float %md.25ulp, float addrspace(1)* %out
9 ret void
10 }
11
12 ; CHECK-LABEL: @fdiv_fpmath(
13 ; CHECK: %no.md = fdiv float %a, %b{{$}}
14 ; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
15 ; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
16 ; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
17 ; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
18 ; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
19 ; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
20 define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
21 %no.md = fdiv float %a, %b
22 store volatile float %no.md, float addrspace(1)* %out
23
24 %md.half.ulp = fdiv float %a, %b, !fpmath !1
25 store volatile float %md.half.ulp, float addrspace(1)* %out
26
27 %md.1ulp = fdiv float %a, %b, !fpmath !2
28 store volatile float %md.1ulp, float addrspace(1)* %out
29
30 %md.25ulp = fdiv float %a, %b, !fpmath !0
31 store volatile float %md.25ulp, float addrspace(1)* %out
32
33 %md.3ulp = fdiv float %a, %b, !fpmath !3
34 store volatile float %md.3ulp, float addrspace(1)* %out
35
36 %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
37 store volatile float %fast.md.25ulp, float addrspace(1)* %out
38
39 %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
40 store volatile float %arcp.md.25ulp, float addrspace(1)* %out
41
42 ret void
43 }
44
45 ; CHECK-LABEL: @rcp_fdiv_fpmath(
46 ; CHECK: %no.md = fdiv float 1.000000e+00, %x{{$}}
47 ; CHECK: %md.25ulp = fdiv float 1.000000e+00, %x, !fpmath !0
48 ; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x, !fpmath !1
49 ; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x{{$}}
50 ; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0
51 ; CHECK: %fast.no.md = fdiv fast float 1.000000e+00, %x{{$}}
52 ; CHECK: %fast.25ulp = fdiv fast float 1.000000e+00, %x, !fpmath !0
53 define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
54 %no.md = fdiv float 1.0, %x
55 store volatile float %no.md, float addrspace(1)* %out
56
57 %md.25ulp = fdiv float 1.0, %x, !fpmath !0
58 store volatile float %md.25ulp, float addrspace(1)* %out
59
60 %md.half.ulp = fdiv float 1.0, %x, !fpmath !1
61 store volatile float %md.half.ulp, float addrspace(1)* %out
62
63 %arcp.no.md = fdiv arcp float 1.0, %x
64 store volatile float %arcp.no.md, float addrspace(1)* %out
65
66 %arcp.25ulp = fdiv arcp float 1.0, %x, !fpmath !0
67 store volatile float %arcp.25ulp, float addrspace(1)* %out
68
69 %fast.no.md = fdiv fast float 1.0, %x
70 store volatile float %fast.no.md, float addrspace(1)* %out
71
72 %fast.25ulp = fdiv fast float 1.0, %x, !fpmath !0
73 store volatile float %fast.25ulp, float addrspace(1)* %out
74
75 ret void
76 }
77
78 ; CHECK-LABEL: @fdiv_fpmath_vector(
79 ; CHECK: %no.md = fdiv <2 x float> %a, %b{{$}}
80 ; CHECK: %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1
81 ; CHECK: %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2
82
83 ; CHECK: %[[A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
84 ; CHECK: %[[B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
85 ; CHECK: %[[FDIV0:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A0]], float %[[B0]]), !fpmath !0
86 ; CHECK: %[[INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FDIV0]], i64 0
87 ; CHECK: %[[A1:[0-9]+]] = extractelement <2 x float> %a, i64 1
88 ; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
89 ; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]), !fpmath !0
90 ; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1
91 define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
92 %no.md = fdiv <2 x float> %a, %b
93 store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
94
95 %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1
96 store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
97
98 %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2
99 store volatile <2 x float> %md.1ulp, <2 x float> addrspace(1)* %out
100
101 %md.25ulp = fdiv <2 x float> %a, %b, !fpmath !0
102 store volatile <2 x float> %md.25ulp, <2 x float> addrspace(1)* %out
103
104 ret void
105 }
106
107 ; CHECK-LABEL: @rcp_fdiv_fpmath_vector(
108 ; CHECK: %no.md = fdiv <2 x float> , %x{{$}}
109 ; CHECK: %md.half.ulp = fdiv <2 x float> , %x, !fpmath !1
110 ; CHECK: %arcp.no.md = fdiv arcp <2 x float> , %x{{$}}
111 ; CHECK: %fast.no.md = fdiv fast <2 x float> , %x{{$}}
112
113 ; CHECK: extractelement <2 x float> %x
114 ; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
115 ; CHECK: extractelement <2 x float> %x
116 ; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
117 ; CHECK: store volatile <2 x float> %arcp.25ulp
118
119 ; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
120 ; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
121 ; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
122 define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
123 %no.md = fdiv <2 x float> , %x
124 store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
125
126 %md.half.ulp = fdiv <2 x float> , %x, !fpmath !1
127 store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
128
129 %arcp.no.md = fdiv arcp <2 x float> , %x
130 store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
131
132 %fast.no.md = fdiv fast <2 x float> , %x
133 store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
134
135 %arcp.25ulp = fdiv arcp <2 x float> , %x, !fpmath !0
136 store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
137
138 %fast.25ulp = fdiv fast <2 x float> , %x, !fpmath !0
139 store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
140
141 ret void
142 }
143
144 ; CHECK-LABEL: @rcp_fdiv_fpmath_vector_nonsplat(
145 ; CHECK: %no.md = fdiv <2 x float> , %x
146 ; CHECK: %arcp.no.md = fdiv arcp <2 x float> , %x
147 ; CHECK: %fast.no.md = fdiv fast <2 x float> , %x{{$}}
148
149 ; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
150 ; CHECK: fdiv arcp float 1.000000e+00, %[[X0]], !fpmath !0
151 ; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
152 ; CHECK: fdiv arcp float 2.000000e+00, %[[X1]], !fpmath !0
153 ; CHECK: store volatile <2 x float> %arcp.25ulp
154
155 ; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
156 ; CHECK: fdiv fast float 1.000000e+00, %[[X0]], !fpmath !0
157 ; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
158 ; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0
159 ; CHECK: store volatile <2 x float> %fast.25ulp
160 define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
161 %no.md = fdiv <2 x float> , %x
162 store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
163
164 %arcp.no.md = fdiv arcp <2 x float> , %x
165 store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
166
167 %fast.no.md = fdiv fast <2 x float> , %x
168 store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
169
170 %arcp.25ulp = fdiv arcp <2 x float> , %x, !fpmath !0
171 store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
172
173 %fast.25ulp = fdiv fast <2 x float> , %x, !fpmath !0
174 store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
175
176 ret void
177 }
178
179 ; FIXME: Should be able to get fdiv for 1.0 component
180 ; CHECK-LABEL: @rcp_fdiv_fpmath_vector_partial_constant(
181 ; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
182 ; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
183 ; CHECK: store volatile <2 x float> %arcp.25ulp
184
185 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
186 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
187 ; CHECK: store volatile <2 x float> %fast.25ulp
188 define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
189 %x.insert = insertelement <2 x float> %x, float 1.0, i32 0
190
191 %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
192 store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
193
194 %fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0
195 store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
196
197 ret void
198 }
199
200 ; CHECK-LABEL: @fdiv_fpmath_f32_denormals(
201 ; CHECK: %no.md = fdiv float %a, %b{{$}}
202 ; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
203 ; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
204 ; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0
205 ; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
206 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
207 ; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
208 define void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
209 %no.md = fdiv float %a, %b
210 store volatile float %no.md, float addrspace(1)* %out
211
212 %md.half.ulp = fdiv float %a, %b, !fpmath !1
213 store volatile float %md.half.ulp, float addrspace(1)* %out
214
215 %md.1ulp = fdiv float %a, %b, !fpmath !2
216 store volatile float %md.1ulp, float addrspace(1)* %out
217
218 %md.25ulp = fdiv float %a, %b, !fpmath !0
219 store volatile float %md.25ulp, float addrspace(1)* %out
220
221 %md.3ulp = fdiv float %a, %b, !fpmath !3
222 store volatile float %md.3ulp, float addrspace(1)* %out
223
224 %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
225 store volatile float %fast.md.25ulp, float addrspace(1)* %out
226
227 %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
228 store volatile float %arcp.md.25ulp, float addrspace(1)* %out
229
230 ret void
231 }
232
233 attributes #0 = { nounwind optnone noinline }
234 attributes #1 = { nounwind }
235 attributes #2 = { nounwind "target-features"="+fp32-denormals" }
236
237 ; CHECK: !0 = !{float 2.500000e+00}
238 ; CHECK: !1 = !{float 5.000000e-01}
239 ; CHECK: !2 = !{float 1.000000e+00}
240 ; CHECK: !3 = !{float 3.000000e+00}
241
242 !0 = !{float 2.500000e+00}
243 !1 = !{float 5.000000e-01}
244 !2 = !{float 1.000000e+00}
245 !3 = !{float 3.000000e+00}
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=I754 -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -amdgpu-fast-fdiv < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
3 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=I754 -check-prefix=FUNC %s
4 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=UNSAFE-FP -check-prefix=FUNC %s
51 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
62
73 ; These tests check that fdiv is expanded correctly and also test that the
1410 ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
1511 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
1612
17 ; UNSAFE-FP: v_rcp_f32
18 ; UNSAFE-FP: v_mul_f32_e32
13 ; SI: v_div_scale_f32
14 ; SI-DAG: v_div_scale_f32
1915
2016 ; SI-DAG: v_rcp_f32
21 ; SI-DAG: v_mul_f32
22
23 ; I754-DAG: v_div_scale_f32
24 ; I754-DAG: v_rcp_f32
25 ; I754-DAG: v_fma_f32
26 ; I754-DAG: v_mul_f32
27 ; I754-DAG: v_fma_f32
28 ; I754-DAG: v_div_fixup_f32
29 define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) {
30 entry:
31 %0 = fdiv float %a, %b
32 store float %0, float addrspace(1)* %out
17 ; SI: v_fma_f32
18 ; SI: v_fma_f32
19 ; SI: v_mul_f32
20 ; SI: v_fma_f32
21 ; SI: v_fma_f32
22 ; SI: v_fma_f32
23 ; SI: v_div_fmas_f32
24 ; SI: v_div_fixup_f32
25 define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
26 entry:
27 %fdiv = fdiv float %a, %b
28 store float %fdiv, float addrspace(1)* %out
29 ret void
30 }
31
32 ; FUNC-LABEL: {{^}}fdiv_25ulp_f32:
33 ; SI: v_cndmask_b32
34 ; SI: v_mul_f32
35 ; SI: v_rcp_f32
36 ; SI: v_mul_f32
37 ; SI: v_mul_f32
38 define void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 {
39 entry:
40 %fdiv = fdiv float %a, %b, !fpmath !0
41 store float %fdiv, float addrspace(1)* %out
42 ret void
43 }
44
45 ; Use correct fdiv
46 ; FUNC-LABEL: {{^}}fdiv_25ulp_denormals_f32:
47 ; SI: v_fma_f32
48 ; SI: v_div_fmas_f32
49 ; SI: v_div_fixup_f32
50 define void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
51 entry:
52 %fdiv = fdiv float %a, %b, !fpmath !0
53 store float %fdiv, float addrspace(1)* %out
54 ret void
55 }
56
57 ; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32:
58 ; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
59 ; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
60 ; SI-NOT: [[RESULT]]
61 ; SI: buffer_store_dword [[RESULT]]
62 define void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
63 entry:
64 %fdiv = fdiv fast float %a, %b
65 store float %fdiv, float addrspace(1)* %out
3366 ret void
3467 }
3568
3770 ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
3871 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
3972
40 ; UNSAFE-FP: v_rcp_f32
41 ; UNSAFE-FP: v_mul_f32_e32
42
43 ; SI-DAG: v_rcp_f32
44 ; SI-DAG: v_mul_f32
45 define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) {
46 entry:
47 %0 = fdiv fast float %a, %b
48 store float %0, float addrspace(1)* %out
73 ; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
74 ; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
75 ; SI-NOT: [[RESULT]]
76 ; SI: buffer_store_dword [[RESULT]]
77 define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
78 entry:
79 %fdiv = fdiv fast float %a, %b
80 store float %fdiv, float addrspace(1)* %out
4981 ret void
5082 }
5183
5385 ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
5486 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
5587
56 ; UNSAFE-FP: v_rcp_f32
57 ; UNSAFE-FP: v_mul_f32_e32
58
59 ; SI-DAG: v_rcp_f32
60 ; SI-DAG: v_mul_f32
61 define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) {
62 entry:
63 %0 = fdiv arcp float %a, %b
64 store float %0, float addrspace(1)* %out
88 ; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
89 ; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
90 ; SI-NOT: [[RESULT]]
91 ; SI: buffer_store_dword [[RESULT]]
92 define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 {
93 entry:
94 %fdiv = fdiv arcp float %a, %b
95 store float %fdiv, float addrspace(1)* %out
6596 ret void
6697 }
6798
71102 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
72103 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
73104
74 ; UNSAFE-FP: v_rcp_f32
75 ; UNSAFE-FP: v_rcp_f32
76 ; UNSAFE-FP: v_mul_f32_e32
77 ; UNSAFE-FP: v_mul_f32_e32
78
79 ; SI-DAG: v_rcp_f32
80 ; SI-DAG: v_mul_f32
81 ; SI-DAG: v_rcp_f32
82 ; SI-DAG: v_mul_f32
83
84 ; I754: v_div_scale_f32
85 ; I754: v_div_scale_f32
86 ; I754: v_div_scale_f32
87 ; I754: v_div_scale_f32
88 ; I754: v_div_fixup_f32
89 ; I754: v_div_fixup_f32
90 define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
91 entry:
92 %0 = fdiv <2 x float> %a, %b
93 store <2 x float> %0, <2 x float> addrspace(1)* %out
105 ; SI: v_div_scale_f32
106 ; SI: v_div_scale_f32
107 ; SI: v_div_scale_f32
108 ; SI: v_div_scale_f32
109 define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
110 entry:
111 %fdiv = fdiv <2 x float> %a, %b
112 store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
113 ret void
114 }
115
116 ; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32:
117 ; SI: v_cmp_gt_f32
118 ; SI: v_cmp_gt_f32
119 define void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
120 entry:
121 %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
122 store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
94123 ret void
95124 }
96125
100129 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
101130 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
102131
103 ; UNSAFE-FP: v_rcp_f32
104 ; UNSAFE-FP: v_rcp_f32
105 ; UNSAFE-FP: v_mul_f32_e32
106 ; UNSAFE-FP: v_mul_f32_e32
107
108 ; SI-DAG: v_rcp_f32
109 ; SI-DAG: v_mul_f32
110 ; SI-DAG: v_rcp_f32
111 ; SI-DAG: v_mul_f32
112 define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
113 entry:
114 %0 = fdiv fast <2 x float> %a, %b
115 store <2 x float> %0, <2 x float> addrspace(1)* %out
132 ; SI: v_rcp_f32
133 ; SI: v_rcp_f32
134 define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
135 entry:
136 %fdiv = fdiv fast <2 x float> %a, %b
137 store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
116138 ret void
117139 }
118140
122144 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
123145 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
124146
125 ; UNSAFE-FP: v_rcp_f32
126 ; UNSAFE-FP: v_rcp_f32
127 ; UNSAFE-FP: v_mul_f32_e32
128 ; UNSAFE-FP: v_mul_f32_e32
129
130 ; SI-DAG: v_rcp_f32
131 ; SI-DAG: v_mul_f32
132 ; SI-DAG: v_rcp_f32
133 ; SI-DAG: v_mul_f32
134 define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
135 entry:
136 %0 = fdiv arcp <2 x float> %a, %b
137 store <2 x float> %0, <2 x float> addrspace(1)* %out
147 ; SI: v_rcp_f32
148 ; SI: v_rcp_f32
149 define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
150 entry:
151 %fdiv = fdiv arcp <2 x float> %a, %b
152 store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
138153 ret void
139154 }
140155
148163 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
149164 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
150165
151 ; UNSAFE-FP: v_rcp_f32_e32
152 ; UNSAFE-FP: v_rcp_f32_e32
153 ; UNSAFE-FP: v_rcp_f32_e32
154 ; UNSAFE-FP: v_rcp_f32_e32
155 ; UNSAFE-FP: v_mul_f32_e32
156 ; UNSAFE-FP: v_mul_f32_e32
157 ; UNSAFE-FP: v_mul_f32_e32
158 ; UNSAFE-FP: v_mul_f32_e32
159
160 ; SI-DAG: v_rcp_f32
161 ; SI-DAG: v_mul_f32
162 ; SI-DAG: v_rcp_f32
163 ; SI-DAG: v_mul_f32
164 ; SI-DAG: v_rcp_f32
165 ; SI-DAG: v_mul_f32
166 ; SI-DAG: v_rcp_f32
167 ; SI-DAG: v_mul_f32
168
169 ; I754: v_div_scale_f32
170 ; I754: v_div_scale_f32
171 ; I754: v_div_scale_f32
172 ; I754: v_div_scale_f32
173 ; I754: v_div_scale_f32
174 ; I754: v_div_scale_f32
175 ; I754: v_div_scale_f32
176 ; I754: v_div_scale_f32
177 ; I754: v_div_fixup_f32
178 ; I754: v_div_fixup_f32
179 ; I754: v_div_fixup_f32
180 ; I754: v_div_fixup_f32
181 define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
166 ; SI: v_div_fixup_f32
167 ; SI: v_div_fixup_f32
168 ; SI: v_div_fixup_f32
169 ; SI: v_div_fixup_f32
170 define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
182171 %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
183172 %a = load <4 x float>, <4 x float> addrspace(1) * %in
184173 %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
197186 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
198187 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
199188
200 ; UNSAFE-FP: v_rcp_f32_e32
201 ; UNSAFE-FP: v_rcp_f32_e32
202 ; UNSAFE-FP: v_rcp_f32_e32
203 ; UNSAFE-FP: v_rcp_f32_e32
204 ; UNSAFE-FP: v_mul_f32_e32
205 ; UNSAFE-FP: v_mul_f32_e32
206 ; UNSAFE-FP: v_mul_f32_e32
207 ; UNSAFE-FP: v_mul_f32_e32
208
209 ; SI-DAG: v_rcp_f32
210 ; SI-DAG: v_mul_f32
211 ; SI-DAG: v_rcp_f32
212 ; SI-DAG: v_mul_f32
213 ; SI-DAG: v_rcp_f32
214 ; SI-DAG: v_mul_f32
215 ; SI-DAG: v_rcp_f32
216 ; SI-DAG: v_mul_f32
217 define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
189 ; SI: v_rcp_f32
190 ; SI: v_rcp_f32
191 ; SI: v_rcp_f32
192 ; SI: v_rcp_f32
193 define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
218194 %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
219195 %a = load <4 x float>, <4 x float> addrspace(1) * %in
220196 %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
233209 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
234210 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
235211
236 ; UNSAFE-FP: v_rcp_f32_e32
237 ; UNSAFE-FP: v_rcp_f32_e32
238 ; UNSAFE-FP: v_rcp_f32_e32
239 ; UNSAFE-FP: v_rcp_f32_e32
240 ; UNSAFE-FP: v_mul_f32_e32
241 ; UNSAFE-FP: v_mul_f32_e32
242 ; UNSAFE-FP: v_mul_f32_e32
243 ; UNSAFE-FP: v_mul_f32_e32
244
245 ; SI-DAG: v_rcp_f32
246 ; SI-DAG: v_mul_f32
247 ; SI-DAG: v_rcp_f32
248 ; SI-DAG: v_mul_f32
249 ; SI-DAG: v_rcp_f32
250 ; SI-DAG: v_mul_f32
251 ; SI-DAG: v_rcp_f32
252 ; SI-DAG: v_mul_f32
253 define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
212 ; SI: v_rcp_f32
213 ; SI: v_rcp_f32
214 ; SI: v_rcp_f32
215 ; SI: v_rcp_f32
216 define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
254217 %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
255218 %a = load <4 x float>, <4 x float> addrspace(1) * %in
256219 %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
258221 store <4 x float> %result, <4 x float> addrspace(1)* %out
259222 ret void
260223 }
224
225 attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="-fp32-denormals" }
226 attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "target-features"="-fp32-denormals" }
227 attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="+fp32-denormals" }
228
229 !0 = !{float 2.500000e+00}
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
1
2 declare float @llvm.amdgcn.fdiv.fast(float, float) #0
3
4 ; CHECK-LABEL: {{^}}test_fdiv_fast:
5 ; CHECK: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
6 ; CHECK: v_mul_f32_e32
7 ; CHECK: v_rcp_f32_e32
8 ; CHECK: v_mul_f32_e32
9 ; CHECK: v_mul_f32_e32
10 define void @test_fdiv_fast(float addrspace(1)* %out, float %a, float %b) #1 {
11 %fdiv = call float @llvm.amdgcn.fdiv.fast(float %a, float %b)
12 store float %fdiv, float addrspace(1)* %out
13 ret void
14 }
15
16 attributes #0 = { nounwind readnone }
17 attributes #1 = { nounwind }
None ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG-SAFE -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
2 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
13 ; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
24
3 ; FIXME: Evergreen only ever does unsafe fp math.
45 ; FUNC-LABEL: {{^}}rcp_pat_f32:
6 ; GCN: s_load_dword [[SRC:s[0-9]+]]
7 ; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]]
8 ; GCN: buffer_store_dword [[RCP]]
9
510 ; EG: RECIP_IEEE
6 define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
11 define void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
712 %rcp = fdiv float 1.0, %src
813 store float %rcp, float addrspace(1)* %out, align 4
914 ret void
1015 }
16
17 ; FUNC-LABEL: {{^}}rcp_ulp25_pat_f32:
18 ; GCN: s_load_dword [[SRC:s[0-9]+]]
19 ; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]]
20 ; GCN: buffer_store_dword [[RCP]]
21
22 ; EG: RECIP_IEEE
23 define void @rcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
24 %rcp = fdiv float 1.0, %src, !fpmath !0
25 store float %rcp, float addrspace(1)* %out, align 4
26 ret void
27 }
28
29 ; FUNC-LABEL: {{^}}rcp_fast_ulp25_pat_f32:
30 ; GCN: s_load_dword [[SRC:s[0-9]+]]
31 ; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]]
32 ; GCN: buffer_store_dword [[RCP]]
33
34 ; EG: RECIP_IEEE
35 define void @rcp_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
36 %rcp = fdiv fast float 1.0, %src, !fpmath !0
37 store float %rcp, float addrspace(1)* %out, align 4
38 ret void
39 }
40
41 ; FUNC-LABEL: {{^}}rcp_arcp_ulp25_pat_f32:
42 ; GCN: s_load_dword [[SRC:s[0-9]+]]
43 ; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]]
44 ; GCN: buffer_store_dword [[RCP]]
45
46 ; EG: RECIP_IEEE
47 define void @rcp_arcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
48 %rcp = fdiv arcp float 1.0, %src, !fpmath !0
49 store float %rcp, float addrspace(1)* %out, align 4
50 ret void
51 }
52
53 ; FUNC-LABEL: {{^}}rcp_global_fast_ulp25_pat_f32:
54 ; GCN: s_load_dword [[SRC:s[0-9]+]]
55 ; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]]
56 ; GCN: buffer_store_dword [[RCP]]
57
58 ; EG: RECIP_IEEE
59 define void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #2 {
60 %rcp = fdiv float 1.0, %src, !fpmath !0
61 store float %rcp, float addrspace(1)* %out, align 4
62 ret void
63 }
64
65 ; FUNC-LABEL: {{^}}rcp_fabs_pat_f32:
66 ; GCN: s_load_dword [[SRC:s[0-9]+]]
67 ; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], |[[SRC]]|
68 ; GCN: buffer_store_dword [[RCP]]
69
70 ; EG: RECIP_IEEE
71 define void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 {
72 %src.fabs = call float @llvm.fabs.f32(float %src)
73 %rcp = fdiv float 1.0, %src.fabs
74 store float %rcp, float addrspace(1)* %out, align 4
75 ret void
76 }
77
78 ; FIXME: fneg folded into constant 1
79 ; FUNC-LABEL: {{^}}rcp_fabs_fneg_pat_f32:
80 define void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 {
81 %src.fabs = call float @llvm.fabs.f32(float %src)
82 %src.fabs.fneg = fsub float -0.0, %src.fabs
83 %rcp = fdiv float 1.0, %src.fabs.fneg
84 store float %rcp, float addrspace(1)* %out, align 4
85 ret void
86 }
87
88
89 declare float @llvm.fabs.f32(float) #1
90
91 attributes #0 = { nounwind "unsafe-fp-math"="false" }
92 attributes #1 = { nounwind readnone }
93 attributes #2 = { nounwind "unsafe-fp-math"="true" }
94
95 !0 = !{float 2.500000e+00}
+0
-13
test/CodeGen/AMDGPU/reciprocal.ll less more
None ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
1
2 ;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
3
4 define amdgpu_ps void @test(<4 x float> inreg %reg0) {
5 %r0 = extractelement <4 x float> %reg0, i32 0
6 %r1 = fdiv float 1.0, %r0
7 %vec = insertelement <4 x float> undef, float %r1, i32 0
8 call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
9 ret void
10 }
11
12 declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)