llvm.org GIT mirror llvm / f0fa788
AMDGPU: Fix implementation of isCanonicalized If denormals are enabled, denormals are canonical. Also fix a few other issues. minnum/maxnum are supposed to canonicalize. Temporarily improve workaround for the instruction behavior change in gfx9. Handle selects and fcopysign. The tests were also largely broken, since they were checking for a flush used on some targets after the store of the result. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@339061 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 1 year, 6 months ago
3 changed file(s) with 338 addition(s) and 117 deletion(s). Raw diff Collapse all Expand all
67446744 return AMDGPUTargetLowering::performRcpCombine(N, DCI);
67456745 }
67466746
6747 static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
6748 const GCNSubtarget *ST, unsigned MaxDepth=5) {
6747 bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
6748 unsigned MaxDepth) const {
6749 unsigned Opcode = Op.getOpcode();
6750 if (Opcode == ISD::FCANONICALIZE)
6751 return true;
6752
6753 if (auto *CFP = dyn_cast(Op)) {
6754 auto F = CFP->getValueAPF();
6755 if (F.isNaN() && F.isSignaling())
6756 return false;
6757 return !F.isDenormal() || denormalsEnabledForType(Op.getValueType());
6758 }
6759
67496760 // If source is a result of another standard FP operation it is already in
67506761 // canonical form.
6751
6752 switch (Op.getOpcode()) {
6753 default:
6754 break;
6755
6762 if (MaxDepth == 0)
6763 return false;
6764
6765 switch (Opcode) {
67566766 // These will flush denorms if required.
67576767 case ISD::FADD:
67586768 case ISD::FSUB:
67596769 case ISD::FMUL:
6760 case ISD::FSQRT:
67616770 case ISD::FCEIL:
67626771 case ISD::FFLOOR:
67636772 case ISD::FMA:
67646773 case ISD::FMAD:
6765
6766 case ISD::FCANONICALIZE:
6774 case ISD::FSQRT:
6775 case ISD::FDIV:
6776 case ISD::FREM:
6777 case AMDGPUISD::FMUL_LEGACY:
6778 case AMDGPUISD::FMAD_FTZ:
67676779 return true;
6768
67696780 case ISD::FP_ROUND:
67706781 return Op.getValueType().getScalarType() != MVT::f16 ||
6771 ST->hasFP16Denormals();
6782 Subtarget->hasFP16Denormals();
67726783
67736784 case ISD::FP_EXTEND:
67746785 return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 ||
6775 ST->hasFP16Denormals();
6786 Subtarget->hasFP16Denormals();
67766787
67776788 // It can/will be lowered or combined as a bit operation.
67786789 // Need to check their input recursively to handle.
67796790 case ISD::FNEG:
67806791 case ISD::FABS:
6781 return (MaxDepth > 0) &&
6782 isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1);
6792 case ISD::FCOPYSIGN:
6793 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
67836794
67846795 case ISD::FSIN:
67856796 case ISD::FCOS:
67866797 case ISD::FSINCOS:
67876798 return Op.getValueType().getScalarType() != MVT::f16;
67886799
6789 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
6790 // For such targets need to check their input recursively.
67916800 case ISD::FMINNUM:
6792 case ISD::FMAXNUM:
6793 case ISD::FMINNAN:
6794 case ISD::FMAXNAN:
6795
6796 if (ST->supportsMinMaxDenormModes() &&
6797 DAG.isKnownNeverNaN(Op.getOperand(0)) &&
6798 DAG.isKnownNeverNaN(Op.getOperand(1)))
6799 return true;
6800
6801 return (MaxDepth > 0) &&
6802 isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) &&
6803 isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1);
6804
6805 case ISD::ConstantFP: {
6806 auto F = cast(Op)->getValueAPF();
6807 return !F.isDenormal() && !(F.isNaN() && F.isSignaling());
6808 }
6809 }
6810 return false;
6801 case ISD::FMAXNUM: {
6802 // FIXME: Shouldn't treat the generic operations different based these.
6803 bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
6804 if (IsIEEEMode) {
6805 // snans will be quieted, so we only need to worry about denormals.
6806 if (Subtarget->supportsMinMaxDenormModes() ||
6807 denormalsEnabledForType(Op.getValueType()))
6808 return true;
6809
6810 // Flushing may be required.
6811 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
6812 // targets need to check their input recursively.
6813 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
6814 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
6815 }
6816
6817 if (Subtarget->supportsMinMaxDenormModes() ||
6818 denormalsEnabledForType(Op.getValueType())) {
6819 // Only quieting may be necessary.
6820 return DAG.isKnownNeverSNaN(Op.getOperand(0)) &&
6821 DAG.isKnownNeverSNaN(Op.getOperand(1));
6822 }
6823
6824 // Flushing and quieting may be necessary
6825 // With ieee_mode off, the nan is returned as-is, so if it is an sNaN it
6826 // needs to be quieted.
6827 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
6828 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
6829 }
6830 case ISD::SELECT: {
6831 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
6832 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
6833 }
6834 default:
6835 return denormalsEnabledForType(Op.getValueType()) &&
6836 DAG.isKnownNeverSNaN(Op);
6837 }
6838
6839 llvm_unreachable("invalid operation");
68116840 }
68126841
68136842 // Constant fold canonicalize.
68276856 ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0);
68286857 if (!CFP) {
68296858 SDValue N0 = N->getOperand(0);
6830 EVT VT = N0.getValueType().getScalarType();
6831 auto ST = getSubtarget();
6832
6833 if (((VT == MVT::f32 && ST->hasFP32Denormals()) ||
6834 (VT == MVT::f64 && ST->hasFP64Denormals()) ||
6835 (VT == MVT::f16 && ST->hasFP16Denormals())) &&
6836 DAG.isKnownNeverNaN(N0))
6837 return N0;
6838
6839 bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
6840
6841 if ((IsIEEEMode || DAG.isKnownNeverSNaN(N0)) &&
6842 isCanonicalized(DAG, N0, ST))
6843 return N0;
6844
6845 return SDValue();
6859 return isCanonicalized(DAG, N0) ? N0 : SDValue();
68466860 }
68476861
68486862 const APFloat &C = CFP->getValueAPF();
84618475 }
84628476 return false;
84638477 }
8478
8479 bool SITargetLowering::denormalsEnabledForType(EVT VT) const {
8480 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
8481 case MVT::f32:
8482 return Subtarget->hasFP32Denormals();
8483 case MVT::f64:
8484 return Subtarget->hasFP64Denormals();
8485 case MVT::f16:
8486 return Subtarget->hasFP16Denormals();
8487 default:
8488 return false;
8489 }
8490 }
322322
323323 bool isSDNodeSourceOfDivergence(const SDNode *N,
324324 FunctionLoweringInfo *FLI, DivergenceAnalysis *DA) const override;
325
326 bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
327 unsigned MaxDepth = 5) const;
328 bool denormalsEnabledForType(EVT VT) const;
325329 };
326330
327331 } // End namespace llvm
None ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-FLUSH %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=-fp32-denormals,+fp-exceptions < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-EXCEPT,VI,GCN-FLUSH %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-DENORM,GCN-DENORM %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLUSH,GCN-FLUSH %s
0 ; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-FLUSH,GCN-FLUSH,GCN-NOEXCEPT %s
1 ; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -mattr=-fp32-denormals,+fp-exceptions < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-EXCEPT,VI,VI-FLUSH,GCN-FLUSH %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-DENORM,GCN-DENORM,GCN-NOEXCEPT %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-DENORM,GCN-DENORM,GCN-NOEXCEPT %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLUSH,GCN-FLUSH,GCN-NOEXCEPT %s
45
56 ; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32:
67 ; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
2829 ret void
2930 }
3031
32 ; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_legacy_value_f32:
33 ; GCN: v_mul_legacy_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
34 ; GCN-NOT: v_mul
35 ; GCN-NOT: v_max
36 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
37 define amdgpu_kernel void @test_fold_canonicalize_fmul_legacy_value_f32(float addrspace(1)* %arg) {
38 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
39 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
40 %load = load float, float addrspace(1)* %gep, align 4
41 %v = call float @llvm.amdgcn.fmul.legacy(float %load, float 15.0)
42 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
43 store float %canonicalized, float addrspace(1)* %gep, align 4
44 ret void
45 }
46
3147 ; GCN-LABEL: {{^}}test_fold_canonicalize_sub_value_f32:
3248 ; GCN: v_sub_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
33 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
34 ; GCN-NOT: 1.0
49 ; GCN-NOT: v_mul
50 ; GCN-NOT: v_max
51 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
3552 define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(float addrspace(1)* %arg) {
3653 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
3754 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
4461
4562 ; GCN-LABEL: {{^}}test_fold_canonicalize_add_value_f32:
4663 ; GCN: v_add_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
47 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
48 ; GCN-NOT: 1.0
64 ; GCN-NOT: v_mul
65 ; GCN-NOT: v_max
66 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
4967 define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(float addrspace(1)* %arg) {
5068 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
5169 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
5876
5977 ; GCN-LABEL: {{^}}test_fold_canonicalize_sqrt_value_f32:
6078 ; GCN: v_sqrt_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
61 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
62 ; GCN-NOT: 1.0
79 ; GCN-NOT: v_mul
80 ; GCN-NOT: v_max
81 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
6382 define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(float addrspace(1)* %arg) {
6483 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
6584 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
7291
7392 ; GCN-LABEL: test_fold_canonicalize_fceil_value_f32:
7493 ; GCN: v_ceil_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
75 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
76 ; GCN-NOT: 1.0
94 ; GCN-NOT: v_mul
95 ; GCN-NOT: v_max
96 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
7797 define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(float addrspace(1)* %arg) {
7898 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
7999 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
86106
87107 ; GCN-LABEL: test_fold_canonicalize_floor_value_f32:
88108 ; GCN: v_floor_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
89 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
90 ; GCN-NOT: 1.0
109 ; GCN-NOT: v_mul
110 ; GCN-NOT: v_max
111 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
91112 define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(float addrspace(1)* %arg) {
92113 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
93114 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
100121
101122 ; GCN-LABEL: test_fold_canonicalize_fma_value_f32:
102123 ; GCN: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
103 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
104 ; GCN-NOT: 1.0
124 ; GCN-NOT: v_mul
125 ; GCN-NOT: v_max
126 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
105127 define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(1)* %arg) {
106128 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
107129 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
108130 %load = load float, float addrspace(1)* %gep, align 4
109131 %v = call float @llvm.fma.f32(float %load, float 15.0, float 15.0)
132 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
133 store float %canonicalized, float addrspace(1)* %gep, align 4
134 ret void
135 }
136
137 ; GCN-LABEL: test_fold_canonicalize_fmad_ftz_value_f32:
138 ; GCN: v_mad_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
139 ; GCN-NOT: v_mul
140 ; GCN-NOT: v_max
141 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
142 define amdgpu_kernel void @test_fold_canonicalize_fmad_ftz_value_f32(float addrspace(1)* %arg) {
143 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
144 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
145 %load = load float, float addrspace(1)* %gep, align 4
146 %v = call float @llvm.amdgcn.fmad.ftz.f32(float %load, float 15.0, float 15.0)
110147 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
111148 store float %canonicalized, float addrspace(1)* %gep, align 4
112149 ret void
115152 ; GCN-LABEL: test_fold_canonicalize_fmuladd_value_f32:
116153 ; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
117154 ; GFX9-DENORM: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
118 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
119 ; GCN-NOT: 1.0
155 ; GCN-NOT: v_mul
156 ; GCN-NOT: v_max
157 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}],
120158 define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(float addrspace(1)* %arg) {
121159 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
122160 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
131169 ; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]],
132170 ; GCN-FLUSH: v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]]
133171 ; GCN-DENORM: v_max_f32_e32 [[V:v[0-9]+]], [[LOAD]], [[LOAD]]
134 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
135 ; GCN-NOT: 1.0
172 ; GCN-NOT: v_mul
173 ; GCN-NOT: v_max
174 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
136175 define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(float addrspace(1)* %arg) {
137176 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
138177 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
145184
146185 ; GCN-LABEL: test_fold_canonicalize_fpextend_value_f64_f32:
147186 ; GCN: v_cvt_f64_f32_e32 [[V:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
187 ; GCN-NOT: v_mul
188 ; GCN-NOT: v_max
148189 ; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]]
149 ; GCN-NOT: 1.0
150190 define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(float addrspace(1)* %arg, double addrspace(1)* %out) {
151191 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
152192 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
160200
161201 ; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16:
162202 ; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}}
163 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
164 ; GCN-NOT: 1.0
203 ; GCN-NOT: v_mul
204 ; GCN-NOT: v_max
205 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
165206 define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(half addrspace(1)* %arg, float addrspace(1)* %out) {
166207 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
167208 %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
175216
176217 ; GCN-LABEL: test_fold_canonicalize_fpround_value_f32_f64:
177218 ; GCN: v_cvt_f32_f64_e32 [[V:v[0-9]+]], v[{{[0-9:]+}}]
178 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
179 ; GCN-NOT: 1.0
219 ; GCN-NOT: v_mul
220 ; GCN-NOT: v_max
221 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
180222 define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f32_f64(double addrspace(1)* %arg, float addrspace(1)* %out) {
181223 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
182224 %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
210252 ; GFX9: v_cvt_f16_f32_e32 [[V1:v[0-9]+]], v{{[0-9]+}}
211253 ; GFX9: v_and_b32_e32 [[V0_16:v[0-9]+]], 0xffff, [[V0]]
212254 ; GFX9: v_lshl_or_b32 [[V:v[0-9]+]], [[V1]], 16, [[V0_16]]
213 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
214 ; GCN-NOT: 1.0
255 ; GCN-NOT: v_mul
256 ; GCN-NOT: v_max
257 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
215258 define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(<2 x float> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) {
216259 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
217260 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %arg, i32 %id
238281
239282 ; GCN-LABEL: test_fold_canonicalize_fneg_value_f32:
240283 ; GCN: v_xor_b32_e32 [[V:v[0-9]+]], 0x80000000, v{{[0-9]+}}
241 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
242 ; GCN-NOT: 1.0
284 ; GCN-NOT: v_mul
285 ; GCN-NOT: v_max
286 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
243287 define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) {
244288 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
245289 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
264308 ret void
265309 }
266310
311 ; GCN-LABEL: test_no_fold_canonicalize_fcopysign_value_f32:
312 ; GCN-FLUSH: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}|
313 ; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
314 ; GCN-NOT: v_mul_
315 ; GCN-NOT: v_max_
316 define amdgpu_kernel void @test_no_fold_canonicalize_fcopysign_value_f32(float addrspace(1)* %arg, float %sign) {
317 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
318 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
319 %load = load float, float addrspace(1)* %gep, align 4
320 %canon.load = tail call float @llvm.canonicalize.f32(float %load)
321 %copysign = call float @llvm.copysign.f32(float %canon.load, float %sign)
322 %v = tail call float @llvm.fabs.f32(float %load)
323 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
324 store float %canonicalized, float addrspace(1)* %gep, align 4
325 ret void
326 }
327
267328 ; GCN-LABEL: test_fold_canonicalize_fabs_value_f32:
268329 ; GCN: v_and_b32_e32 [[V:v[0-9]+]], 0x7fffffff, v{{[0-9]+}}
269 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
270 ; GCN-NOT: 1.0
330 ; GCN-NOT: v_mul
331 ; GCN-NOT: v_max
332 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
271333 define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) {
272334 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
273335 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
281343
282344 ; GCN-LABEL: test_fold_canonicalize_sin_value_f32:
283345 ; GCN: v_sin_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
284 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
285 ; GCN-NOT: 1.0
346 ; GCN-NOT: v_mul
347 ; GCN-NOT: v_max
348 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
286349 define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(float addrspace(1)* %arg) {
287350 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
288351 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
295358
296359 ; GCN-LABEL: test_fold_canonicalize_cos_value_f32:
297360 ; GCN: v_cos_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
298 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
299 ; GCN-NOT: 1.0
361 ; GCN-NOT: v_mul
362 ; GCN-NOT: v_max
363 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
300364 define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(float addrspace(1)* %arg) {
301365 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
302366 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
310374 ; GCN-LABEL: test_fold_canonicalize_sin_value_f16:
311375 ; GCN: v_sin_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
312376 ; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]]
377 ; GCN-NOT: v_mul
378 ; GCN-NOT: v_max
313379 ; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
314 ; GCN-NOT: 1.0
315380 define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(half addrspace(1)* %arg) {
316381 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
317382 %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
325390 ; GCN-LABEL: test_fold_canonicalize_cos_value_f16:
326391 ; GCN: v_cos_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
327392 ; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]]
393 ; GCN-NOT: v_mul
394 ; GCN-NOT: v_max
328395 ; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
329 ; GCN-NOT: 1.0
330396 define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(half addrspace(1)* %arg) {
331397 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
332398 %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
339405
340406 ; GCN-LABEL: test_fold_canonicalize_qNaN_value_f32:
341407 ; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x7fc00000
342 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
343 ; GCN-NOT: 1.0
408 ; GCN-NOT: v_mul
409 ; GCN-NOT: v_max
410 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
344411 define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace(1)* %arg) {
345412 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
346413 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
349416 ret void
350417 }
351418
352 ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32:
353 ; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
354 ; GFX9: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
419 ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode:
420 ; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
421 ; GFX9-NOT: v_max
422 ; GFX9-NOT: v_mul
423
424 ; VI-DENORM-NOT: v_max_f32
425 ; VI-DENORM-NOT: v_mul_f32
426
427 ; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
428
355429 ; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
356 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32(float addrspace(1)* %arg) {
430 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode(float addrspace(1)* %arg) {
431 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
432 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
433 %load = load float, float addrspace(1)* %gep, align 4
434 %v = tail call float @llvm.minnum.f32(float %load, float 0.0)
435 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
436 store float %canonicalized, float addrspace(1)* %gep, align 4
437 ret void
438 }
439
440 ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode:
441 ; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
442
443 ; GFX9-NOT: v_max
444 ; GFX9-NOT: v_mul
445
446
447 ; VI-DENORM-NOT: v_max
448 ; VI-DENORM-NOT: v_mul
449 ; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
450
451 ; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}]
452 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(float addrspace(1)* %arg) #1 {
357453 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
358454 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
359455 %load = load float, float addrspace(1)* %gep, align 4
365461
366462 ; GCN-LABEL: test_fold_canonicalize_minnum_value_f32:
367463 ; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
368 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
369 ; GCN-NOT: 1.0
464 ; GCN-NOT: v_mul
465 ; GCN-NOT: v_max
466 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
370467 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(float addrspace(1)* %arg) {
371468 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
372469 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
381478 ; FIXME: Should there be more checks here? minnum with NaN operand is simplified away.
382479
383480 ; GCN-LABEL: test_fold_canonicalize_sNaN_value_f32:
384 ; VI: v_add_u32_e32 v{{[0-9]+}}
385 ; GFX9: v_add_co_u32_e32 v{{[0-9]+}}
386 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}]
481 ; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]]
482 ; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[LOAD]]
483 ; GCN-DENORM: v_max_f32_e32 v{{[0-9]+}}, [[LOAD]], [[LOAD]]
387484 define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace(1)* %arg) {
388485 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
389486 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
396493
397494 ; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
398495 ; GFX9: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
399 ; VI: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
400 ; VI: v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]]
496
497 ; VI-FLUSH: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
498 ; VI-FLUSH: v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]]
499
500 ; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
501
502
503 ; GCN-NOT: v_mul
504 ; GCN-NOT: v_max
401505 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[RESULT]]
402 ; GFX9-NOT: 1.0
403506 define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) {
404507 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
405508 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
410513 ret void
411514 }
412515
413 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32:
516 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode:
414517 ; GFX9: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, v{{[0-9]+}}
415 ; VI: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
416 ; VI: v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]]
518 ; VI-FLUSH: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
519 ; VI-FLUSH: v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]]
520
521 ; VI-DENORM: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, v{{[0-9]+}}
522
523 ; GCN-NOT: v_mul
524 ; GCN-NOT: v_max
417525 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[RESULT]]
418 ; GFX9-NOT: 1.0
419 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32(float addrspace(1)* %arg) {
526 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode(float addrspace(1)* %arg) {
420527 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
421528 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
422529 %load = load float, float addrspace(1)* %gep, align 4
428535
429536 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_f32:
430537 ; GCN: v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
431 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
432 ; GCN-NOT: 1.0
538 ; GCN-NOT: v_max
539 ; GCN-NOT: v_mul
540 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
433541 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(float addrspace(1)* %arg) {
434542 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
435543 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
443551
444552 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_f64:
445553 ; GCN: v_max_f64 [[V:v\[[0-9]+:[0-9]+\]]], v[{{[0-9:]+}}], 0
554 ; GCN-NOT: v_mul
555 ; GCN-NOT: v_max
446556 ; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]]
447 ; GCN-NOT: 1.0
448557 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f64(double addrspace(1)* %arg) {
449558 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
450559 %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
456565 ret void
457566 }
458567
459 ; GCN-LABEL: test_no_fold_canonicalize_fmul_value_f32_no_ieee:
568 ; GCN-LABEL: test_fold_canonicalize_fmul_value_f32_no_ieee:
460569 ; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
461570 ; GCN-NOT: v_mul
462571 ; GCN-NOT: v_max
463572 ; GCN-NEXT: ; return
464 define amdgpu_ps float @test_no_fold_canonicalize_fmul_value_f32_no_ieee(float %arg) {
573 define amdgpu_ps float @test_fold_canonicalize_fmul_value_f32_no_ieee(float %arg) {
465574 entry:
466575 %v = fmul float %arg, 15.0
467576 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
470579
471580 ; GCN-LABEL: test_fold_canonicalize_fmul_nnan_value_f32_no_ieee:
472581 ; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
582 ; GCN-NOT: v_mul
583 ; GCN-NOT: v_max
473584 ; GCN-NEXT: ; return
474 ; GCN-NOT: 1.0
475585 define amdgpu_ps float @test_fold_canonicalize_fmul_nnan_value_f32_no_ieee(float %arg) {
476586 entry:
477587 %v = fmul nnan float %arg, 15.0
588 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
589 ret float %canonicalized
590 }
591
592 ; GCN-LABEL: {{^}}test_fold_canonicalize_fdiv_value_f32_no_ieee:
593 ; GCN: v_div_fixup_f32
594 ; GCN-NOT: v_max
595 ; GCN-NOT: v_mul
596 ; GCN: ; return
597 define amdgpu_ps float @test_fold_canonicalize_fdiv_value_f32_no_ieee(float %arg0) {
598 entry:
599 %v = fdiv float 15.0, %arg0
478600 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
479601 ret float %canonicalized
480602 }
497619 ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f64
498620 ; GCN: {{flat|global}}_load_dwordx2 [[V:v\[[0-9:]+\]]],
499621 ; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]]
500 ; GCN-NOT: 1.0
622 ; GCN-NOT: v_mul_
623 ; GCN-NOT: v_max_
501624 define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(double addrspace(1)* %arg, double addrspace(1)* %out) #1 {
502625 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
503626 %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
510633
511634 ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16
512635 ; GCN: {{flat|global}}_load_ushort [[V:v[0-9]+]],
513 ; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
514 ; GCN-NOT: 1.0
636 ; GCN-NOT: v_mul
637 ; GCN-NOT: v_max
638 ; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V]]
515639 define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(half addrspace(1)* %arg, half addrspace(1)* %out) #1 {
516640 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
517641 %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
522646 ret void
523647 }
524648
649 ; GCN-LABEL: {{^}}test_fold_canonicalize_select_value_f32:
650 ; GCN: v_add_f32
651 ; GCN: v_add_f32
652 ; GCN: v_cndmask_b32
653 ; GCN-NOT: v_mul_
654 ; GCN-NOT: v_max_
655 define amdgpu_kernel void @test_fold_canonicalize_select_value_f32(float addrspace(1)* %arg) {
656 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
657 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
658 %load0 = load volatile float, float addrspace(1)* %gep, align 4
659 %load1 = load volatile float, float addrspace(1)* %gep, align 4
660 %load2 = load volatile i32, i32 addrspace(1)* undef, align 4
661 %v0 = fadd float %load0, 15.0
662 %v1 = fadd float %load1, 32.0
663 %cond = icmp eq i32 %load2, 0
664 %select = select i1 %cond, float %v0, float %v1
665 %canonicalized = tail call float @llvm.canonicalize.f32(float %select)
666 store float %canonicalized, float addrspace(1)* %gep, align 4
667 ret void
668 }
669
670 ; Need to quiet the nan with a separate instruction since it will be
671 ; passed through the minnum.
672
673 ; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_no_ieee_mode:
674 ; GFX9: v_min_f32_e32 v0, v0, v1
675 ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
676 ; GFX9-DENORM-NEXT: v_max_f32_e32 v0, v0, v0
677 ; GFX9-NEXT: ; return to shader
678
679 ; VI: v_min_f32_e32 v0, v0, v1
680 ; VI-FLUSH: v_mul_f32_e32 v0, 1.0, v0
681 ; VI-DENORM: v_max_f32_e32 v0, v0, v0
682 define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode(float %arg0, float %arg1) {
683 %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
684 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
685 ret float %canonicalized
686 }
687
688 ; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_ieee_mode:
689 ; GFX9: v_min_f32_e32 v0, v0, v1
690 ; GFX9-NEXT: s_setpc_b64
691
692 ; VI: v_min_f32_e32 v0, v0, v1
693 ; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
694 ; VI-NEXT: s_setpc_b64
695 define float @test_fold_canonicalize_minnum_value_ieee_mode(float %arg0, float %arg1) {
696 %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
697 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
698 ret float %canonicalized
699 }
700
701 ; Canonicalizing flush necessary pre-gfx9
702 ; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_no_ieee_mode_nnan:
703 ; GCN: v_min_f32_e32 v0, v0, v1
704 ; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
705 ; GCN-NEXT: ; return
706 define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode_nnan(float %arg0, float %arg1) #1 {
707 %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
708 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
709 ret float %canonicalized
710 }
711
525712 ; Avoid failing the test on FreeBSD11.0 which will match the GCN-NOT: 1.0
526713 ; in the .amd_amdgpu_isa "amdgcn-unknown-freebsd11.0--gfx802" directive
527714 ; CHECK: .amd_amdgpu_isa
528715
529716 declare float @llvm.canonicalize.f32(float) #0
717 declare float @llvm.copysign.f32(float, float) #0
718 declare float @llvm.amdgcn.fmul.legacy(float, float) #0
719 declare float @llvm.amdgcn.fmad.ftz.f32(float, float, float) #0
530720 declare double @llvm.canonicalize.f64(double) #0
531721 declare half @llvm.canonicalize.f16(half) #0
532722 declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0