llvm.org GIT mirror llvm / 16be511
[AMDGPU] fcanonicalize elimination optimization We are using multiplication by 1.0 to flush denormals and quiet sNaNs. That is possible to omit this multiplication if source of the fcanonicalize instruction is known to be flushed/quieted, i.e. if it comes from another instruction known to do the normalization and we are using IEEE mode to quiet sNaNs. Differential Revision: https://reviews.llvm.org/D35218 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@307848 91177308-0d34-0410-b5e6-96231b3b80d8 Stanislav Mekhanoshin 2 years ago
2 changed file(s) with 574 addition(s) and 10 deletion(s). Raw diff Collapse all Expand all
46164616 return SDValue();
46174617 }
46184618
4619 static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
4620 if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
4621 return true;
4622
4623 return DAG.isKnownNeverNaN(Op);
4624 }
4625
4626 static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
4627 unsigned MaxDepth=5) {
4628 // If source is a result of another standard FP operation it is already in
4629 // canonical form.
4630
4631 switch (Op.getOpcode()) {
4632 default:
4633 break;
4634
4635 // These will flush denorms if required.
4636 case ISD::FADD:
4637 case ISD::FSUB:
4638 case ISD::FMUL:
4639 case ISD::FSQRT:
4640 case ISD::FCEIL:
4641 case ISD::FFLOOR:
4642 case ISD::FMA:
4643 case ISD::FMAD:
4644
4645 case ISD::FCANONICALIZE:
4646 return true;
4647
4648 case ISD::FP_ROUND:
4649 return Op.getValueType().getScalarType() != MVT::f16 ||
4650 ST->hasFP16Denormals();
4651
4652 case ISD::FP_EXTEND:
4653 return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 ||
4654 ST->hasFP16Denormals();
4655
4656 case ISD::FP16_TO_FP:
4657 case ISD::FP_TO_FP16:
4658 return ST->hasFP16Denormals();
4659
4660 // It can/will be lowered or combined as a bit operation.
4661 // Need to check their input recursively to handle.
4662 case ISD::FNEG:
4663 case ISD::FABS:
4664 return (MaxDepth > 0) &&
4665 isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1);
4666
4667 case ISD::FSIN:
4668 case ISD::FCOS:
4669 case ISD::FSINCOS:
4670 return Op.getValueType().getScalarType() != MVT::f16;
4671
4672 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
4673 // For such targets need to check their input recursively.
4674 // TODO: on GFX9+ we could return true without checking provided no-nan
4675 // mode, since canonicalization is also used to quiet sNaNs.
4676 case ISD::FMINNUM:
4677 case ISD::FMAXNUM:
4678 case ISD::FMINNAN:
4679 case ISD::FMAXNAN:
4680
4681 return (MaxDepth > 0) &&
4682 isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1) &&
4683 isCanonicalized(Op.getOperand(1), ST, MaxDepth - 1);
4684
4685 case ISD::ConstantFP: {
4686 auto F = cast(Op)->getValueAPF();
4687 return !F.isDenormal() && !(F.isNaN() && F.isSignaling());
4688 }
4689 }
4690 return false;
4691 }
4692
46194693 // Constant fold canonicalize.
46204694 SDValue SITargetLowering::performFCanonicalizeCombine(
46214695 SDNode *N,
46224696 DAGCombinerInfo &DCI) const {
4697 SelectionDAG &DAG = DCI.DAG;
46234698 ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0));
4624 if (!CFP)
4699
4700 if (!CFP) {
4701 SDValue N0 = N->getOperand(0);
4702
4703 bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
4704
4705 if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) &&
4706 isCanonicalized(N0, getSubtarget()))
4707 return N0;
4708
46254709 return SDValue();
4626
4627 SelectionDAG &DAG = DCI.DAG;
4710 }
4711
46284712 const APFloat &C = CFP->getValueAPF();
46294713
46304714 // Flush denormals to 0 if not enabled.
47154799
47164800 SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
47174801 return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
4718 }
4719
4720 static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
4721 if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
4722 return true;
4723
4724 return DAG.isKnownNeverNaN(Op);
47254802 }
47264803
47274804 SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
0 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GCN-FLUSH %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=-fp32-denormals,+fp-exceptions < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-EXCEPT -check-prefix=VI -check-prefix=GCN-FLUSH %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9-DENORM %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GCN-FLUSH %s
4
5 ; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32:
6 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
7 define amdgpu_kernel void @test_no_fold_canonicalize_loaded_value_f32(float addrspace(1)* %arg) {
8 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
9 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
10 %v = load float, float addrspace(1)* %gep, align 4
11 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
12 store float %canonicalized, float addrspace(1)* %gep, align 4
13 ret void
14 }
15
16 ; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_value_f32:
17 ; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
18 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
19 ; GCN-NOT: 1.0
20 define amdgpu_kernel void @test_fold_canonicalize_fmul_value_f32(float addrspace(1)* %arg) {
21 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
22 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
23 %load = load float, float addrspace(1)* %gep, align 4
24 %v = fmul float %load, 15.0
25 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
26 store float %canonicalized, float addrspace(1)* %gep, align 4
27 ret void
28 }
29
30 ; GCN-LABEL: {{^}}test_fold_canonicalize_sub_value_f32:
31 ; GCN: v_sub_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
32 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
33 ; GCN-NOT: 1.0
34 define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(float addrspace(1)* %arg) {
35 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
36 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
37 %load = load float, float addrspace(1)* %gep, align 4
38 %v = fsub float 15.0, %load
39 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
40 store float %canonicalized, float addrspace(1)* %gep, align 4
41 ret void
42 }
43
44 ; GCN-LABEL: {{^}}test_fold_canonicalize_add_value_f32:
45 ; GCN: v_add_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
46 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
47 ; GCN-NOT: 1.0
48 define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(float addrspace(1)* %arg) {
49 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
50 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
51 %load = load float, float addrspace(1)* %gep, align 4
52 %v = fadd float %load, 15.0
53 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
54 store float %canonicalized, float addrspace(1)* %gep, align 4
55 ret void
56 }
57
58 ; GCN-LABEL: {{^}}test_fold_canonicalize_sqrt_value_f32:
59 ; GCN: v_sqrt_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
60 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
61 ; GCN-NOT: 1.0
62 define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(float addrspace(1)* %arg) {
63 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
64 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
65 %load = load float, float addrspace(1)* %gep, align 4
66 %v = call float @llvm.sqrt.f32(float %load)
67 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
68 store float %canonicalized, float addrspace(1)* %gep, align 4
69 ret void
70 }
71
72 ; GCN-LABEL: test_fold_canonicalize_fceil_value_f32:
73 ; GCN: v_ceil_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
74 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
75 ; GCN-NOT: 1.0
76 define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(float addrspace(1)* %arg) {
77 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
78 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
79 %load = load float, float addrspace(1)* %gep, align 4
80 %v = call float @llvm.ceil.f32(float %load)
81 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
82 store float %canonicalized, float addrspace(1)* %gep, align 4
83 ret void
84 }
85
86 ; GCN-LABEL: test_fold_canonicalize_floor_value_f32:
87 ; GCN: v_floor_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
88 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
89 ; GCN-NOT: 1.0
90 define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(float addrspace(1)* %arg) {
91 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
92 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
93 %load = load float, float addrspace(1)* %gep, align 4
94 %v = call float @llvm.floor.f32(float %load)
95 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
96 store float %canonicalized, float addrspace(1)* %gep, align 4
97 ret void
98 }
99
100 ; GCN-LABEL: test_fold_canonicalize_fma_value_f32:
101 ; GCN: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
102 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
103 ; GCN-NOT: 1.0
104 define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(1)* %arg) {
105 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
106 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
107 %load = load float, float addrspace(1)* %gep, align 4
108 %v = call float @llvm.fma.f32(float %load, float 15.0, float 15.0)
109 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
110 store float %canonicalized, float addrspace(1)* %gep, align 4
111 ret void
112 }
113
114 ; GCN-LABEL: test_fold_canonicalize_fmuladd_value_f32:
115 ; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
116 ; GFX9-DENORM: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
117 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
118 ; GCN-NOT: 1.0
119 define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(float addrspace(1)* %arg) {
120 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
121 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
122 %load = load float, float addrspace(1)* %gep, align 4
123 %v = call float @llvm.fmuladd.f32(float %load, float 15.0, float 15.0)
124 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
125 store float %canonicalized, float addrspace(1)* %gep, align 4
126 ret void
127 }
128
129 ; GCN-LABEL: test_fold_canonicalize_canonicalize_value_f32:
130 ; GCN: flat_load_dword [[LOAD:v[0-9]+]],
131 ; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]]
132 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
133 ; GCN-NOT: 1.0
134 define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(float addrspace(1)* %arg) {
135 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
136 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
137 %load = load float, float addrspace(1)* %gep, align 4
138 %v = call float @llvm.canonicalize.f32(float %load)
139 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
140 store float %canonicalized, float addrspace(1)* %gep, align 4
141 ret void
142 }
143
144 ; GCN-LABEL: test_fold_canonicalize_fpextend_value_f64_f32:
145 ; GCN: v_cvt_f64_f32_e32 [[V:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
146 ; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], [[V]]
147 ; GCN-NOT: 1.0
148 define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(float addrspace(1)* %arg, double addrspace(1)* %out) {
149 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
150 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
151 %load = load float, float addrspace(1)* %gep, align 4
152 %v = fpext float %load to double
153 %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
154 %gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
155 store double %canonicalized, double addrspace(1)* %gep2, align 8
156 ret void
157 }
158
159 ; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16:
160 ; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}}
161 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
162 ; GCN-NOT: 1.0
163 define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(half addrspace(1)* %arg, float addrspace(1)* %out) {
164 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
165 %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
166 %load = load half, half addrspace(1)* %gep, align 2
167 %v = fpext half %load to float
168 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
169 %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
170 store float %canonicalized, float addrspace(1)* %gep2, align 4
171 ret void
172 }
173
174 ; GCN-LABEL: test_fold_canonicalize_fpround_value_f32_f64:
175 ; GCN: v_cvt_f32_f64_e32 [[V:v[0-9]+]], v[{{[0-9:]+}}]
176 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
177 ; GCN-NOT: 1.0
178 define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f32_f64(double addrspace(1)* %arg, float addrspace(1)* %out) {
179 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
180 %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
181 %load = load double, double addrspace(1)* %gep, align 8
182 %v = fptrunc double %load to float
183 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
184 %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
185 store float %canonicalized, float addrspace(1)* %gep2, align 4
186 ret void
187 }
188
189 ; GCN-LABEL: test_fold_canonicalize_fpround_value_f16_f32:
190 ; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
191 ; GCN: flat_store_short v[{{[0-9:]+}}], [[V]]
192 ; GCN-NOT: 1.0
193 define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32(float addrspace(1)* %arg, half addrspace(1)* %out) {
194 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
195 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
196 %load = load float, float addrspace(1)* %gep, align 4
197 %v = fptrunc float %load to half
198 %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
199 %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
200 store half %canonicalized, half addrspace(1)* %gep2, align 2
201 ret void
202 }
203
204 ; GCN-LABEL: test_fold_canonicalize_fpround_value_v2f16_v2f32:
205 ; GCN-DAG: v_cvt_f16_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
206 ; VI-DAG: v_cvt_f16_f32_sdwa [[V1:v[0-9]+]], v{{[0-9]+}}
207 ; VI: v_or_b32_e32 [[V:v[0-9]+]], [[V0]], [[V1]]
208 ; GFX9: v_cvt_f16_f32_e32 [[V1:v[0-9]+]], v{{[0-9]+}}
209 ; GFX9: v_and_b32_e32 [[V0_16:v[0-9]+]], 0xffff, [[V0]]
210 ; GFX9: v_lshl_or_b32 [[V:v[0-9]+]], [[V1]], 16, [[V0_16]]
211 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
212 ; GCN-NOT: 1.0
213 define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(<2 x float> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) {
214 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
215 %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %arg, i32 %id
216 %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
217 %v = fptrunc <2 x float> %load to <2 x half>
218 %canonicalized = tail call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %v)
219 %gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i32 %id
220 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %gep2, align 4
221 ret void
222 }
223
224 ; GCN-LABEL: test_no_fold_canonicalize_fneg_value_f32:
225 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, 1.0, -v{{[0-9]+}}
226 define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) {
227 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
228 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
229 %load = load float, float addrspace(1)* %gep, align 4
230 %v = fsub float -0.0, %load
231 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
232 store float %canonicalized, float addrspace(1)* %gep, align 4
233 ret void
234 }
235
236 ; GCN-LABEL: test_fold_canonicalize_fneg_value_f32:
237 ; GCN: v_xor_b32_e32 [[V:v[0-9]+]], 0x80000000, v{{[0-9]+}}
238 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
239 ; GCN-NOT: 1.0
240 define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) {
241 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
242 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
243 %load = load float, float addrspace(1)* %gep, align 4
244 %v0 = fadd float %load, 0.0
245 %v = fsub float -0.0, %v0
246 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
247 store float %canonicalized, float addrspace(1)* %gep, align 4
248 ret void
249 }
250
251 ; GCN-LABEL: test_no_fold_canonicalize_fabs_value_f32:
252 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}|
253 define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) {
254 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
255 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
256 %load = load float, float addrspace(1)* %gep, align 4
257 %v = tail call float @llvm.fabs.f32(float %load)
258 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
259 store float %canonicalized, float addrspace(1)* %gep, align 4
260 ret void
261 }
262
263 ; GCN-LABEL: test_fold_canonicalize_fabs_value_f32:
264 ; GCN: v_and_b32_e32 [[V:v[0-9]+]], 0x7fffffff, v{{[0-9]+}}
265 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
266 ; GCN-NOT: 1.0
267 define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) {
268 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
269 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
270 %load = load float, float addrspace(1)* %gep, align 4
271 %v0 = fadd float %load, 0.0
272 %v = tail call float @llvm.fabs.f32(float %v0)
273 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
274 store float %canonicalized, float addrspace(1)* %gep, align 4
275 ret void
276 }
277
278 ; GCN-LABEL: test_fold_canonicalize_sin_value_f32:
279 ; GCN: v_sin_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
280 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
281 ; GCN-NOT: 1.0
282 define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(float addrspace(1)* %arg) {
283 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
284 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
285 %load = load float, float addrspace(1)* %gep, align 4
286 %v = tail call float @llvm.sin.f32(float %load)
287 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
288 store float %canonicalized, float addrspace(1)* %gep, align 4
289 ret void
290 }
291
292 ; GCN-LABEL: test_fold_canonicalize_cos_value_f32:
293 ; GCN: v_cos_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
294 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
295 ; GCN-NOT: 1.0
296 define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(float addrspace(1)* %arg) {
297 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
298 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
299 %load = load float, float addrspace(1)* %gep, align 4
300 %v = tail call float @llvm.cos.f32(float %load)
301 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
302 store float %canonicalized, float addrspace(1)* %gep, align 4
303 ret void
304 }
305
306 ; GCN-LABEL: test_fold_canonicalize_sin_value_f16:
307 ; GCN: v_sin_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
308 ; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]]
309 ; GCN: flat_store_short v[{{[0-9:]+}}], [[V]]
310 ; GCN-NOT: 1.0
311 define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(half addrspace(1)* %arg) {
312 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
313 %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
314 %load = load half, half addrspace(1)* %gep, align 2
315 %v = tail call half @llvm.sin.f16(half %load)
316 %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
317 store half %canonicalized, half addrspace(1)* %gep, align 2
318 ret void
319 }
320
321 ; GCN-LABEL: test_fold_canonicalize_cos_value_f16:
322 ; GCN: v_cos_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
323 ; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]]
324 ; GCN: flat_store_short v[{{[0-9:]+}}], [[V]]
325 ; GCN-NOT: 1.0
326 define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(half addrspace(1)* %arg) {
327 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
328 %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
329 %load = load half, half addrspace(1)* %gep, align 2
330 %v = tail call half @llvm.cos.f16(half %load)
331 %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
332 store half %canonicalized, half addrspace(1)* %gep, align 2
333 ret void
334 }
335
336 ; GCN-LABEL: test_fold_canonicalize_qNaN_value_f32:
337 ; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x7fc00000
338 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
339 ; GCN-NOT: 1.0
340 define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace(1)* %arg) {
341 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
342 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
343 %canonicalized = tail call float @llvm.canonicalize.f32(float 0x7FF8000000000000)
344 store float %canonicalized, float addrspace(1)* %gep, align 4
345 ret void
346 }
347
348 ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32:
349 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
350 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32(float addrspace(1)* %arg) {
351 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
352 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
353 %load = load float, float addrspace(1)* %gep, align 4
354 %v = tail call float @llvm.minnum.f32(float %load, float 0.0)
355 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
356 store float %canonicalized, float addrspace(1)* %gep, align 4
357 ret void
358 }
359
360 ; GCN-LABEL: test_fold_canonicalize_minnum_value_f32:
361 ; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
362 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
363 ; GCN-NOT: 1.0
364 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(float addrspace(1)* %arg) {
365 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
366 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
367 %load = load float, float addrspace(1)* %gep, align 4
368 %v0 = fadd float %load, 0.0
369 %v = tail call float @llvm.minnum.f32(float %v0, float 0.0)
370 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
371 store float %canonicalized, float addrspace(1)* %gep, align 4
372 ret void
373 }
374
375 ; GCN-LABEL: test_fold_canonicalize_sNaN_value_f32:
376 ; GCN: v_min_f32_e32 [[V0:v[0-9]+]], 0x7f800001, v{{[0-9]+}}
377 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
378 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
379 define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace(1)* %arg) {
380 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
381 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
382 %load = load float, float addrspace(1)* %gep, align 4
383 %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 2139095041 to float))
384 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
385 store float %canonicalized, float addrspace(1)* %gep, align 4
386 ret void
387 }
388
389 ; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
390 ; GCN: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
391 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
392 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
393 define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) {
394 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
395 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
396 %load = load float, float addrspace(1)* %gep, align 4
397 %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float))
398 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
399 store float %canonicalized, float addrspace(1)* %gep, align 4
400 ret void
401 }
402
403 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32:
404 ; GCN: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
405 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
406 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
407 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32(float addrspace(1)* %arg) {
408 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
409 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
410 %load = load float, float addrspace(1)* %gep, align 4
411 %v = tail call float @llvm.maxnum.f32(float %load, float 0.0)
412 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
413 store float %canonicalized, float addrspace(1)* %gep, align 4
414 ret void
415 }
416
417 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_f32:
418 ; GCN: v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
419 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
420 ; GCN-NOT: 1.0
421 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(float addrspace(1)* %arg) {
422 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
423 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
424 %load = load float, float addrspace(1)* %gep, align 4
425 %v0 = fadd float %load, 0.0
426 %v = tail call float @llvm.maxnum.f32(float %v0, float 0.0)
427 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
428 store float %canonicalized, float addrspace(1)* %gep, align 4
429 ret void
430 }
431
432 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_f64:
433 ; GCN: v_max_f64 [[V:v\[[0-9]+:[0-9]+\]]], v[{{[0-9:]+}}], 0
434 ; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], [[V]]
435 ; GCN-NOT: 1.0
436 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f64(double addrspace(1)* %arg) {
437 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
438 %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
439 %load = load double, double addrspace(1)* %gep, align 8
440 %v0 = fadd double %load, 0.0
441 %v = tail call double @llvm.maxnum.f64(double %v0, double 0.0)
442 %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
443 store double %canonicalized, double addrspace(1)* %gep, align 8
444 ret void
445 }
446
447 ; GCN-LABEL: test_no_fold_canonicalize_fmul_value_f32_no_ieee:
448 ; GCN-EXCEPT: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
449 define amdgpu_ps float @test_no_fold_canonicalize_fmul_value_f32_no_ieee(float %arg) {
450 entry:
451 %v = fmul float %arg, 15.0
452 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
453 ret float %canonicalized
454 }
455
456 ; GCN-LABEL: test_fold_canonicalize_fmul_nnan_value_f32_no_ieee:
457 ; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
458 ; GCN-NEXT: ; return
459 ; GCN-NOT: 1.0
460 define amdgpu_ps float @test_fold_canonicalize_fmul_nnan_value_f32_no_ieee(float %arg) {
461 entry:
462 %v = fmul nnan float %arg, 15.0
463 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
464 ret float %canonicalized
465 }
466
467 declare float @llvm.canonicalize.f32(float) #0
468 declare double @llvm.canonicalize.f64(double) #0
469 declare half @llvm.canonicalize.f16(half) #0
470 declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
471 declare i32 @llvm.amdgcn.workitem.id.x() #0
472 declare float @llvm.sqrt.f32(float) #0
473 declare float @llvm.ceil.f32(float) #0
474 declare float @llvm.floor.f32(float) #0
475 declare float @llvm.fma.f32(float, float, float) #0
476 declare float @llvm.fmuladd.f32(float, float, float) #0
477 declare float @llvm.fabs.f32(float) #0
478 declare float @llvm.sin.f32(float) #0
479 declare float @llvm.cos.f32(float) #0
480 declare half @llvm.sin.f16(half) #0
481 declare half @llvm.cos.f16(half) #0
482 declare float @llvm.minnum.f32(float, float) #0
483 declare float @llvm.maxnum.f32(float, float) #0
484 declare double @llvm.maxnum.f64(double, double) #0
485
486 attributes #0 = { nounwind readnone }