llvm.org GIT mirror llvm / 0a4c4f2
[AMDGPU] Use v_max_f* for fcanonicalize If denorms are not flushed we can use max instead of multiplication by 1. For double that is simply faster, while for float and half it is shorter, because mul uses constant bus and VOP3. Differential Revision: https://reviews.llvm.org/D36856 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@312095 91177308-0d34-0410-b5e6-96231b3b80d8 Stanislav Mekhanoshin 2 years ago
5 changed file(s) with 144 addition(s) and 37 deletion(s). Raw diff Collapse all Expand all
4141 field bits<32> Inst = 0xffffffff;
4242 }
4343
44 def FP16Denormals : Predicate<"Subtarget.hasFP16Denormals()">;
45 def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">;
46 def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">;
44 def FP16Denormals : Predicate<"Subtarget->hasFP16Denormals()">;
45 def FP32Denormals : Predicate<"Subtarget->hasFP32Denormals()">;
46 def FP64Denormals : Predicate<"Subtarget->hasFP64Denormals()">;
47 def NoFP16Denormals : Predicate<"!Subtarget->hasFP16Denormals()">;
48 def NoFP32Denormals : Predicate<"!Subtarget->hasFP32Denormals()">;
49 def NoFP64Denormals : Predicate<"!Subtarget->hasFP64Denormals()">;
4750 def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
4851
4952 def InstFlag : OperandWithDefaultOps ;
12771277 // FIXME: defm : BFMPatterns ;
12781278 defm : BFEPattern ;
12791279
1280 let Predicates = [NoFP16Denormals] in {
12801281 def : Pat<
12811282 (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
12821283 (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src, 0, 0)
12831284 >;
1284
1285 }
1286
1287 let Predicates = [FP16Denormals] in {
1288 def : Pat<
1289 (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
1290 (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)
1291 >;
1292 }
1293
1294 let Predicates = [NoFP32Denormals] in {
12851295 def : Pat<
12861296 (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
12871297 (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src, 0, 0)
12881298 >;
1289
1299 }
1300
1301 let Predicates = [FP32Denormals] in {
1302 def : Pat<
1303 (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
1304 (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src, 0, 0)
1305 >;
1306 }
1307
1308 let Predicates = [NoFP64Denormals] in {
12901309 def : Pat<
12911310 (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
12921311 (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src, 0, 0)
12931312 >;
1313 }
1314
1315 let Predicates = [FP64Denormals] in {
1316 def : Pat<
1317 (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
1318 (V_MAX_F64 $src_mods, $src, $src_mods, $src, 0, 0)
1319 >;
1320 }
12941321
12951322 def : Pat<
12961323 (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
33 ; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GCN-FLUSH %s
44
55 ; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32:
6 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
6 ; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
7 ; GFX9-DENORM: v_max_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
78 define amdgpu_kernel void @test_no_fold_canonicalize_loaded_value_f32(float addrspace(1)* %arg) {
89 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
910 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
128129
129130 ; GCN-LABEL: test_fold_canonicalize_canonicalize_value_f32:
130131 ; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]],
131 ; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]]
132 ; GCN-FLUSH: v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]]
133 ; GCN-DENORM: v_max_f32_e32 [[V:v[0-9]+]], [[LOAD]], [[LOAD]]
132134 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
133135 ; GCN-NOT: 1.0
134136 define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(float addrspace(1)* %arg) {
222224 }
223225
224226 ; GCN-LABEL: test_no_fold_canonicalize_fneg_value_f32:
225 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, 1.0, -v{{[0-9]+}}
227 ; GCN-FLUSH: v_mul_f32_e64 v{{[0-9]+}}, 1.0, -v{{[0-9]+}}
228 ; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
226229 define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) {
227230 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
228231 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
249252 }
250253
251254 ; GCN-LABEL: test_no_fold_canonicalize_fabs_value_f32:
252 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}|
255 ; GCN-FLUSH: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}|
256 ; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
253257 define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) {
254258 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
255259 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
376380
377381 ; GCN-LABEL: test_fold_canonicalize_sNaN_value_f32:
378382 ; GCN: v_min_f32_e32 [[V0:v[0-9]+]], 0x7f800001, v{{[0-9]+}}
379 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
383 ; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
384 ; GCN-DENORM: v_max_f32_e32 v{{[0-9]+}}, [[V0]], [[V0]]
380385 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
381386 define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace(1)* %arg) {
382387 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
88
99
1010 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f16:
11 ; GCN: v_mul_f16_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
11 ; GCN: v_max_f16_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
1212 ; GCN: buffer_store_short [[REG]]
1313 define amdgpu_kernel void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 {
1414 %val = load half, half addrspace(1)* %out
1818 }
1919
2020 ; GCN-LABEL: {{^}}s_test_canonicalize_var_f16:
21 ; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}}
21 ; GCN: v_max_f16_e64 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}}
2222 ; GCN: buffer_store_short [[REG]]
2323 define amdgpu_kernel void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 {
2424 %val = bitcast i16 %val.arg to half
2828 }
2929
3030 ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f16:
31 ; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, |{{v[0-9]+}}|
31 ; GCN: v_max_f16_e64 [[REG:v[0-9]+]], |{{v[0-9]+}}|, |{{v[0-9]+}}|
3232 ; GCN: buffer_store_short [[REG]]
3333 define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %out) #1 {
3434 %val = load half, half addrspace(1)* %out
3939 }
4040
4141 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f16:
42 ; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, -|{{v[0-9]+}}|
42 ; GCN: v_max_f16_e64 [[REG:v[0-9]+]], -|{{v[0-9]+}}|, -|{{v[0-9]+}}|
4343 ; GCN: buffer_store_short [[REG]]
4444 define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #1 {
4545 %val = load half, half addrspace(1)* %out
5151 }
5252
5353 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f16:
54 ; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, -{{v[0-9]+}}
54 ; GCN: v_max_f16_e64 [[REG:v[0-9]+]], -{{v[0-9]+}}, -{{v[0-9]+}}
5555 ; GCN: buffer_store_short [[REG]]
5656 define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* %out) #1 {
5757 %val = load half, half addrspace(1)* %out
206206 }
207207
208208 ; GCN-LABEL: {{^}}v_test_canonicalize_var_v2f16:
209 ; VI: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
210 ; VI-DAG: v_mul_f16_sdwa [[REG0:v[0-9]+]], v[[CONST1]], {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
211 ; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, {{v[0-9]+}}
209 ; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
210 ; VI-DAG: v_max_f16_e32 [[REG1:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
212211 ; VI-NOT: v_and_b32
213212
214213 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}}
226225 ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_v2f16:
227226 ; VI-DAG: v_bfe_u32
228227 ; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, v{{[0-9]+}}
229 ; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
230 ; VI: v_mul_f16_sdwa [[REG0:v[0-9]+]], v[[CONST1]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
231 ; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}}
228 ; VI: v_max_f16_sdwa [[REG0:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
229 ; VI: v_max_f16_e32 [[REG1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
232230 ; VI-NOT: 0xffff
233231 ; VI: v_or_b32
234232
246244 }
247245
248246 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_v2f16:
249 ; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
250247 ; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
251 ; VI-DAG: v_mul_f16_sdwa [[REG0:v[0-9]+]], v[[CONST1]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
252 ; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}}
248 ; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
249 ; VI-DAG: v_max_f16_e32 [[REG1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
253250 ; VI: v_or_b32
254251
255252 ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
268265
269266 ; FIXME: Fold modifier
270267 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16:
271 ; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
272 ; VI-DAG: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}}
273 ; VI-DAG: v_mul_f16_sdwa [[REG1:v[0-9]+]], v[[CONST1]], [[FNEG]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
274 ; VI-DAG: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, [[FNEG]]
268 ; VI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}}
269 ; VI: v_lshrrev_b32_e32 [[FNEGHI:v[0-9]+]], 16, [[FNEG]]
270 ; VI-DAG: v_max_f16_sdwa [[REG1:v[0-9]+]], [[FNEG]], [[FNEGHI]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
271 ; VI-DAG: v_max_f16_e32 [[REG0:v[0-9]+]], [[FNEG]], [[FNEG]]
275272 ; VI-NOT: 0xffff
276273
277274 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}}
287284 }
288285
289286 ; GCN-LABEL: {{^}}s_test_canonicalize_var_v2f16:
290 ; VI: v_mov_b32_e32 [[ONE:v[0-9]+]], 0x3c00
291 ; VI: v_mul_f16_sdwa [[REG0:v[0-9]+]], [[ONE]], {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
292 ; VI: v_mul_f16_e64 [[REG1:v[0-9]+]], 1.0, {{s[0-9]+}}
287 ; VI: v_max_f16_sdwa [[REG0:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
288 ; VI: v_max_f16_e64 [[REG1:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}}
293289 ; VI-NOT: v_and_b32
294290
295291 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{s[0-9]+$}}
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
11
22 declare float @llvm.fabs.f32(float) #0
33 declare float @llvm.canonicalize.f32(float) #0
44 declare double @llvm.fabs.f64(double) #0
55 declare double @llvm.canonicalize.f64(double) #0
6 declare half @llvm.canonicalize.f16(half) #0
7 declare i32 @llvm.amdgcn.workitem.id.x() #0
68
79 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f32:
810 ; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
202204 }
203205
204206 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f64:
205 ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, {{v\[[0-9]+:[0-9]+\]}}
207 ; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
206208 ; GCN: buffer_store_dwordx2 [[REG]]
207209 define amdgpu_kernel void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
208210 %val = load double, double addrspace(1)* %out
212214 }
213215
214216 ; GCN-LABEL: {{^}}s_test_canonicalize_var_f64:
215 ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, {{s\[[0-9]+:[0-9]+\]}}
217 ; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
216218 ; GCN: buffer_store_dwordx2 [[REG]]
217219 define amdgpu_kernel void @s_test_canonicalize_var_f64(double addrspace(1)* %out, double %val) #1 {
218220 %canonicalized = call double @llvm.canonicalize.f64(double %val)
221223 }
222224
223225 ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f64:
224 ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, |{{v\[[0-9]+:[0-9]+\]}}|
226 ; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], |{{v\[[0-9]+:[0-9]+\]}}|, |{{v\[[0-9]+:[0-9]+\]}}|
225227 ; GCN: buffer_store_dwordx2 [[REG]]
226228 define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(double addrspace(1)* %out) #1 {
227229 %val = load double, double addrspace(1)* %out
232234 }
233235
234236 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f64:
235 ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]\]]], 1.0, -|{{v\[[0-9]+:[0-9]+\]}}|
237 ; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]\]]], -|{{v\[[0-9]+:[0-9]+\]}}|, -|{{v\[[0-9]+:[0-9]+\]}}|
236238 ; GCN: buffer_store_dwordx2 [[REG]]
237239 define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(double addrspace(1)* %out) #1 {
238240 %val = load double, double addrspace(1)* %out
244246 }
245247
246248 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f64:
247 ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, -{{v\[[0-9]+:[0-9]+\]}}
249 ; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -{{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]}}
248250 ; GCN: buffer_store_dwordx2 [[REG]]
249251 define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(double addrspace(1)* %out) #1 {
250252 %val = load double, double addrspace(1)* %out
414416 ret void
415417 }
416418
419 ; GCN-LABEL: {{^}}test_canonicalize_value_f64_flush:
420 ; GCN: v_mul_f64 v[{{[0-9:]+}}], 1.0, v[{{[0-9:]+}}]
421 define amdgpu_kernel void @test_canonicalize_value_f64_flush(double addrspace(1)* %arg, double addrspace(1)* %out) #4 {
422 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
423 %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
424 %v = load double, double addrspace(1)* %gep, align 8
425 %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
426 %gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
427 store double %canonicalized, double addrspace(1)* %gep2, align 8
428 ret void
429 }
430
431 ; GCN-LABEL: {{^}}test_canonicalize_value_f32_flush:
432 ; GCN: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
433 define amdgpu_kernel void @test_canonicalize_value_f32_flush(float addrspace(1)* %arg, float addrspace(1)* %out) #4 {
434 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
435 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
436 %v = load float, float addrspace(1)* %gep, align 4
437 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
438 %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
439 store float %canonicalized, float addrspace(1)* %gep2, align 4
440 ret void
441 }
442
443 ; GCN-LABEL: {{^}}test_canonicalize_value_f16_flush:
444 ; GCN: v_mul_f16_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
445 define amdgpu_kernel void @test_canonicalize_value_f16_flush(half addrspace(1)* %arg, half addrspace(1)* %out) #4 {
446 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
447 %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
448 %v = load half, half addrspace(1)* %gep, align 2
449 %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
450 %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
451 store half %canonicalized, half addrspace(1)* %gep2, align 2
452 ret void
453 }
454
455 ; GCN-LABEL: {{^}}test_canonicalize_value_f64_denorm:
456 ; GCN: v_max_f64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
457 define amdgpu_kernel void @test_canonicalize_value_f64_denorm(double addrspace(1)* %arg, double addrspace(1)* %out) #5 {
458 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
459 %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
460 %v = load double, double addrspace(1)* %gep, align 8
461 %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
462 %gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
463 store double %canonicalized, double addrspace(1)* %gep2, align 8
464 ret void
465 }
466
467 ; GCN-LABEL: {{^}}test_canonicalize_value_f32_denorm:
468 ; GCN: v_max_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
469 define amdgpu_kernel void @test_canonicalize_value_f32_denorm(float addrspace(1)* %arg, float addrspace(1)* %out) #5 {
470 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
471 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
472 %v = load float, float addrspace(1)* %gep, align 4
473 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
474 %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
475 store float %canonicalized, float addrspace(1)* %gep2, align 4
476 ret void
477 }
478
479 ; GCN-LABEL: {{^}}test_canonicalize_value_f16_denorm:
480 ; GCN: v_max_f16_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
481 define amdgpu_kernel void @test_canonicalize_value_f16_denorm(half addrspace(1)* %arg, half addrspace(1)* %out) #5 {
482 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
483 %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
484 %v = load half, half addrspace(1)* %gep, align 2
485 %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
486 %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
487 store half %canonicalized, half addrspace(1)* %gep2, align 2
488 ret void
489 }
490
417491 attributes #0 = { nounwind readnone }
418492 attributes #1 = { nounwind }
419493 attributes #2 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" }
420494 attributes #3 = { nounwind "target-features"="+fp32-denormals,+fp64-fp16-denormals" }
495 attributes #4 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" "target-cpu"="tonga" }
496 attributes #5 = { nounwind "target-features"="+fp32-denormals,+fp64-fp16-denormals" "target-cpu"="gfx900" }