llvm.org GIT mirror llvm / 9fc15af
[AMDGPU] fcaninicalize optimization for GFX9+ Since GFX9 supports denorm modes for v_min_f32/v_max_f32 that is possible to further optimize fcanonicalize and remove it if applied to min/max given their operands are known not to be an sNaN or that sNaNs are not supported. Additionally we can remove fcanonicalize if denorms are supported for the VT and we know that its argument is never a NaN. Differential Revision: https://reviews.llvm.org/D35335 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@307976 91177308-0d34-0410-b5e6-96231b3b80d8 Stanislav Mekhanoshin 2 years ago
3 changed file(s) with 90 addition(s) and 25 deletion(s). Raw diff Collapse all Expand all
358358 return FP64FP16Denormals;
359359 }
360360
361 bool supportsMinMaxDenormModes() const {
362 return getGeneration() >= AMDGPUSubtarget::GFX9;
363 }
364
361365 bool hasFPExceptions() const {
362366 return FPExceptions;
363367 }
46234623 return DAG.isKnownNeverNaN(Op);
46244624 }
46254625
4626 static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
4627 unsigned MaxDepth=5) {
4626 static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
4627 const SISubtarget *ST, unsigned MaxDepth=5) {
46284628 // If source is a result of another standard FP operation it is already in
46294629 // canonical form.
46304630
46624662 case ISD::FNEG:
46634663 case ISD::FABS:
46644664 return (MaxDepth > 0) &&
4665 isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1);
4665 isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1);
46664666
46674667 case ISD::FSIN:
46684668 case ISD::FCOS:
46714671
46724672 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
46734673 // For such targets need to check their input recursively.
4674 // TODO: on GFX9+ we could return true without checking provided no-nan
4675 // mode, since canonicalization is also used to quiet sNaNs.
46764674 case ISD::FMINNUM:
46774675 case ISD::FMAXNUM:
46784676 case ISD::FMINNAN:
46794677 case ISD::FMAXNAN:
46804678
4679 if (ST->supportsMinMaxDenormModes() &&
4680 DAG.isKnownNeverNaN(Op.getOperand(0)) &&
4681 DAG.isKnownNeverNaN(Op.getOperand(1)))
4682 return true;
4683
46814684 return (MaxDepth > 0) &&
4682 isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1) &&
4683 isCanonicalized(Op.getOperand(1), ST, MaxDepth - 1);
4685 isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) &&
4686 isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1);
46844687
46854688 case ISD::ConstantFP: {
46864689 auto F = cast(Op)->getValueAPF();
46994702
47004703 if (!CFP) {
47014704 SDValue N0 = N->getOperand(0);
4705 EVT VT = N0.getValueType().getScalarType();
4706 auto ST = getSubtarget();
4707
4708 if (((VT == MVT::f32 && ST->hasFP32Denormals()) ||
4709 (VT == MVT::f64 && ST->hasFP64Denormals()) ||
4710 (VT == MVT::f16 && ST->hasFP16Denormals())) &&
4711 DAG.isKnownNeverNaN(N0))
4712 return N0;
47024713
47034714 bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
47044715
47054716 if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) &&
4706 isCanonicalized(N0, getSubtarget()))
4717 isCanonicalized(DAG, N0, ST))
47074718 return N0;
47084719
47094720 return SDValue();
346346 }
347347
348348 ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32:
349 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
349 ; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
350 ; GFX9: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
351 ; GFX9: flat_store_dword v[{{[0-9:]+}}], [[V]]
350352 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32(float addrspace(1)* %arg) {
351353 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
352354 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
387389 }
388390
389391 ; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
390 ; GCN: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
391 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
392 ; GFX9: v_min_f32_e32 [[V:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
393 ; VI: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
394 ; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
395 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
396 ; GFX9-NOT: 1.0
397 define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) {
398 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
399 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
400 %load = load float, float addrspace(1)* %gep, align 4
401 %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float))
402 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
403 store float %canonicalized, float addrspace(1)* %gep, align 4
404 ret void
405 }
406
407 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32:
408 ; GFX9: v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
409 ; VI: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
410 ; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
392411 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
393 define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) {
394 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
395 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
396 %load = load float, float addrspace(1)* %gep, align 4
397 %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float))
398 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
399 store float %canonicalized, float addrspace(1)* %gep, align 4
400 ret void
401 }
402
403 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32:
404 ; GCN: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
405 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
406 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
412 ; GFX9-NOT: 1.0
407413 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32(float addrspace(1)* %arg) {
408414 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
409415 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
462468 %v = fmul nnan float %arg, 15.0
463469 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
464470 ret float %canonicalized
471 }
472
473 ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f32
474 ; GFX9-DENORM: flat_load_dword [[V:v[0-9]+]],
475 ; GFX9-DENORM: flat_store_dword v[{{[0-9:]+}}], [[V]]
476 ; GFX9-DENORM-NOT: 1.0
477 ; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
478 define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f32(float addrspace(1)* %arg, float addrspace(1)* %out) #1 {
479 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
480 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
481 %v = load float, float addrspace(1)* %gep, align 4
482 %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
483 %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
484 store float %canonicalized, float addrspace(1)* %gep2, align 4
485 ret void
486 }
487
488 ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f64
489 ; GCN: flat_load_dwordx2 [[V:v\[[0-9:]+\]]],
490 ; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], [[V]]
491 ; GCN-NOT: 1.0
492 define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(double addrspace(1)* %arg, double addrspace(1)* %out) #1 {
493 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
494 %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
495 %v = load double, double addrspace(1)* %gep, align 8
496 %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
497 %gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
498 store double %canonicalized, double addrspace(1)* %gep2, align 8
499 ret void
500 }
501
502 ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16
503 ; GCN: flat_load_ushort [[V:v[0-9]+]],
504 ; GCN: flat_store_short v[{{[0-9:]+}}], [[V]]
505 ; GCN-NOT: 1.0
506 define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(half addrspace(1)* %arg, half addrspace(1)* %out) #1 {
507 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
508 %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
509 %v = load half, half addrspace(1)* %gep, align 2
510 %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
511 %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
512 store half %canonicalized, half addrspace(1)* %gep2, align 2
513 ret void
465514 }
466515
467516 declare float @llvm.canonicalize.f32(float) #0
484533 declare double @llvm.maxnum.f64(double, double) #0
485534
486535 attributes #0 = { nounwind readnone }
536 attributes #1 = { "no-nans-fp-math"="true" }