llvm.org GIT mirror llvm / 2bbb56f
AMDGPU: Pull fneg out of extract_vector_elt This allows folding source modifiers in more f16 cases. Makes it easier to select per-component packed neg modifiers. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@302813 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 3 years ago
6 changed file(s) with 72 addition(s) and 8 deletion(s). Raw diff Collapse all Expand all
566566 case AMDGPUISD::INTERP_P1:
567567 case AMDGPUISD::INTERP_P2:
568568 case AMDGPUISD::DIV_SCALE:
569
570 // TODO: Should really be looking at the users of the bitcast. These are
571 // problematic because bitcasts are used to legalize all stores to integer
572 // types.
573 case ISD::BITCAST:
569574 return false;
570575 default:
571576 return true;
572577 }
573578 }
574579
575 static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold = 4) {
580 bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
581 unsigned CostThreshold) {
576582 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
577583 // it is truly free to use a source modifier in all cases. If there are
578584 // multiple users but for each one will necessitate using VOP3, there will be
131131 return false;
132132 }
133133
134 static bool allUsesHaveSourceMods(const SDNode *N,
135 unsigned CostThreshold = 4);
134136 bool isFAbsFree(EVT VT) const override;
135137 bool isFNegFree(EVT VT) const override;
136138 bool isTruncateFree(EVT Src, EVT Dest) const override;
487487 setTargetDAGCombine(ISD::FCANONICALIZE);
488488 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
489489 setTargetDAGCombine(ISD::ZERO_EXTEND);
490 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
490491
491492 // All memory operations. Some folding on the pointer operand is done to help
492493 // matching the constant offsets in the addressing modes.
46034604 return SDValue();
46044605 }
46054606
4607 SDValue SITargetLowering::performExtractVectorEltCombine(
4608 SDNode *N, DAGCombinerInfo &DCI) const {
4609 SDValue Vec = N->getOperand(0);
4610
4611 SelectionDAG &DAG= DCI.DAG;
4612 if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) {
4613 SDLoc SL(N);
4614 EVT EltVT = N->getValueType(0);
4615 SDValue Idx = N->getOperand(1);
4616 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
4617 Vec.getOperand(0), Idx);
4618 return DAG.getNode(ISD::FNEG, SL, EltVT, Elt);
4619 }
4620
4621 return SDValue();
4622 }
4623
4624
46064625 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
46074626 const SDNode *N0,
46084627 const SDNode *N1) const {
48904909
48914910 break;
48924911 }
4912 case ISD::EXTRACT_VECTOR_ELT:
4913 return performExtractVectorEltCombine(N, DCI);
48934914 }
48944915 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
48954916 }
9999 SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
100100 SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
101101 SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
102 SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
102103
103104 unsigned getFusedOpcode(const SelectionDAG &DAG,
104105 const SDNode *N0, const SDNode *N1) const;
14701470 ; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_mul_legacy_f32:
14711471 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
14721472 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1473 ; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
1474 ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
1475 ; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
1476 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
1477 ; GCN: buffer_store_dword [[MUL]]
1473 ; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
1474 ; GCN-NEXT: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[ADD]], 4.0
1475 ; GCN-NEXT: buffer_store_dword [[ADD]]
1476 ; GCN-NEXT: buffer_store_dword [[MUL]]
14781477 define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
14791478 %tid = call i32 @llvm.amdgcn.workitem.id.x()
14801479 %tid.ext = sext i32 %tid to i64
0 ; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s
33
44 ; FIXME: Should be able to do scalar op
55 ; GCN-LABEL: {{^}}s_fneg_f16:
128128 ret void
129129 }
130130
131 ; GCN-LABEL: {{^}}v_extract_fneg_fold_v2f16:
132 ; GCN: flat_load_dword [[VAL:v[0-9]+]]
133 ; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}}
134 ; CI-DAG: v_sub_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
135
136 ; GFX89: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VAL]]
137 ; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]]
138 ; GFX89-DAG: v_sub_f16_e32 v{{[0-9]+}}, 2.0, [[ELT1]]
139 define amdgpu_kernel void @v_extract_fneg_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {
140 %val = load <2 x half>, <2 x half> addrspace(1)* %in
141 %fneg = fsub <2 x half> , %val
142 %elt0 = extractelement <2 x half> %fneg, i32 0
143 %elt1 = extractelement <2 x half> %fneg, i32 1
144
145 %fmul0 = fmul half %elt0, 4.0
146 %fadd1 = fadd half %elt1, 2.0
147 store volatile half %fmul0, half addrspace(1)* undef
148 store volatile half %fadd1, half addrspace(1)* undef
149 ret void
150 }
151
152 ; GCN-LABEL: {{^}}v_extract_fneg_no_fold_v2f16:
153 ; GCN: flat_load_dword [[VAL:v[0-9]+]]
154 ; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80008000, [[VAL]]
155 ; GCN: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[NEG]]
156 define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {
157 %val = load <2 x half>, <2 x half> addrspace(1)* %in
158 %fneg = fsub <2 x half> , %val
159 %elt0 = extractelement <2 x half> %fneg, i32 0
160 %elt1 = extractelement <2 x half> %fneg, i32 1
161 store volatile half %elt0, half addrspace(1)* undef
162 store volatile half %elt1, half addrspace(1)* undef
163 ret void
164 }
165
131166 declare i32 @llvm.amdgcn.workitem.id.x() #1
132167
133168 attributes #0 = { nounwind }