llvm.org GIT mirror llvm / 802910f
Merging r292472: ------------------------------------------------------------------------ r292472 | arsenm | 2017-01-18 22:04:12 -0800 (Wed, 18 Jan 2017) | 5 lines AMDGPU: Remove modifiers from v_div_scale_* They seem to produce nonsense results when used. This should be applied to the release branch. ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_40@293317 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 2 years ago
5 changed file(s) with 14 addition(s) and 18 deletion(s). Raw diff Collapse all Expand all
726726 unsigned Opc
727727 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
728728
729 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
730 // omod
731 SDValue Ops[8];
732
733 SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
734 SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]);
735 SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]);
736 CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops);
729 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) };
730 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
737731 }
738732
739733 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
6969 }
7070
7171 class VOP3b_Profile : VOPProfile<[vt, vt, vt, vt]> {
72 // v_div_scale_{f32|f64} do not support input modifiers.
73 let HasModifiers = 0;
7274 let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
73 let Asm64 = " $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod";
75 let Asm64 = " $vdst, $sdst, $src0, $src1, $src2";
7476 }
7577
7678 def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile {
167169 def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> {
168170 let SchedRW = [WriteFloatFMA, WriteSALU];
169171 let hasExtraSrcRegAllocReq = 1;
172 let AsmMatchConverter = "";
170173 }
171174
172175 // Double precision division pre-scale.
173176 def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> {
174177 let SchedRW = [WriteDouble, WriteSALU];
175178 let hasExtraSrcRegAllocReq = 1;
179 let AsmMatchConverter = "";
176180 }
177181
178182 def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile, int_amdgcn_msad_u8>;
6262 S_BRANCH %bb.3
6363
6464 bb.3:
65 %vgpr4, %vcc = V_DIV_SCALE_F32 0, %vgpr1, 0, %vgpr1, 0, %vgpr3, 0, 0, implicit %exec
65 %vgpr4, %vcc = V_DIV_SCALE_F32 %vgpr1, %vgpr1, %vgpr3, implicit %exec
6666 %vgpr0 = V_DIV_FMAS_F32 0, %vgpr1, 0, %vgpr2, 0, %vgpr3, 0, 0, implicit %vcc, implicit %exec
6767 S_ENDPGM
6868
321321 ; SI-LABEL: {{^}}test_div_scale_f32_fabs_num:
322322 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
323323 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
324 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], |[[A]]|
324 ; SI: v_and_b32_e32 [[ABS_A:v[0-9]+]], 0x7fffffff, [[A]]
325 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[ABS_A]]
325326 ; SI: buffer_store_dword [[RESULT0]]
326327 ; SI: s_endpgm
327328 define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
343344 ; SI-LABEL: {{^}}test_div_scale_f32_fabs_den:
344345 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
345346 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
346 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], |[[B]]|, |[[B]]|, [[A]]
347 ; SI: v_and_b32_e32 [[ABS_B:v[0-9]+]], 0x7fffffff, [[B]]
348 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[ABS_B]], [[ABS_B]], [[A]]
347349 ; SI: buffer_store_dword [[RESULT0]]
348350 ; SI: s_endpgm
349351 define void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
351351 // SICI: v_div_scale_f32 v24, vcc, v22, v22, v20 ; encoding: [0x18,0x6a,0xda,0xd2,0x16,0x2d,0x52,0x04]
352352 // VI: v_div_scale_f32 v24, vcc, v22, v22, v20 ; encoding: [0x18,0x6a,0xe0,0xd1,0x16,0x2d,0x52,0x04]
353353
354 v_div_scale_f32 v24, vcc, s[10:11], v22, v20
355 // SICI: v_div_scale_f32 v24, vcc, s[10:11], v22, v20 ; encoding: [0x18,0x6a,0xda,0xd2,0x0a,0x2c,0x52,0x04]
356 // VI: v_div_scale_f32 v24, vcc, s[10:11], v22, v20 ; encoding: [0x18,0x6a,0xe0,0xd1,0x0a,0x2c,0x52,0x04]
357
358354 v_div_scale_f32 v24, s[10:11], v22, v22, v20
359355 // SICI: v_div_scale_f32 v24, s[10:11], v22, v22, v20 ; encoding: [0x18,0x0a,0xda,0xd2,0x16,0x2d,0x52,0x04]
360356 // VI: v_div_scale_f32 v24, s[10:11], v22, v22, v20 ; encoding: [0x18,0x0a,0xe0,0xd1,0x16,0x2d,0x52,0x04]
364360 // VI: v_div_scale_f32 v24, vcc, v22, 1.0, v22 ; encoding: [0x18,0x6a,0xe0,0xd1,0x16,0xe5,0x59,0x04]
365361
366362 v_div_scale_f32 v24, vcc, v22, v22, -2.0
367 // SICI: v_div_scale_f32 v24, vcc, v22, v22, -2.0 ; encoding: [0x18,0x6a,0xda,0xd2,0x16,0x2d,0xd2,0x83]
368 // VI: v_div_scale_f32 v24, vcc, v22, v22, -2.0 ; encoding: [0x18,0x6a,0xe0,0xd1,0x16,0x2d,0xd2,0x83]
363 // SICI: v_div_scale_f32 v24, vcc, v22, v22, -2.0 ; encoding: [0x18,0x6a,0xda,0xd2,0x16,0x2d,0xd6,0x03]
364 // VI: v_div_scale_f32 v24, vcc, v22, v22, -2.0 ; encoding: [0x18,0x6a,0xe0,0xd1,0x16,0x2d,0xd6,0x03]
369365
370366 v_div_scale_f32 v24, vcc, v22, v22, 0xc0000000
371367 // SICI: v_div_scale_f32 v24, vcc, v22, v22, -2.0 ; encoding: [0x18,0x6a,0xda,0xd2,0x16,0x2d,0xd6,0x03]