llvm.org GIT mirror llvm / 4fc498f
AMDGPU: Improve accuracy of instruction rates for VOPC These were all using the default 32-bit VALU write class, but the i64/f64 compares are half rate. I'm not sure this is really correct, because they are still using the write to VALU write class, even though they really write to the SALU. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@248582 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 4 years ago
4 changed file(s) with 85 addition(s) and 58 deletion(s). Raw diff Collapse all Expand all
14891489
14901490 multiclass VOP3_C_m
14911491 list pattern, string opName,
1492 bit HasMods, bit defExec, string revOp> {
1492 bit HasMods, bit defExec,
1493 string revOp, list sched> {
14931494
14941495 def "" : VOP3_Pseudo ,
1495 VOP2_REV;
1496 VOP2_REV {
1497 let SchedRW = sched;
1498 }
14961499
14971500 def _si : VOP3_Real_si ,
14981501 VOP3DisableFields<1, 0, HasMods> {
14991502 let Defs = !if(defExec, [EXEC], []);
1503 let SchedRW = sched;
15001504 }
15011505
15021506 def _vi : VOP3_Real_vi ,
15031507 VOP3DisableFields<1, 0, HasMods> {
15041508 let Defs = !if(defExec, [EXEC], []);
1509 let SchedRW = sched;
15051510 }
15061511 }
15071512
16891694
16901695 multiclass VOPC_m pattern,
16911696 string opName, bit DefExec, VOPProfile p,
1697 list sched,
16921698 string revOpName = "", string asm = opName#"_e32 "#op_asm,
16931699 string alias_asm = opName#" "#op_asm> {
1694 def "" : VOPC_Pseudo ;
1700 def "" : VOPC_Pseudo {
1701 let SchedRW = sched;
1702 }
16951703
16961704 let AssemblerPredicates = [isSICI] in {
1697
1698 def _si : VOPC,
1699 SIMCInstr {
1700 let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
1701 let hasSideEffects = DefExec;
1702 }
1703
1704 def : SIInstAlias <
1705 alias_asm,
1706 (!cast(NAME#"_e32_si") p.Src0RC32:$src0, p.Src1RC32:$src1)
1707 >;
1705 def _si : VOPC,
1706 SIMCInstr {
1707 let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
1708 let hasSideEffects = DefExec;
1709 let SchedRW = sched;
1710 }
1711
1712 def : SIInstAlias <
1713 alias_asm,
1714 (!cast(NAME#"_e32_si") p.Src0RC32:$src0, p.Src1RC32:$src1)
1715 >;
17081716
17091717 } // End AssemblerPredicates = [isSICI]
17101718
1711
17121719 let AssemblerPredicates = [isVI] in {
1713
1714 def _vi : VOPC,
1715 SIMCInstr {
1716 let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
1717 let hasSideEffects = DefExec;
1718 }
1719
1720 def : SIInstAlias <
1721 alias_asm,
1722 (!cast(NAME#"_e32_vi") p.Src0RC32:$src0, p.Src1RC32:$src1)
1723 >;
1724
1720 def _vi : VOPC,
1721 SIMCInstr {
1722 let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
1723 let hasSideEffects = DefExec;
1724 let SchedRW = sched;
1725 }
1726
1727 def : SIInstAlias <
1728 alias_asm,
1729 (!cast(NAME#"_e32_vi") p.Src0RC32:$src0, p.Src1RC32:$src1)
1730 >;
17251731 } // End AssemblerPredicates = [isVI]
17261732 }
17271733
17291735 dag ins32, string asm32, list pat32,
17301736 dag out64, dag ins64, string asm64, list pat64,
17311737 bit HasMods, bit DefExec, string revOp,
1732 VOPProfile p> {
1733 defm _e32 : VOPC_m ;
1738 VOPProfile p,
1739 list sched> {
1740 defm _e32 : VOPC_m ;
17341741
17351742 defm _e64 : VOP3_C_m
1736 opName, HasMods, DefExec, revOp>;
1743 opName, HasMods, DefExec, revOp,
1744 sched>;
17371745 }
17381746
17391747 // Special case for class instructions which only have modifiers on
17421750 dag ins32, string asm32, list pat32,
17431751 dag out64, dag ins64, string asm64, list pat64,
17441752 bit HasMods, bit DefExec, string revOp,
1745 VOPProfile p> {
1746 defm _e32 : VOPC_m ;
1753 VOPProfile p,
1754 list sched> {
1755 defm _e32 : VOPC_m ;
17471756
17481757 defm _e64 : VOP3_C_m
1749 opName, HasMods, DefExec, revOp>,
1758 opName, HasMods, DefExec, revOp, sched>,
17501759 VOP3DisableModFields<1, 0, 0>;
17511760 }
17521761
17531762 multiclass VOPCInst
17541763 VOPProfile P, PatLeaf cond = COND_NULL,
17551764 string revOp = opName,
1756 bit DefExec = 0> : VOPC_Helper <
1765 bit DefExec = 0,
1766 list sched = [Write32Bit]> :
1767 VOPC_Helper <
17571768 op, opName,
17581769 P.Ins32, P.Asm32, [],
17591770 (outs VOPDstS64:$dst), P.Ins64, P.Asm64,
17641775 (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
17651776 cond))],
17661777 [(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]),
1767 P.HasModifiers, DefExec, revOp, P
1778 P.HasModifiers, DefExec, revOp, P, sched
17681779 >;
17691780
17701781 multiclass VOPCClassInst
1771 bit DefExec = 0> : VOPC_Class_Helper <
1782 bit DefExec = 0,
1783 list sched> : VOPC_Class_Helper <
17721784 op, opName,
17731785 P.Ins32, P.Asm32, [],
17741786 (outs VOPDstS64:$dst), P.Ins64, P.Asm64,
17761788 [(set i1:$dst,
17771789 (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))],
17781790 [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]),
1779 P.HasModifiers, DefExec, opName, P
1791 P.HasModifiers, DefExec, opName, P, sched
17801792 >;
17811793
17821794
17841796 VOPCInst ;
17851797
17861798 multiclass VOPC_F64 :
1787 VOPCInst >;
1799 VOPCInst , 0, [WriteDoubleAdd]>;
17881800
17891801 multiclass VOPC_I32 :
17901802 VOPCInst ;
17911803
17921804 multiclass VOPC_I64 :
1793 VOPCInst >;
1805 VOPCInst , 0, [Write64Bit]>;
17941806
17951807
17961808 multiclass VOPCX
17971809 PatLeaf cond = COND_NULL,
1810 list sched,
17981811 string revOp = "">
1799 : VOPCInst >;
1812 : VOPCInst , sched>;
18001813
18011814 multiclass VOPCX_F32 :
1802 VOPCX revOp>;
1815 VOPCX [Write32Bit], revOp>;
18031816
18041817 multiclass VOPCX_F64 :
1805 VOPCX revOp>;
1818 VOPCX [WriteDoubleAdd], revOp>;
18061819
18071820 multiclass VOPCX_I32 :
1808 VOPCX revOp>;
1821 VOPCX [Write32Bit], revOp>;
18091822
18101823 multiclass VOPCX_I64 :
1811 VOPCX revOp>;
1824 VOPCX [Write64Bit], revOp>;
18121825
18131826 multiclass VOP3_Helper
18141827 list pat, int NumSrcArgs, bit HasMods> : VOP3_m <
18161829 >;
18171830
18181831 multiclass VOPC_CLASS_F32 :
1819 VOPCClassInst >;
1832 VOPCClassInst , [Write32Bit]>;
18201833
18211834 multiclass VOPCX_CLASS_F32 :
1822 VOPCClassInst >;
1835 VOPCClassInst , [Write32Bit]>;
18231836
18241837 multiclass VOPC_CLASS_F64 :
1825 VOPCClassInst >;
1838 VOPCClassInst , [WriteDoubleAdd]>;
18261839
18271840 multiclass VOPCX_CLASS_F64 :
1828 VOPCClassInst >;
1841 VOPCClassInst , [WriteDoubleAdd]>;
18291842
18301843 multiclass VOP3Inst
18311844 SDPatternOperator node = null_frag> : VOP3_Helper <
2121 // Vector ALU instructions
2222 def Write32Bit : SchedWrite;
2323 def WriteQuarterRate32 : SchedWrite;
24 def WriteFullOrQuarterRate32 : SchedWrite;
2425
2526 def WriteFloatFMA : SchedWrite;
2627
27 def WriteDouble : SchedWrite;
28 // Slow quarter rate f64 instruction.
29 def WriteDouble : SchedWrite;
30
31 // half rate f64 instruction (same as v_add_f64)
2832 def WriteDoubleAdd : SchedWrite;
33
34 // Half rate 64-bit instructions.
35 def Write64Bit : SchedWrite;
36
37 // FIXME: Should there be a class for instructions which are VALU
38 // instructions and have VALU rates, but write to the SALU (i.e. VOPC
39 // instructions)
2940
3041 def SIFullSpeedModel : SchedMachineModel;
3142 def SIQuarterSpeedModel : SchedMachineModel;
5364
5465
5566 // The latency numbers are taken from AMD Accelerated Parallel Processing
56 // guide. They may not be acurate.
67 // guide. They may not be accurate.
5768
5869 // The latency values are 1 / (operations / cycle) / 4.
5970 multiclass SICommonWriteRes {
6778 def : HWWriteRes; // XXX: Guessed ???
6879
6980 def : HWVALUWriteRes;
81 def : HWVALUWriteRes;
7082 def : HWVALUWriteRes;
7183 }
7284
270270 ; SI: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
271271 ; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
272272 ; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]]
273 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
273 ; SI-NOT: vcc
274 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
274275 ; SI-NEXT: buffer_store_dword [[RESULT]]
275276 ; SI: s_endpgm
276277 define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
284285 ; SI-DAG: buffer_load_dwordx2 [[VA:v\[[0-9]+:[0-9]+\]]]
285286 ; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
286287 ; SI: v_cmp_class_f64_e32 vcc, [[VA]], [[MASK]]
287 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
288 ; SI-NOT: vcc
289 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
288290 ; SI: buffer_store_dword [[RESULT]]
289291 ; SI: s_endpgm
290292 define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 {
127127 ; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]]
128128 ; SI-DAG: v_cmp_ne_i32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]]
129129 ; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
130 ; SI: s_and_saveexec_b64 [[ORNEG1]], [[ORNEG1]]
131 ; SI: s_xor_b64 [[ORNEG1]], exec, [[ORNEG1]]
130 ; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]]
131 ; SI: s_xor_b64 [[ORNEG2]], exec, [[ORNEG2]]
132132 ; SI: s_cbranch_execz BB3_5
133133
134134 ; SI: BB#4:
135135 ; SI: buffer_store_dword
136 ; SI: v_cmp_ge_i64_e32 vcc
137 ; SI: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]]
136 ; SI: v_cmp_ge_i64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]]
137 ; SI: s_or_b64 [[COND_STATE]], [[CMP]], [[COND_STATE]]
138138
139139 ; SI: BB3_5:
140 ; SI: s_or_b64 exec, exec, [[ORNEG1]]
141 ; SI: s_or_b64 [[COND_STATE]], [[ORNEG1]], [[COND_STATE]]
140 ; SI: s_or_b64 exec, exec, [[ORNEG2]]
141 ; SI: s_or_b64 [[COND_STATE]], [[ORNEG2]], [[COND_STATE]]
142142 ; SI: s_andn2_b64 exec, exec, [[COND_STATE]]
143143 ; SI: s_cbranch_execnz BB3_3
144144