llvm.org GIT mirror llvm / 03ca6fb
AMDGPU: Define priorities for register classes Allocating larger register classes first should give better allocation results (and more importantly for myself, make the lit tests more stable with respect to scheduler changes). Patch by Matthias Braun git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@270312 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 3 years ago
10 changed file(s) with 75 addition(s) and 52 deletion(s). Raw diff Collapse all Expand all
123123
124124 // SGPR 32-bit registers
125125 def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
126 (add (sequence "SGPR%u", 0, 103))>;
126 (add (sequence "SGPR%u", 0, 103))> {
127 let AllocationPriority = 1;
128 }
127129
128130 // SGPR 64-bit registers
129131 def SGPR_64Regs : RegisterTuples<[sub0, sub1],
188190
189191 // VGPR 32-bit registers
190192 def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
191 (add (sequence "VGPR%u", 0, 255))>;
193 (add (sequence "VGPR%u", 0, 255))> {
194 let AllocationPriority = 1;
195 }
192196
193197 // VGPR 64-bit registers
194198 def VGPR_64 : RegisterTuples<[sub0, sub1],
252256 // See comments in SIInstructions.td for more info.
253257 def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32], 32,
254258 (add SGPR_32, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI,
255 TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI)
256 >;
259 TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI)> {
260 let AllocationPriority = 1;
261 }
257262
258263 // Register class for all scalar registers (SGPRs + Special Registers)
259264 def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
260 (add SReg_32_XM0, M0)
261 >;
262
263 def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)>;
265 (add SReg_32_XM0, M0)> {
266 let AllocationPriority = 1;
267 }
268
269 def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> {
270 let AllocationPriority = 2;
271 }
264272
265273 def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add TTMP_64Regs)> {
266274 let isAllocatable = 0;
267275 }
268276
269277 def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
270 (add SGPR_64, VCC, EXEC, FLAT_SCR, TTMP_64, TBA, TMA)
271 >;
278 (add SGPR_64, VCC, EXEC, FLAT_SCR, TTMP_64, TBA, TMA)> {
279 let AllocationPriority = 2;
280 }
272281
273282 // Requires 2 s_mov_b64 to copy
274283 let CopyCost = 2 in {
275284
276 def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128Regs)>;
285 def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128Regs)> {
286 let AllocationPriority = 4;
287 }
277288
278289 def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128Regs)> {
279290 let isAllocatable = 0;
280291 }
281292
282 def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)>;
293 def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)> {
294 let AllocationPriority = 4;
295 }
283296
284297 } // End CopyCost = 2
285298
286299 def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256)> {
287300 // Requires 4 s_mov_b64 to copy
288301 let CopyCost = 4;
302 let AllocationPriority = 5;
289303 }
290304
291305 def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> {
292306 // Requires 8 s_mov_b64 to copy
293307 let CopyCost = 8;
308 let AllocationPriority = 6;
294309 }
295310
296311 // Register class for all vector registers (VGPRs + Interploation Registers)
297312 def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> {
298313 // Requires 2 v_mov_b32 to copy
299314 let CopyCost = 2;
315 let AllocationPriority = 2;
300316 }
301317
302318 def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> {
304320
305321 // Requires 3 v_mov_b32 to copy
306322 let CopyCost = 3;
323 let AllocationPriority = 3;
307324 }
308325
309326 def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> {
310327 // Requires 4 v_mov_b32 to copy
311328 let CopyCost = 4;
329 let AllocationPriority = 4;
312330 }
313331
314332 def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add VGPR_256)> {
315333 let CopyCost = 8;
334 let AllocationPriority = 5;
316335 }
317336
318337 def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> {
319338 let CopyCost = 16;
339 let AllocationPriority = 6;
320340 }
321341
322342 def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
None ; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
1 ; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE
2 ; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
3 ; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA
4 ; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
5 ; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
0 ; RUN: llc -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-PROMOTE %s
1 ; RUN: llc -mattr=+promote-alloca,-flat-for-global -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-PROMOTE -check-prefix=HSA %s
2 ; RUN: llc -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-ALLOCA %s
3 ; RUN: llc -mattr=-promote-alloca,-flat-for-global -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-ALLOCA -check-prefix=HSA %s
4 ; RUN: llc -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-PROMOTE %s
5 ; RUN: llc -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-ALLOCA %s
66
77
88 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
1010
1111 ; Make sure we don't overwrite workitem information with private memory
1212
13 ; FUNC-LABEL: {{^}}work_item_info:
14
15 ; SI-NOT: v_mov_b32_e{{(32|64)}} v0
13 ; GCN-LABEL: {{^}}work_item_info:
14 ; GCN-NOT: v0
15 ; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, v0, v{{[0-9]+}}
16 ; GCN: buffer_store_dword [[RESULT]]
1617 define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
1718 entry:
1819 %0 = alloca [2 x i32]
397397 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
398398 ; GCN-DAG: v_cvt_f32_f16_e32
399399 ; GCN-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
400 ; GCN: v_cvt_f32_f16_e32
401 ; GCN: v_cvt_f32_f16_e32
402 ; GCN-NOT: v_cvt_f32_f16
400 ; GCN-DAG: v_cvt_f32_f16_e32
401 ; GCN-DAG: v_cvt_f32_f16_e32
403402
404403 ; GCN: v_cvt_f64_f32_e32
405404 ; GCN: v_cvt_f64_f32_e32
724724 ; an immediate.
725725 ; FUNC-LABEL: {{^}}load_i32_local_const_ptr:
726726 ; SI: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
727 ; SI: ds_read_b32 v0, v[[ZERO]] offset:4
727 ; SI: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4
728728 ; R600: LDS_READ_RET
729729 define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
730730 entry:
55 ; FIXME: We should be able to use the SGPR directly as src0 to v_add_i32
66
77 ; GCN-LABEL: {{^}}clobber_vgpr_pair_pointer_add:
8 ; GCN: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
9 ; GCN-DAG: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]
8 ; GCN-DAG: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
109 ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}}
1110
1211 ; GCN-NOT: v_mov_b32
13 ; GCN-NEXT: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]]
12 ; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]]
13 ; GCN-NOT: v_mov_b32
14 ; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]
1415 ; GCN-NOT: v_mov_b32
1516
1617 ; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]]
5050 }
5151
5252 ; Test moving an SMRD instruction to the VALU
53 ; FIXME: movs can be moved before nop to reduce count
5354
5455 ; GCN-LABEL: {{^}}smrd_valu:
5556 ; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x2ee0
5657 ; GCN: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
5758 ; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
59 ; SI: s_nop 3
60 ; SI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, [[OFFSET]]
5861 ; SI: s_mov_b32
59 ; SI: s_nop 2
60 ; SI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, [[OFFSET]]
62
6163 ; CI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0xbb8
6264 ; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]]
6365 ; GCN-NOHSA: buffer_store_dword [[V_OUT]]
None ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI --check-prefix=GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI --check-prefix=GCN %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI -check-prefix=GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI -check-prefix=GCN %s
22
33 ; FUNC-LABEL: {{^}}cluster_arg_loads:
44 ; FIXME: Due to changes in the load clustering heuristics. We no longer
88 ; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9
99 ; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
1010 ; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
11 ; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
12 ; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
13 ; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
11 ; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
12 ; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
13 ; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
1414 define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
1515 store i32 %x, i32 addrspace(1)* %out0, align 4
1616 store i32 %y, i32 addrspace(1)* %out1, align 4
1111
1212 ; GCN-LABEL: {{^}}main:
1313
14 ; GCN-DAG: s_mov_b32 s6, s12
14 ; GCN-DAG: s_mov_b32 s13, s12
1515 ; GCN-DAG: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
1616 ; GCN-DAG: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
1717 ; GCN-DAG: s_mov_b32 s18, -1
1818 ; SI-DAG: s_mov_b32 s19, 0x88f000
1919 ; VI-DAG: s_mov_b32 s19, 0x880000
2020
21 ; s6 is offset system SGPR
22 ; GCN: buffer_store_dword {{v[0-9]+}}, off, s[16:19], s6 offset:{{[0-9]+}} ; 16-byte Folded Spill
23 ; GCN: buffer_load_dword v{{[0-9]+}}, off, s[16:19], s6 offset:{{[0-9]+}} ; 16-byte Folded Reload
21 ; s13 is offset system SGPR
22 ; GCN: buffer_store_dword {{v[0-9]+}}, off, s[16:19], s13 offset:{{[0-9]+}} ; 16-byte Folded Spill
23 ; GCN: buffer_load_dword v{{[0-9]+}}, off, s[16:19], s13 offset:{{[0-9]+}} ; 16-byte Folded Reload
2424
2525 ; GCN: NumVgprs: 256
2626 ; GCN: ScratchSize: 1024
55 ; for the original bug.
66
77 ; GCN: {{^}}test:
8 ; GCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[DATA:v[0-9]+]]
9 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
10 ; GCN: flat_load_dword [[DATA]], v[{{[0-9]+:[0-9]+}}]
8 ; XGCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[DATA:v[0-9]+]]
9 ; XGCN: s_waitcnt vmcnt(0) lgkmcnt(0)
10 ; XGCN: flat_load_dword [[DATA]], v[{{[0-9]+:[0-9]+}}]
1111 define void @test(i32 addrspace(1)* %out, i32 %in) {
1212 store volatile i32 0, i32 addrspace(1)* %out
1313 %val = load volatile i32, i32 addrspace(1)* %out
309309
310310 ; ... but only if WQM is necessary.
311311 ;
312 ;CHECK-LABEL: {{^}}test_kill_1:
313 ;CHECK-NEXT: ; %main_body
314 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
315 ;CHECK-NEXT: s_wqm_b64 exec, exec
316 ;CHECK: image_sample
317 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
318 ;SI: buffer_store_dword
319 ;VI: flat_store_dword
320 ;CHECK-NOT: wqm
321 ;CHECK: v_cmpx_
312 ; CHECK-LABEL: {{^}}test_kill_1:
313 ; CHECK-NEXT: ; %main_body
314 ; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
315 ; CHECK: s_wqm_b64 exec, exec
316 ; CHECK: image_sample
317 ; CHECK: s_and_b64 exec, exec, [[ORIG]]
318 ; SI: buffer_store_dword
319 ; VI: flat_store_dword
320 ; CHECK-NOT: wqm
321 ; CHECK: v_cmpx_
322322 define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) {
323323 main_body:
324324 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)