llvm.org GIT mirror llvm / 808f964
AMDGPU/SI: Use flat for global load/store when targeting HSA Summary: For some reason doing executing an MUBUF instruction with the addr64 bit set and a zero base pointer in the resource descriptor causes the memory operation to be dropped when the shader is executed using the HSA runtime. This kind of MUBUF instruction is commonly used when the pointer is stored in VGPRs. The base pointer field in the resource descriptor is set to zero and and the pointer is stored in the vaddr field. This patch resolves the issue by only using flat instructions for global memory operations when targeting HSA. This is an overly conservative fix as all other configurations of MUBUF instructions appear to work. Reviewers: tstellarAMD Subscribers: arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D15543 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@256273 91177308-0d34-0410-b5e6-96231b3b80d8 Changpeng Fang 4 years ago
13 changed file(s) with 131 addition(s) and 82 deletion(s). Raw diff Collapse all Expand all
106106 "EnableUnsafeDSOffsetFolding",
107107 "true",
108108 "Force using DS instruction immediate offsets on SI">;
109
110 def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global",
111 "FlatForGlobal",
112 "true",
113 "Force to generate flat instruction for global">;
109114
110115 def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
111116 "FlatAddressSpace",
9494 bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
9595 bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
9696 SDValue &Offset1) const;
97 void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
97 bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
9898 SDValue &SOffset, SDValue &Offset, SDValue &Offen,
9999 SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
100100 SDValue &TFE) const;
919919 return isUInt<12>(Imm->getZExtValue());
920920 }
921921
922 void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
922 bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
923923 SDValue &VAddr, SDValue &SOffset,
924924 SDValue &Offset, SDValue &Offen,
925925 SDValue &Idxen, SDValue &Addr64,
926926 SDValue &GLC, SDValue &SLC,
927927 SDValue &TFE) const {
928 // Subtarget prefers to use flat instruction
929 if (Subtarget->useFlatForGlobal())
930 return false;
931
928932 SDLoc DL(Addr);
929933
930934 GLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
957961
958962 if (isLegalMUBUFImmOffset(C1)) {
959963 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
960 return;
964 return true;
961965 } else if (isUInt<32>(C1->getZExtValue())) {
962966 // Illegal offset, store it in soffset.
963967 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
964968 SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
965969 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
966970 0);
967 return;
971 return true;
968972 }
969973 }
970974
976980 Ptr = N0;
977981 VAddr = N1;
978982 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
979 return;
983 return true;
980984 }
981985
982986 // default case -> offset
983987 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
984988 Ptr = Addr;
985989 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
990
991 return true;
986992 }
987993
988994 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
9951001 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9961002 return false;
9971003
998 SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
999 GLC, SLC, TFE);
1004 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
1005 GLC, SLC, TFE))
1006 return false;
10001007
10011008 ConstantSDNode *C = cast(Addr64);
10021009 if (C->getSExtValue()) {
10621069 const SIInstrInfo *TII =
10631070 static_cast(Subtarget->getInstrInfo());
10641071
1065 SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
1066 GLC, SLC, TFE);
1072 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
1073 GLC, SLC, TFE))
1074 return false;
10671075
10681076 if (!cast(Offen)->getSExtValue() &&
10691077 !cast(Idxen)->getSExtValue() &&
4444 // disable it.
4545
4646 SmallString<256> FullFS("+promote-alloca,+fp64-denormals,");
47 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
48 FullFS += "+flat-for-global,";
4749 FullFS += FS;
4850
4951 if (GPU == "" && TT.getArch() == Triple::amdgcn)
6769 DumpCode(false), R600ALUInst(false), HasVertexCache(false),
6870 TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false),
6971 FP64Denormals(false), FP32Denormals(false), FastFMAF32(false),
70 CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true),
71 EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false),
72 EnableUnsafeDSOffsetFolding(false),
72 CaymanISA(false), FlatAddressSpace(false), FlatForGlobal(false),
73 EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true),
74 EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false),
7375 WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
7476 EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),
7577 GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0),
6969 bool FastFMAF32;
7070 bool CaymanISA;
7171 bool FlatAddressSpace;
72 bool FlatForGlobal;
7273 bool EnableIRStructurizer;
7374 bool EnablePromoteAlloca;
7475 bool EnableIfCvt;
158159 return FlatAddressSpace;
159160 }
160161
162 bool useFlatForGlobal() const {
163 return FlatForGlobal;
164 }
165
161166 bool hasBFE() const {
162167 return (getGeneration() >= EVERGREEN);
163168 }
233233 >;
234234
235235 } // End Predicates = [isCI]
236
237
238 //===----------------------------------------------------------------------===//
239 // Patterns to generate flat for global
240 //===----------------------------------------------------------------------===//
241
242 def useFlatForGlobal : Predicate <
243 "Subtarget->useFlatForGlobal() || "
244 "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">;
245
246 let Predicates = [useFlatForGlobal] in {
247
248 // 1. Offset as 20bit DWORD immediate
249 def : Pat <
250 (SIload_constant v4i32:$sbase, IMM20bit:$offset),
251 (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
252 >;
253
254 // Patterns for global loads with no offset
255 class FlatLoadPat : Pat <
256 (vt (node i64:$addr)),
257 (inst $addr, 0, 0, 0)
258 >;
259
260 def : FlatLoadPat ;
261 def : FlatLoadPat ;
262 def : FlatLoadPat ;
263 def : FlatLoadPat ;
264 def : FlatLoadPat ;
265 def : FlatLoadPat ;
266 def : FlatLoadPat ;
267
268 class FlatStorePat : Pat <
269 (node vt:$data, i64:$addr),
270 (inst $data, $addr, 0, 0, 0)
271 >;
272
273 def : FlatStorePat ;
274 def : FlatStorePat ;
275 def : FlatStorePat ;
276 def : FlatStorePat ;
277 def : FlatStorePat ;
278
279 class FlatAtomicPat : Pat <
280 (vt (node i64:$addr, vt:$data)),
281 (inst $addr, $data, 0, 0)
282 >;
283
284 def : FlatAtomicPat ;
285 def : FlatAtomicPat ;
286 def : FlatAtomicPat ;
287 def : FlatAtomicPat ;
288 def : FlatAtomicPat ;
289 def : FlatAtomicPat ;
290 def : FlatAtomicPat ;
291 def : FlatAtomicPat ;
292 def : FlatAtomicPat ;
293 def : FlatAtomicPat ;
294
295 } // End Predicates = [useFlatForGlobal]
100100
101101 } // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI
102102
103 //===----------------------------------------------------------------------===//
104 // SMEM Patterns
105 //===----------------------------------------------------------------------===//
106
107 let Predicates = [isVI] in {
108
109 // 1. Offset as 20bit DWORD immediate
110 def : Pat <
111 (SIload_constant v4i32:$sbase, IMM20bit:$offset),
112 (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
113 >;
114
115 // Patterns for global loads with no offset
116 class FlatLoadPat : Pat <
117 (vt (node i64:$addr)),
118 (inst $addr, 0, 0, 0)
119 >;
120
121 def : FlatLoadPat ;
122 def : FlatLoadPat ;
123 def : FlatLoadPat ;
124 def : FlatLoadPat ;
125 def : FlatLoadPat ;
126 def : FlatLoadPat ;
127 def : FlatLoadPat ;
128
129 class FlatStorePat : Pat <
130 (node vt:$data, i64:$addr),
131 (inst $data, $addr, 0, 0, 0)
132 >;
133
134 def : FlatStorePat ;
135 def : FlatStorePat ;
136 def : FlatStorePat ;
137 def : FlatStorePat ;
138 def : FlatStorePat ;
139
140 class FlatAtomicPat : Pat <
141 (vt (node i64:$addr, vt:$data)),
142 (inst $addr, $data, 0, 0)
143 >;
144
145 def : FlatAtomicPat ;
146 def : FlatAtomicPat ;
147 def : FlatAtomicPat ;
148 def : FlatAtomicPat ;
149 def : FlatAtomicPat ;
150 def : FlatAtomicPat ;
151 def : FlatAtomicPat ;
152 def : FlatAtomicPat ;
153 def : FlatAtomicPat ;
154 def : FlatAtomicPat ;
155
156
157 } // End Predicates = [isVI]
0 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck -check-prefix=HSA-DEFAULT %s
1 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global | FileCheck -check-prefix=HSA-NODEFAULT %s
2 ; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri | FileCheck -check-prefix=NOHSA-DEFAULT %s
3 ; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri -mattr=+flat-for-global | FileCheck -check-prefix=NOHSA-NODEFAULT %s
4
5
6 ; HSA-DEFAULT: flat_store_dword
7 ; HSA-NODEFAULT: buffer_store_dword
8 ; NOHSA-DEFAULT: buffer_store_dword
9 ; NOHSA-NODEFAULT: flat_store_dword
10 define void @test(i32 addrspace(1)* %out) {
11 entry:
12 store i32 0, i32 addrspace(1)* %out
13 ret void
14 }
None ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA-CI --check-prefix=HSA %s
1 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA-VI --check-prefix=HSA %s
2 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s
0 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s
1 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global | FileCheck --check-prefix=HSA-CI %s
2 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA %s
3 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-flat-for-global | FileCheck --check-prefix=HSA-VI %s
4 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s
35 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
46
57 ; The SHT_NOTE section contains the output from the .hsa_code_object_*
4648 ; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000
4749 ; On VI+ we also need to set MTYPE = 2
4850 ; HSA-VI: s_mov_b32 s[[HI:[0-9]]], 0x1100f000
49 ; HSA: buffer_store_dword v{{[0-9]+}}, s[0:[[HI]]], 0
51 ; Make sure we generate flat store for HSA
52 ; HSA: flat_store_dword v{{[0-9]+}}
5053
5154 define void @simple(i32 addrspace(1)* %out) {
5255 entry:
0 ; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
11 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
2 ; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
2 ; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s -mattr=-flat-for-global | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
44
55 ; FIXME: align on alloca seems to be ignored for private_segment_alignment
66
22 ; CHECK-LABEL: {{^}}test_debug_value:
33 ; CHECK: s_load_dwordx2 s[4:5]
44 ; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- %SGPR4_SGPR5
5 ; CHECK: buffer_store_dword
5 ; CHECK: flat_store_dword
66 ; CHECK: s_endpgm
77 define void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 {
88 entry:
None ; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCN %s
0 ; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN %s
11
22 ; Check that when mubuf addr64 instruction is handled in moveToVALU
33 ; from the pointer, dead register writes are not emitted.
0 ; RUN: llc -march=amdgcn -verify-machineinstrs -asm-verbose < %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs -asm-verbose < %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs -asm-verbose -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s
22
33 declare i32 @llvm.SI.tid() nounwind readnone
44
128128
129129 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s2{{$}}
130130 ; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6{{$}}
131 ; GCN: buffer_store_dword [[VVAL]]
131 ; GCN-NOHSA: buffer_store_dword [[VVAL]]
132 ; HSA: flat_store_dword [[VVAL]]
132133
133134 ; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
134135 ; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
154155 ; HSA: enable_sgpr_grid_workgroup_count_z = 0
155156 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3
156157 ; GCN-HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7
157 ; GCN: buffer_store_dword [[VVAL]]
158 ; GCN-NOHSA: buffer_store_dword [[VVAL]]
159 ; HSA: flat_store_dword [[VVAL]]
158160
159161 ; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
160162 ; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
189191
190192 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3{{$}}
191193 ; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7{{$}}
192 ; GCN: buffer_store_dword [[VVAL]]
194 ; GCN-NOHSA: buffer_store_dword [[VVAL]]
195 ; HSA: flat_store_dword [[VVAL]]
193196
194197 ; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
195198 ; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
210213
211214 ; FUNC-LABEL: {{^}}tidig_x:
212215 ; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
213 ; GCN: buffer_store_dword v0
216 ; GCN-NOHSA: buffer_store_dword v0
217 ; HSA: flat_store_dword v0
214218 define void @tidig_x(i32 addrspace(1)* %out) {
215219 entry:
216220 %0 = call i32 @llvm.r600.read.tidig.x() #0
225229 ; FUNC-LABEL: {{^}}tidig_y:
226230
227231 ; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 1
228 ; GCN: buffer_store_dword v1
232 ; GCN-NOHSA: buffer_store_dword v1
233 ; HSA: flat_store_dword v1
229234 define void @tidig_y(i32 addrspace(1)* %out) {
230235 entry:
231236 %0 = call i32 @llvm.r600.read.tidig.y() #0
239244
240245 ; FUNC-LABEL: {{^}}tidig_z:
241246 ; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 2
242 ; GCN: buffer_store_dword v2
247 ; GCN-NOHSA: buffer_store_dword v2
248 ; HSA: flat_store_dword v2
243249 define void @tidig_z(i32 addrspace(1)* %out) {
244250 entry:
245251 %0 = call i32 @llvm.r600.read.tidig.z() #0