llvm.org GIT mirror llvm / a00544a
Revert "AMDGPU/SI: Use flat for global load/store when targeting HSA" This reverts commit r256273. It broke CodeGen/AMDGPU/llvm.dbg.value.ll git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@256275 91177308-0d34-0410-b5e6-96231b3b80d8 Rafael Espindola 4 years ago
13 changed file(s) with 82 addition(s) and 131 deletion(s). Raw diff Collapse all Expand all
106106 "EnableUnsafeDSOffsetFolding",
107107 "true",
108108 "Force using DS instruction immediate offsets on SI">;
109
110 def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global",
111 "FlatForGlobal",
112 "true",
113 "Force to generate flat instruction for global">;
114109
115110 def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
116111 "FlatAddressSpace",
9494 bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
9595 bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
9696 SDValue &Offset1) const;
97 bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
97 void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
9898 SDValue &SOffset, SDValue &Offset, SDValue &Offen,
9999 SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
100100 SDValue &TFE) const;
919919 return isUInt<12>(Imm->getZExtValue());
920920 }
921921
922 bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
922 void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
923923 SDValue &VAddr, SDValue &SOffset,
924924 SDValue &Offset, SDValue &Offen,
925925 SDValue &Idxen, SDValue &Addr64,
926926 SDValue &GLC, SDValue &SLC,
927927 SDValue &TFE) const {
928 // Subtarget prefers to use flat instruction
929 if (Subtarget->useFlatForGlobal())
930 return false;
931
932928 SDLoc DL(Addr);
933929
934930 GLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
961957
962958 if (isLegalMUBUFImmOffset(C1)) {
963959 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
964 return true;
960 return;
965961 } else if (isUInt<32>(C1->getZExtValue())) {
966962 // Illegal offset, store it in soffset.
967963 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
968964 SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
969965 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
970966 0);
971 return true;
967 return;
972968 }
973969 }
974970
980976 Ptr = N0;
981977 VAddr = N1;
982978 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
983 return true;
979 return;
984980 }
985981
986982 // default case -> offset
987983 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
988984 Ptr = Addr;
989985 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
990
991 return true;
992986 }
993987
994988 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1001995 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1002996 return false;
1003997
1004 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
1005 GLC, SLC, TFE))
1006 return false;
998 SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
999 GLC, SLC, TFE);
10071000
10081001 ConstantSDNode *C = cast(Addr64);
10091002 if (C->getSExtValue()) {
10691062 const SIInstrInfo *TII =
10701063 static_cast(Subtarget->getInstrInfo());
10711064
1072 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
1073 GLC, SLC, TFE))
1074 return false;
1065 SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
1066 GLC, SLC, TFE);
10751067
10761068 if (!cast(Offen)->getSExtValue() &&
10771069 !cast(Idxen)->getSExtValue() &&
4444 // disable it.
4545
4646 SmallString<256> FullFS("+promote-alloca,+fp64-denormals,");
47 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
48 FullFS += "+flat-for-global,";
4947 FullFS += FS;
5048
5149 if (GPU == "" && TT.getArch() == Triple::amdgcn)
6967 DumpCode(false), R600ALUInst(false), HasVertexCache(false),
7068 TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false),
7169 FP64Denormals(false), FP32Denormals(false), FastFMAF32(false),
72 CaymanISA(false), FlatAddressSpace(false), FlatForGlobal(false),
73 EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true),
74 EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false),
70 CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true),
71 EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false),
72 EnableUnsafeDSOffsetFolding(false),
7573 WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
7674 EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),
7775 GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0),
6969 bool FastFMAF32;
7070 bool CaymanISA;
7171 bool FlatAddressSpace;
72 bool FlatForGlobal;
7372 bool EnableIRStructurizer;
7473 bool EnablePromoteAlloca;
7574 bool EnableIfCvt;
159158 return FlatAddressSpace;
160159 }
161160
162 bool useFlatForGlobal() const {
163 return FlatForGlobal;
164 }
165
166161 bool hasBFE() const {
167162 return (getGeneration() >= EVERGREEN);
168163 }
233233 >;
234234
235235 } // End Predicates = [isCI]
236
237
238 //===----------------------------------------------------------------------===//
239 // Patterns to generate flat for global
240 //===----------------------------------------------------------------------===//
241
242 def useFlatForGlobal : Predicate <
243 "Subtarget->useFlatForGlobal() || "
244 "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">;
245
246 let Predicates = [useFlatForGlobal] in {
247
248 // 1. Offset as 20bit DWORD immediate
249 def : Pat <
250 (SIload_constant v4i32:$sbase, IMM20bit:$offset),
251 (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
252 >;
253
254 // Patterns for global loads with no offset
255 class FlatLoadPat : Pat <
256 (vt (node i64:$addr)),
257 (inst $addr, 0, 0, 0)
258 >;
259
260 def : FlatLoadPat ;
261 def : FlatLoadPat ;
262 def : FlatLoadPat ;
263 def : FlatLoadPat ;
264 def : FlatLoadPat ;
265 def : FlatLoadPat ;
266 def : FlatLoadPat ;
267
268 class FlatStorePat : Pat <
269 (node vt:$data, i64:$addr),
270 (inst $data, $addr, 0, 0, 0)
271 >;
272
273 def : FlatStorePat ;
274 def : FlatStorePat ;
275 def : FlatStorePat ;
276 def : FlatStorePat ;
277 def : FlatStorePat ;
278
279 class FlatAtomicPat : Pat <
280 (vt (node i64:$addr, vt:$data)),
281 (inst $addr, $data, 0, 0)
282 >;
283
284 def : FlatAtomicPat ;
285 def : FlatAtomicPat ;
286 def : FlatAtomicPat ;
287 def : FlatAtomicPat ;
288 def : FlatAtomicPat ;
289 def : FlatAtomicPat ;
290 def : FlatAtomicPat ;
291 def : FlatAtomicPat ;
292 def : FlatAtomicPat ;
293 def : FlatAtomicPat ;
294
295 } // End Predicates = [useFlatForGlobal]
100100
101101 } // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI
102102
103 //===----------------------------------------------------------------------===//
104 // SMEM Patterns
105 //===----------------------------------------------------------------------===//
106
107 let Predicates = [isVI] in {
108
109 // 1. Offset as 20bit DWORD immediate
110 def : Pat <
111 (SIload_constant v4i32:$sbase, IMM20bit:$offset),
112 (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
113 >;
114
115 // Patterns for global loads with no offset
116 class FlatLoadPat : Pat <
117 (vt (node i64:$addr)),
118 (inst $addr, 0, 0, 0)
119 >;
120
121 def : FlatLoadPat ;
122 def : FlatLoadPat ;
123 def : FlatLoadPat ;
124 def : FlatLoadPat ;
125 def : FlatLoadPat ;
126 def : FlatLoadPat ;
127 def : FlatLoadPat ;
128
129 class FlatStorePat : Pat <
130 (node vt:$data, i64:$addr),
131 (inst $data, $addr, 0, 0, 0)
132 >;
133
134 def : FlatStorePat ;
135 def : FlatStorePat ;
136 def : FlatStorePat ;
137 def : FlatStorePat ;
138 def : FlatStorePat ;
139
140 class FlatAtomicPat : Pat <
141 (vt (node i64:$addr, vt:$data)),
142 (inst $addr, $data, 0, 0)
143 >;
144
145 def : FlatAtomicPat ;
146 def : FlatAtomicPat ;
147 def : FlatAtomicPat ;
148 def : FlatAtomicPat ;
149 def : FlatAtomicPat ;
150 def : FlatAtomicPat ;
151 def : FlatAtomicPat ;
152 def : FlatAtomicPat ;
153 def : FlatAtomicPat ;
154 def : FlatAtomicPat ;
155
156
157 } // End Predicates = [isVI]
+0
-15
test/CodeGen/AMDGPU/ci-use-flat-for-global.ll less more
None ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck -check-prefix=HSA-DEFAULT %s
1 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global | FileCheck -check-prefix=HSA-NODEFAULT %s
2 ; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri | FileCheck -check-prefix=NOHSA-DEFAULT %s
3 ; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri -mattr=+flat-for-global | FileCheck -check-prefix=NOHSA-NODEFAULT %s
4
5
6 ; HSA-DEFAULT: flat_store_dword
7 ; HSA-NODEFAULT: buffer_store_dword
8 ; NOHSA-DEFAULT: buffer_store_dword
9 ; NOHSA-NODEFAULT: flat_store_dword
10 define void @test(i32 addrspace(1)* %out) {
11 entry:
12 store i32 0, i32 addrspace(1)* %out
13 ret void
14 }
None ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s
1 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global | FileCheck --check-prefix=HSA-CI %s
2 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA %s
3 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-flat-for-global | FileCheck --check-prefix=HSA-VI %s
4 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s
0 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA-CI --check-prefix=HSA %s
1 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA-VI --check-prefix=HSA %s
2 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s
53 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
64
75 ; The SHT_NOTE section contains the output from the .hsa_code_object_*
4846 ; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000
4947 ; On VI+ we also need to set MTYPE = 2
5048 ; HSA-VI: s_mov_b32 s[[HI:[0-9]]], 0x1100f000
51 ; Make sure we generate flat store for HSA
52 ; HSA: flat_store_dword v{{[0-9]+}}
49 ; HSA: buffer_store_dword v{{[0-9]+}}, s[0:[[HI]]], 0
5350
5451 define void @simple(i32 addrspace(1)* %out) {
5552 entry:
0 ; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
11 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
2 ; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s -mattr=-flat-for-global | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
2 ; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
44
55 ; FIXME: align on alloca seems to be ignored for private_segment_alignment
66
22 ; CHECK-LABEL: {{^}}test_debug_value:
33 ; CHECK: s_load_dwordx2 s[4:5]
44 ; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- %SGPR4_SGPR5
5 ; CHECK: flat_store_dword
5 ; CHECK: buffer_store_dword
66 ; CHECK: s_endpgm
77 define void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 {
88 entry:
None ; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN %s
0 ; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCN %s
11
22 ; Check that when mubuf addr64 instruction is handled in moveToVALU
33 ; from the pointer, dead register writes are not emitted.
0 ; RUN: llc -march=amdgcn -verify-machineinstrs -asm-verbose < %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs -asm-verbose -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs -asm-verbose < %s | FileCheck -check-prefix=SI %s
22
33 declare i32 @llvm.SI.tid() nounwind readnone
44
128128
129129 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s2{{$}}
130130 ; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6{{$}}
131 ; GCN-NOHSA: buffer_store_dword [[VVAL]]
132 ; HSA: flat_store_dword [[VVAL]]
131 ; GCN: buffer_store_dword [[VVAL]]
133132
134133 ; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
135134 ; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
155154 ; HSA: enable_sgpr_grid_workgroup_count_z = 0
156155 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3
157156 ; GCN-HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7
158 ; GCN-NOHSA: buffer_store_dword [[VVAL]]
159 ; HSA: flat_store_dword [[VVAL]]
157 ; GCN: buffer_store_dword [[VVAL]]
160158
161159 ; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
162160 ; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
191189
192190 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3{{$}}
193191 ; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7{{$}}
194 ; GCN-NOHSA: buffer_store_dword [[VVAL]]
195 ; HSA: flat_store_dword [[VVAL]]
192 ; GCN: buffer_store_dword [[VVAL]]
196193
197194 ; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
198195 ; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
213210
214211 ; FUNC-LABEL: {{^}}tidig_x:
215212 ; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
216 ; GCN-NOHSA: buffer_store_dword v0
217 ; HSA: flat_store_dword v0
213 ; GCN: buffer_store_dword v0
218214 define void @tidig_x(i32 addrspace(1)* %out) {
219215 entry:
220216 %0 = call i32 @llvm.r600.read.tidig.x() #0
229225 ; FUNC-LABEL: {{^}}tidig_y:
230226
231227 ; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 1
232 ; GCN-NOHSA: buffer_store_dword v1
233 ; HSA: flat_store_dword v1
228 ; GCN: buffer_store_dword v1
234229 define void @tidig_y(i32 addrspace(1)* %out) {
235230 entry:
236231 %0 = call i32 @llvm.r600.read.tidig.y() #0
244239
245240 ; FUNC-LABEL: {{^}}tidig_z:
246241 ; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 2
247 ; GCN-NOHSA: buffer_store_dword v2
248 ; HSA: flat_store_dword v2
242 ; GCN: buffer_store_dword v2
249243 define void @tidig_z(i32 addrspace(1)* %out) {
250244 entry:
251245 %0 = call i32 @llvm.r600.read.tidig.z() #0