llvm.org GIT mirror llvm / 7710002
Merging r277504: ------------------------------------------------------------------------ r277504 | nha | 2016-08-02 12:31:14 -0700 (Tue, 02 Aug 2016) | 20 lines AMDGPU: Stay in WQM for non-intrinsic stores Summary: Two types of stores are possible in pixel shaders: stores to memory that are explicitly requested at the API level, and stores that are an implementation detail of register spilling or lowering of arrays. For the first kind of store, we must ensure that helper pixels have no effect and hence WQM must be disabled. The second kind of store must always be executed, because the written value may be loaded again in a way that is relevant for helper pixels as well -- and there are no externally visible effects anyway. This is a candidate for the 3.9 release branch. Reviewers: arsenm, tstellarAMD, mareko Subscribers: arsenm, kzhuravl, llvm-commits Differential Revision: https://reviews.llvm.org/D22675 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_39@277620 91177308-0d34-0410-b5e6-96231b3b80d8 Hans Wennborg 4 years ago
8 changed file(s) with 107 addition(s) and 54 deletion(s). Raw diff Collapse all Expand all
4040 WQM = 1 << 22,
4141 VGPRSpill = 1 << 23,
4242 VOPAsmPrefer32Bit = 1 << 24,
43 Gather4 = 1 << 25
43 Gather4 = 1 << 25,
44 DisableWQM = 1 << 26
4445 };
4546 }
4647
4040 field bits<1> DS = 0;
4141 field bits<1> MIMG = 0;
4242 field bits<1> FLAT = 0;
43
44 // Whether WQM _must_ be enabled for this instruction.
4345 field bits<1> WQM = 0;
4446 field bits<1> VGPRSpill = 0;
4547
4850 field bits<1> VOPAsmPrefer32Bit = 0;
4951
5052 field bits<1> Gather4 = 0;
53
54 // Whether WQM _must_ be disabled for this instruction.
55 field bits<1> DisableWQM = 0;
5156
5257 // These need to be kept in sync with the enum in SIInstrFlags.
5358 let TSFlags{0} = VM_CNT;
8085 let TSFlags{23} = VGPRSpill;
8186 let TSFlags{24} = VOPAsmPrefer32Bit;
8287 let TSFlags{25} = Gather4;
88 let TSFlags{26} = DisableWQM;
8389
8490 let SchedRW = [Write32Bit];
8591
339339 return get(Opcode).TSFlags & SIInstrFlags::WQM;
340340 }
341341
342 static bool isDisableWQM(const MachineInstr &MI) {
343 return MI.getDesc().TSFlags & SIInstrFlags::DisableWQM;
344 }
345
346 bool isDisableWQM(uint16_t Opcode) const {
347 return get(Opcode).TSFlags & SIInstrFlags::DisableWQM;
348 }
349
342350 static bool isVGPRSpill(const MachineInstr &MI) {
343351 return MI.getDesc().TSFlags & SIInstrFlags::VGPRSpill;
344352 }
29482948 def "" : MUBUF_Pseudo ,
29492949 MUBUFAddr64Table <0>;
29502950
2951 let DisableWQM = 1 in {
2952 def "_exact" : MUBUF_Pseudo ;
2953 }
2954
29512955 let addr64 = 0, isCodeGenOnly = 0 in {
29522956 def _si : MUBUF_Real_si ;
29532957 }
30183022 multiclass MUBUF_Atomic
30193023 ValueType vt, SDPatternOperator atomic> {
30203024
3021 let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1 in {
3025 let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1,
3026 DisableWQM = 1 in {
30223027
30233028 // No return variants
30243029 let glc = 0, AsmMatchConverter = "cvtMubufAtomic" in {
34223427 let mayStore = 1;
34233428 let hasSideEffects = 1;
34243429 let hasPostISelHook = 0;
3430 let DisableWQM = 1;
34253431 }
34263432
34273433 multiclass MIMG_Store_Addr_Helper op, string asm,
34533459 let mayStore = 1;
34543460 let hasSideEffects = 1;
34553461 let hasPostISelHook = 0;
3462 let DisableWQM = 1;
34563463 let Constraints = "$vdst = $vdata";
34573464 let AsmMatchConverter = "cvtMIMGAtomic";
34583465 }
21992199 (name vt:$vdata, v4i32:$rsrc, 0,
22002200 (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
22012201 imm:$glc, imm:$slc),
2202 (!cast(opcode # _OFFSET) $vdata, $rsrc, $soffset, (as_i16imm $offset),
2202 (!cast(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset),
22032203 (as_i1imm $glc), (as_i1imm $slc), 0)
22042204 >;
22052205
22072207 (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
22082208 (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
22092209 imm:$glc, imm:$slc),
2210 (!cast(opcode # _IDXEN) $vdata, $vindex, $rsrc, $soffset,
2210 (!cast(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
22112211 (as_i16imm $offset), (as_i1imm $glc),
22122212 (as_i1imm $slc), 0)
22132213 >;
22162216 (name vt:$vdata, v4i32:$rsrc, 0,
22172217 (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
22182218 imm:$glc, imm:$slc),
2219 (!cast(opcode # _OFFEN) $vdata, $voffset, $rsrc, $soffset,
2219 (!cast(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
22202220 (as_i16imm $offset), (as_i1imm $glc),
22212221 (as_i1imm $slc), 0)
22222222 >;
22252225 (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
22262226 (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
22272227 imm:$glc, imm:$slc),
2228 (!cast(opcode # _BOTHEN)
2228 (!cast(opcode # _BOTHEN_exact)
22292229 $vdata,
22302230 (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
22312231 $rsrc, $soffset, (as_i16imm $offset),
184184
185185 if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
186186 Flags = StateWQM;
187 } else if (MI.mayStore() && TII->usesVM_CNT(MI)) {
187 } else if (TII->isDisableWQM(MI)) {
188188 Flags = StateExact;
189189 } else {
190190 // Handle export instructions with the exec mask valid flag set
236236 InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
237237 BlockInfo &BI = Blocks[MBB];
238238
239 // Control flow-type instructions that are followed by WQM computations
240 // must themselves be in WQM.
241 if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) && MI.isTerminator()) {
239 // Control flow-type instructions and stores to temporary memory that are
240 // followed by WQM computations must themselves be in WQM.
241 if ((II.OutNeeds & StateWQM) && !II.Needs &&
242 (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
242243 Instructions[&MI].Needs = StateWQM;
243244 II.Needs = StateWQM;
244245 }
347347 ; CHECK: image_sample_c
348348
349349 ; CHECK: v_cmp_neq_f32_e32 vcc, 0,
350 ; CHECK: s_and_b64 exec, exec,
351350 ; CHECK: s_and_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc
352351 ; CHECK: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
353352 ; CHECK: mask branch [[END:BB[0-9]+_[0-9]+]]
384383
385384 declare void @llvm.AMDGPU.kill(float) #0
386385 declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
386 declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) nounwind
387387
388388 attributes #0 = { nounwind }
389389 attributes #1 = { nounwind readnone }
4040 ;CHECK: store
4141 ;CHECK-NOT: exec
4242 ;CHECK: .size test3
43 define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
43 define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <4 x i32> %c) {
4444 main_body:
4545 %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
4646 %tex.1 = bitcast <4 x float> %tex to <4 x i32>
4747 %tex.2 = extractelement <4 x i32> %tex.1, i32 0
48 %gep = getelementptr float, float addrspace(1)* %ptr, i32 %tex.2
49 %wr = extractelement <4 x float> %tex, i32 1
50 store float %wr, float addrspace(1)* %gep
48
49 call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i1 0, i1 0)
50
5151 ret <4 x float> %tex
5252 }
5353
6565 define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) {
6666 main_body:
6767 %c.1 = mul i32 %c, %d
68 %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.1
69 store float %data, float addrspace(1)* %gep
68
69 call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i1 0, i1 0)
70
7071 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
7172 ret <4 x float> %tex
7273 }
8889 ;CHECK: s_mov_b64 exec, [[SAVED]]
8990 ;CHECK: %IF
9091 ;CHECK: image_sample
91 define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) {
92 define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
9293 main_body:
9394 %cmp = icmp eq i32 %z, 0
9495 br i1 %cmp, label %IF, label %ELSE
99100 br label %END
100101
101102 ELSE:
102 %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c
103 store float %data, float addrspace(1)* %gep
103 call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
104104 br label %END
105105
106106 END:
128128 ;CHECK: s_or_b64 exec, exec,
129129 ;CHECK: v_mov_b32_e32 v0
130130 ;CHECK: ; return
131 define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) {
131 define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
132132 main_body:
133133 %cmp = icmp eq i32 %z, 0
134134 br i1 %cmp, label %ELSE, label %IF
139139 br label %END
140140
141141 ELSE:
142 %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c
143 store float %data, float addrspace(1)* %gep
142 call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
144143 br label %END
145144
146145 END:
162161 ;CHECK: store
163162 ;CHECK: s_wqm_b64 exec, exec
164163 ;CHECK: v_cmp
165 define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
164 define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
166165 main_body:
167166 %idx.1 = extractelement <3 x i32> %idx, i32 0
168 %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
169167 %data.1 = extractelement <2 x float> %data, i32 0
170 store float %data.1, float addrspace(1)* %gep.1
168 call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)
171169
172170 ; The load that determines the branch (and should therefore be WQM) is
173171 ; surrounded by stores that require disabled WQM.
174172 %idx.2 = extractelement <3 x i32> %idx, i32 1
175 %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
176 %z = load float, float addrspace(1)* %gep.2
173 %z = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i1 0, i1 0)
177174
178175 %idx.3 = extractelement <3 x i32> %idx, i32 2
179 %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
180176 %data.3 = extractelement <2 x float> %data, i32 1
181 store float %data.3, float addrspace(1)* %gep.3
177 call void @llvm.amdgcn.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i1 0, i1 0)
182178
183179 %cc = fcmp ogt float %z, 0.0
184180 br i1 %cc, label %IF, label %ELSE
209205 ;CHECK: load
210206 ;CHECK: store
211207 ;CHECK: v_cmp
212 define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
208 define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
213209 main_body:
214210 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
215211 %tex.1 = extractelement <4 x float> %tex, i32 0
216212
217213 %idx.1 = extractelement <3 x i32> %idx, i32 0
218 %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
219214 %data.1 = extractelement <2 x float> %data, i32 0
220 store float %data.1, float addrspace(1)* %gep.1
215 call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)
221216
222217 %idx.2 = extractelement <3 x i32> %idx, i32 1
223 %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
224 %z = load float, float addrspace(1)* %gep.2
218 %z = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i1 0, i1 0)
225219
226220 %idx.3 = extractelement <3 x i32> %idx, i32 2
227 %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
228221 %data.3 = extractelement <2 x float> %data, i32 1
229 store float %data.3, float addrspace(1)* %gep.3
222 call void @llvm.amdgcn.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i1 0, i1 0)
230223
231224 %cc = fcmp ogt float %z, 0.0
232225 br i1 %cc, label %IF, label %ELSE
257250 ;CHECK: s_mov_b64 exec, [[SAVE]]
258251 ;CHECK: %END
259252 ;CHECK: image_sample
260 define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, i32 %y, float %z) {
253 define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %coord, i32 %y, float %z) {
261254 main_body:
262255 %cond = icmp eq i32 %y, 0
263256 br i1 %cond, label %IF, label %END
264257
265258 IF:
266 %data = load float, float addrspace(1)* %ptr
267 %gep = getelementptr float, float addrspace(1)* %ptr, i32 1
268 store float %data, float addrspace(1)* %gep
259 %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
260 call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0)
269261 br label %END
270262
271263 END:
281273 ;CHECK-NEXT: s_wqm_b64 exec, exec
282274 ;CHECK: image_sample
283275 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
284 ;SI: buffer_store_dword
285 ;VI: flat_store_dword
276 ;CHECK: buffer_store_dword
286277 ;CHECK: s_wqm_b64 exec, exec
287278 ;CHECK: v_cmpx_
288279 ;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
289 ;SI: buffer_store_dword
290 ;VI: flat_store_dword
280 ;CHECK: buffer_store_dword
291281 ;CHECK: s_mov_b64 exec, [[SAVE]]
292282 ;CHECK: image_sample
293283 define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) {
295285 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
296286
297287 %idx.0 = extractelement <2 x i32> %idx, i32 0
298 %gep.0 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.0
299288 %data.0 = extractelement <2 x float> %data, i32 0
300 store float %data.0, float addrspace(1)* %gep.0
289 call void @llvm.amdgcn.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i1 0, i1 0)
301290
302291 call void @llvm.AMDGPU.kill(float %z)
303292
304293 %idx.1 = extractelement <2 x i32> %idx, i32 1
305 %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
306294 %data.1 = extractelement <2 x float> %data, i32 1
307 store float %data.1, float addrspace(1)* %gep.1
295 call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)
308296
309297 %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
310298 %out = fadd <4 x float> %tex, %tex2
320308 ; CHECK: s_wqm_b64 exec, exec
321309 ; CHECK: image_sample
322310 ; CHECK: s_and_b64 exec, exec, [[ORIG]]
323 ; SI: buffer_store_dword
324 ; VI: flat_store_dword
311 ; CHECK: buffer_store_dword
325312 ; CHECK-NOT: wqm
326313 ; CHECK: v_cmpx_
327 define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) {
314 define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) {
328315 main_body:
329316 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
330317
331 %gep = getelementptr float, float addrspace(1)* %ptr, i32 %idx
332 store float %data, float addrspace(1)* %gep
318 call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
333319
334320 call void @llvm.AMDGPU.kill(float %z)
335321
387373 ret <4 x float> %c.iv
388374 }
389375
376 ; Only intrinsic stores need exact execution -- other stores do not have
377 ; externally visible effects and may require WQM for correctness.
378 ;
379 ; CHECK-LABEL: {{^}}test_alloca:
380 ; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
381 ; CHECK: s_wqm_b64 exec, exec
382
383 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
384 ; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0
385 ; CHECK: s_wqm_b64 exec, exec
386 ; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
387 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
388 ; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen
389 ; CHECK: s_wqm_b64 exec, exec
390 ; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
391
392 ; CHECK: image_sample
393 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
394 ; CHECK: buffer_store_dwordx4
395 define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
396 entry:
397 %array = alloca [32 x i32], align 4
398
399 call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
400
401 %s.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 0
402 store volatile i32 %a, i32* %s.gep, align 4
403
404 call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0)
405
406 %c.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 %idx
407 %c = load i32, i32* %c.gep, align 4
408
409 %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
410
411 call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
412
413 ret void
414 }
415
416
390417 declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
418 declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1
419 declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1
391420
392421 declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
422 declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2
393423
394424 declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
395425 declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3