llvm.org GIT mirror llvm / 756309c
AMDGPU: add llvm.amdgcn.buffer.load/store intrinsics Summary: They correspond to BUFFER_LOAD/STORE_DWORD[_X2,X3,X4] and mostly behave like llvm.amdgcn.buffer.load/store.format. They will be used by Mesa for SSBO and atomic counters at least when robust buffer access behavior is desired. (These instructions perform no format conversion and do buffer range checking per component.) As a side effect of sharing patterns with llvm.amdgcn.buffer.store.format, it has become trivial to add support for the f32 and v2f32 variants of that intrinsic, so the patch does so. Also DAG-ify (and fix) some tests that I noticed intermittent failures in while developing this patch. Some tests were (temporarily) adjusted for the required mayLoad/hasSideEffects changes to the BUFFER_STORE_DWORD* instructions. See also http://reviews.llvm.org/D18291. Reviewers: arsenm, tstellarAMD, mareko Subscribers: arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D18292 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@266126 91177308-0d34-0410-b5e6-96231b3b80d8 Nicolai Haehnle 3 years ago
10 changed file(s) with 323 addition(s) and 89 deletion(s). Raw diff Collapse all Expand all
223223 llvm_i1_ty], // slc(imm)
224224 []>;
225225
226 def int_amdgcn_buffer_load_format : Intrinsic <
226 class AMDGPUBufferLoad : Intrinsic <
227227 [llvm_anyfloat_ty],
228228 [llvm_v4i32_ty, // rsrc(SGPR)
229229 llvm_i32_ty, // vindex(VGPR)
231231 llvm_i1_ty, // glc(imm)
232232 llvm_i1_ty], // slc(imm)
233233 [IntrReadMem]>;
234
235 def int_amdgcn_buffer_store_format : Intrinsic <
234 def int_amdgcn_buffer_load_format : AMDGPUBufferLoad;
235 def int_amdgcn_buffer_load : AMDGPUBufferLoad;
236
237 class AMDGPUBufferStore : Intrinsic <
236238 [],
237 [llvm_anyfloat_ty, // vdata(VGPR) -- can currently only select v4f32
239 [llvm_anyfloat_ty, // vdata(VGPR) -- can currently only select f32, v2f32, v4f32
238240 llvm_v4i32_ty, // rsrc(SGPR)
239241 llvm_i32_ty, // vindex(VGPR)
240242 llvm_i32_ty, // offset(SGPR/VGPR/imm)
241243 llvm_i1_ty, // glc(imm)
242244 llvm_i1_ty], // slc(imm)
243245 []>;
246 def int_amdgcn_buffer_store_format : AMDGPUBufferStore;
247 def int_amdgcn_buffer_store : AMDGPUBufferStore;
244248
245249 class AMDGPUBufferAtomic : Intrinsic <
246250 [llvm_i32_ty],
995995 mubuf<0x1a>, "buffer_store_short", VGPR_32, i32, truncstorei16_global
996996 >;
997997
998 // Without mayLoad and hasSideEffects, TableGen complains about the pattern
999 // matching llvm.amdgcn.buffer.store. Eventually, we'll want a WriteOnly
1000 // property to express the effects of this intrinsic more precisely, see
1001 // http://reviews.llvm.org/D18291
1002 let mayLoad = 1, hasSideEffects = 1 in {
9981003 defm BUFFER_STORE_DWORD : MUBUF_Store_Helper <
9991004 mubuf<0x1c>, "buffer_store_dword", VGPR_32, i32, global_store
10001005 >;
10061011 defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
10071012 mubuf<0x1e, 0x1f>, "buffer_store_dwordx4", VReg_128, v4i32, global_store
10081013 >;
1014 }
10091015
10101016 defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic <
10111017 mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global
21392145 // buffer_load/store_format patterns
21402146 //===----------------------------------------------------------------------===//
21412147
2142 multiclass MUBUF_LoadIntrinsicPat<ValueType vt, string opcode> {
2148 multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
2149 string opcode> {
21432150 def : Pat<
2144 (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, 0,
2145 (MUBUFIntrinsicOffset i32:$soffset,
2146 i16:$offset),
2147 imm:$glc, imm:$slc)),
2151 (vt (name v4i32:$rsrc, 0,
2152 (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
2153 imm:$glc, imm:$slc)),
21482154 (!cast(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
21492155 (as_i1imm $glc), (as_i1imm $slc), 0)
21502156 >;
21512157
21522158 def : Pat<
2153 (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$vindex,
2154 (MUBUFIntrinsicOffset i32:$soffset,
2155 i16:$offset),
2156 imm:$glc, imm:$slc)),
2159 (vt (name v4i32:$rsrc, i32:$vindex,
2160 (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
2161 imm:$glc, imm:$slc)),
21572162 (!cast(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
21582163 (as_i1imm $glc), (as_i1imm $slc), 0)
21592164 >;
21602165
21612166 def : Pat<
2162 (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, 0,
2163 (MUBUFIntrinsicVOffset i32:$soffset,
2164 i16:$offset,
2165 i32:$voffset),
2166 imm:$glc, imm:$slc)),
2167 (vt (name v4i32:$rsrc, 0,
2168 (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
2169 imm:$glc, imm:$slc)),
21672170 (!cast(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
21682171 (as_i1imm $glc), (as_i1imm $slc), 0)
21692172 >;
21702173
21712174 def : Pat<
2172 (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$vindex,
2173 (MUBUFIntrinsicVOffset i32:$soffset,
2174 i16:$offset,
2175 i32:$voffset),
2176 imm:$glc, imm:$slc)),
2175 (vt (name v4i32:$rsrc, i32:$vindex,
2176 (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
2177 imm:$glc, imm:$slc)),
21772178 (!cast(opcode # _BOTHEN)
21782179 (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
21792180 $rsrc, $soffset, (as_i16imm $offset),
21812182 >;
21822183 }
21832184
2184 defm : MUBUF_LoadIntrinsicPat;
2185 defm : MUBUF_LoadIntrinsicPat;
2186 defm : MUBUF_LoadIntrinsicPat;
2187
2188 def : Pat<
2189 (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, 0,
2190 (MUBUFIntrinsicOffset i32:$soffset,
2191 i16:$offset),
2192 imm:$glc, imm:$slc),
2193 (BUFFER_STORE_FORMAT_XYZW_OFFSET $vdata, $rsrc, $soffset, (as_i16imm $offset),
2194 (as_i1imm $glc), (as_i1imm $slc), 0)
2195 >;
2196
2197 def : Pat<
2198 (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, i32:$vindex,
2199 (MUBUFIntrinsicOffset i32:$soffset,
2200 i16:$offset),
2201 imm:$glc, imm:$slc),
2202 (BUFFER_STORE_FORMAT_XYZW_IDXEN $vdata, $vindex, $rsrc, $soffset,
2203 (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0)
2204 >;
2205
2206 def : Pat<
2207 (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, 0,
2208 (MUBUFIntrinsicVOffset i32:$soffset,
2209 i16:$offset,
2210 i32:$voffset),
2211 imm:$glc, imm:$slc),
2212 (BUFFER_STORE_FORMAT_XYZW_OFFEN $vdata, $voffset, $rsrc, $soffset,
2213 (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0)
2214 >;
2215
2216 def : Pat<
2217 (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, i32:$vindex,
2218 (MUBUFIntrinsicVOffset i32:$soffset,
2219 i16:$offset,
2220 i32:$voffset),
2221 imm:$glc, imm:$slc),
2222 (BUFFER_STORE_FORMAT_XYZW_BOTHEN
2223 $vdata,
2224 (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
2225 $rsrc, $soffset, (as_i16imm $offset),
2226 (as_i1imm $glc), (as_i1imm $slc), 0)
2227 >;
2185 defm : MUBUF_LoadIntrinsicPat>;
2186 defm : MUBUF_LoadIntrinsicPat;
2187 defm : MUBUF_LoadIntrinsicPat;
2188 defm : MUBUF_LoadIntrinsicPat;
2189 defm : MUBUF_LoadIntrinsicPat;
2190 defm : MUBUF_LoadIntrinsicPat;
2191
2192 multiclass MUBUF_StoreIntrinsicPat
2193 string opcode> {
2194 def : Pat<
2195 (name vt:$vdata, v4i32:$rsrc, 0,
2196 (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
2197 imm:$glc, imm:$slc),
2198 (!cast(opcode # _OFFSET) $vdata, $rsrc, $soffset, (as_i16imm $offset),
2199 (as_i1imm $glc), (as_i1imm $slc), 0)
2200 >;
2201
2202 def : Pat<
2203 (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
2204 (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
2205 imm:$glc, imm:$slc),
2206 (!cast(opcode # _IDXEN) $vdata, $vindex, $rsrc, $soffset,
2207 (as_i16imm $offset), (as_i1imm $glc),
2208 (as_i1imm $slc), 0)
2209 >;
2210
2211 def : Pat<
2212 (name vt:$vdata, v4i32:$rsrc, 0,
2213 (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
2214 imm:$glc, imm:$slc),
2215 (!cast(opcode # _OFFEN) $vdata, $voffset, $rsrc, $soffset,
2216 (as_i16imm $offset), (as_i1imm $glc),
2217 (as_i1imm $slc), 0)
2218 >;
2219
2220 def : Pat<
2221 (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
2222 (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
2223 imm:$glc, imm:$slc),
2224 (!cast(opcode # _BOTHEN)
2225 $vdata,
2226 (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
2227 $rsrc, $soffset, (as_i16imm $offset),
2228 (as_i1imm $glc), (as_i1imm $slc), 0)
2229 >;
2230 }
2231
2232 defm : MUBUF_StoreIntrinsicPat;
2233 defm : MUBUF_StoreIntrinsicPat;
2234 defm : MUBUF_StoreIntrinsicPat;
2235 defm : MUBUF_StoreIntrinsicPat;
2236 defm : MUBUF_StoreIntrinsicPat;
2237 defm : MUBUF_StoreIntrinsicPat;
22282238
22292239 //===----------------------------------------------------------------------===//
22302240 // buffer_atomic patterns
0 ; RUN: llc -march=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
11
22 ; GCN-LABEL: {{^}}stored_fi_to_lds:
3 ; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
4 ; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}}
5 ; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO1]]
3 ; GCN-DAG: s_load_dword [[LDSPTR:s[0-9]+]]
4 ; GCN-DAG: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}}
5 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ZERO1]]
66 ; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
77 ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
88 ; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]]
3232 ; SI-NOT: bfe
3333 ; SI-NOT: v_cvt_f32_ubyte3_e32
3434 ; SI-DAG: v_cvt_f32_ubyte2_e32
35 ; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
36 ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
35 ; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]],
36 ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]],
3737 ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
3838 define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
3939 %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
3131 ; GCN-LABEL: {{^}}bitcast_int_to_fpvector_extract_0:
3232 ; GCN: buffer_load_dwordx2
3333 ; GCN: v_add_i32
34 ; GCN: v_addc_u32
3534 ; GCN: buffer_store_dword
3635 define void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
3736 %a = load i64, i64 addrspace(1)* %in
0 ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
1 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
2
3 ;CHECK-LABEL: {{^}}buffer_load:
4 ;CHECK: buffer_load_dwordx4 v[0:3], s[0:3], 0
5 ;CHECK: buffer_load_dwordx4 v[4:7], s[0:3], 0 glc
6 ;CHECK: buffer_load_dwordx4 v[8:11], s[0:3], 0 slc
7 ;CHECK: s_waitcnt
8 define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) {
9 main_body:
10 %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
11 %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
12 %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
13 %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0
14 %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1
15 %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2
16 ret {<4 x float>, <4 x float>, <4 x float>} %r2
17 }
18
19 ;CHECK-LABEL: {{^}}buffer_load_immoffs:
20 ;CHECK: buffer_load_dwordx4 v[0:3], s[0:3], 0 offset:42
21 ;CHECK: s_waitcnt
22 define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
23 main_body:
24 %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
25 ret <4 x float> %data
26 }
27
28 ;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
29 ;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1fff
30 ;CHECK: buffer_load_dwordx4 v[0:3], s[0:3], [[OFFSET]] offset:1
31 ;CHECK: s_waitcnt
32 define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
33 main_body:
34 %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 8192, i1 0, i1 0)
35 ret <4 x float> %data
36 }
37
38 ;CHECK-LABEL: {{^}}buffer_load_idx:
39 ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen
40 ;CHECK: s_waitcnt
41 define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) {
42 main_body:
43 %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0)
44 ret <4 x float> %data
45 }
46
47 ;CHECK-LABEL: {{^}}buffer_load_ofs:
48 ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen
49 ;CHECK: s_waitcnt
50 define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
51 main_body:
52 %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0)
53 ret <4 x float> %data
54 }
55
56 ;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
57 ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:58
58 ;CHECK: s_waitcnt
59 define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
60 main_body:
61 %ofs = add i32 %1, 58
62 %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0)
63 ret <4 x float> %data
64 }
65
66 ;CHECK-LABEL: {{^}}buffer_load_both:
67 ;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
68 ;CHECK: s_waitcnt
69 define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) {
70 main_body:
71 %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0)
72 ret <4 x float> %data
73 }
74
75 ;CHECK-LABEL: {{^}}buffer_load_both_reversed:
76 ;CHECK: v_mov_b32_e32 v2, v0
77 ;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen
78 ;CHECK: s_waitcnt
79 define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) {
80 main_body:
81 %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0)
82 ret <4 x float> %data
83 }
84
85 ;CHECK-LABEL: {{^}}buffer_load_x1:
86 ;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen
87 ;CHECK: s_waitcnt
88 define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
89 main_body:
90 %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0)
91 ret float %data
92 }
93
94 ;CHECK-LABEL: {{^}}buffer_load_x2:
95 ;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen
96 ;CHECK: s_waitcnt
97 define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
98 main_body:
99 %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0)
100 ret <2 x float> %data
101 }
102
103 declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
104 declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0
105 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0
106
107 attributes #0 = { nounwind readonly }
6969 ret void
7070 }
7171
72 ;CHECK-LABEL: {{^}}buffer_store_x1:
73 ;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen
74 define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) {
75 main_body:
76 call void @llvm.amdgcn.buffer.store.format.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
77 ret void
78 }
79
80 ;CHECK-LABEL: {{^}}buffer_store_x2:
81 ;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen
82 define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) {
83 main_body:
84 call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
85 ret void
86 }
87
88 declare void @llvm.amdgcn.buffer.store.format.f32(float, <4 x i32>, i32, i32, i1, i1) #0
89 declare void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0
7290 declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0
7391 declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1
7492
0 ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
1 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
2
3 ;CHECK-LABEL: {{^}}buffer_store:
4 ;CHECK: buffer_store_dwordx4 v[0:3], s[0:3], 0
5 ;CHECK: buffer_store_dwordx4 v[4:7], s[0:3], 0 glc
6 ;CHECK: buffer_store_dwordx4 v[8:11], s[0:3], 0 slc
7 define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
8 main_body:
9 call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
10 call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
11 call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
12 ret void
13 }
14
15 ;CHECK-LABEL: {{^}}buffer_store_immoffs:
16 ;CHECK: buffer_store_dwordx4 v[0:3], s[0:3], 0 offset:42
17 define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
18 main_body:
19 call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
20 ret void
21 }
22
23 ;CHECK-LABEL: {{^}}buffer_store_idx:
24 ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
25 define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {
26 main_body:
27 call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
28 ret void
29 }
30
31 ;CHECK-LABEL: {{^}}buffer_store_ofs:
32 ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
33 define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
34 main_body:
35 call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0)
36 ret void
37 }
38
39 ;CHECK-LABEL: {{^}}buffer_store_both:
40 ;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
41 define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) {
42 main_body:
43 call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0)
44 ret void
45 }
46
47 ;CHECK-LABEL: {{^}}buffer_store_both_reversed:
48 ;CHECK: v_mov_b32_e32 v6, v4
49 ;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen
50 define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) {
51 main_body:
52 call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0)
53 ret void
54 }
55
56 ; Ideally, the register allocator would avoid the wait here
57 ;
58 ;CHECK-LABEL: {{^}}buffer_store_wait:
59 ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
60 ;CHECK: s_waitcnt vmcnt(0) expcnt(0)
61 ;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
62 ;CHECK: s_waitcnt vmcnt(0)
63 ;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
64 define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) {
65 main_body:
66 call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
67 %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0)
68 call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i1 0, i1 0)
69 ret void
70 }
71
72 ;CHECK-LABEL: {{^}}buffer_store_x1:
73 ;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen
74 define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) {
75 main_body:
76 call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
77 ret void
78 }
79
80 ;CHECK-LABEL: {{^}}buffer_store_x2:
81 ;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
82 define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 {
83 main_body:
84 call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
85 ret void
86 }
87
88 declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0
89 declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0
90 declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0
91 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1
92
93 attributes #0 = { nounwind }
94 attributes #1 = { nounwind readonly }
1010
1111 ; FUNC-LABEL: @reorder_local_load_global_store_local_load
1212 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
13 ; CI-NEXT: buffer_store_dword
1314 ; CI-NEXT: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
14 ; CI: buffer_store_dword
1515 define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
1616 %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
1717
7070 }
7171
7272 ; FUNC-LABEL: @reorder_constant_load_global_store_constant_load
73 ; CI: buffer_store_dword
7473 ; CI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
7574 ; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
75 ; CI-DAG: buffer_store_dword
7676 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
7777 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x2
7878 ; CI: buffer_store_dword
183183 }
184184
185185 ; FUNC-LABEL: @reorder_global_offsets
186 ; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
186187 ; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
187188 ; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404
188 ; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
189 ; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
189190 ; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
190 ; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
191191 ; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404
192192 ; CI: buffer_store_dword
193193 ; CI: s_endpgm
4545 }
4646
4747 ; FUNC-LABEL: {{^}}v_abs_v2i32:
48 ; GCN: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
49 ; GCN: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]
48 ; GCN-DAG: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
49 ; GCN-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]
5050
51 ; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
52 ; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
51 ; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
52 ; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
5353
5454 ; GCN: v_add_i32
5555 ; GCN: v_add_i32
9696 }
9797
9898 ; FUNC-LABEL: {{^}}v_abs_v4i32:
99 ; GCN: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
100 ; GCN: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]
101 ; GCN: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]]
102 ; GCN: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]]
99 ; GCN-DAG: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
100 ; GCN-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]
101 ; GCN-DAG: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]]
102 ; GCN-DAG: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]]
103103
104 ; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
105 ; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
106 ; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]]
107 ; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]]
104 ; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
105 ; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
106 ; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]]
107 ; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]]
108108
109109 ; GCN: v_add_i32
110110 ; GCN: v_add_i32