llvm.org GIT mirror llvm / 3e3d015
AMDGPU: Fix multi-use shl/add combine This was using a custom function that didn't handle the addressing modes properly for private. Use isLegalAddressingMode to avoid duplicating this. Additionally, skip the combine if there is only one use since the standard combine will handle it. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@318013 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 2 years ago
3 changed file(s) with 188 addition(s) and 100 deletion(s). Raw diff Collapse all Expand all
51755175 return SDValue();
51765176 }
51775177
5178 /// \brief Return true if the given offset Size in bytes can be folded into
5179 /// the immediate offsets of a memory instruction for the given address space.
5180 static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
5181 const SISubtarget &STI) {
5182 auto AMDGPUASI = STI.getAMDGPUAS();
5183 if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
5184 // MUBUF instructions a 12-bit offset in bytes.
5185 return isUInt<12>(OffsetSize);
5186 }
5187 if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
5188 // SMRD instructions have an 8-bit offset in dwords on SI and
5189 // a 20-bit offset in bytes on VI.
5190 if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
5191 return isUInt<20>(OffsetSize);
5192 else
5193 return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
5194 }
5195 if (AS == AMDGPUASI.LOCAL_ADDRESS ||
5196 AS == AMDGPUASI.REGION_ADDRESS) {
5197 // The single offset versions have a 16-bit offset in bytes.
5198 return isUInt<16>(OffsetSize);
5199 }
5200 // Indirect register addressing does not use any offsets.
5201 return false;
5202 }
5203
52045178 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
52055179
52065180 // This is a variant of
52175191 //
52185192 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
52195193 unsigned AddrSpace,
5194 EVT MemVT,
52205195 DAGCombinerInfo &DCI) const {
52215196 SDValue N0 = N->getOperand(0);
52225197 SDValue N1 = N->getOperand(1);
52235198
5224 if (N0.getOpcode() != ISD::ADD)
5199 // We only do this to handle cases where it's profitable when there are
5200 // multiple uses of the add, so defer to the standard combine.
5201 // TODO: Support or
5202 if (N0.getOpcode() != ISD::ADD || N0->hasOneUse())
52255203 return SDValue();
52265204
52275205 const ConstantSDNode *CN1 = dyn_cast(N1);
52355213 // If the resulting offset is too large, we can't fold it into the addressing
52365214 // mode offset.
52375215 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
5238 if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget()))
5216 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
5217
5218 AddrMode AM;
5219 AM.HasBaseReg = true;
5220 AM.BaseOffs = Offset.getSExtValue();
5221 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
52395222 return SDValue();
52405223
52415224 SelectionDAG &DAG = DCI.DAG;
52555238 SDLoc SL(N);
52565239
52575240 // TODO: We could also do this for multiplies.
5258 unsigned AS = N->getAddressSpace();
5259 if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUASI.PRIVATE_ADDRESS) {
5260 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
5241 if (Ptr.getOpcode() == ISD::SHL) {
5242 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
5243 N->getMemoryVT(), DCI);
52615244 if (NewPtr) {
52625245 SmallVector NewOps(N->op_begin(), N->op_end());
52635246
8787 DAGCombinerInfo &DCI) const;
8888 SDValue performSHLPtrCombine(SDNode *N,
8989 unsigned AS,
90 EVT MemVT,
9091 DAGCombinerInfo &DCI) const;
9192
9293 SDValue performMemSDNodeCombine(MemSDNode *N, DAGCombinerInfo &DCI) const;
None ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
0 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
22
33 ; Test that doing a shift of a pointer with a constant add will be
44 ; folded into the constant offset addressing mode even if the add has
1414
1515 ; Make sure the (add tid, 2) << 2 gets folded into the ds's offset as (tid << 2) + 8
1616
17 ; SI-LABEL: {{^}}load_shl_base_lds_0:
18 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
19 ; SI: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8
20 ; SI: s_endpgm
17 ; GCN-LABEL: {{^}}load_shl_base_lds_0:
18 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
19 ; GCN: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8
20 ; GCN: s_endpgm
2121 define amdgpu_kernel void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
2222 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
2323 %idx.0 = add nsw i32 %tid.x, 2
3131 ; Make sure once the first use is folded into the addressing mode, the
3232 ; remaining add use goes through the normal shl + add constant fold.
3333
34 ; SI-LABEL: {{^}}load_shl_base_lds_1:
35 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
36 ; SI: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8
37 ; SI: v_add_i32_e32 [[ADDUSE:v[0-9]+]], vcc, 8, v{{[0-9]+}}
38 ; SI-DAG: buffer_store_dword [[RESULT]]
39 ; SI-DAG: buffer_store_dword [[ADDUSE]]
40 ; SI: s_endpgm
34 ; GCN-LABEL: {{^}}load_shl_base_lds_1:
35 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
36 ; GCN: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8
37 ; GCN: v_add_i32_e32 [[ADDUSE:v[0-9]+]], vcc, 8, v{{[0-9]+}}
38 ; GCN-DAG: buffer_store_dword [[RESULT]]
39 ; GCN-DAG: buffer_store_dword [[ADDUSE]]
40 ; GCN: s_endpgm
4141 define amdgpu_kernel void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
4242 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
4343 %idx.0 = add nsw i32 %tid.x, 2
5151
5252 @maxlds = addrspace(3) global [65536 x i8] undef, align 4
5353
54 ; SI-LABEL: {{^}}load_shl_base_lds_max_offset
55 ; SI: ds_read_u8 v{{[0-9]+}}, v{{[0-9]+}} offset:65535
56 ; SI: s_endpgm
54 ; GCN-LABEL: {{^}}load_shl_base_lds_max_offset
55 ; GCN: ds_read_u8 v{{[0-9]+}}, v{{[0-9]+}} offset:65535
56 ; GCN: s_endpgm
5757 define amdgpu_kernel void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 {
5858 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
5959 %idx.0 = add nsw i32 %tid.x, 65535
6767 ; The two globals are placed adjacent in memory, so the same base
6868 ; pointer can be used with an offset into the second one.
6969
70 ; SI-LABEL: {{^}}load_shl_base_lds_2:
71 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
72 ; SI: s_mov_b32 m0, -1
73 ; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
74 ; SI: s_endpgm
70 ; GCN-LABEL: {{^}}load_shl_base_lds_2:
71 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
72 ; GCN: s_mov_b32 m0, -1
73 ; GCN-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
74 ; GCN: s_endpgm
7575 define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
7676 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
7777 %idx.0 = add nsw i32 %tid.x, 64
8484 ret void
8585 }
8686
87 ; SI-LABEL: {{^}}store_shl_base_lds_0:
88 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
89 ; SI: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8
90 ; SI: s_endpgm
87 ; GCN-LABEL: {{^}}store_shl_base_lds_0:
88 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
89 ; GCN: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8
90 ; GCN: s_endpgm
9191 define amdgpu_kernel void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
9292 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
9393 %idx.0 = add nsw i32 %tid.x, 2
114114 ; }
115115
116116
117 ; SI-LABEL: {{^}}atomic_cmpxchg_shl_base_lds_0:
118 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
119 ; SI: ds_cmpst_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, {{v[0-9]+}} offset:8
120 ; SI: s_endpgm
117 ; GCN-LABEL: {{^}}atomic_cmpxchg_shl_base_lds_0:
118 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
119 ; GCN: ds_cmpst_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, {{v[0-9]+}} offset:8
120 ; GCN: s_endpgm
121121 define amdgpu_kernel void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 {
122122 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
123123 %idx.0 = add nsw i32 %tid.x, 2
129129 ret void
130130 }
131131
132 ; SI-LABEL: {{^}}atomic_swap_shl_base_lds_0:
133 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
134 ; SI: ds_wrxchg_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
135 ; SI: s_endpgm
132 ; GCN-LABEL: {{^}}atomic_swap_shl_base_lds_0:
133 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
134 ; GCN: ds_wrxchg_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
135 ; GCN: s_endpgm
136136 define amdgpu_kernel void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
137137 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
138138 %idx.0 = add nsw i32 %tid.x, 2
143143 ret void
144144 }
145145
146 ; SI-LABEL: {{^}}atomic_add_shl_base_lds_0:
147 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
148 ; SI: ds_add_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
149 ; SI: s_endpgm
146 ; GCN-LABEL: {{^}}atomic_add_shl_base_lds_0:
147 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
148 ; GCN: ds_add_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
149 ; GCN: s_endpgm
150150 define amdgpu_kernel void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
151151 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
152152 %idx.0 = add nsw i32 %tid.x, 2
157157 ret void
158158 }
159159
160 ; SI-LABEL: {{^}}atomic_sub_shl_base_lds_0:
161 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
162 ; SI: ds_sub_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
163 ; SI: s_endpgm
160 ; GCN-LABEL: {{^}}atomic_sub_shl_base_lds_0:
161 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
162 ; GCN: ds_sub_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
163 ; GCN: s_endpgm
164164 define amdgpu_kernel void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
165165 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
166166 %idx.0 = add nsw i32 %tid.x, 2
171171 ret void
172172 }
173173
174 ; SI-LABEL: {{^}}atomic_and_shl_base_lds_0:
175 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
176 ; SI: ds_and_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
177 ; SI: s_endpgm
174 ; GCN-LABEL: {{^}}atomic_and_shl_base_lds_0:
175 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
176 ; GCN: ds_and_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
177 ; GCN: s_endpgm
178178 define amdgpu_kernel void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
179179 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
180180 %idx.0 = add nsw i32 %tid.x, 2
185185 ret void
186186 }
187187
188 ; SI-LABEL: {{^}}atomic_or_shl_base_lds_0:
189 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
190 ; SI: ds_or_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
191 ; SI: s_endpgm
188 ; GCN-LABEL: {{^}}atomic_or_shl_base_lds_0:
189 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
190 ; GCN: ds_or_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
191 ; GCN: s_endpgm
192192 define amdgpu_kernel void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
193193 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
194194 %idx.0 = add nsw i32 %tid.x, 2
199199 ret void
200200 }
201201
202 ; SI-LABEL: {{^}}atomic_xor_shl_base_lds_0:
203 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
204 ; SI: ds_xor_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
205 ; SI: s_endpgm
202 ; GCN-LABEL: {{^}}atomic_xor_shl_base_lds_0:
203 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
204 ; GCN: ds_xor_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
205 ; GCN: s_endpgm
206206 define amdgpu_kernel void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
207207 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
208208 %idx.0 = add nsw i32 %tid.x, 2
223223 ; ret void
224224 ; }
225225
226 ; SI-LABEL: {{^}}atomic_min_shl_base_lds_0:
227 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
228 ; SI: ds_min_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
229 ; SI: s_endpgm
226 ; GCN-LABEL: {{^}}atomic_min_shl_base_lds_0:
227 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
228 ; GCN: ds_min_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
229 ; GCN: s_endpgm
230230 define amdgpu_kernel void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
231231 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
232232 %idx.0 = add nsw i32 %tid.x, 2
237237 ret void
238238 }
239239
240 ; SI-LABEL: {{^}}atomic_max_shl_base_lds_0:
241 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
242 ; SI: ds_max_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
243 ; SI: s_endpgm
240 ; GCN-LABEL: {{^}}atomic_max_shl_base_lds_0:
241 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
242 ; GCN: ds_max_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
243 ; GCN: s_endpgm
244244 define amdgpu_kernel void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
245245 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
246246 %idx.0 = add nsw i32 %tid.x, 2
251251 ret void
252252 }
253253
254 ; SI-LABEL: {{^}}atomic_umin_shl_base_lds_0:
255 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
256 ; SI: ds_min_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
257 ; SI: s_endpgm
254 ; GCN-LABEL: {{^}}atomic_umin_shl_base_lds_0:
255 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
256 ; GCN: ds_min_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
257 ; GCN: s_endpgm
258258 define amdgpu_kernel void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
259259 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
260260 %idx.0 = add nsw i32 %tid.x, 2
265265 ret void
266266 }
267267
268 ; SI-LABEL: {{^}}atomic_umax_shl_base_lds_0:
269 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
270 ; SI: ds_max_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
271 ; SI: s_endpgm
268 ; GCN-LABEL: {{^}}atomic_umax_shl_base_lds_0:
269 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
270 ; GCN: ds_max_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
271 ; GCN: s_endpgm
272272 define amdgpu_kernel void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
273273 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
274274 %idx.0 = add nsw i32 %tid.x, 2
276276 %val = atomicrmw umax i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
277277 store i32 %val, i32 addrspace(1)* %out, align 4
278278 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
279 ret void
280 }
281
282 ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_lds:
283 ; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0
284 ; GCN: ds_write_b32 [[SCALE0]], v{{[0-9]+}} offset:32
285
286 ; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0
287 ; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}} offset:64
288 define void @shl_add_ptr_combine_2use_lds(i32 %idx) #0 {
289 %idx.add = add nuw i32 %idx, 4
290 %shl0 = shl i32 %idx.add, 3
291 %shl1 = shl i32 %idx.add, 4
292 %ptr0 = inttoptr i32 %shl0 to i32 addrspace(3)*
293 %ptr1 = inttoptr i32 %shl1 to i32 addrspace(3)*
294 store volatile i32 9, i32 addrspace(3)* %ptr0
295 store volatile i32 10, i32 addrspace(3)* %ptr1
296 ret void
297 }
298
299 ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_max_lds_offset:
300 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0
301 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0
302 ; GCN-DAG: ds_write_b32 [[SCALE0]], v{{[0-9]+}} offset:65528
303 ; GCN-DAG: v_add_i32_e32 [[ADD1:v[0-9]+]], vcc, 0x1fff0, [[SCALE1]]
304 ; GCN: ds_write_b32 [[ADD1]], v{{[0-9]+$}}
305 define void @shl_add_ptr_combine_2use_max_lds_offset(i32 %idx) #0 {
306 %idx.add = add nuw i32 %idx, 8191
307 %shl0 = shl i32 %idx.add, 3
308 %shl1 = shl i32 %idx.add, 4
309 %ptr0 = inttoptr i32 %shl0 to i32 addrspace(3)*
310 %ptr1 = inttoptr i32 %shl1 to i32 addrspace(3)*
311 store volatile i32 9, i32 addrspace(3)* %ptr0
312 store volatile i32 10, i32 addrspace(3)* %ptr1
313 ret void
314 }
315
316 ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_both_max_lds_offset:
317 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 0x1000, v0
318 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 4, [[ADD]]
319 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 5, [[ADD]]
320 ; GCN-DAG: ds_write_b32 [[SCALE0]], v{{[0-9]+$}}
321 ; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+$}}
322 define void @shl_add_ptr_combine_2use_both_max_lds_offset(i32 %idx) #0 {
323 %idx.add = add nuw i32 %idx, 4096
324 %shl0 = shl i32 %idx.add, 4
325 %shl1 = shl i32 %idx.add, 5
326 %ptr0 = inttoptr i32 %shl0 to i32 addrspace(3)*
327 %ptr1 = inttoptr i32 %shl1 to i32 addrspace(3)*
328 store volatile i32 9, i32 addrspace(3)* %ptr0
329 store volatile i32 10, i32 addrspace(3)* %ptr1
330 ret void
331 }
332
333 ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_private:
334 ; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 2, v0
335 ; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s4 offen offset:16
336
337 ; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 3, v0
338 ; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], s4 offen offset:32
339 define void @shl_add_ptr_combine_2use_private(i16 zeroext %idx.arg) #0 {
340 %idx = zext i16 %idx.arg to i32
341 %idx.add = add nuw i32 %idx, 4
342 %shl0 = shl i32 %idx.add, 2
343 %shl1 = shl i32 %idx.add, 3
344 %ptr0 = inttoptr i32 %shl0 to i32*
345 %ptr1 = inttoptr i32 %shl1 to i32*
346 store volatile i32 9, i32* %ptr0
347 store volatile i32 10, i32* %ptr1
348 ret void
349 }
350
351 ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_max_private_offset:
352 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0
353 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0
354 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s4 offen offset:4088
355 ; GCN-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 0x1ff0, [[SCALE1]]
356 ; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[0:3], s4 offen{{$}}
357 define void @shl_add_ptr_combine_2use_max_private_offset(i16 zeroext %idx.arg) #0 {
358 %idx = zext i16 %idx.arg to i32
359 %idx.add = add nuw i32 %idx, 511
360 %shl0 = shl i32 %idx.add, 3
361 %shl1 = shl i32 %idx.add, 4
362 %ptr0 = inttoptr i32 %shl0 to i32*
363 %ptr1 = inttoptr i32 %shl1 to i32*
364 store volatile i32 9, i32* %ptr0
365 store volatile i32 10, i32* %ptr1
366 ret void
367 }
368 ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_both_max_private_offset:
369 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 0x100, v0
370 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 4, [[ADD]]
371 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 5, [[ADD]]
372 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s4 offen{{$}}
373 ; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], s4 offen{{$}}
374 define void @shl_add_ptr_combine_2use_both_max_private_offset(i16 zeroext %idx.arg) #0 {
375 %idx = zext i16 %idx.arg to i32
376 %idx.add = add nuw i32 %idx, 256
377 %shl0 = shl i32 %idx.add, 4
378 %shl1 = shl i32 %idx.add, 5
379 %ptr0 = inttoptr i32 %shl0 to i32*
380 %ptr1 = inttoptr i32 %shl1 to i32*
381 store volatile i32 9, i32* %ptr0
382 store volatile i32 10, i32* %ptr1
279383 ret void
280384 }
281385