llvm.org GIT mirror llvm / 34ef4cd
R600/SI: Fix offset folding in some cases with shifted pointers. Ordinarily (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) is only done if the add has one use. If the resulting constant add can be folded into an addressing mode, force this to happen for the pointer operand. This ends up happening a lot because of how LDS objects are allocated. Since the globals are allocated next to each other, acessing the first element of the second object is directly indexed by a shifted pointer. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@215739 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 5 years ago
5 changed file(s) with 424 addition(s) and 2 deletion(s). Raw diff Collapse all Expand all
233233 setTargetDAGCombine(ISD::SETCC);
234234
235235 setTargetDAGCombine(ISD::UINT_TO_FP);
236
237 // All memory operations. Some folding on the pointer operand is done to help
238 // matching the constant offsets in the addressing modes.
239 setTargetDAGCombine(ISD::LOAD);
240 setTargetDAGCombine(ISD::STORE);
241 setTargetDAGCombine(ISD::ATOMIC_LOAD);
242 setTargetDAGCombine(ISD::ATOMIC_STORE);
243 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
244 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
245 setTargetDAGCombine(ISD::ATOMIC_SWAP);
246 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
247 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
248 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
249 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
250 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
251 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
252 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
253 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
254 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
255 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
236256
237257 setSchedulingPreference(Sched::RegPressure);
238258 }
12951315 return SDValue();
12961316 }
12971317
1318 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
1319
1320 // This is a variant of
1321 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
1322 //
1323 // The normal DAG combiner will do this, but only if the add has one use since
1324 // that would increase the number of instructions.
1325 //
1326 // This prevents us from seeing a constant offset that can be folded into a
1327 // memory instruction's addressing mode. If we know the resulting add offset of
1328 // a pointer can be folded into an addressing offset, we can replace the pointer
1329 // operand with the add of new constant offset. This eliminates one of the uses,
1330 // and may allow the remaining use to also be simplified.
1331 //
1332 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
1333 unsigned AddrSpace,
1334 DAGCombinerInfo &DCI) const {
1335 SDValue N0 = N->getOperand(0);
1336 SDValue N1 = N->getOperand(1);
1337
1338 if (N0.getOpcode() != ISD::ADD)
1339 return SDValue();
1340
1341 const ConstantSDNode *CN1 = dyn_cast(N1);
1342 if (!CN1)
1343 return SDValue();
1344
1345 const ConstantSDNode *CAdd = dyn_cast(N0.getOperand(1));
1346 if (!CAdd)
1347 return SDValue();
1348
1349 const SIInstrInfo *TII = static_cast(
1350 getTargetMachine().getSubtargetImpl()->getInstrInfo());
1351
1352 // If the resulting offset is too large, we can't fold it into the addressing
1353 // mode offset.
1354 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
1355 if (!TII->canFoldOffset(Offset.getZExtValue(), AddrSpace))
1356 return SDValue();
1357
1358 SelectionDAG &DAG = DCI.DAG;
1359 SDLoc SL(N);
1360 EVT VT = N->getValueType(0);
1361
1362 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
1363 SDValue COffset = DAG.getConstant(Offset, MVT::i32);
1364
1365 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
1366 }
1367
12981368 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
12991369 DAGCombinerInfo &DCI) const {
13001370 SelectionDAG &DAG = DCI.DAG;
13471417 case ISD::UINT_TO_FP: {
13481418 return performUCharToFloatCombine(N, DCI);
13491419 }
1350 }
1351
1420 case ISD::LOAD:
1421 case ISD::STORE:
1422 case ISD::ATOMIC_LOAD:
1423 case ISD::ATOMIC_STORE:
1424 case ISD::ATOMIC_CMP_SWAP:
1425 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
1426 case ISD::ATOMIC_SWAP:
1427 case ISD::ATOMIC_LOAD_ADD:
1428 case ISD::ATOMIC_LOAD_SUB:
1429 case ISD::ATOMIC_LOAD_AND:
1430 case ISD::ATOMIC_LOAD_OR:
1431 case ISD::ATOMIC_LOAD_XOR:
1432 case ISD::ATOMIC_LOAD_NAND:
1433 case ISD::ATOMIC_LOAD_MIN:
1434 case ISD::ATOMIC_LOAD_MAX:
1435 case ISD::ATOMIC_LOAD_UMIN:
1436 case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics.
1437 if (DCI.isBeforeLegalize())
1438 break;
1439
1440 MemSDNode *MemNode = cast(N);
1441 SDValue Ptr = MemNode->getBasePtr();
1442
1443 // TODO: We could also do this for multiplies.
1444 unsigned AS = MemNode->getAddressSpace();
1445 if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) {
1446 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
1447 if (NewPtr) {
1448 SmallVector NewOps;
1449 for (unsigned I = 0, N = MemNode->getNumOperands(); I != N; ++I)
1450 NewOps.push_back(MemNode->getOperand(I));
1451
1452 NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
1453 return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0);
1454 }
1455 }
1456 break;
1457 }
1458 }
13521459 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
13531460 }
13541461
5555
5656 static SDValue performUCharToFloatCombine(SDNode *N,
5757 DAGCombinerInfo &DCI);
58 SDValue performSHLPtrCombine(SDNode *N,
59 unsigned AS,
60 DAGCombinerInfo &DCI) const;
5861
5962 public:
6063 SITargetLowering(TargetMachine &tm);
800800 return false;
801801
802802 return RI.regClassCanUseImmediate(OpInfo.RegClass);
803 }
804
805 bool SIInstrInfo::canFoldOffset(unsigned OffsetSize, unsigned AS) {
806 switch (AS) {
807 case AMDGPUAS::GLOBAL_ADDRESS: {
808 // MUBUF instructions a 12-bit offset in bytes.
809 return isUInt<12>(OffsetSize);
810 }
811 case AMDGPUAS::CONSTANT_ADDRESS: {
812 // SMRD instructions have an 8-bit offset in dwords.
813 return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
814 }
815 case AMDGPUAS::LOCAL_ADDRESS:
816 case AMDGPUAS::REGION_ADDRESS: {
817 // The single offset versions have a 16-bit offset in bytes.
818 return isUInt<16>(OffsetSize);
819 }
820 case AMDGPUAS::PRIVATE_ADDRESS:
821 // Indirect register addressing does not use any offsets.
822 default:
823 return 0;
824 }
803825 }
804826
805827 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
118118 bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
119119 const MachineOperand &MO) const;
120120
121 /// \brief Return true if the given offset Size in bytes can be folded into
122 /// the immediate offsets of a memory instruction for the given address space.
123 static bool canFoldOffset(unsigned OffsetSize, unsigned AS) LLVM_READNONE;
124
121125 /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding.
122126 /// This function will return false if you pass it a 32-bit instruction.
123127 bool hasVALU32BitEncoding(unsigned Opcode) const;
0 ; XFAIL: *
1 ; Enable when patch to perform shl + add constant generic DAG combiner patch is in.
2
3 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
4
5 ; Test that doing a shift of a pointer with a constant add will be
6 ; folded into the constant offset addressing mode even if the add has
7 ; multiple uses. This is relevant to accessing 2 separate, adjacent
8 ; LDS globals.
9
10
11 declare i32 @llvm.r600.read.tidig.x() #1
12
13 @lds0 = addrspace(3) global [512 x float] zeroinitializer, align 4
14 @lds1 = addrspace(3) global [512 x float] zeroinitializer, align 4
15
16
17 ; Make sure the (add tid, 2) << 2 gets folded into the ds's offset as (tid << 2) + 8
18
19 ; SI-LABEL: @load_shl_base_lds_0
20 ; SI: V_LSHLREV_B32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
21 ; SI: DS_READ_B32 {{v[0-9]+}}, [[PTR]], 0x8, [M0]
22 ; SI: S_ENDPGM
23 define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
24 %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
25 %idx.0 = add nsw i32 %tid.x, 2
26 %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
27 %val0 = load float addrspace(3)* %arrayidx0, align 4
28 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
29 store float %val0, float addrspace(1)* %out
30 ret void
31 }
32
33 ; Make sure once the first use is folded into the addressing mode, the
34 ; remaining add use goes through the normal shl + add constant fold.
35
36 ; SI-LABEL: @load_shl_base_lds_1
37 ; SI: V_LSHLREV_B32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
38 ; SI: DS_READ_B32 [[RESULT:v[0-9]+]], [[PTR]], 0x8, [M0]
39 ; SI: V_ADD_I32_e32 [[ADDUSE:v[0-9]+]], 8, v{{[0-9]+}}
40 ; SI-DAG: BUFFER_STORE_DWORD [[RESULT]]
41 ; SI-DAG: BUFFER_STORE_DWORD [[ADDUSE]]
42 ; SI: S_ENDPGM
43 define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
44 %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
45 %idx.0 = add nsw i32 %tid.x, 2
46 %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
47 %val0 = load float addrspace(3)* %arrayidx0, align 4
48 %shl_add_use = shl i32 %idx.0, 2
49 store i32 %shl_add_use, i32 addrspace(1)* %add_use, align 4
50 store float %val0, float addrspace(1)* %out
51 ret void
52 }
53
54 @maxlds = addrspace(3) global [65536 x i8] zeroinitializer, align 4
55
56 ; SI-LABEL: @load_shl_base_lds_max_offset
57 ; SI: DS_READ_U8 v{{[0-9]+}}, v{{[0-9]+}}, 0xffff
58 ; SI: S_ENDPGM
59 define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 {
60 %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
61 %idx.0 = add nsw i32 %tid.x, 65535
62 %arrayidx0 = getelementptr inbounds [65536 x i8] addrspace(3)* @maxlds, i32 0, i32 %idx.0
63 %val0 = load i8 addrspace(3)* %arrayidx0
64 store i32 %idx.0, i32 addrspace(1)* %add_use
65 store i8 %val0, i8 addrspace(1)* %out
66 ret void
67 }
68
69 ; The two globals are placed adjacent in memory, so the same base
70 ; pointer can be used with an offset into the second one.
71
72 ; SI-LABEL: @load_shl_base_lds_2
73 ; SI: V_LSHLREV_B32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
74 ; SI-NEXT: DS_READ_B32 {{v[0-9]+}}, [[PTR]], 0x100, [M0]
75 ; SI-NEXT: DS_READ_B32 {{v[0-9]+}}, [[PTR]], 0x900, [M0]
76 ; SI: S_ENDPGM
77 define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
78 %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
79 %idx.0 = add nsw i32 %tid.x, 64
80 %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
81 %val0 = load float addrspace(3)* %arrayidx0, align 4
82 %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0
83 %val1 = load float addrspace(3)* %arrayidx1, align 4
84 %sum = fadd float %val0, %val1
85 store float %sum, float addrspace(1)* %out, align 4
86 ret void
87 }
88
89 ; SI-LABEL: @store_shl_base_lds_0
90 ; SI: V_LSHLREV_B32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
91 ; SI: DS_WRITE_B32 [[PTR]], {{v[0-9]+}}, 0x8 [M0]
92 ; SI: S_ENDPGM
93 define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
94 %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
95 %idx.0 = add nsw i32 %tid.x, 2
96 %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
97 store float 1.0, float addrspace(3)* %arrayidx0, align 4
98 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
99 ret void
100 }
101
102
103 ; --------------------------------------------------------------------------------
104 ; Atomics.
105
106 @lds2 = addrspace(3) global [512 x i32] zeroinitializer, align 4
107
108 ; define void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
109 ; %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
110 ; %idx.0 = add nsw i32 %tid.x, 2
111 ; %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
112 ; %val = load atomic i32 addrspace(3)* %arrayidx0 seq_cst, align 4
113 ; store i32 %val, i32 addrspace(1)* %out, align 4
114 ; store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
115 ; ret void
116 ; }
117
118
119 ; SI-LABEL: @atomic_cmpxchg_shl_base_lds_0
120 ; SI: V_LSHLREV_B32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
121 ; SI: DS_CMPST_RTN_B32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, {{v[0-9]+}}, 0x8
122 ; SI: S_ENDPGM
123 define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 {
124 %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
125 %idx.0 = add nsw i32 %tid.x, 2
126 %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
127 %pair = cmpxchg i32 addrspace(3)* %arrayidx0, i32 7, i32 %swap seq_cst monotonic
128 %result = extractvalue { i32, i1 } %pair, 0
129 store i32 %result, i32 addrspace(1)* %out, align 4
130 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
131 ret void
132 }
133
134 ; SI-LABEL: @atomic_swap_shl_base_lds_0
135 ; SI: V_LSHLREV_B32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
136 ; SI: DS_WRXCHG_RTN_B32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, 0x8
137 ; SI: S_ENDPGM
138 define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
139 %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
140 %idx.0 = add nsw i32 %tid.x, 2
141 %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
142 %val = atomicrmw xchg i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
143 store i32 %val, i32 addrspace(1)* %out, align 4
144 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
145 ret void
146 }
147
148 ; SI-LABEL: @atomic_add_shl_base_lds_0
149 ; SI: V_LSHLREV_B32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
150 ; SI: DS_ADD_RTN_U32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, 0x8
151 ; SI: S_ENDPGM
152 define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
153 %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
154 %idx.0 = add nsw i32 %tid.x, 2
155 %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
156 %val = atomicrmw add i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
157 store i32 %val, i32 addrspace(1)* %out, align 4
158 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
159 ret void
160 }
161
162 ; SI-LABEL: @atomic_sub_shl_base_lds_0
163 ; SI: V_LSHLREV_B32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
164 ; SI: DS_SUB_RTN_U32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, 0x8
165 ; SI: S_ENDPGM
166 define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
167 %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
168 %idx.0 = add nsw i32 %tid.x, 2
169 %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
170 %val = atomicrmw sub i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
171 store i32 %val, i32 addrspace(1)* %out, align 4
172 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
173 ret void
174 }
175
176 ; SI-LABEL: @atomic_and_shl_base_lds_0
177 ; SI: V_LSHLREV_B32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
178 ; SI: DS_AND_RTN_B32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, 0x8
179 ; SI: S_ENDPGM
180 define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
181 %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
182 %idx.0 = add nsw i32 %tid.x, 2
183 %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
184 %val = atomicrmw and i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
185 store i32 %val, i32 addrspace(1)* %out, align 4
186 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
187 ret void
188 }
189
190 ; SI-LABEL: @atomic_or_shl_base_lds_0
191 ; SI: V_LSHLREV_B32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
192 ; SI: DS_OR_RTN_B32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, 0x8
193 ; SI: S_ENDPGM
194 define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
195 %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
196 %idx.0 = add nsw i32 %tid.x, 2
197 %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
198 %val = atomicrmw or i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
199 store i32 %val, i32 addrspace(1)* %out, align 4
200 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
201 ret void
202 }
203
204 ; SI-LABEL: @atomic_xor_shl_base_lds_0
205 ; SI: V_LSHLREV_B32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
206 ; SI: DS_XOR_RTN_B32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, 0x8
207 ; SI: S_ENDPGM
208 define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
209 %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
210 %idx.0 = add nsw i32 %tid.x, 2
211 %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
212 %val = atomicrmw xor i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
213 store i32 %val, i32 addrspace(1)* %out, align 4
214 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
215 ret void
216 }
217
218 ; define void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
219 ; %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
220 ; %idx.0 = add nsw i32 %tid.x, 2
221 ; %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
222 ; %val = atomicrmw nand i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
223 ; store i32 %val, i32 addrspace(1)* %out, align 4
224 ; store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
225 ; ret void
226 ; }
227
228 ; SI-LABEL: @atomic_min_shl_base_lds_0
229 ; SI: V_LSHLREV_B32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
230 ; SI: DS_MIN_RTN_I32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, 0x8
231 ; SI: S_ENDPGM
232 define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
233 %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
234 %idx.0 = add nsw i32 %tid.x, 2
235 %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
236 %val = atomicrmw min i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
237 store i32 %val, i32 addrspace(1)* %out, align 4
238 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
239 ret void
240 }
241
242 ; SI-LABEL: @atomic_max_shl_base_lds_0
243 ; SI: V_LSHLREV_B32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
244 ; SI: DS_MAX_RTN_I32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, 0x8
245 ; SI: S_ENDPGM
246 define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
247 %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
248 %idx.0 = add nsw i32 %tid.x, 2
249 %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
250 %val = atomicrmw max i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
251 store i32 %val, i32 addrspace(1)* %out, align 4
252 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
253 ret void
254 }
255
256 ; SI-LABEL: @atomic_umin_shl_base_lds_0
257 ; SI: V_LSHLREV_B32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
258 ; SI: DS_MIN_RTN_U32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, 0x8
259 ; SI: S_ENDPGM
260 define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
261 %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
262 %idx.0 = add nsw i32 %tid.x, 2
263 %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
264 %val = atomicrmw umin i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
265 store i32 %val, i32 addrspace(1)* %out, align 4
266 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
267 ret void
268 }
269
270 ; SI-LABEL: @atomic_umax_shl_base_lds_0
271 ; SI: V_LSHLREV_B32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
272 ; SI: DS_MAX_RTN_U32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, 0x8
273 ; SI: S_ENDPGM
274 define void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
275 %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
276 %idx.0 = add nsw i32 %tid.x, 2
277 %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
278 %val = atomicrmw umax i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
279 store i32 %val, i32 addrspace(1)* %out, align 4
280 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
281 ret void
282 }
283
284 attributes #0 = { nounwind }
285 attributes #1 = { nounwind readnone }