llvm.org GIT mirror llvm / e3d4cbc
R600: Add local memory support via LDS Reviewed-by: Vincent Lejeune<vljn at ovi.com> git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@185162 91177308-0d34-0410-b5e6-96231b3b80d8 Tom Stellard 7 years ago
18 changed file(s) with 337 addition(s) and 25 deletion(s). Raw diff Collapse all Expand all
2828 #include "llvm/MC/MCSectionELF.h"
2929 #include "llvm/MC/MCStreamer.h"
3030 #include "llvm/Support/ELF.h"
31 #include "llvm/Support/MathExtras.h"
3132 #include "llvm/Support/TargetRegistry.h"
3233 #include "llvm/Target/TargetLoweringObjectFile.h"
3334
129130 S_STACK_SIZE(MFI->StackSize), 4);
130131 OutStreamer.EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
131132 OutStreamer.EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
133
134 if (MFI->ShaderType == ShaderType::COMPUTE) {
135 OutStreamer.EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
136 OutStreamer.EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4);
137 }
132138 }
133139
134140 void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
2323 #include "llvm/CodeGen/MachineRegisterInfo.h"
2424 #include "llvm/CodeGen/SelectionDAG.h"
2525 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
26 #include "llvm/IR/DataLayout.h"
2627
2728 using namespace llvm;
2829
6970 setOperationAction(ISD::UDIV, MVT::i32, Expand);
7071 setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
7172 setOperationAction(ISD::UREM, MVT::i32, Expand);
73
74 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
7275
7376 int types[] = {
7477 (int)MVT::v2i32,
135138 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
136139 }
137140 return Op;
141 }
142
143 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
144 SDValue Op,
145 SelectionDAG &DAG) const {
146
147 const DataLayout *TD = getTargetMachine().getDataLayout();
148 GlobalAddressSDNode *G = cast(Op);
149 // XXX: What does the value of G->getOffset() mean?
150 assert(G->getOffset() == 0 &&
151 "Do not know what to do with an non-zero offset");
152
153 unsigned Offset = MFI->LDSSize;
154 const GlobalValue *GV = G->getGlobal();
155 uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
156
157 // XXX: Account for alignment?
158 MFI->LDSSize += Size;
159
160 return DAG.getConstant(Offset, MVT::i32);
138161 }
139162
140163 SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
1919
2020 namespace llvm {
2121
22 class AMDGPUMachineFunction;
2223 class MachineRegisterInfo;
2324
2425 class AMDGPUTargetLowering : public TargetLowering {
3536 virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
3637 const TargetRegisterClass *RC,
3738 unsigned Reg, EVT VT) const;
39 SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
40 SelectionDAG &DAG) const;
3841
3942 bool isHWTrueValue(SDValue Op) const;
4043 bool isHWFalseValue(SDValue Op) const;
9191
9292 def zextloadi8_constant : PatFrag<(ops node:$ptr), (zextloadi8 node:$ptr), [{
9393 return isGlobalLoad(dyn_cast(N));
94 }]>;
95
96 def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
97 return isLocalLoad(dyn_cast(N));
98 }]>;
99
100 def local_store : PatFrag<(ops node:$val, node:$ptr),
101 (store node:$val, node:$ptr), [{
102 return isLocalStore(dyn_cast(N));
94103 }]>;
95104
96105 class Constants {
99 AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
1010 MachineFunctionInfo() {
1111 ShaderType = ShaderType::COMPUTE;
12 LDSSize = 0;
1213 AttributeSet Set = MF.getFunction()->getAttributes();
1314 Attribute A = Set.getAttribute(AttributeSet::FunctionIndex,
1415 ShaderTypeAttribute);
2222 public:
2323 AMDGPUMachineFunction(const MachineFunction &MF);
2424 unsigned ShaderType;
25 /// Number of bytes in the LDS that are being used.
26 unsigned LDSSize;
2527 };
2628
2729 }
281281
282282 int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(),
283283 AMDGPU::OpName::literal);
284 assert(ImmIdx != -1);
285
286 // subtract one from ImmIdx, because the DST operand is usually index
287 // 0 for MachineInstrs, but we have no DST in the Ops vector.
288 ImmIdx--;
284 if (ImmIdx == -1) {
285 continue;
286 }
287
288 if (TII->getOperandIdx(Use->getMachineOpcode(),
289 AMDGPU::OpName::dst) != -1) {
290 // subtract one from ImmIdx, because the DST operand is usually index
291 // 0 for MachineInstrs, but we have no DST in the Ops vector.
292 ImmIdx--;
293 }
289294
290295 // Check that we aren't already using an immediate.
291296 // XXX: It's possible for an instruction to have more than one
335340 }
336341 if (Result && Result->isMachineOpcode() &&
337342 !(TII->get(Result->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR)
338 && TII->isALUInstr(Result->getMachineOpcode())) {
343 && TII->hasInstrModifiers(Result->getMachineOpcode())) {
339344 // Fold FNEG/FABS/CONST_ADDRESS
340345 // TODO: Isel can generate multiple MachineInst, we need to recursively
341346 // parse Result
4141 OP2 = (1 << 11),
4242 VTX_INST = (1 << 12),
4343 TEX_INST = (1 << 13),
44 ALU_INST = (1 << 14)
44 ALU_INST = (1 << 14),
45 LDS_1A = (1 << 15),
46 LDS_1A1D = (1 << 16)
4547 };
4648 }
4749
161163 #define R_028878_SQ_PGM_RESOURCES_GS 0x028878
162164 #define R_0288D4_SQ_PGM_RESOURCES_LS 0x0288d4
163165
166 #define R_0288E8_SQ_LDS_ALLOC 0x0288E8
167
164168 #endif // R600DEFINES_H_
134134 assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
135135 MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
136136 TII->addFlag(defInstr, 0, MO_FLAG_MASK);
137 break;
138 }
139
140 case AMDGPU::LDS_READ_RET: {
141 MachineInstrBuilder NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
142 TII->get(MI->getOpcode()),
143 AMDGPU::OQAP);
144 for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
145 NewMI.addOperand(MI->getOperand(i));
146 }
147 TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV,
148 MI->getOperand(0).getReg(),
149 AMDGPU::OQAP);
137150 break;
138151 }
139152
455468 //===----------------------------------------------------------------------===//
456469
457470 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
471 MachineFunction &MF = DAG.getMachineFunction();
472 R600MachineFunctionInfo *MFI = MF.getInfo();
458473 switch (Op.getOpcode()) {
459474 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
460475 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
462477 case ISD::STORE: return LowerSTORE(Op, DAG);
463478 case ISD::LOAD: return LowerLOAD(Op, DAG);
464479 case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
480 case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
465481 case ISD::INTRINSIC_VOID: {
466482 SDValue Chain = Op.getOperand(0);
467483 unsigned IntrinsicID =
468484 cast(Op.getOperand(1))->getZExtValue();
469485 switch (IntrinsicID) {
470486 case AMDGPUIntrinsic::AMDGPU_store_output: {
471 MachineFunction &MF = DAG.getMachineFunction();
472 R600MachineFunctionInfo *MFI = MF.getInfo();
473487 int64_t RegIndex = cast(Op.getOperand(3))->getZExtValue();
474488 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
475489 MFI->LiveOuts.push_back(Reg);
2222 bits<2> FlagOperandIdx = 0;
2323 bit Op1 = 0;
2424 bit Op2 = 0;
25 bit LDS_1A = 0;
26 bit LDS_1A1D = 0;
2527 bit HasNativeOperands = 0;
2628 bit VTXInst = 0;
2729 bit TEXInst = 0;
4850 let TSFlags{12} = VTXInst;
4951 let TSFlags{13} = TEXInst;
5052 let TSFlags{14} = ALUInst;
53 let TSFlags{15} = LDS_1A;
54 let TSFlags{16} = LDS_1A1D;
5155 }
5256
5357 //===----------------------------------------------------------------------===//
5458 // ALU instructions
5559 //===----------------------------------------------------------------------===//
5660
57 class R600ALU_Word0 {
61 class R600_ALU_LDS_Word0 {
5862 field bits<32> Word0;
5963
6064 bits<11> src0;
61 bits<1> src0_neg;
6265 bits<1> src0_rel;
6366 bits<11> src1;
6467 bits<1> src1_rel;
65 bits<1> src1_neg;
6668 bits<3> index_mode = 0;
6769 bits<2> pred_sel;
6870 bits<1> last;
7577 let Word0{8-0} = src0_sel;
7678 let Word0{9} = src0_rel;
7779 let Word0{11-10} = src0_chan;
78 let Word0{12} = src0_neg;
7980 let Word0{21-13} = src1_sel;
8081 let Word0{22} = src1_rel;
8182 let Word0{24-23} = src1_chan;
82 let Word0{25} = src1_neg;
8383 let Word0{28-26} = index_mode;
8484 let Word0{30-29} = pred_sel;
8585 let Word0{31} = last;
86 }
87
88 class R600ALU_Word0 : R600_ALU_LDS_Word0 {
89
90 bits<1> src0_neg;
91 bits<1> src1_neg;
92
93 let Word0{12} = src0_neg;
94 let Word0{25} = src1_neg;
8695 }
8796
8897 class R600ALU_Word1 {
136145 let Word1{12} = src2_neg;
137146 let Word1{17-13} = alu_inst;
138147 }
148
149 class R600LDS_Word1 {
150 field bits<32> Word1;
151
152 bits<11> src2;
153 bits<9> src2_sel = src2{8-0};
154 bits<2> src2_chan = src2{10-9};
155 bits<1> src2_rel;
156 // offset specifies the stride offset to the second set of data to be read
157 // from. This is a dword offset.
158 bits<5> alu_inst = 17; // OP3_INST_LDS_IDX_OP
159 bits<3> bank_swizzle;
160 bits<6> lds_op;
161 bits<2> dst_chan = 0;
162
163 let Word1{8-0} = src2_sel;
164 let Word1{9} = src2_rel;
165 let Word1{11-10} = src2_chan;
166 let Word1{17-13} = alu_inst;
167 let Word1{20-18} = bank_swizzle;
168 let Word1{26-21} = lds_op;
169 let Word1{30-29} = dst_chan;
170 }
171
139172
140173 /*
141174 XXX: R600 subtarget uses a slightly different encoding than the other
135135 return (TargetFlags & R600_InstFlag::ALU_INST);
136136 }
137137
138 bool R600InstrInfo::hasInstrModifiers(unsigned Opcode) const {
139 unsigned TargetFlags = get(Opcode).TSFlags;
140
141 return ((TargetFlags & R600_InstFlag::OP1) |
142 (TargetFlags & R600_InstFlag::OP2) |
143 (TargetFlags & R600_InstFlag::OP3));
144 }
145
146 bool R600InstrInfo::isLDSInstr(unsigned Opcode) const {
147 unsigned TargetFlags = get(Opcode).TSFlags;
148
149 return ((TargetFlags & R600_InstFlag::LDS_1A) |
150 (TargetFlags & R600_InstFlag::LDS_1A1D));
151 }
152
138153 bool R600InstrInfo::isTransOnly(unsigned Opcode) const {
139154 return (get(Opcode).TSFlags & R600_InstFlag::TRANS_ONLY);
140155 }
244259 unsigned Reg = Srcs[i].first->getReg();
245260 unsigned Index = RI.getEncodingValue(Reg) & 0xff;
246261 unsigned Chan = RI.getHWRegChan(Reg);
262 if (Reg == AMDGPU::OQAP) {
263 Result.push_back(std::pair(Index, 0));
264 }
247265 if (Index > 127) {
248266 Result.push_back(DummyPair);
249267 continue;
286304 return Src;
287305 }
288306
289 static bool
290 isLegal(const std::vector > > &IGSrcs,
291 const std::vector &Swz,
292 unsigned CheckedSize) {
307 bool
308 R600InstrInfo::isLegal(
309 const std::vector > > &IGSrcs,
310 const std::vector &Swz,
311 unsigned CheckedSize) const {
293312 int Vector[4][3];
294313 memset(Vector, -1, sizeof(Vector));
295314 for (unsigned i = 0; i < CheckedSize; i++) {
299318 const std::pair &Src = Srcs[j];
300319 if (Src.first < 0)
301320 continue;
321 if (Src.first == GET_REG_INDEX(RI.getEncodingValue(AMDGPU::OQAP))) {
322 if (Swz[i] != R600InstrInfo::ALU_VEC_012 &&
323 Swz[i] != R600InstrInfo::ALU_VEC_021) {
324 // The value from output queue A (denoted by register OQAP) can
325 // only be fetched during the first cycle.
326 return false;
327 }
328 // OQAP does not count towards the normal read port restrictions
329 continue;
330 }
302331 if (Vector[Src.second][j] < 0)
303332 Vector[Src.second][j] = Src.first;
304333 if (Vector[Src.second][j] != Src.first)
308337 return true;
309338 }
310339
311 static bool recursiveFitsFPLimitation(
312 const std::vector > > &IGSrcs,
313 std::vector &SwzCandidate,
314 unsigned Depth = 0) {
340 bool
341 R600InstrInfo::recursiveFitsFPLimitation(
342 const std::vector > > &IGSrcs,
343 std::vector &SwzCandidate,
344 unsigned Depth) const {
315345 if (!isLegal(IGSrcs, SwzCandidate, Depth))
316346 return false;
317347 if (IGSrcs.size() == Depth)
6262
6363 /// \returns true if this \p Opcode represents an ALU instruction.
6464 bool isALUInstr(unsigned Opcode) const;
65 bool hasInstrModifiers(unsigned Opcode) const;
66 bool isLDSInstr(unsigned Opcode) const;
6567
6668 bool isTransOnly(unsigned Opcode) const;
6769 bool isTransOnly(const MachineInstr *MI) const;
8082 /// Otherwise, second member value is undefined.
8183 SmallVector, 3>
8284 getSrcs(MachineInstr *MI) const;
85
86 bool isLegal(
87 const std::vector > > &IGSrcs,
88 const std::vector &Swz,
89 unsigned CheckedSize) const;
90 bool recursiveFitsFPLimitation(
91 const std::vector > > &IGSrcs,
92 std::vector &SwzCandidate,
93 unsigned Depth = 0) const;
8394
8495 /// Given the order VEC_012 < VEC_021 < VEC_120 < VEC_102 < VEC_201 < VEC_210
8596 /// returns true and the first (in lexical order) BankSwizzle affectation
15271527
15281528 let ALUInst = 1;
15291529 }
1530
1531 //===----------------------------------------------------------------------===//
1532 // LDS Instructions
1533 //===----------------------------------------------------------------------===//
1534 class R600_LDS op, dag outs, dag ins, string asm,
1535 list pattern = []> :
1536
1537 InstR600 ,
1538 R600_ALU_LDS_Word0,
1539 R600LDS_Word1 {
1540
1541 bits<6> offset = 0;
1542 let lds_op = op;
1543
1544 let Word1{27} = offset{0};
1545 let Word1{12} = offset{1};
1546 let Word1{28} = offset{2};
1547 let Word1{31} = offset{3};
1548 let Word0{12} = offset{4};
1549 let Word0{25} = offset{5};
1550
1551
1552 let Inst{31-0} = Word0;
1553 let Inst{63-32} = Word1;
1554
1555 let ALUInst = 1;
1556 let HasNativeOperands = 1;
1557 let UseNamedOperandTable = 1;
1558 }
1559
1560 class R600_LDS_1A lds_op, string name, list pattern> : R600_LDS <
1561 lds_op,
1562 (outs R600_Reg32:$dst),
1563 (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
1564 LAST:$last, R600_Pred:$pred_sel,
1565 BANK_SWIZZLE:$bank_swizzle),
1566 " "#name#" $last OQAP, $src0$src0_rel $pred_sel",
1567 pattern
1568 > {
1569
1570 let src1 = 0;
1571 let src1_rel = 0;
1572 let src2 = 0;
1573 let src2_rel = 0;
1574
1575 let Defs = [OQAP];
1576 let usesCustomInserter = 1;
1577 let LDS_1A = 1;
1578 let DisableEncoding = "$dst";
1579 }
1580
1581 class R600_LDS_1A1D lds_op, string name, list pattern> :
1582 R600_LDS <
1583 lds_op,
1584 (outs),
1585 (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
1586 R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
1587 LAST:$last, R600_Pred:$pred_sel,
1588 BANK_SWIZZLE:$bank_swizzle),
1589 " "#name#" $last $src0$src0_rel, $src1$src1_rel, $pred_sel",
1590 pattern
1591 > {
1592
1593 let src2 = 0;
1594 let src2_rel = 0;
1595 let LDS_1A1D = 1;
1596 }
1597
1598 def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET",
1599 [(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))]
1600 >;
1601
1602 def LDS_WRITE : R600_LDS_1A1D <0xD, "LDS_WRITE",
1603 [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)]
1604 >;
15301605
15311606 // TRUNC is used for the FLT_TO_INT instructions to work around a
15321607 // perceived problem where the rounding modes are applied differently
277277 return AluT_XYZW;
278278 }
279279
280 if (TII->isLDSInstr(MI->getOpcode())) {
281 return AluT_X;
282 }
283
280284 // Is the result already assigned to a channel ?
281285 unsigned DestSubReg = MI->getOperand(0).getSubReg();
282286 switch (DestSubReg) {
370374 }
371375
372376 void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
373 unsigned DestReg = MI->getOperand(0).getReg();
377 int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
378 if (DstIndex == -1) {
379 return;
380 }
381 unsigned DestReg = MI->getOperand(DstIndex).getReg();
374382 // PressureRegister crashes if an operand is def and used in the same inst
375383 // and we try to constraint its regclass
376384 for (MachineInstr::mop_iterator It = MI->operands_begin(),
377385 E = MI->operands_end(); It != E; ++It) {
378386 MachineOperand &MO = *It;
379387 if (MO.isReg() && !MO.isDef() &&
380 MO.getReg() == MI->getOperand(0).getReg())
388 MO.getReg() == DestReg)
381389 return;
382390 }
383391 // Constrains the regclass of DestReg to assign it to Slot
8989 if (BI->getOpcode() == AMDGPU::DOT4_r600 ||
9090 BI->getOpcode() == AMDGPU::DOT4_eg) {
9191 Result[Dst] = AMDGPU::PV_X;
92 continue;
93 }
94 if (Dst == AMDGPU::OQAP) {
9295 continue;
9396 }
9497 unsigned PVReg = 0;
100100 def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>;
101101 def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>;
102102 def AR_X : R600Reg<"AR.x", 0>;
103 def OQAP : R600Reg<"OQAP", 221>;
103104
104105 def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32,
105106 (add (sequence "ArrayBase%u", 448, 480))>;
169170 R600_ArrayBase,
170171 R600_Addr,
171172 ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF,
172 ALU_CONST, ALU_PARAM
173 ALU_CONST, ALU_PARAM, OQAP
173174 )>;
174175
175176 def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add
2222 def AnyALU : InstrItinClass;
2323 def VecALU : InstrItinClass;
2424 def TransALU : InstrItinClass;
25 def XALU : InstrItinClass;
2526
2627 def R600_VLIW5_Itin : ProcessorItineraries <
2728 [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL],
3031 InstrItinData]>,
3132 InstrItinData]>,
3233 InstrItinData]>,
34 InstrItinData]>,
3335 InstrItinData]>
3436 ]
3537 >;
0 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
1
2
3 @local_memory.local_mem = internal addrspace(3) unnamed_addr global [16 x i32] zeroinitializer, align 4
4
5 ; CHECK: @local_memory
6
7 ; Check that the LDS size emitted correctly
8 ; CHECK: .long 166120
9 ; CHECK-NEXT: .long 16
10
11 ; CHECK: LDS_WRITE
12
13 ; GROUP_BARRIER must be the last instruction in a clause
14 ; CHECK: GROUP_BARRIER
15 ; CHECK-NEXT: ALU clause
16
17 ; CHECK: LDS_READ_RET
18
19 define void @local_memory(i32 addrspace(1)* %out) {
20 entry:
21 %y.i = call i32 @llvm.r600.read.tidig.x() #0
22 %arrayidx = getelementptr inbounds [16 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
23 store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
24 %add = add nsw i32 %y.i, 1
25 %cmp = icmp eq i32 %add, 16
26 %.add = select i1 %cmp, i32 0, i32 %add
27 call void @llvm.AMDGPU.barrier.local()
28 %arrayidx1 = getelementptr inbounds [16 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
29 %0 = load i32 addrspace(3)* %arrayidx1, align 4
30 %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %out, i32 %y.i
31 store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
32 ret void
33 }
34
35 @local_memory_two_objects.local_mem0 = internal addrspace(3) unnamed_addr global [4 x i32] zeroinitializer, align 4
36 @local_memory_two_objects.local_mem1 = internal addrspace(3) unnamed_addr global [4 x i32] zeroinitializer, align 4
37
38 ; CHECK: @local_memory_two_objects
39
40 ; Check that the LDS size emitted correctly
41 ; CHECK: .long 166120
42 ; CHECK-NEXT: .long 8
43
44 ; Make sure the lds writes are using different addresses.
45 ; CHECK: LDS_WRITE {{[*]*}} {{PV|T}}[[ADDRW:[0-9]*\.[XYZW]]]
46 ; CHECK-NOT: LDS_WRITE {{[*]*}} T[[ADDRW]]
47
48 ; GROUP_BARRIER must be the last instruction in a clause
49 ; CHECK: GROUP_BARRIER
50 ; CHECK-NEXT: ALU clause
51
52 ; Make sure the lds reads are using different addresses.
53 ; CHECK: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
54 ; CHECK-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
55
56 define void @local_memory_two_objects(i32 addrspace(1)* %out) {
57 entry:
58 %x.i = call i32 @llvm.r600.read.tidig.x() #0
59 %arrayidx = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
60 store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
61 %mul = shl nsw i32 %x.i, 1
62 %arrayidx1 = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
63 store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
64 %sub = sub nsw i32 3, %x.i
65 call void @llvm.AMDGPU.barrier.local()
66 %arrayidx2 = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
67 %0 = load i32 addrspace(3)* %arrayidx2, align 4
68 %arrayidx3 = getelementptr inbounds i32 addrspace(1)* %out, i32 %x.i
69 store i32 %0, i32 addrspace(1)* %arrayidx3, align 4
70 %arrayidx4 = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
71 %1 = load i32 addrspace(3)* %arrayidx4, align 4
72 %add = add nsw i32 %x.i, 4
73 %arrayidx5 = getelementptr inbounds i32 addrspace(1)* %out, i32 %add
74 store i32 %1, i32 addrspace(1)* %arrayidx5, align 4
75 ret void
76 }
77
78 declare i32 @llvm.r600.read.tidig.x() #0
79 declare void @llvm.AMDGPU.barrier.local()
80
81 attributes #0 = { readnone }