llvm.org GIT mirror llvm / 90f5eff
AMDGPU: Write LDS objects out as global symbols in code generation Summary: The symbols use the processor-specific SHN_AMDGPU_LDS section index introduced with a previous change. The linker is then expected to resolve relocations, which are also emitted. Initially disabled for HSA and PAL environments until they have caught up in terms of linker and runtime loader. Some notes: - The llvm.amdgcn.groupstaticsize intrinsics can no longer be lowered to a constant at compile times, which means some tests can no longer be applied. The current "solution" is a terrible hack, but the intrinsic isn't used by Mesa, so we can keep it for now. - We no longer know the full LDS size per kernel at compile time, which means that we can no longer generate a relevant error message at compile time. It would be possible to add a check for the size of individual variables, but ultimately the linker will have to perform the final check. Change-Id: If66dbf33fccfbf3609aefefa2558ac0850d42275 Reviewers: arsenm, rampitec, t-tye, b-sumner, jsjodin Subscribers: qcolombet, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D61494 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@364297 91177308-0d34-0410-b5e6-96231b3b80d8 Nicolai Haehnle 9 months ago
29 changed file(s) with 283 addition(s) and 103 deletion(s). Raw diff Collapse all Expand all
297297 }
298298
299299 void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
300
301 // Group segment variables aren't emitted in HSA.
302 if (AMDGPU::isGroupSegment(GV))
300 if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
301 if (GV->hasInitializer() && !isa(GV->getInitializer())) {
302 OutContext.reportError({},
303 Twine(GV->getName()) +
304 ": unsupported initializer for address space");
305 return;
306 }
307
308 // LDS variables aren't emitted in HSA or PAL yet.
309 const Triple::OSType OS = TM.getTargetTriple().getOS();
310 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
311 return;
312
313 MCSymbol *GVSym = getSymbol(GV);
314
315 GVSym->redefineIfPossible();
316 if (GVSym->isDefined() || GVSym->isVariable())
317 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
318 "' is already defined");
319
320 const DataLayout &DL = GV->getParent()->getDataLayout();
321 uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
322 unsigned Align = GV->getAlignment();
323 if (!Align)
324 Align = 4;
325
326 EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
327 EmitLinkage(GV, GVSym);
328 getTargetStreamer()->emitAMDGPULDS(GVSym, Size, Align);
303329 return;
330 }
304331
305332 AsmPrinter::EmitGlobalVariable(GV);
306333 }
43564356 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
43574357 NODE_NAME_CASE(CONST_DATA_PTR)
43584358 NODE_NAME_CASE(PC_ADD_REL_OFFSET)
4359 NODE_NAME_CASE(LDS)
43594360 NODE_NAME_CASE(KILL)
43604361 NODE_NAME_CASE(DUMMY_CHAIN)
43614362 case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
45704571 Known.Zero.setHighBits(16);
45714572 break;
45724573 }
4574 case AMDGPUISD::LDS: {
4575 auto GA = cast(Op.getOperand(0).getNode());
4576 unsigned Align = GA->getGlobal()->getAlignment();
4577
4578 Known.Zero.setHighBits(16);
4579 if (Align)
4580 Known.Zero.setLowBits(Log2_32(Align));
4581 break;
4582 }
45734583 case ISD::INTRINSIC_WO_CHAIN: {
45744584 unsigned IID = cast(Op.getOperand(0))->getZExtValue();
45754585 switch (IID) {
484484 INTERP_P1LV_F16,
485485 INTERP_P2_F16,
486486 PC_ADD_REL_OFFSET,
487 LDS,
487488 KILL,
488489 DUMMY_CHAIN,
489490 FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
4949 } else if (FoldOp->isFI()) {
5050 FrameIndexToFold = FoldOp->getIndex();
5151 } else {
52 assert(FoldOp->isReg());
52 assert(FoldOp->isReg() || FoldOp->isGlobal());
5353 OpToFold = FoldOp;
5454 }
5555 }
6565 bool isReg() const {
6666 return Kind == MachineOperand::MO_Register;
6767 }
68
69 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
6870
6971 bool isCommuted() const {
7072 return Commuted;
229231 }
230232 }
231233
232 if ((Fold.isImm() || Fold.isFI()) && Fold.needsShrink()) {
234 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
233235 MachineBasicBlock *MBB = MI->getParent();
234236 auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);
235237 if (Liveness != MachineBasicBlock::LQR_Dead)
273275
274276 if (Fold.isImm()) {
275277 Old.ChangeToImmediate(Fold.ImmToFold);
278 return true;
279 }
280
281 if (Fold.isGlobal()) {
282 Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
283 Fold.OpToFold->getTargetFlags());
276284 return true;
277285 }
278286
367375 if ((Opc == AMDGPU::V_ADD_I32_e64 ||
368376 Opc == AMDGPU::V_SUB_I32_e64 ||
369377 Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME
370 (OpToFold->isImm() || OpToFold->isFI())) {
378 (OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) {
371379 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
372380
373381 // Verify the other operand is a VGPR, otherwise we would violate the
482490 return;
483491 }
484492
485 bool FoldingImmLike = OpToFold.isImm() || OpToFold.isFI();
493 bool FoldingImmLike =
494 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
486495
487496 if (FoldingImmLike && UseMI->isCopy()) {
488497 unsigned DestReg = UseMI->getOperand(0).getReg();
883892 SmallVector FoldList;
884893 MachineOperand &Dst = MI.getOperand(0);
885894
886 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
895 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
887896 if (FoldingImm) {
888897 unsigned NumLiteralUses = 0;
889898 MachineOperand *NonInlineUse = nullptr;
12311240 }
12321241
12331242 MachineOperand &OpToFold = MI.getOperand(1);
1234 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
1243 bool FoldingImm =
1244 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
12351245
12361246 // FIXME: We could also be folding things like TargetIndexes.
12371247 if (!FoldingImm && !OpToFold.isReg())
35873587 }
35883588
35893589 case AMDGPU::GET_GROUPSTATICSIZE: {
3590 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
3591 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
35903592 DebugLoc DL = MI.getDebugLoc();
35913593 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
35923594 .add(MI.getOperand(0))
47754777 SelectionDAG &DAG) const {
47764778 GlobalAddressSDNode *GSD = cast(Op);
47774779 const GlobalValue *GV = GSD->getGlobal();
4778 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
4780 if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
4781 (!GV->hasExternalLinkage() ||
4782 getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
4783 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL)) ||
47794784 GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
47804785 GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
47814786 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
47834788 SDLoc DL(GSD);
47844789 EVT PtrVT = Op.getValueType();
47854790
4786 // FIXME: Should not make address space based decisions here.
4791 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
4792 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
4793 SIInstrInfo::MO_ABS32_LO);
4794 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
4795 }
4796
47874797 if (shouldEmitFixup(GV))
47884798 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
47894799 else if (shouldEmitPCReloc(GV))
57725782 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
57735783 Op->getOperand(1), Op->getOperand(2)), 0);
57745784
5785 case Intrinsic::amdgcn_groupstaticsize: {
5786 Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
5787 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
5788 return Op;
5789
5790 const Module *M = MF.getFunction().getParent();
5791 const GlobalValue *GV =
5792 M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
5793 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
5794 SIInstrInfo::MO_ABS32_LO);
5795 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
5796 }
57755797 default:
57765798 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
57775799 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
27022702 const MCInstrDesc &InstDesc = MI.getDesc();
27032703 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];
27042704
2705 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
2705 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
27062706
27072707 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
27082708 return true;
30113011
30123012 switch (Desc.OpInfo[i].OperandType) {
30133013 case MCOI::OPERAND_REGISTER:
3014 if (MI.getOperand(i).isImm()) {
3014 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
30153015 ErrInfo = "Illegal immediate value for operand.";
30163016 return false;
30173017 }
36813681 return isLegalRegOperand(MRI, OpInfo, MO);
36823682
36833683 // Handle non-register types that are treated like immediates.
3684 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
3684 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
36853685 return true;
36863686 }
36873687
37383738 }
37393739
37403740 // Handle non-register types that are treated like immediates.
3741 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
3741 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
37423742
37433743 if (!DefinedRC) {
37443744 // This operand expects an immediate.
202202
203203 def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",
204204 SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>
205 >;
206
207 def SIlds : SDNode<"AMDGPUISD::LDS",
208 SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]>
205209 >;
206210
207211 def SIload_d16_lo : SDNode<"AMDGPUISD::LOAD_D16_LO",
11411141 (S_MOV_B32 imm:$imm)
11421142 >;
11431143
1144 def : GCNPat <
1145 (VGPRImm<(SIlds tglobaladdr:$ga)>),
1146 (V_MOV_B32_e32 $ga)
1147 >;
1148
1149 def : GCNPat <
1150 (SIlds tglobaladdr:$ga),
1151 (S_MOV_B32 $ga)
1152 >;
1153
11441154 // FIXME: Workaround for ordering issue with peephole optimizer where
11451155 // a register class copy interferes with immediate folding. Should
11461156 // use s_mov_b32, which can be shrunk to s_movk_i32
9393 } else if (MovSrc.isFI()) {
9494 Src0.setSubReg(0);
9595 Src0.ChangeToFrameIndex(MovSrc.getIndex());
96 ConstantFolded = true;
97 } else if (MovSrc.isGlobal()) {
98 Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(),
99 MovSrc.getTargetFlags());
96100 ConstantFolded = true;
97101 }
98102
8080 @g_lds = addrspace(3) global float undef, align 4
8181
8282 ; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:
83 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
84 ; SI: ds_read_b32 v{{[0-9]+}}, [[REG]]
83 ; SI: v_mov_b32_e32 [[PTR:v[0-9]+]], g_lds@abs32@lo
84 ; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
8585 define amdgpu_kernel void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
8686 %val = load float, float addrspace(3)* @g_lds
8787 store float %val, float addrspace(1)* %out
66
77 ; GCN-LABEL: {{^}}write_ds_sub0_offset0_global:
88 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v0
9 ; CI: v_sub_i32_e32 [[BASEPTR:v[0-9]+]], vcc, 0, [[SHL]]
10 ; GFX9: v_sub_u32_e32 [[BASEPTR:v[0-9]+]], 0, [[SHL]]
9 ; GCN: v_sub_{{[iu]}}32_e32 [[BASEPTR:v[0-9]+]], {{(vcc, )?}}lds.obj@abs32@lo, [[SHL]]
1110 ; GCN: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b
1211 ; GCN: ds_write_b32 [[BASEPTR]], [[VAL]] offset:12
1312 define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 {
354354 ; CI-DAG: s_mov_b32 m0
355355 ; GFX9-NOT: m0
356356
357 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}}
357 ; GCN-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 3, {{v[0-9]+}}
358 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}lds.f64@abs32@lo, [[VOFS]]
358359 ; GCN: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8
359360 ; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
360361
440441 ; CI-DAG: s_mov_b32 m0
441442 ; GFX9-NOT: m0
442443
443 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
444 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
444 ; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}}
445 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1
445446 define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
446447 %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
447448 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
454455 ; CI-DAG: s_mov_b32 m0
455456 ; GFX9-NOT: m0
456457
457 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
458 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2
458 ; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}}
459 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:2
459460 define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
460461 %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
461462 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
470471 ; CI-DAG: s_mov_b32 m0
471472 ; GFX9-NOT: m0
472473
473 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
474 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
475 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3
474 ; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}}
475 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1
476 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:2 offset1:3
476477 define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
477478 %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
478479 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
487488 ; CI-DAG: s_mov_b32 m0
488489 ; GFX9-NOT: m0
489490
490 ; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
491 ; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000
492 ; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1
493 ; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1
491 ; GCN-DAG: s_mov_b32 [[SBASE0:s[0-9]+]], bar.large@abs32@lo
492 ; GCN-DAG: s_add_i32 [[SBASE1:s[0-9]+]], [[SBASE0]], 0x4000{{$}}
493 ; GCN-DAG: s_addk_i32 [[SBASE0]], 0x7ff8{{$}}
494 ; GCN-DAG: v_mov_b32_e32 [[VBASE0:v[0-9]+]], [[SBASE0]]
495 ; GCN-DAG: v_mov_b32_e32 [[VBASE1:v[0-9]+]], [[SBASE1]]
496 ; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VBASE0]] offset1:1
497 ; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VBASE1]] offset1:1
494498 ; GCN: s_endpgm
495499 define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
496500 %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
102102 ; CI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
103103 ; CI-DAG: s_mov_b32 m0
104104
105 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
106
107 ; GFX9: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
108 ; GFX9: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
105 ; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}}
106 ; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]]
107 ;
108 ; TODO: This should be an s_mov_b32. The v_mov_b32 gets introduced by an
109 ; early legalization of the constant bus constraint on the v_lshl_add_u32,
110 ; and then SIFoldOperands folds in an unlucky order.
111 ; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo
112 ; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], {{v[0-9]+}}, 2, [[VBASE]]
113
114 ; GFX9-DAG: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
115 ; GFX9-DAG: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
109116
110117 ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
111118 ; GCN: s_endpgm
130137 ; GFX9-NOT: m0
131138
132139 ; GCN-DAG: {{buffer|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
133 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
140
141 ; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}}
142 ; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]]
143 ; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo
144 ; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], v{{[0-9]+}}, 2, [[VBASE]]
145
134146 ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
135147 ; GCN: s_endpgm
136148 define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
152164 ; GFX9-NOT: m0
153165
154166 ; GCN-DAG: {{buffer|global}}_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
155 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
167
168 ; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}}
169 ; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]]
170 ; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo
171 ; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], v{{[0-9]+}}, 2, [[VBASE]]
172
156173 ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
157174 ; GCN: s_endpgm
158175 define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
388405 ; CI-DAG: s_mov_b32 m0
389406 ; GFX9-NOT: m0
390407
391 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
392 ; GCN: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
408 ; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}}
409 ; GCN: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
393410 define amdgpu_kernel void @store_constant_adjacent_offsets() {
394411 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
395412 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
401418 ; GFX9-NOT: m0
402419
403420 ; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
404 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
405 ; GCN: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2
421 ; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}}
422 ; GCN: ds_write2_b32 [[PTR]], [[VAL]], [[VAL]] offset1:2
406423 define amdgpu_kernel void @store_constant_disjoint_offsets() {
407424 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
408425 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
415432 ; CI-DAG: s_mov_b32 m0
416433 ; GFX9-NOT: m0
417434
418 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
419 ; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
420 ; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
435 ; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}}
436 ; GCN-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
437 ; GCN-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
421438 ; GCN: s_endpgm
422439 define amdgpu_kernel void @store_misaligned64_constant_offsets() {
423440 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
431448 ; CI-DAG: s_mov_b32 m0
432449 ; GFX9-NOT: m0
433450
434 ; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
435 ; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}}
436 ; GCN-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
437 ; GCN-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
451 ; GCN-DAG: s_mov_b32 [[SBASE0:s[0-9]+]], bar.large@abs32@lo
452 ; GCN-DAG: s_add_i32 [[SBASE1:s[0-9]+]], [[SBASE0]], 0x4000{{$}}
453 ; GCN-DAG: s_addk_i32 [[SBASE0]], 0x7ff8{{$}}
454 ; GCN-DAG: v_mov_b32_e32 [[VBASE0:v[0-9]+]], [[SBASE0]]{{$}}
455 ; GCN-DAG: v_mov_b32_e32 [[VBASE1:v[0-9]+]], [[SBASE1]]{{$}}
456 ; GCN-DAG: ds_write2_b32 [[VBASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
457 ; GCN-DAG: ds_write2_b32 [[VBASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
438458 ; GCN: s_endpgm
439459 define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
440460 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
0 ; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck %s
11 ; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
22
3 ; CHECK: in function load_init_lds_global{{.*}}: unsupported initializer for address space
3 ; CHECK: lds: unsupported initializer for address space
44
55 @lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8]
66
0 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s
1 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -filetype=obj < %s | llvm-readobj -r -t | FileCheck -check-prefixes=ELF %s
2
3 @lds.external = external unnamed_addr addrspace(3) global [0 x i32]
4 @lds.defined = unnamed_addr addrspace(3) global [8 x i32] undef, align 8
5
6 ; ELF: Relocations [
7 ; ELF-NEXT: Section (3) .rel.text {
8 ; ELF-NEXT: 0x{{[0-9a-f]*}} R_AMDGPU_ABS32 lds.external 0x0
9 ; ELF-NEXT: 0x{{[0-9a-f]*}} R_AMDGPU_ABS32 lds.defined 0x0
10 ; ELF-NEXT: }
11 ; ELF-NEXT: ]
12
13 ; ELF: Symbol {
14 ; ELF: Name: lds.defined
15 ; ELF-NEXT: Value: 0x8
16 ; ELF-NEXT: Size: 32
17 ; ELF-NEXT: Binding: Global (0x1)
18 ; ELF-NEXT: Type: Object (0x1)
19 ; ELF-NEXT: Other: 0
20 ; ELF-NEXT: Section: Processor Specific (0xFF00)
21 ; ELF-NEXT: }
22
23 ; ELF: Symbol {
24 ; ELF: Name: lds.external
25 ; ELF-NEXT: Value: 0x4
26 ; ELF-NEXT: Size: 0
27 ; ELF-NEXT: Binding: Global (0x1)
28 ; ELF-NEXT: Type: Object (0x1)
29 ; ELF-NEXT: Other: 0
30 ; ELF-NEXT: Section: Processor Specific (0xFF00)
31 ; ELF-NEXT: }
32
33 ; GCN-LABEL: {{^}}test_basic:
34 ; GCN: v_mov_b32_e32 v1, lds.external@abs32@lo ; encoding: [0xff,0x02,0x02,0x7e,A,A,A,A]
35 ; GCN-NEXT: ; fixup A - offset: 4, value: lds.external@abs32@lo, kind: FK_Data_4{{$}}
36 ;
37 ; GCN: s_add_i32 s0, lds.defined@abs32@lo, s0 ; encoding: [0xff,0x00,0x00,0x81,A,A,A,A]
38 ; GCN-NEXT: ; fixup A - offset: 4, value: lds.defined@abs32@lo, kind: FK_Data_4{{$}}
39 ;
40 ; GCN: .globl lds.external
41 ; GCN: .amdgpu_lds lds.external, 0, 4
42 ; GCN: .globl lds.defined
43 ; GCN: .amdgpu_lds lds.defined, 32, 8
44 define amdgpu_gs float @test_basic(i32 inreg %wave, i32 %arg1) #0 {
45 main_body:
46 %gep0 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @lds.external, i32 0, i32 %arg1
47 %tmp = load i32, i32 addrspace(3)* %gep0
48
49 %mask = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %tmp, i32 0, i32 0)
50 %mask.32 = trunc i64 %mask to i32
51 %gep1 = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds.defined, i32 0, i32 %wave
52 store i32 %mask.32, i32 addrspace(3)* %gep1
53
54 %r = bitcast i32 %tmp to float
55 ret float %r
56 }
57
58 ; Function Attrs: convergent nounwind readnone
59 declare i64 @llvm.amdgcn.icmp.i64.i32(i32, i32, i32) #4
60
61 attributes #0 = { "no-signed-zeros-fp-math"="true" }
62 attributes #4 = { convergent nounwind readnone }
None ; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=ALL -check-prefix=GCN %s
1 ; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=ALL -check-prefix=HSA %s
21 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=ALL -check-prefix=EG %s
32
0 ; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck %s
11 ; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
22
3 ; CHECK: in function load_zeroinit_lds_global{{.*}}: unsupported initializer for address space
3 ; CHECK: lds: unsupported initializer for address space
44
55 @lds = addrspace(3) global [256 x i32] zeroinitializer
66
267267 ; CIVI-DAG: s_mov_b32 m0
268268 ; GFX9-NOT: m0
269269
270 ; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
270 ; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
271 ; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
272 ; GFX9-DAG: s_mov_b32 [[BASE:s[0-9]+]], lds0@abs32@lo
273 ; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 2, [[BASE]]
274
271275 ; GCN: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
272276 define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
273277 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
411415 ; CIVI-DAG: s_mov_b32 m0
412416 ; GFX9-NOT: m0
413417
414 ; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
418 ; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 3, {{v[0-9]+}}
419 ; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]]
420 ; GFX9-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], lds1@abs32@lo
421 ; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 3, [[BASE]]
422
415423 ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
416424 define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
417425 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
130130 @lds0 = addrspace(3) global [512 x i32] undef, align 4
131131
132132 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32:
133 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
133 ; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
134 ; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
135 ; GFX9-DAG: s_mov_b32 [[BASE:s[0-9]+]], lds0@abs32@lo
136 ; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 2, [[BASE]]
134137 ; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
135138 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
136139 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
324327 @lds1 = addrspace(3) global [512 x i64] undef, align 8
325328
326329 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64:
327 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
330 ; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 3, {{v[0-9]+}}
331 ; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]]
332 ; GFX9-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], lds1@abs32@lo
333 ; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 3, [[BASE]]
328334 ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
329335 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
330336 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
None ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck %s
2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
0 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,NOHSA %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,HSA %s
2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,HSA %s
33
44 @lds0 = addrspace(3) global [512 x float] undef, align 4
55 @lds1 = addrspace(3) global [256 x float] undef, align 4
77 @large = addrspace(3) global [4096 x i32] undef, align 4
88
99 ; CHECK-LABEL: {{^}}groupstaticsize_test0:
10 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}}
10 ; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo
11 ; HSA: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}}
1112 define amdgpu_kernel void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 {
1213 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
1314 %idx.0 = add nsw i32 %tid.x, 64
2122 }
2223
2324 ; CHECK-LABEL: {{^}}groupstaticsize_test1:
24 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}}
25 ; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo
26 ; HSA: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}}
2527 define amdgpu_kernel void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) {
2628 entry:
2729 %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
4951
5052 ; Exceeds 16-bit simm limit of s_movk_i32
5153 ; CHECK-LABEL: {{^}}large_groupstaticsize:
52 ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}
54 ; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo
55 ; HSA: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}
5356 define amdgpu_kernel void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 {
5457 %gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(3)* @large, i32 0, i32 %idx
5558 store volatile i32 0, i32 addrspace(3)* %gep
11 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
22
33 @local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
4
5 ; Check that the LDS size emitted correctly
6 ; SI: .long 47180
7 ; SI-NEXT: .long 65668
8 ; CI: .long 47180
9 ; CI-NEXT: .long 32900
104
115 ; GCN-LABEL: {{^}}local_memory:
126
5650
5751 ; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
5852 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
53
5954 define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
6055 entry:
6156 %x.i = call i32 @llvm.amdgcn.workitem.id.x()
99 ; not an immediate.
1010
1111 ; FUNC-LABEL: {{^}}load_i32_local_const_ptr:
12 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
13 ; GCN: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4
12 ; GCN: v_mov_b32_e32 v[[PTR:[0-9]+]], lds@abs32@lo
13 ; GCN: ds_read_b32 v{{[0-9]+}}, v[[PTR]] offset:4
1414
1515 ; R600: LDS_READ_RET
1616 define amdgpu_kernel void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
66 @tess_lds = external addrspace(3) global [8192 x i32]
77
88 ; CHECK-LABEL: {{^}}main:
9 ; CHECK: ds_write2_b32
9 ; CHECK: ds_write_b32
10 ; CHECK: ds_write_b32
1011 ; CHECK: v_mov_b32_e32 v1, v0
1112 ; CHECK: tbuffer_store_format_xyzw v[0:3],
1213 define amdgpu_vs void @main(i32 inreg %arg) {
+0
-14
test/CodeGen/AMDGPU/over-max-lds-size.ll less more
None ; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck -check-prefix=ERROR %s
1 ; RUN: not llc -march=amdgcn -mcpu=hawaii < %s 2>&1 | FileCheck -check-prefix=ERROR %s
2 ; RUN: not llc -march=amdgcn -mcpu=fiji < %s 2>&1 | FileCheck -check-prefix=ERROR %s
3
4 ; ERROR: error: local memory limit exceeded (400000) in use_huge_lds
5
6 @huge = internal unnamed_addr addrspace(3) global [100000 x i32] undef, align 4
7
8 define amdgpu_kernel void @use_huge_lds() {
9 entry:
10 %v0 = getelementptr inbounds [100000 x i32], [100000 x i32] addrspace(3)* @huge, i32 0, i32 0
11 store i32 0, i32 addrspace(3)* %v0
12 ret void
13 }
77 ; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
88 ; IR: alloca [10 x i32]
99 ; ASM-LABEL: {{^}}promote_alloca_size_256:
10 ; ASM: ; LDSByteSize: 60000 bytes/workgroup (compile time only)
10 ; ASM: .amdgpu_lds global_array0, 30000, 4
11 ; ASM: .amdgpu_lds global_array1, 30000, 4
1112
1213 define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
1314 entry:
3232 ; remaining add use goes through the normal shl + add constant fold.
3333
3434 ; GCN-LABEL: {{^}}load_shl_base_lds_1:
35 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
35 ; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
36
37 ; TODO: integrate into the ds_read_b32 offset using a 16-bit relocation
38 ; GCN: v_add_{{[iu]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
39
3640 ; GCN: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8
3741 ; GCN: v_add_{{[iu]}}32_e32 [[ADDUSE:v[0-9]+]], vcc, 8, v{{[0-9]+}}
3842 ; GCN-DAG: buffer_store_dword [[RESULT]]
6771 ; The two globals are placed adjacent in memory, so the same base
6872 ; pointer can be used with an offset into the second one.
6973
74 ; TODO: Recover the optimization of using ds_read2st64_b32 using alignment hints
75
7076 ; GCN-LABEL: {{^}}load_shl_base_lds_2:
71 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
77 ; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
78 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR0:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
79 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR1:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]]
7280 ; GCN: s_mov_b32 m0, -1
73 ; GCN-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
81
82 ; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR0]] offset:256
83 ; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR1]] offset:256
84 ; TODO: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
85
7486 ; GCN: s_endpgm
7587 define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
7688 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
33 ; These tests check that the compiler won't crash when it needs to spill
44 ; SGPRs.
55
6 @ddxy_lds = external addrspace(3) global [64 x i32]
7
86 ; GCN-LABEL: {{^}}main:
97 ; GCN: s_wqm
108
119 ; Make sure not emitting unused scratch resource descriptor setup
12 ; GCN-NOT: s_mov_b32
13 ; GCN-NOT: s_mov_b32
14 ; GCN-NOT: s_mov_b32
1510 ; GCN-NOT: s_mov_b32
1611
1712 ; GCN: s_mov_b32 m0
2520 ; TOVGPR: ScratchSize: 0{{$}}
2621 define amdgpu_ps void @main([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <8 x i32>] addrspace(4)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
2722 main_body:
23 %lds = inttoptr i32 0 to [64 x i32] addrspace(3)*
2824 %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0
2925 %tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
3026 %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 96, i32 0)
202198 %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 5, i32 %arg4) #0
203199 %mbcnt.lo.0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
204200 %tmp109 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.0)
205 %tmp110 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp109
201 %tmp110 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp109
206202 %tmp111 = bitcast float %p2.i to i32
207203 store i32 %tmp111, i32 addrspace(3)* %tmp110
208204 %tmp112 = bitcast float %p2.i96 to i32
209205 store i32 %tmp112, i32 addrspace(3)* %tmp110
210206 %mbcnt.lo.1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
211207 %tmp113 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.1)
212 %tmp114 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp113
208 %tmp114 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp113
213209 %tmp115 = and i32 %tmp113, -4
214 %tmp116 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp115
210 %tmp116 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp115
215211 %tmp117 = add i32 %tmp115, 1
216 %tmp118 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp117
212 %tmp118 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp117
217213 %tmp119 = bitcast float %p2.i to i32
218214 store i32 %tmp119, i32 addrspace(3)* %tmp114
219215 %tmp120 = load i32, i32 addrspace(3)* %tmp116
240236 %tmp140 = fmul float %tmp59, %p2.i96
241237 %mbcnt.lo.2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
242238 %tmp141 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.2)
243 %tmp142 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp141
239 %tmp142 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp141
244240 %tmp143 = bitcast float %tmp137 to i32
245241 store i32 %tmp143, i32 addrspace(3)* %tmp142
246242 %tmp144 = bitcast float %tmp138 to i32
251247 store i32 %tmp146, i32 addrspace(3)* %tmp142
252248 %mbcnt.lo.3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
253249 %tmp147 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.3)
254 %tmp148 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp147
250 %tmp148 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp147
255251 %tmp149 = and i32 %tmp147, -4
256 %tmp150 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp149
252 %tmp150 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp149
257253 %tmp151 = add i32 %tmp149, 2
258 %tmp152 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp151
254 %tmp152 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp151
259255 %tmp153 = bitcast float %tmp137 to i32
260256 store i32 %tmp153, i32 addrspace(3)* %tmp148
261257 %tmp154 = load i32, i32 addrspace(3)* %tmp150
7777
7878 ; CHECK-LABEL: {{^}}promote_alloca_enabled:
7979 ; CHECK: ds_read_b32
80 ; CHECK: ; LDSByteSize: 5120
8180 define amdgpu_kernel void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 {
8281 entry:
8382 %stack = alloca [5 x i32], align 4, addrspace(5)
99 ; CHECK: machineFunctionInfo:
1010 ; CHECK-NEXT: explicitKernArgSize: 128
1111 ; CHECK-NEXT: maxKernArgAlign: 64
12 ; CHECK-NEXT: ldsSize: 2048
12 ; CHECK-NEXT: ldsSize: 0
1313 ; CHECK-NEXT: isEntryFunction: true
1414 ; CHECK-NEXT: noSignedZerosFPMath: false
1515 ; CHECK-NEXT: memoryBound: false