llvm.org GIT mirror llvm / b664d47
R600/SI: Store constant initializer data in constant memory This implements a solution for constant initializers suggested by Vadim Girlin, where we store the data after the shader code and then use the S_GETPC instruction to compute its address. This saves use the trouble of creating a new buffer for constant data and then having to pass the pointer to the kernel via user SGPRs or the input buffer. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213530 91177308-0d34-0410-b5e6-96231b3b80d8 Tom Stellard 5 years ago
18 changed file(s) with 194 addition(s) and 31 deletion(s). Raw diff Collapse all Expand all
6262
6363 extern Target TheAMDGPUTarget;
6464
65 namespace AMDGPU {
66 enum TargetIndex {
67 TI_CONSTDATA_START
68 };
69 }
70
71 #define END_OF_TEXT_LABEL_NAME "EndOfTextLabel"
72
6573 } // End namespace llvm
6674
6775 namespace ShaderType {
8484 DisasmEnabled = TM.getSubtarget().dumpCode();
8585 }
8686
87 void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
88
89 // This label is used to mark the end of the .text section.
90 const TargetLoweringObjectFile &TLOF = getObjFileLowering();
91 OutStreamer.SwitchSection(TLOF.getTextSection());
92 MCSymbol *EndOfTextLabel =
93 OutContext.GetOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
94 OutStreamer.EmitLabel(EndOfTextLabel);
95 }
96
8797 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
8898 SetupMachineFunction(MF);
8999
6969 /// Implemented in AMDGPUMCInstLower.cpp
7070 void EmitInstruction(const MachineInstr *MI) override;
7171
72 void EmitEndOfAsmFile(Module &M) override;
73
7274 protected:
7375 bool DisasmEnabled;
7476 std::vector DisasmLines, HexLines;
2020 #include "AMDGPUSubtarget.h"
2121 #include "R600MachineFunctionInfo.h"
2222 #include "SIMachineFunctionInfo.h"
23 #include "llvm/Analysis/ValueTracking.h"
2423 #include "llvm/CodeGen/CallingConvLower.h"
2524 #include "llvm/CodeGen/MachineFunction.h"
2625 #include "llvm/CodeGen/MachineRegisterInfo.h"
11741173 };
11751174
11761175 return DAG.getMergeValues(Ops, DL);
1177 }
1178
1179 // Lower loads constant address space global variable loads
1180 if (Load->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
1181 isa(
1182 GetUnderlyingObject(Load->getMemOperand()->getValue()))) {
1183
1184
1185 SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL,
1186 getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
1187 Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1188 DAG.getConstant(2, MVT::i32));
1189 return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
1190 Load->getChain(), Ptr,
1191 DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
11921176 }
11931177
11941178 if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS ||
22212205 NODE_NAME_CASE(CVT_F32_UBYTE2)
22222206 NODE_NAME_CASE(CVT_F32_UBYTE3)
22232207 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
2208 NODE_NAME_CASE(CONST_DATA_PTR)
22242209 NODE_NAME_CASE(STORE_MSKOR)
22252210 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
22262211 }
7777 virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
7878 const TargetRegisterClass *RC,
7979 unsigned Reg, EVT VT) const;
80 SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
81 SelectionDAG &DAG) const;
80 virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
81 SelectionDAG &DAG) const;
8282 /// \brief Split a vector load into multiple scalar loads.
8383 SDValue SplitVectorLoad(const SDValue &Op, SelectionDAG &DAG) const;
8484 SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
232232 /// T2|v.z| | | |
233233 /// T3|v.w| | | |
234234 BUILD_VERTICAL_VECTOR,
235 /// Pointer to the start of the shader's constant data.
236 CONST_DATA_PTR,
235237 FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
236238 STORE_MSKOR,
237239 LOAD_CONSTANT,
2121 #include "llvm/CodeGen/MachineBasicBlock.h"
2222 #include "llvm/CodeGen/MachineInstr.h"
2323 #include "llvm/IR/Constants.h"
24 #include "llvm/IR/GlobalVariable.h"
2425 #include "llvm/MC/MCCodeEmitter.h"
26 #include "llvm/MC/MCContext.h"
2527 #include "llvm/MC/MCExpr.h"
2628 #include "llvm/MC/MCInst.h"
2729 #include "llvm/MC/MCObjectStreamer.h"
7678 case MachineOperand::MO_MachineBasicBlock:
7779 MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
7880 MO.getMBB()->getSymbol(), Ctx));
81 break;
82 case MachineOperand::MO_GlobalAddress: {
83 const GlobalValue *GV = MO.getGlobal();
84 MCSymbol *Sym = Ctx.GetOrCreateSymbol(StringRef(GV->getName()));
85 MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(Sym, Ctx));
86 break;
87 }
88 case MachineOperand::MO_TargetIndex: {
89 assert(MO.getIndex() == AMDGPU::TI_CONSTDATA_START);
90 MCSymbol *Sym = Ctx.GetOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
91 const MCSymbolRefExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
92 MCOp = MCOperand::CreateExpr(Expr);
93 break;
94 }
7995 }
8096 OutMI.addOperand(MCOp);
8197 }
4444 AMDGPUAsmBackend(const Target &T)
4545 : MCAsmBackend() {}
4646
47 unsigned getNumFixupKinds() const override { return 0; };
47 unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; };
4848 void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
4949 uint64_t Value, bool IsPCRel) const override;
5050 bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
7676 unsigned DataSize, uint64_t Value,
7777 bool IsPCRel) const {
7878
79 uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset());
80 assert(Fixup.getKind() == FK_PCRel_4);
81 *Dst = (Value - 4) / 4;
79 switch ((unsigned)Fixup.getKind()) {
80 default: llvm_unreachable("Unknown fixup kind");
81 case AMDGPU::fixup_si_sopp_br: {
82 uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset());
83 *Dst = (Value - 4) / 4;
84 break;
85 }
86
87 case AMDGPU::fixup_si_rodata: {
88 uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset());
89 *Dst = Value;
90 break;
91 }
92
93 case AMDGPU::fixup_si_end_of_text: {
94 uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset());
95 // The value points to the last instruction in the text section, so we
96 // need to add 4 bytes to get to the start of the constants.
97 *Dst = Value + 4;
98 break;
99 }
100 }
82101 }
83102
84103 const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
85104 MCFixupKind Kind) const {
86105 const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = {
87106 // name offset bits flags
88 { "fixup_si_sopp_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel }
107 { "fixup_si_sopp_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel },
108 { "fixup_si_rodata", 0, 32, 0 },
109 { "fixup_si_end_of_text", 0, 32, MCFixupKindInfo::FKF_IsPCRel }
89110 };
90111
91112 if (Kind < FirstTargetFixupKind)
99
1010 #include "AMDGPUMCTargetDesc.h"
1111 #include "llvm/MC/MCELFObjectWriter.h"
12 #include "llvm/MC/MCFixup.h"
1213
1314 using namespace llvm;
1415
2021 protected:
2122 unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
2223 bool IsPCRel) const override {
23 llvm_unreachable("Not implemented");
24 return Fixup.getKind();
2425 }
2526
2627 };
1717 /// 16-bit PC relative fixup for SOPP branch instructions.
1818 fixup_si_sopp_br = FirstTargetFixupKind,
1919
20 /// fixup for global addresses with constant initializers
21 fixup_si_rodata,
22
23 /// fixup for offset from instruction to end of text section
24 fixup_si_end_of_text,
25
2026 // Marker
2127 LastTargetFixupKind,
2228 NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
1212 //
1313 //===----------------------------------------------------------------------===//
1414
15 #include "AMDGPU.h"
1516 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
1617 #include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
1718 #include "MCTargetDesc/AMDGPUFixupKinds.h"
3940 void operator=(const SIMCCodeEmitter &) LLVM_DELETED_FUNCTION;
4041 const MCInstrInfo &MCII;
4142 const MCRegisterInfo &MRI;
43 MCContext &Ctx;
4244
4345 /// \brief Can this operand also contain immediate values?
4446 bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const;
4951 public:
5052 SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
5153 MCContext &ctx)
52 : MCII(mcii), MRI(mri) { }
54 : MCII(mcii), MRI(mri), Ctx(ctx) { }
5355
5456 ~SIMCCodeEmitter() { }
5557
9698 Imm.I = MO.getImm();
9799 else if (MO.isFPImm())
98100 Imm.F = MO.getFPImm();
101 else if (MO.isExpr())
102 return 255;
99103 else
100104 return ~0;
101105
163167 IntFloatUnion Imm;
164168 if (Op.isImm())
165169 Imm.I = Op.getImm();
166 else
170 else if (Op.isFPImm())
167171 Imm.F = Op.getFPImm();
172 else {
173 assert(Op.isExpr());
174 // This will be replaced with a fixup value.
175 Imm.I = 0;
176 }
168177
169178 for (unsigned j = 0; j < 4; j++) {
170179 OS.write((uint8_t) ((Imm.I >> (8 * j)) & 0xff));
197206 if (MO.isReg())
198207 return MRI.getEncodingValue(MO.getReg());
199208
209 if (MO.isExpr()) {
210 const MCSymbolRefExpr *Expr = cast(MO.getExpr());
211 MCFixupKind Kind;
212 const MCSymbol *Sym =
213 Ctx.GetOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
214
215 if (&Expr->getSymbol() == Sym) {
216 // Add the offset to the beginning of the constant values.
217 Kind = (MCFixupKind)AMDGPU::fixup_si_end_of_text;
218 } else {
219 // This is used for constant data stored in .rodata.
220 Kind = (MCFixupKind)AMDGPU::fixup_si_rodata;
221 }
222 Fixups.push_back(MCFixup::Create(4, Expr, Kind, MI.getLoc()));
223 }
224
200225 // Figure out the operand number, needed for isSrcOperand check
201226 unsigned OpNo = 0;
202227 for (unsigned e = MI.getNumOperands(); OpNo < e; ++OpNo) {
1818 #include "R600Defines.h"
1919 #include "R600InstrInfo.h"
2020 #include "R600MachineFunctionInfo.h"
21 #include "llvm/Analysis/ValueTracking.h"
2122 #include "llvm/CodeGen/CallingConvLower.h"
2223 #include "llvm/CodeGen/MachineFrameInfo.h"
2324 #include "llvm/CodeGen/MachineInstrBuilder.h"
15251526 return DAG.getMergeValues(Ops, DL);
15261527 }
15271528
1529 // Lower loads constant address space global variable loads
1530 if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
1531 isa(
1532 GetUnderlyingObject(LoadNode->getMemOperand()->getValue()))) {
1533
1534 SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL,
1535 getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
1536 Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1537 DAG.getConstant(2, MVT::i32));
1538 return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
1539 LoadNode->getChain(), Ptr,
1540 DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
1541 }
15281542
15291543 if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
15301544 SDValue MergedValues[2] = {
859859 return Chain;
860860 }
861861
862 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
863 SDValue Op,
864 SelectionDAG &DAG) const {
865 GlobalAddressSDNode *GSD = cast(Op);
866
867 if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
868 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
869
870 SDLoc DL(GSD);
871 const GlobalValue *GV = GSD->getGlobal();
872 MVT PtrVT = getPointerTy(GSD->getAddressSpace());
873
874 SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT);
875 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
876
877 SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
878 DAG.getConstant(0, MVT::i32));
879 SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
880 DAG.getConstant(1, MVT::i32));
881
882 SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue),
883 PtrLo, GA);
884 SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue),
885 PtrHi, DAG.getConstant(0, MVT::i32),
886 SDValue(Lo.getNode(), 1));
887 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
888 }
889
862890 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
863891 SDLoc DL(Op);
864892 LoadSDNode *Load = cast(Op);
2424 SDValue Chain, unsigned Offset, bool Signed) const;
2525 SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op,
2626 SelectionDAG &DAG) const;
27 SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
28 SelectionDAG &DAG) const override;
2729 SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
2830 SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
2931 SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const;
357357 .addImm(Spill.Lane + i);
358358 }
359359 insertNOPs(MI, 3);
360 MI->eraseFromParent();
361 break;
362 }
363 case AMDGPU::SI_CONSTDATA_PTR: {
364 unsigned Reg = MI->getOperand(0).getReg();
365 unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
366 unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
367
368 BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg);
369
370 // Add 32-bit offset from this instruction to the start of the constant data.
371 BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_I32), RegLo)
372 .addReg(RegLo)
373 .addTargetIndex(AMDGPU::TI_CONSTDATA_START)
374 .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit);
375 BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi)
376 .addReg(RegHi)
377 .addImm(0)
378 .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit)
379 .addReg(AMDGPU::SCC, RegState::Implicit);
360380 MI->eraseFromParent();
361381 break;
362382 }
5656 def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
5757 def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
5858
59 def SIconstdata_ptr : SDNode<
60 "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]>
61 >;
62
5963 // Transformation function, extract the lower 32bit of a 64bit immediate
6064 def LO32 : SDNodeXForm
6165 return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, MVT::i32);
138138 ////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "S_BITSET0_B64", []>;
139139 ////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "S_BITSET1_B32", []>;
140140 ////def S_BITSET1_B64 : SOP1_BITSET1 <0x0000001e, "S_BITSET1_B64", []>;
141 def S_GETPC_B64 : SOP1_64 <0x0000001f, "S_GETPC_B64", []>;
141 def S_GETPC_B64 : SOP1 <
142 0x0000001f, (outs SReg_64:$dst), (ins), "S_GETPC_B64 $dst", []
143 > {
144 let SSRC0 = 0;
145 }
142146 def S_SETPC_B64 : SOP1_64 <0x00000020, "S_SETPC_B64", []>;
143147 def S_SWAPPC_B64 : SOP1_64 <0x00000021, "S_SWAPPC_B64", []>;
144148 def S_RFE_B64 : SOP1_64 <0x00000022, "S_RFE_B64", []>;
16931697 defm SI_SPILL_S256 : SI_SPILL_SGPR ;
16941698 defm SI_SPILL_S512 : SI_SPILL_SGPR ;
16951699
1700 let Defs = [SCC] in {
1701
1702 def SI_CONSTDATA_PTR : InstSI <
1703 (outs SReg_64:$dst),
1704 (ins),
1705 "", [(set SReg_64:$dst, (i64 SIconstdata_ptr))]
1706 >;
1707
1708 } // End Defs = [SCC]
1709
16961710 } // end IsCodeGenOnly, isPseudo
16971711
16981712 } // end SubtargetPredicate = SI
33
44 @b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
55
6 ; XXX: Test on SI once 64-bit adds are supportes.
7
86 @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4
97
108 ; FUNC-LABEL: @float
9 ; FIXME: We should be using S_LOAD_DWORD here.
10 ; SI: BUFFER_LOAD_DWORD
1111
1212 ; EG-DAG: MOV {{\** *}}T2.X
1313 ; EG-DAG: MOV {{\** *}}T3.X
2727 @i32_gv = internal unnamed_addr addrspace(2) constant [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4], align 4
2828
2929 ; FUNC-LABEL: @i32
30
31 ; FIXME: We should be using S_LOAD_DWORD here.
32 ; SI: BUFFER_LOAD_DWORD
3033
3134 ; EG-DAG: MOV {{\** *}}T2.X
3235 ; EG-DAG: MOV {{\** *}}T3.X
4952 @struct_foo_gv = internal unnamed_addr addrspace(2) constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ]
5053
5154 ; FUNC-LABEL: @struct_foo_gv_load
55 ; SI: S_LOAD_DWORD
5256
5357 define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
5458 %gep = getelementptr inbounds [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
6367 <1 x i32> ]
6468
6569 ; FUNC-LABEL: @array_v1_gv_load
70 ; FIXME: We should be using S_LOAD_DWORD here.
71 ; SI: BUFFER_LOAD_DWORD
6672 define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
6773 %gep = getelementptr inbounds [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index
6874 %load = load <1 x i32> addrspace(2)* %gep, align 4
None ; XFAIL: *
1 ; REQUIRES: asserts
2 ; RUN: llc -march=r600 -mcpu=SI < %s
1 ; CHECK: S_ENDPGM
32
43 @gv = external unnamed_addr addrspace(2) constant [239 x i32], align 4
54