llvm.org GIT mirror llvm / 3280804
R600/SI: Use scratch memory for large private arrays git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213551 91177308-0d34-0410-b5e6-96231b3b80d8 Tom Stellard 5 years ago
23 changed file(s) with 511 addition(s) and 108 deletion(s). Raw diff Collapse all Expand all
2424 #include "SIDefines.h"
2525 #include "SIMachineFunctionInfo.h"
2626 #include "SIRegisterInfo.h"
27 #include "llvm/CodeGen/MachineFrameInfo.h"
2728 #include "llvm/MC/MCContext.h"
2829 #include "llvm/MC/MCSectionELF.h"
2930 #include "llvm/MC/MCStreamer.h"
139140 OutStreamer.emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
140141 false);
141142 OutStreamer.emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
143 false);
144 OutStreamer.emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
142145 false);
143146 } else {
144147 R600MachineFunctionInfo *MFI = MF.getInfo();
331334 // Do not clamp NAN to 0.
332335 ProgInfo.DX10Clamp = 0;
333336
337 const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
338 ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF);
339
334340 ProgInfo.CodeLen = CodeSize;
335341 }
336342
359365
360366 unsigned LDSBlocks =
361367 RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
368
369 // Scratch is allocated in 256 dword blocks.
370 unsigned ScratchAlignShift = 10;
371 // We need to program the hardware with the amount of scratch memory that
372 // is used by the entire wave. KernelInfo.ScratchSize is the amount of
373 // scratch memory used per thread.
374 unsigned ScratchBlocks =
375 RoundUpToAlignment(KernelInfo.ScratchSize * STM.getWavefrontSize(),
376 1 << ScratchAlignShift) >> ScratchAlignShift;
362377
363378 if (MFI->getShaderType() == ShaderType::COMPUTE) {
364379 OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
376391 OutStreamer.EmitIntValue(ComputePGMRSrc1, 4);
377392
378393 OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
379 OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(LDSBlocks), 4);
394 const uint32_t ComputePGMRSrc2 =
395 S_00B84C_LDS_SIZE(LDSBlocks) |
396 S_00B02C_SCRATCH_EN(ScratchBlocks > 0);
397
398 OutStreamer.EmitIntValue(ComputePGMRSrc2, 4);
399
400 OutStreamer.EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
401 OutStreamer.EmitIntValue(S_00B860_WAVESIZE(ScratchBlocks), 4);
380402 } else {
381403 OutStreamer.EmitIntValue(RsrcReg, 4);
382404 OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) |
3131 DX10Clamp(0),
3232 DebugMode(0),
3333 IEEEMode(0),
34 ScratchSize(0),
3435 CodeLen(0) {}
3536
3637 // Fields set in PGM_RSRC1 pm4 packet.
4243 uint32_t DX10Clamp;
4344 uint32_t DebugMode;
4445 uint32_t IEEEMode;
46 uint32_t ScratchSize;
4547
4648 // Bonus information for debugging.
4749 uint64_t CodeLen;
1515 #include "AMDGPURegisterInfo.h"
1616 #include "AMDGPUSubtarget.h"
1717 #include "R600InstrInfo.h"
18 #include "SIDefines.h"
1819 #include "SIISelLowering.h"
20 #include "SIMachineFunctionInfo.h"
1921 #include "llvm/CodeGen/FunctionLoweringInfo.h"
2022 #include "llvm/CodeGen/PseudoSourceValue.h"
23 #include "llvm/CodeGen/MachineFrameInfo.h"
24 #include "llvm/CodeGen/MachineRegisterInfo.h"
2125 #include "llvm/CodeGen/SelectionDAG.h"
2226 #include "llvm/CodeGen/SelectionDAGISel.h"
2327 #include "llvm/IR/Function.h"
8488 bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
8589 bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
8690 bool SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, SDValue &Offset,
87 SDValue &ImmOffset) const;
91 SDValue &ImmOffset) const;
92 bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr,
93 SDValue &SOffset, SDValue &ImmOffset) const;
94 bool SelectMUBUFAddr32(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
95 SDValue &SOffset, SDValue &Offset, SDValue &Offen,
96 SDValue &Idxen, SDValue &GLC, SDValue &SLC,
97 SDValue &TFE) const;
8898
8999 SDNode *SelectADD_SUB_I64(SDNode *N);
90100 SDNode *SelectDIV_SCALE(SDNode *N);
729739 Ptr), 0);
730740 }
731741
742 static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) {
743 return isUInt<12>(Imm->getZExtValue());
744 }
745
732746 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr,
733747 SDValue &Offset,
734748 SDValue &ImmOffset) const {
739753 SDValue N1 = Addr.getOperand(1);
740754 ConstantSDNode *C1 = cast(N1);
741755
742 if (isUInt<12>(C1->getZExtValue())) {
756 if (isLegalMUBUFImmOffset(C1)) {
743757
744758 if (N0.getOpcode() == ISD::ADD) {
745759 // (add (add N2, N3), C1)
775789 return true;
776790 }
777791
792 /// \brief Return a resource descriptor with the 'Add TID' bit enabled
793 /// The TID (Thread ID) is multipled by the stride value (bits [61:48]
794 /// of the resource descriptor) to create an offset, which is added to the
795 /// resource ponter.
796 static SDValue buildScratchRSRC(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) {
797
798 uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE |
799 0xffffffff;
800
801 SDValue PtrLo = DAG->getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
802 SDValue PtrHi = DAG->getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
803 SDValue DataLo = DAG->getTargetConstant(
804 Rsrc & APInt::getAllOnesValue(32).getZExtValue(), MVT::i32);
805 SDValue DataHi = DAG->getTargetConstant(Rsrc >> 32, MVT::i32);
806
807 const SDValue Ops[] = { PtrLo, PtrHi, DataLo, DataHi };
808 return SDValue(DAG->getMachineNode(AMDGPU::SI_BUFFER_RSRC, DL,
809 MVT::v4i32, Ops), 0);
810 }
811
812 bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
813 SDValue &VAddr, SDValue &SOffset,
814 SDValue &ImmOffset) const {
815
816 SDLoc DL(Addr);
817 MachineFunction &MF = CurDAG->getMachineFunction();
818 const SIRegisterInfo *TRI = static_cast(MF.getTarget().getRegisterInfo());
819 MachineRegisterInfo &MRI = MF.getRegInfo();
820
821
822 unsigned ScratchPtrReg =
823 TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
824 unsigned ScratchOffsetReg =
825 TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
826
827 Rsrc = buildScratchRSRC(CurDAG, DL, CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, MRI.getLiveInVirtReg(ScratchPtrReg), MVT::i64));
828 SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
829 MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32);
830
831 // (add n0, c1)
832 if (CurDAG->isBaseWithConstantOffset(Addr)) {
833 SDValue N1 = Addr.getOperand(1);
834 ConstantSDNode *C1 = cast(N1);
835
836 if (isLegalMUBUFImmOffset(C1)) {
837 VAddr = Addr.getOperand(0);
838 ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
839 return true;
840 }
841 }
842
843 // (add FI, n0)
844 if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
845 isa(Addr.getOperand(0))) {
846 VAddr = Addr.getOperand(1);
847 ImmOffset = Addr.getOperand(0);
848 return true;
849 }
850
851 // (FI)
852 if (isa(Addr)) {
853 VAddr = SDValue(CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
854 CurDAG->getConstant(0, MVT::i32)), 0);
855 ImmOffset = Addr;
856 return true;
857 }
858
859 // (node)
860 VAddr = Addr;
861 ImmOffset = CurDAG->getTargetConstant(0, MVT::i16);
862 return true;
863 }
864
865 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr32(SDValue Addr, SDValue &SRsrc,
866 SDValue &VAddr, SDValue &SOffset,
867 SDValue &Offset, SDValue &Offen,
868 SDValue &Idxen, SDValue &GLC,
869 SDValue &SLC, SDValue &TFE) const {
870
871 GLC = CurDAG->getTargetConstant(0, MVT::i1);
872 SLC = CurDAG->getTargetConstant(0, MVT::i1);
873 TFE = CurDAG->getTargetConstant(0, MVT::i1);
874
875 Idxen = CurDAG->getTargetConstant(0, MVT::i1);
876 Offen = CurDAG->getTargetConstant(1, MVT::i1);
877
878 return SelectMUBUFScratch(Addr, SRsrc, VAddr, SOffset, Offset);
879 }
880
778881 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
779882 const AMDGPUTargetLowering& Lowering =
780883 *static_cast(getTargetLowering());
7070 static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
7171 static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT);
7272
73 /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
74 /// MachineFunction.
75 ///
76 /// \returns a RegisterSDNode representing Reg.
77 virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
78 const TargetRegisterClass *RC,
79 unsigned Reg, EVT VT) const;
8073 virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
8174 SelectionDAG &DAG) const;
8275 /// \brief Split a vector load into multiple scalar loads.
159152 SDValue Op,
160153 const SelectionDAG &DAG,
161154 unsigned Depth = 0) const override;
155
156 /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
157 /// MachineFunction.
158 ///
159 /// \returns a RegisterSDNode representing Reg.
160 virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
161 const TargetRegisterClass *RC,
162 unsigned Reg, EVT VT) const;
162163 };
163164
164165 namespace AMDGPUISD {
4040 def InstFlag : OperandWithDefaultOps ;
4141 def ADDRIndirect : ComplexPattern;
4242
43 let OperandType = "OPERAND_IMMEDIATE" in {
44
4345 def u32imm : Operand {
4446 let PrintMethod = "printU32ImmOperand";
4547 }
5153 def u8imm : Operand {
5254 let PrintMethod = "printU8ImmOperand";
5355 }
56
57 } // End OperandType = "OPERAND_IMMEDIATE"
5458
5559 //===--------------------------------------------------------------------===//
5660 // Custom Operands
134138 //===----------------------------------------------------------------------===//
135139 // Load/Store Pattern Fragments
136140 //===----------------------------------------------------------------------===//
141
142 class PrivateMemOp : PatFrag
143 return cast(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
144 }]>;
145
146 class PrivateLoad : PrivateMemOp <
147 (ops node:$ptr), (op node:$ptr)
148 >;
149
150 class PrivateStore : PrivateMemOp <
151 (ops node:$value, node:$ptr), (op node:$value, node:$ptr)
152 >;
153
154 def extloadi8_private : PrivateLoad ;
155 def sextloadi8_private : PrivateLoad ;
156 def extloadi16_private : PrivateLoad ;
157 def sextloadi16_private : PrivateLoad ;
158 def load_private : PrivateLoad ;
159
160 def truncstorei8_private : PrivateStore ;
161 def truncstorei16_private : PrivateStore ;
162 def store_private : PrivateStore ;
137163
138164 def global_store : PatFrag<(ops node:$val, node:$ptr),
139165 (store node:$val, node:$ptr), [{
5050 unsigned getSubRegFromChannel(unsigned Channel) const;
5151
5252 const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override;
53 void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
53 virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
5454 unsigned FIOperandNum,
5555 RegScavenger *RS) const override;
5656 unsigned getFrameRegister(const MachineFunction &MF) const override;
5151 std::string Ret = "e-p:32:32";
5252
5353 if (ST.is64bit()) {
54 // 32-bit private, local, and region pointers. 64-bit global and constant.
54 // 32-bit local, and region pointers. 64-bit private, global, and constant.
5555 Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64";
5656 }
5757
3131 #define S_00B028_VGPRS(x) (((x) & 0x3F) << 0)
3232 #define S_00B028_SGPRS(x) (((x) & 0x0F) << 6)
3333 #define R_00B84C_COMPUTE_PGM_RSRC2 0x00B84C
34 #define S_00B02C_SCRATCH_EN(x) (((x) & 0x1) << 0)
3435 #define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15)
3536 #define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC
3637
8485 #define FP_DENORM_MODE_SP(x) (((x) & 0x3) << 4)
8586 #define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6)
8687
88 #define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860
89 #define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12)
90
8791 #endif // SIDEFINES_H_
390390 }
391391
392392 // The pointer to the list of arguments is stored in SGPR0, SGPR1
393 // The pointer to the scratch buffer is stored in SGPR2, SGPR3
393394 if (Info->getShaderType() == ShaderType::COMPUTE) {
395 Info->NumUserSGPRs = 4;
394396 CCInfo.AllocateReg(AMDGPU::SGPR0);
395397 CCInfo.AllocateReg(AMDGPU::SGPR1);
398 CCInfo.AllocateReg(AMDGPU::SGPR2);
399 CCInfo.AllocateReg(AMDGPU::SGPR3);
396400 MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass);
401 MF.addLiveIn(AMDGPU::SGPR2_SGPR3, &AMDGPU::SReg_64RegClass);
397402 }
398403
399404 if (Info->getShaderType() == ShaderType::COMPUTE) {
508513 MI->eraseFromParent();
509514 break;
510515 }
516 case AMDGPU::SI_BUFFER_RSRC: {
517 unsigned SuperReg = MI->getOperand(0).getReg();
518 unsigned Args[4];
519 for (unsigned i = 0, e = 4; i < e; ++i) {
520 MachineOperand &Arg = MI->getOperand(i + 1);
521
522 if (Arg.isReg()) {
523 Args[i] = Arg.getReg();
524 continue;
525 }
526
527 assert(Arg.isImm());
528 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
529 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), Reg)
530 .addImm(Arg.getImm());
531 Args[i] = Reg;
532 }
533 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE),
534 SuperReg)
535 .addReg(Args[0])
536 .addImm(AMDGPU::sub0)
537 .addReg(Args[1])
538 .addImm(AMDGPU::sub1)
539 .addReg(Args[2])
540 .addImm(AMDGPU::sub2)
541 .addReg(Args[3])
542 .addImm(AMDGPU::sub3);
543 MI->eraseFromParent();
544 break;
545 }
511546 case AMDGPU::V_SUB_F64: {
512547 unsigned DestReg = MI->getOperand(0).getReg();
513548 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg)
619654 SIMachineFunctionInfo *MFI = MF.getInfo();
620655 switch (Op.getOpcode()) {
621656 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
657 case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
622658 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
623659 case ISD::LOAD: {
624660 LoadSDNode *Load = dyn_cast(Op);
657693 cast(Op.getOperand(0))->getZExtValue();
658694 EVT VT = Op.getValueType();
659695 SDLoc DL(Op);
660 //XXX: Hardcoded we only use two to store the pointer to the parameters.
661 unsigned NumUserSGPRs = 2;
662696 switch (IntrinsicID) {
663697 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
664698 case Intrinsic::r600_read_ngroups_x:
681715 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32, false);
682716 case Intrinsic::r600_read_tgid_x:
683717 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
684 AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT);
718 AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0), VT);
685719 case Intrinsic::r600_read_tgid_y:
686720 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
687 AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 1), VT);
721 AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1), VT);
688722 case Intrinsic::r600_read_tgid_z:
689723 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
690 AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 2), VT);
724 AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2), VT);
691725 case Intrinsic::r600_read_tidig_x:
692726 return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
693727 AMDGPU::VGPR0, VT);
781815 return nullptr;
782816 }
783817
818 SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
819
820 MachineFunction &MF = DAG.getMachineFunction();
821 const SIInstrInfo *TII =
822 static_cast(getTargetMachine().getInstrInfo());
823 const SIRegisterInfo &TRI = TII->getRegisterInfo();
824 FrameIndexSDNode *FINode = cast(Op);
825 unsigned FrameIndex = FINode->getIndex();
826
827 CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
828 TRI.getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET), MVT::i32);
829
830 return DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
831 }
832
784833 /// This transforms the control flow intrinsics to get the branch destination as
785834 /// last parameter, also switches branch target with BR if the need arise
786835 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
890939 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
891940 SDLoc DL(Op);
892941 LoadSDNode *Load = cast(Op);
942 // Vector private memory loads have already been split, and
943 // all the rest of private memory loads are legal.
944 if (Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
945 return SDValue();
946 }
893947 SDValue Lowered = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
894948 if (Lowered.getNode())
895949 return Lowered;
10791133 VT.isVector() && VT.getVectorNumElements() == 2 &&
10801134 VT.getVectorElementType() == MVT::i32)
10811135 return SDValue();
1136
1137 if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
1138 if (VT.isVector() && VT.getVectorNumElements() > 4)
1139 return SplitVectorStore(Op, DAG);
1140 return SDValue();
1141 }
10821142
10831143 SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
10841144 if (Ret.getNode())
14941554
14951555 // This is a conservative aproach. It is possible that we can't determine the
14961556 // correct register class and copy too often, but better safe than sorry.
1497 SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32);
1498 SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(),
1499 Operand.getValueType(), Operand, RC);
1557
1558 SDNode *Node;
1559 // We can't use COPY_TO_REGCLASS with FrameIndex arguments.
1560 if (isa(Operand)) {
1561 unsigned Opcode = Operand.getValueType() == MVT::i32 ?
1562 AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1563 Node = DAG.getMachineNode(Opcode, SDLoc(), Operand.getValueType(),
1564 Operand);
1565 } else {
1566 SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32);
1567 Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(),
1568 Operand.getValueType(), Operand, RC);
1569 }
15001570 Operand = SDValue(Node, 0);
15011571 }
15021572
15901660 ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed);
15911661 }
15921662 continue;
1663 } else {
1664 // If it's not a VSrc or SSrc operand check if we have a GlobalAddress.
1665 // These will be lowered to immediates, so we will need to insert a MOV.
1666 if (isa(Ops[i])) {
1667 SDNode *Node = DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(),
1668 Operand.getValueType(), Operand);
1669 Ops[i] = SDValue(Node, 0);
1670 }
15931671 }
15941672
15951673 if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) {
2626 SelectionDAG &DAG) const;
2727 SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
2828 SelectionDAG &DAG) const override;
29 SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
2930 SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
3031 SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
3132 SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const;
560560 }
561561 }
562562
563 bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
564 const MachineOperand &MO) const {
565 const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo];
566
567 assert(MO.isImm() || MO.isFPImm());
568
569 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
570 return true;
571
572 if (OpInfo.RegClass < 0)
573 return false;
574
575 return RI.regClassCanUseImmediate(OpInfo.RegClass);
576 }
577
563578 bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
564579 StringRef &ErrInfo) const {
565580 uint16_t Opcode = MI->getOpcode();
588603 }
589604 break;
590605 case MCOI::OPERAND_IMMEDIATE:
591 if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm()) {
606 // Check if this operand is an immediate.
607 // FrameIndex operands will be replaced by immediates, so they are
608 // allowed.
609 if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm() &&
610 !MI->getOperand(i).isFI()) {
592611 ErrInfo = "Expected immediate, but got non-immediate";
593612 return false;
594613 }
105105 bool isInlineConstant(const MachineOperand &MO) const;
106106 bool isLiteralConstant(const MachineOperand &MO) const;
107107
108 bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
109 const MachineOperand &MO) const;
110
108111 bool verifyInstruction(const MachineInstr *MI,
109112 StringRef &ErrInfo) const override;
110113
180183 int getMCOpcode(uint16_t Opcode, unsigned Gen);
181184
182185 const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
183
186 const uint64_t RSRC_TID_ENABLE = 1LL << 55;
184187
185188 } // End namespace AMDGPU
186189
162162 // Complex patterns
163163 //===----------------------------------------------------------------------===//
164164
165 def MUBUFAddr32 : ComplexPattern;
165166 def MUBUFAddr64 : ComplexPattern;
167 def MUBUFScratch : ComplexPattern;
166168
167169 //===----------------------------------------------------------------------===//
168170 // SI assembler operands
604606 asm#" $vdata, $srsrc + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
605607 }
606608
607 let offen = 1, idxen = 0, offset = 0 in {
609 let offen = 1, idxen = 0 in {
608610 def _OFFEN : MUBUF
609611 (ins SReg_128:$srsrc, VReg_32:$vaddr,
610 SSrc_32:$soffset, i1imm:$glc, i1imm:$slc,
612 SSrc_32:$soffset, u16imm:$offset, i1imm:$glc, i1imm:$slc,
611613 i1imm:$tfe),
612 asm#" $vdata, $srsrc + $vaddr + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
614 asm#" $vdata, $srsrc + $vaddr + $soffset + $offset, glc=$glc, slc=$slc, tfe=$tfe", []>;
613615 }
614616
615617 let offen = 0, idxen = 1 in {
639641 }
640642 }
641643
642 class MUBUF_Store_Helper op, string name, RegisterClass vdataClass,
643 ValueType store_vt, SDPatternOperator st> :
644 MUBUF
645 u16imm:$offset),
646 name#" $vdata, $srsrc + $vaddr + $offset",
647 [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, u16imm:$offset))]> {
648
649 let mayLoad = 0;
650 let mayStore = 1;
651
652 // Encoding
653 let offen = 0;
654 let idxen = 0;
655 let glc = 0;
656 let addr64 = 1;
657 let lds = 0;
658 let slc = 0;
659 let tfe = 0;
660 let soffset = 128; // ZERO
644 multiclass MUBUF_Store_Helper op, string name, RegisterClass vdataClass,
645 ValueType store_vt, SDPatternOperator st> {
646
647 def "" : MUBUF <
648 op, (outs),
649 (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_32:$vaddr, SSrc_32:$soffset,
650 u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$slc,
651 i1imm:$tfe),
652 name#" $vdata, $srsrc, $vaddr, $soffset, $offset $offen $idxen $glc $slc $tfe",
653 []
654 > {
655 let addr64 = 0;
656 }
657
658 def _ADDR64 : MUBUF <
659 op, (outs),
660 (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset),
661 name#" $vdata, $srsrc + $vaddr + $offset",
662 [(st store_vt:$vdata,
663 (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, u16imm:$offset))]> {
664
665 let mayLoad = 0;
666 let mayStore = 1;
667
668 // Encoding
669 let offen = 0;
670 let idxen = 0;
671 let glc = 0;
672 let addr64 = 1;
673 let lds = 0;
674 let slc = 0;
675 let tfe = 0;
676 let soffset = 128; // ZERO
677 }
661678 }
662679
663680 class MTBUF_Load_Helper op, string asm, RegisterClass regClass> : MTBUF <
871871 0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128, v4i32, global_load
872872 >;
873873
874 def BUFFER_STORE_BYTE : MUBUF_Store_Helper <
874 defm BUFFER_STORE_BYTE : MUBUF_Store_Helper <
875875 0x00000018, "BUFFER_STORE_BYTE", VReg_32, i32, truncstorei8_global
876876 >;
877877
878 def BUFFER_STORE_SHORT : MUBUF_Store_Helper <
878 defm BUFFER_STORE_SHORT : MUBUF_Store_Helper <
879879 0x0000001a, "BUFFER_STORE_SHORT", VReg_32, i32, truncstorei16_global
880880 >;
881881
882 def BUFFER_STORE_DWORD : MUBUF_Store_Helper <
882 defm BUFFER_STORE_DWORD : MUBUF_Store_Helper <
883883 0x0000001c, "BUFFER_STORE_DWORD", VReg_32, i32, global_store
884884 >;
885885
886 def BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
886 defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
887887 0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, v2i32, global_store
888888 >;
889889
890 def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
890 defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
891891 0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128, v4i32, global_store
892892 >;
893893 //def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>;
16661666 "", []
16671667 >;
16681668
1669 def SI_BUFFER_RSRC : InstSI <
1670 (outs SReg_128:$srsrc),
1671 (ins SReg_32:$ptr_lo, SReg_32:$ptr_hi, SSrc_32:$data_lo, SSrc_32:$data_hi),
1672 "", []
1673 >;
1674
16691675 def V_SUB_F64 : InstSI <
16701676 (outs VReg_64:$dst),
16711677 (ins VReg_64:$src0, VReg_64:$src1),
24092415 // Offset in an 32Bit VGPR
24102416 def : Pat <
24112417 (SIload_constant v4i32:$sbase, i32:$voff),
2412 (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0)
2418 (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0, 0)
24132419 >;
24142420
24152421 // The multiplication scales from [0,1] to the unsigned integer range
25982604 (vt (constant_ld (add i64:$ptr, i64:$offset))),
25992605 (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0)
26002606 >;
2601 }
2602
2603 defm : MUBUFLoad_Pattern
2604 sextloadi8_constant>;
2605 defm : MUBUFLoad_Pattern
2606 az_extloadi8_constant>;
2607 defm : MUBUFLoad_Pattern
2608 sextloadi16_constant>;
2609 defm : MUBUFLoad_Pattern
2610 az_extloadi16_constant>;
2611 defm : MUBUFLoad_Pattern
2612 constant_load>;
2613 defm : MUBUFLoad_Pattern
2614 constant_load>;
2615 defm : MUBUFLoad_Pattern
2616 constant_load>;
2607
2608 }
2609
2610 defm : MUBUFLoad_Pattern ;
2611 defm : MUBUFLoad_Pattern ;
2612 defm : MUBUFLoad_Pattern ;
2613 defm : MUBUFLoad_Pattern ;
2614 defm : MUBUFLoad_Pattern ;
2615 defm : MUBUFLoad_Pattern ;
2616 defm : MUBUFLoad_Pattern ;
2617
2618 class MUBUFScratchLoadPat : Pat <
2619 (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr,
2620 i32:$soffset, u16imm:$offset))),
2621 (Instr $srsrc, $vaddr, $soffset, $offset, 0, 0, 0)
2622 >;
2623
2624 def : MUBUFScratchLoadPat ;
2625 def : MUBUFScratchLoadPat ;
2626 def : MUBUFScratchLoadPat ;
2627 def : MUBUFScratchLoadPat ;
2628 def : MUBUFScratchLoadPat ;
2629 def : MUBUFScratchLoadPat ;
2630 def : MUBUFScratchLoadPat ;
26172631
26182632 // BUFFER_LOAD_DWORD*, addr64=0
26192633 multiclass MUBUF_Load_Dword
26292643
26302644 def : Pat <
26312645 (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
2632 imm, 1, 0, imm:$glc, imm:$slc,
2646 imm:$offset, 1, 0, imm:$glc, imm:$slc,
26332647 imm:$tfe)),
2634 (offen $rsrc, $vaddr, $soffset, (as_i1imm $glc), (as_i1imm $slc),
2648 (offen $rsrc, $vaddr, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
26352649 (as_i1imm $tfe))
26362650 >;
26372651
26582672 BUFFER_LOAD_DWORDX2_IDXEN, BUFFER_LOAD_DWORDX2_BOTHEN>;
26592673 defm : MUBUF_Load_Dword
26602674 BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>;
2675
2676 class MUBUFScratchStorePat : Pat <
2677 (st vt:$value, (MUBUFAddr32 v4i32:$srsrc, i32:$vaddr, i32:$soffset,
2678 u16imm:$offset, i1imm:$offen, i1imm:$idxen,
2679 i1imm:$glc, i1imm:$slc, i1imm:$tfe)),
2680 (Instr $value, $srsrc, $vaddr, $soffset, $offset, $offen, $idxen,
2681 $glc, $slc, $tfe)
2682 >;
2683
2684 def : MUBUFScratchStorePat ;
2685 def : MUBUFScratchStorePat ;
2686 def : MUBUFScratchStorePat ;
2687 def : MUBUFScratchStorePat ;
2688 def : MUBUFScratchStorePat ;
2689
2690 /*
2691 class MUBUFStore_Pattern : Pat <
2692 (st vt:$value, (MUBUFScratch v4i32:$srsrc, i64:$vaddr, u16imm:$offset)),
2693 (Instr $value, $srsrc, $vaddr, $offset)
2694 >;
2695
2696 def : MUBUFStore_Pattern ;
2697 def : MUBUFStore_Pattern ;
2698 def : MUBUFStore_Pattern ;
2699 def : MUBUFStore_Pattern ;
2700 def : MUBUFStore_Pattern ;
2701
2702 */
26612703
26622704 //===----------------------------------------------------------------------===//
26632705 // MTBUF Patterns
2626 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
2727 : AMDGPUMachineFunction(MF),
2828 PSInputAddr(0),
29 SpillTracker() { }
29 SpillTracker(),
30 NumUserSGPRs(0) { }
3031
3132 static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) {
3233 unsigned VGPR = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
5858 SIMachineFunctionInfo(const MachineFunction &MF);
5959 unsigned PSInputAddr;
6060 struct RegSpillTracker SpillTracker;
61 unsigned NumUserSGPRs;
6162 };
6263
6364 } // End namespace llvm
1515 #include "SIRegisterInfo.h"
1616 #include "AMDGPUSubtarget.h"
1717 #include "SIInstrInfo.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "llvm/CodeGen/MachineFrameInfo.h"
20 #include "llvm/CodeGen/MachineInstrBuilder.h"
21 #include "llvm/CodeGen/RegisterScavenging.h"
1822
1923 using namespace llvm;
2024
2630 BitVector Reserved(getNumRegs());
2731 Reserved.set(AMDGPU::EXEC);
2832 Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
29 const SIInstrInfo *TII = static_cast(ST.getInstrInfo());
30 TII->reserveIndirectRegisters(Reserved, MF);
3133 return Reserved;
3234 }
3335
3436 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
3537 MachineFunction &MF) const {
3638 return RC->getNumRegs();
39 }
40
41 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
42 return Fn.getFrameInfo()->hasStackObjects();
43 }
44
45 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
46 int SPAdj, unsigned FIOperandNum,
47 RegScavenger *RS) const {
48 MachineFunction *MF = MI->getParent()->getParent();
49 MachineFrameInfo *FrameInfo = MF->getFrameInfo();
50 const SIInstrInfo *TII = static_cast(ST.getInstrInfo());
51 MachineOperand &FIOp = MI->getOperand(FIOperandNum);
52 int Index = MI->getOperand(FIOperandNum).getIndex();
53 int64_t Offset = FrameInfo->getObjectOffset(Index);
54
55 FIOp.ChangeToImmediate(Offset);
56 if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) {
57 unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VReg_32RegClass, MI, SPAdj);
58 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
59 TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
60 .addImm(Offset);
61 FIOp.ChangeToRegister(TmpReg, false);
62 }
3763 }
3864
3965 const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
140166 const TargetRegisterClass *RC) const {
141167 return regClassCanUseImmediate(RC->getID());
142168 }
169
170 unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
171 enum PreloadedValue Value) const {
172
173 const SIMachineFunctionInfo *MFI = MF.getInfo();
174 switch (Value) {
175 case SIRegisterInfo::TGID_X:
176 return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0);
177 case SIRegisterInfo::TGID_Y:
178 return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1);
179 case SIRegisterInfo::TGID_Z:
180 return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2);
181 case SIRegisterInfo::SCRATCH_WAVE_OFFSET:
182 return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4);
183 case SIRegisterInfo::SCRATCH_PTR:
184 return AMDGPU::SGPR2_SGPR3;
185 }
186 }
2727
2828 unsigned getRegPressureLimit(const TargetRegisterClass *RC,
2929 MachineFunction &MF) const override;
30
31 bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
32
33 void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
34 unsigned FIOperandNum,
35 RegScavenger *RS) const override;
3036
3137 /// \brief get the register class of the specified type to use in the
3238 /// CFGStructurizer
6773 /// \returns True if operands defined with this register class can accept
6874 /// inline immediates.
6975 bool regClassCanUseImmediate(const TargetRegisterClass *RC) const;
76
77 enum PreloadedValue {
78 TGID_X,
79 TGID_Y,
80 TGID_Z,
81 SCRATCH_WAVE_OFFSET,
82 SCRATCH_PTR
83 };
84
85 /// \brief Returns the physical register that \p Value is stored in.
86 unsigned getPreloadedValue(const MachineFunction &MF,
87 enum PreloadedValue Value) const;
88
7089 };
7190
7291 } // End namespace llvm
1010
1111 ; SI-LABEL: @test_private_array_ptr_calc:
1212
13 ; SI: V_ADD_I32_e32 [[PTRREG:v[0-9]+]]
14
15 ; SI-ALLOCA: V_MOVRELD_B32_e32 {{v[0-9]+}}, [[PTRREG]]
13 ; FIXME: We end up with zero argument for ADD, because
14 ; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index
15 ; with the appropriate offset. We should fold this into the store.
16 ; SI-ALLOCA: V_ADD_I32_e32 [[PTRREG:v[0-9]+]], 0, v{{[0-9]+}}
17 ; SI-ALLOCA: BUFFER_STORE_DWORD {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], [[PTRREG]]
1618 ;
1719 ; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
1820 ; alloca to a vector. It currently fails because it does not know how
1921 ; to interpret:
2022 ; getelementptr [4 x i32]* %alloca, i32 1, i32 %b
2123
24 ; SI-PROMOTE: V_ADD_I32_e32 [[PTRREG:v[0-9]+]]
2225 ; SI-PROMOTE: DS_WRITE_B32 {{v[0-9]+}}, [[PTRREG]]
2326 define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
2427 %alloca = alloca [4 x i32], i32 4, align 16
7575 store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4
7676 ret void
7777 }
78
79 define void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) {
80 entry:
81 %0 = icmp eq i32 0, %a
82 br i1 %0, label %if, label %else
83
84 if:
85 %1 = getelementptr inbounds [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
86 %2 = load float addrspace(2)* %1
87 store float %2, float addrspace(1)* %out
88 br label %endif
89
90 else:
91 store float 1.0, float addrspace(1)* %out
92 br label %endif
93
94 endif:
95 ret void
96 }
55
66 ; SI-LABEL: @private_access_f64_alloca:
77
8 ; SI-ALLOCA: V_MOVRELD_B32_e32
9 ; SI-ALLOCA: V_MOVRELD_B32_e32
10 ; SI-ALLOCA: V_MOVRELS_B32_e32
11 ; SI-ALLOCA: V_MOVRELS_B32_e32
8 ; SI-ALLOCA: BUFFER_STORE_DWORDX2
9 ; FIXME: We should be able to use BUFFER_LOAD_DWORDX2
10 ; SI-ALLOCA: BUFFER_LOAD_DWORD
11 ; SI-ALLOCA: BUFFER_LOAD_DWORD
1212
1313 ; SI-PROMOTE: DS_WRITE_B64
1414 ; SI-PROMOTE: DS_READ_B64
2525
2626 ; SI-LABEL: @private_access_v2f64_alloca:
2727
28 ; SI-ALLOCA: V_MOVRELD_B32_e32
29 ; SI-ALLOCA: V_MOVRELD_B32_e32
30 ; SI-ALLOCA: V_MOVRELS_B32_e32
31 ; SI-ALLOCA: V_MOVRELS_B32_e32
28 ; SI-ALLOCA: BUFFER_STORE_DWORDX4
29 ; FIXME: We should be able to use BUFFER_LOAD_DWORDX4
30 ; SI-ALLOCA: BUFFER_LOAD_DWORD
31 ; SI-ALLOCA: BUFFER_LOAD_DWORD
32 ; SI-ALLOCA: BUFFER_LOAD_DWORD
33 ; SI-ALLOCA: BUFFER_LOAD_DWORD
3234
3335 ; SI-PROMOTE: DS_WRITE_B32
3436 ; SI-PROMOTE: DS_WRITE_B32
5153
5254 ; SI-LABEL: @private_access_i64_alloca:
5355
54 ; SI-ALLOCA: V_MOVRELD_B32_e32
55 ; SI-ALLOCA: V_MOVRELD_B32_e32
56 ; SI-ALLOCA: V_MOVRELS_B32_e32
57 ; SI-ALLOCA: V_MOVRELS_B32_e32
56 ; SI-ALLOCA: BUFFER_STORE_DWORDX2
57 ; FIXME: We should be able to use BUFFER_LOAD_DWORDX2
58 ; SI-ALLOCA: BUFFER_LOAD_DWORD
59 ; SI-ALLOCA: BUFFER_LOAD_DWORD
5860
5961 ; SI-PROMOTE: DS_WRITE_B64
6062 ; SI-PROMOTE: DS_READ_B64
7173
7274 ; SI-LABEL: @private_access_v2i64_alloca:
7375
74 ; SI-ALLOCA: V_MOVRELD_B32_e32
75 ; SI-ALLOCA: V_MOVRELD_B32_e32
76 ; SI-ALLOCA: V_MOVRELD_B32_e32
77 ; SI-ALLOCA: V_MOVRELD_B32_e32
78 ; SI-ALLOCA: V_MOVRELS_B32_e32
79 ; SI-ALLOCA: V_MOVRELS_B32_e32
80 ; SI-ALLOCA: V_MOVRELS_B32_e32
81 ; SI-ALLOCA: V_MOVRELS_B32_e32
76 ; SI-ALLOCA: BUFFER_STORE_DWORDX4
77 ; FIXME: We should be able to use BUFFER_LOAD_DWORDX4
78 ; SI-ALLOCA: BUFFER_LOAD_DWORD
79 ; SI-ALLOCA: BUFFER_LOAD_DWORD
80 ; SI-ALLOCA: BUFFER_LOAD_DWORD
81 ; SI-ALLOCA: BUFFER_LOAD_DWORD
8282
8383 ; SI-PROMOTE: DS_WRITE_B32
8484 ; SI-PROMOTE: DS_WRITE_B32
1515 ; SI-PROMOTE: DS_READ_B32
1616 ; SI-PROMOTE: DS_READ_B32
1717
18 ; SI-ALLOCA: V_READFIRSTLANE_B32 vcc_lo
19 ; SI-ALLOCA: V_MOVRELD
20 ; SI-ALLOCA: S_CBRANCH
21 ; SI-ALLOCA: V_READFIRSTLANE_B32 vcc_lo
22 ; SI-ALLOCA: V_MOVRELD
23 ; SI-ALLOCA: S_CBRANCH
18 ; SI-ALLOCA: BUFFER_STORE_DWORD v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}
19 ; SI-ALLOCA: BUFFER_STORE_DWORD v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}
2420 define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
2521 entry:
2622 %stack = alloca [5 x i32], align 4
119115
120116 ; R600: MOVA_INT
121117
122 ; SI-PROMOTE: V_MOVRELS_B32_e32
118 ; SI-PROMOTE: BUFFER_STORE_SHORT v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}
119 ; SI-PROMOTE: BUFFER_STORE_SHORT v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}
120 ; SI_PROMOTE: BUFFER_LOAD_SSHORT v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] + v{{[0-9]+}}, s{{[0-9]+}}
123121 define void @short_array(i32 addrspace(1)* %out, i32 %index) {
124122 entry:
125123 %0 = alloca [2 x i16]
138136
139137 ; R600: MOVA_INT
140138
141 ; SI: V_OR_B32_e32 v{{[0-9]}}, 0x100
142 ; SI: V_MOVRELS_B32_e32
139 ; SI-DAG: BUFFER_STORE_BYTE v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}, 0x0
140 ; SI-DAG: BUFFER_STORE_BYTE v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}, 0x1
143141 define void @char_array(i32 addrspace(1)* %out, i32 %index) {
144142 entry:
145143 %0 = alloca [2 x i8]
126126 ret void
127127 }
128128
129 ; The tgid values are stored in ss offset by the number of user ss.
130 ; Currently we always use exactly 2 user ss for the pointer to the
129 ; The tgid values are stored in sgprs offset by the number of user sgprs.
130 ; Currently we always use exactly 2 user sgprs for the pointer to the
131131 ; kernel arguments, but this may change in the future.
132132
133133 ; SI-CHECK: @tgid_x
134 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s2
134 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s4
135135 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
136136 define void @tgid_x (i32 addrspace(1)* %out) {
137137 entry:
141141 }
142142
143143 ; SI-CHECK: @tgid_y
144 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s3
144 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s5
145145 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
146146 define void @tgid_y (i32 addrspace(1)* %out) {
147147 entry:
151151 }
152152
153153 ; SI-CHECK: @tgid_z
154 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s4
154 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s6
155155 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
156156 define void @tgid_z (i32 addrspace(1)* %out) {
157157 entry: