llvm.org GIT mirror llvm / e3924b1
AMDGPU: Divergence-driven selection of scalar buffer load intrinsics Summary: Moving SMRD to VMEM in SIFixSGPRCopies is rather bad for performance if the load is really uniform. So select the scalar load intrinsics directly to either VMEM or SMRD buffer loads based on divergence analysis. If an offset happens to end up in a VGPR -- either because a floating point calculation was involved, or due to other remaining deficiencies in SIFixSGPRCopies -- we use v_readfirstlane. There is some unrelated churn in tests since we now select MUBUF offsets in a unified way with non-scalar buffer loads. Change-Id: I170e6816323beb1348677b358c9d380865cd1a19 Reviewers: arsenm, alex-t, rampitec, tpr Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D53283 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@348050 91177308-0d34-0410-b5e6-96231b3b80d8 Nicolai Haehnle 1 year, 4 months ago
8 changed file(s) with 124 addition(s) and 241 deletion(s). Raw diff Collapse all Expand all
48474847 return SDValue(NewNode, 0);
48484848 }
48494849
4850 SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
4851 SDValue Offset, SDValue GLC,
4852 SelectionDAG &DAG) const {
4853 MachineFunction &MF = DAG.getMachineFunction();
4854 MachineMemOperand *MMO = MF.getMachineMemOperand(
4855 MachinePointerInfo(),
4856 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4857 MachineMemOperand::MOInvariant,
4858 VT.getStoreSize(), VT.getStoreSize());
4859
4860 if (!Offset->isDivergent()) {
4861 SDValue Ops[] = {
4862 Rsrc,
4863 Offset, // Offset
4864 GLC // glc
4865 };
4866 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
4867 DAG.getVTList(VT), Ops, VT, MMO);
4868 }
4869
4870 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
4871 // assume that the buffer is unswizzled.
4872 SmallVector Loads;
4873 unsigned NumLoads = 1;
4874 MVT LoadVT = VT.getSimpleVT();
4875
4876 assert(LoadVT == MVT::i32 || LoadVT == MVT::v2i32 || LoadVT == MVT::v4i32 ||
4877 LoadVT == MVT::v8i32 || LoadVT == MVT::v16i32);
4878
4879 if (VT == MVT::v8i32 || VT == MVT::v16i32) {
4880 NumLoads = VT == MVT::v16i32 ? 4 : 2;
4881 LoadVT = MVT::v4i32;
4882 }
4883
4884 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
4885 unsigned CachePolicy = cast(GLC)->getZExtValue();
4886 SDValue Ops[] = {
4887 DAG.getEntryNode(), // Chain
4888 Rsrc, // rsrc
4889 DAG.getConstant(0, DL, MVT::i32), // vindex
4890 {}, // voffset
4891 {}, // soffset
4892 {}, // offset
4893 DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy
4894 DAG.getConstant(0, DL, MVT::i1), // idxen
4895 };
4896
4897 // Use the alignment to ensure that the required offsets will fit into the
4898 // immediate offsets.
4899 setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4);
4900
4901 uint64_t InstOffset = cast(Ops[5])->getZExtValue();
4902 for (unsigned i = 0; i < NumLoads; ++i) {
4903 Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32);
4904 Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
4905 Ops, LoadVT, MMO));
4906 }
4907
4908 if (VT == MVT::v8i32 || VT == MVT::v16i32)
4909 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
4910
4911 return Loads[0];
4912 }
4913
48504914 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
48514915 SelectionDAG &DAG) const {
48524916 MachineFunction &MF = DAG.getMachineFunction();
50015065 SDLoc(DAG.getEntryNode()),
50025066 MFI->getArgInfo().WorkItemIDZ);
50035067 case AMDGPUIntrinsic::SI_load_const: {
5004 SDValue Ops[] = {
5005 Op.getOperand(1), // Ptr
5006 Op.getOperand(2), // Offset
5007 DAG.getTargetConstant(0, DL, MVT::i1) // glc
5008 };
5009
5010 MachineMemOperand *MMO = MF.getMachineMemOperand(
5011 MachinePointerInfo(),
5012 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5013 MachineMemOperand::MOInvariant,
5014 VT.getStoreSize(), 4);
5015 SDVTList VTList = DAG.getVTList(MVT::i32);
5016 SDValue Load = DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
5017 VTList, Ops, MVT::i32, MMO);
5018
5068 SDValue Load =
5069 lowerSBuffer(MVT::i32, DL, Op.getOperand(1), Op.getOperand(2),
5070 DAG.getTargetConstant(0, DL, MVT::i1), DAG);
50195071 return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load);
50205072 }
50215073 case Intrinsic::amdgcn_s_buffer_load: {
50225074 unsigned Cache = cast(Op.getOperand(3))->getZExtValue();
5023 SDValue Ops[] = {
5024 Op.getOperand(1), // Ptr
5025 Op.getOperand(2), // Offset
5026 DAG.getTargetConstant(Cache & 1, DL, MVT::i1) // glc
5027 };
5028
5029 MachineMemOperand *MMO = MF.getMachineMemOperand(
5030 MachinePointerInfo(),
5031 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5032 MachineMemOperand::MOInvariant,
5033 VT.getStoreSize(), VT.getStoreSize());
5034 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
5035 Op->getVTList(), Ops, VT, MMO);
5075 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
5076 DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG);
50365077 }
50375078 case Intrinsic::amdgcn_fdiv_fast:
50385079 return lowerFDIV_FAST(Op, DAG);
60676108 // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
60686109 // pointed to by Offsets.
60696110 void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
6070 SelectionDAG &DAG,
6071 SDValue *Offsets) const {
6111 SelectionDAG &DAG, SDValue *Offsets,
6112 unsigned Align) const {
60726113 SDLoc DL(CombinedOffset);
60736114 if (auto C = dyn_cast(CombinedOffset)) {
60746115 uint32_t Imm = C->getZExtValue();
60756116 uint32_t SOffset, ImmOffset;
6076 if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget)) {
6117 if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
60776118 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
60786119 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
60796120 Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
60856126 SDValue N1 = CombinedOffset.getOperand(1);
60866127 uint32_t SOffset, ImmOffset;
60876128 int Offset = cast(N1)->getSExtValue();
6088 if (Offset >= 0
6089 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, Subtarget)) {
6129 if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
6130 Subtarget, Align)) {
60906131 Offsets[0] = N0;
60916132 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
60926133 Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
5959 MVT VT, unsigned Offset) const;
6060 SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
6161 SelectionDAG &DAG) const;
62 SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset,
63 SDValue GLC, SelectionDAG &DAG) const;
6264
6365 SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
6466 SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
190192 // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
191193 // pointed to by Offsets.
192194 void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
193 SDValue *Offsets) const;
195 SDValue *Offsets, unsigned Align = 4) const;
194196
195197 public:
196198 SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI);
36013601 // pointer value is uniform.
36023602 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
36033603 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
3604 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
3605 SBase->setReg(SGPR);
3604 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
3605 SBase->setReg(SGPR);
3606 }
3607 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
3608 if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
3609 unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
3610 SOff->setReg(SGPR);
36063611 }
36073612 }
36083613
42714276 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
42724277 Inst.eraseFromParent();
42734278 continue;
4274
4275 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
4276 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
4277 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
4278 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
4279 case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: {
4280 unsigned VDst;
4281 unsigned NewOpcode;
4282
4283 switch(Opcode) {
4284 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
4285 NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
4286 VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4287 break;
4288 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
4289 NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
4290 VDst = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4291 break;
4292 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
4293 NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
4294 VDst = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass);
4295 break;
4296 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
4297 case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR:
4298 splitScalarBuffer(Worklist, Inst);
4299 Inst.eraseFromParent();
4300 continue;
4301 }
4302
4303 const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff);
4304 auto Add = MRI.getUniqueVRegDef(VAddr->getReg());
4305 unsigned Offset = 0;
4306
4307 // FIXME: This isn't safe because the addressing mode doesn't work
4308 // correctly if vaddr is negative.
4309 //
4310 // FIXME: Should probably be done somewhere else, maybe SIFoldOperands.
4311 //
4312 // See if we can extract an immediate offset by recognizing one of these:
4313 // V_ADD_I32_e32 dst, imm, src1
4314 // V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1
4315 // V_ADD will be removed by "Remove dead machine instructions".
4316 if (Add &&
4317 (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 ||
4318 Add->getOpcode() == AMDGPU::V_ADD_U32_e32 ||
4319 Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) {
4320 static const unsigned SrcNames[2] = {
4321 AMDGPU::OpName::src0,
4322 AMDGPU::OpName::src1,
4323 };
4324
4325 // Find a literal offset in one of source operands.
4326 for (int i = 0; i < 2; i++) {
4327 const MachineOperand *Src =
4328 getNamedOperand(*Add, SrcNames[i]);
4329
4330 if (Src->isReg()) {
4331 MachineInstr *Def = MRI.getUniqueVRegDef(Src->getReg());
4332 if (Def) {
4333 if (Def->isMoveImmediate())
4334 Src = &Def->getOperand(1);
4335 else if (Def->isCopy()) {
4336 auto Mov = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
4337 if (Mov && Mov->isMoveImmediate()) {
4338 Src = &Mov->getOperand(1);
4339 }
4340 }
4341 }
4342 }
4343
4344 if (Src) {
4345 if (Src->isImm())
4346 Offset = Src->getImm();
4347 else if (Src->isCImm())
4348 Offset = Src->getCImm()->getZExtValue();
4349 }
4350
4351 if (Offset && isLegalMUBUFImmOffset(Offset)) {
4352 VAddr = getNamedOperand(*Add, SrcNames[!i]);
4353 break;
4354 }
4355
4356 Offset = 0;
4357 }
4358 }
4359
4360 MachineInstr *NewInstr =
4361 BuildMI(*MBB, Inst, Inst.getDebugLoc(),
4362 get(NewOpcode), VDst)
4363 .add(*VAddr) // vaddr
4364 .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc
4365 .addImm(0) // soffset
4366 .addImm(Offset) // offset
4367 .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm())
4368 .addImm(0) // slc
4369 .addImm(0) // tfe
4370 .cloneMemRefs(Inst)
4371 .getInstr();
4372
4373 MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(),
4374 VDst);
4375 addUsersToMoveToVALUWorklist(VDst, MRI, Worklist);
4376 Inst.eraseFromParent();
4377
4378 // Legalize all operands other than the offset. Notably, convert the srsrc
4379 // into SGPRs using v_readfirstlane if needed.
4380 legalizeOperands(*NewInstr, MDT);
4381 continue;
4382 }
43834279 }
43844280
43854281 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
49544850 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
49554851 }
49564852
4957 void SIInstrInfo::splitScalarBuffer(SetVectorType &Worklist,
4958 MachineInstr &Inst) const {
4959 MachineBasicBlock &MBB = *Inst.getParent();
4960 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4961
4962 MachineBasicBlock::iterator MII = Inst;
4963 auto &DL = Inst.getDebugLoc();
4964
4965 MachineOperand &Dest = *getNamedOperand(Inst, AMDGPU::OpName::sdst);;
4966 MachineOperand &Rsrc = *getNamedOperand(Inst, AMDGPU::OpName::sbase);
4967 MachineOperand &Offset = *getNamedOperand(Inst, AMDGPU::OpName::soff);
4968 MachineOperand &Glc = *getNamedOperand(Inst, AMDGPU::OpName::glc);
4969
4970 unsigned Opcode = Inst.getOpcode();
4971 unsigned NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
4972 unsigned Count = 0;
4973 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
4974 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
4975
4976 switch(Opcode) {
4977 default:
4978 return;
4979 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
4980 Count = 2;
4981 break;
4982 case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR:
4983 Count = 4;
4984 break;
4985 }
4986
4987 // FIXME: Should also attempt to build VAddr and Offset like the non-split
4988 // case (see call site for this function)
4989
4990 // Create a vector of result registers
4991 SmallVector ResultRegs;
4992 for (unsigned i = 0; i < Count ; ++i) {
4993 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass);
4994 MachineInstr &NewMI = *BuildMI(MBB, MII, DL, get(NewOpcode), ResultReg)
4995 .addReg(Offset.getReg()) // offset
4996 .addReg(Rsrc.getReg()) // rsrc
4997 .addImm(0) // soffset
4998 .addImm(i << 4) // inst_offset
4999 .addImm(Glc.getImm()) // glc
5000 .addImm(0) // slc
5001 .addImm(0) // tfe
5002 .addMemOperand(*Inst.memoperands_begin());
5003 // Extract the 4 32 bit sub-registers from the result to add into the final REG_SEQUENCE
5004 auto &NewDestOp = NewMI.getOperand(0);
5005 for (unsigned i = 0 ; i < 4 ; i++)
5006 ResultRegs.push_back(buildExtractSubReg(MII, MRI, NewDestOp, &AMDGPU::VReg_128RegClass,
5007 RI.getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass));
5008 }
5009 // Create a new combined result to replace original with
5010 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
5011 MachineInstrBuilder CombinedResBuilder = BuildMI(MBB, MII, DL,
5012 get(TargetOpcode::REG_SEQUENCE), FullDestReg);
5013
5014 for (unsigned i = 0 ; i < Count * 4 ; ++i) {
5015 CombinedResBuilder
5016 .addReg(ResultRegs[i])
5017 .addImm(RI.getSubRegFromChannel(i));
5018 }
5019
5020 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
5021 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
5022 }
5023
50244853 void SIInstrInfo::addUsersToMoveToVALUWorklist(
50254854 unsigned DstReg,
50264855 MachineRegisterInfo &MRI,
110110 MachineInstr &Inst) const;
111111 void splitScalar64BitBFE(SetVectorType &Worklist,
112112 MachineInstr &Inst) const;
113 void splitScalarBuffer(SetVectorType &Worklist,
114 MachineInstr &Inst) const;
115113 void movePackToVALU(SetVectorType &Worklist,
116114 MachineRegisterInfo &MRI,
117115 MachineInstr &Inst) const;
907907 // Given Imm, split it into the values to put into the SOffset and ImmOffset
908908 // fields in an MUBUF instruction. Return false if it is not possible (due to a
909909 // hardware bug needing a workaround).
910 //
911 // The required alignment ensures that individual address components remain
912 // aligned if they are aligned to begin with. It also ensures that additional
913 // offsets within the given alignment can be added to the resulting ImmOffset.
910914 bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
911 const GCNSubtarget *Subtarget) {
912 const uint32_t Align = 4;
915 const GCNSubtarget *Subtarget, uint32_t Align) {
913916 const uint32_t MaxImm = alignDown(4095, Align);
914917 uint32_t Overflow = 0;
915918
470470 /// not the encoded offset.
471471 bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
472472
473 // Given Imm, split it into the values to put into the SOffset and ImmOffset
474 // fields in an MUBUF instruction. Return false if it is not possible (due to a
475 // hardware bug needing a workaround).
476473 bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
477 const GCNSubtarget *Subtarget);
474 const GCNSubtarget *Subtarget, uint32_t Align = 4);
478475
479476 /// \returns true if the intrinsic is divergent
480477 bool isIntrinsicSourceOfDivergence(unsigned IntrID);
0 # RUN: llc -march=amdgcn -run-pass si-fix-sgpr-copies -o - %s | FileCheck -check-prefix=GCN %s
11
2 # GCN: BUFFER_LOAD_DWORD_OFFEN %{{[0-9]+}}, killed %{{[0-9]+}}, 0, 4095
2 # GCN-LABEL: name: smrd_vgpr_offset_imm
3 # GCN: V_READFIRSTLANE_B32
4 # GCN: S_BUFFER_LOAD_DWORD_SGPR
35 ---
46 name: smrd_vgpr_offset_imm
57 body: |
2123 SI_RETURN_TO_EPILOG $vgpr0
2224 ...
2325
24 # GCN: BUFFER_LOAD_DWORD_OFFEN %{{[0-9]+}}, killed %{{[0-9]+}}, 0, 4095
26 # GCN-LABEL: name: smrd_vgpr_offset_imm_add_u32
27 # GCN: V_READFIRSTLANE_B32
28 # GCN: S_BUFFER_LOAD_DWORD_SGPR
2529 ---
2630 name: smrd_vgpr_offset_imm_add_u32
2731 body: |
291291
292292 ; GCN-LABEL: {{^}}smrd_vgpr_offset_imm:
293293 ; GCN-NEXT: %bb.
294 ; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4095 ;
294 ; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4092 ;
295295 define amdgpu_ps float @smrd_vgpr_offset_imm(<4 x i32> inreg %desc, i32 %offset) #0 {
296296 main_body:
297 %off = add i32 %offset, 4095
297 %off = add i32 %offset, 4092
298298 %r = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %off)
299299 ret float %r
300300 }
301301
302302 ; GCN-LABEL: {{^}}smrd_vgpr_offset_imm_too_large:
303303 ; GCN-NEXT: %bb.
304 ; GCN-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0
305 ; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ;
304 ; SICI-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0
305 ; SICI-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ;
306 ; VIGFX9-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 4 offen offset:4092 ;
306307 define amdgpu_ps float @smrd_vgpr_offset_imm_too_large(<4 x i32> inreg %desc, i32 %offset) #0 {
307308 main_body:
308309 %off = add i32 %offset, 4096
511512 }
512513
513514 ; GCN-LABEL: {{^}}smrd_load_nonconst4:
514 ; SICIVI: v_add_{{i32|u32}}_e32 v{{[0-9]+}}, vcc, 0xff8, v0 ;
515 ; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0xff8, v0 ;
516 ; GCN-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
517 ; GCN-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
518 ; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
519 ; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
515 ; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0xff8, v0 ;
516 ; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
517 ; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
518 ; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
519 ; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
520 ; VIGFX9-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 56 offen offset:4032 ;
521 ; VIGFX9-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 56 offen offset:4048 ;
522 ; VIGFX9-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 56 offen offset:4064 ;
523 ; VIGFX9-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 56 offen offset:4080 ;
520524 ; GCN: ; return to shader part epilog
521525 define amdgpu_ps <16 x float> @smrd_load_nonconst4(<4 x i32> inreg %rsrc, i32 %off) #0 {
522526 main_body:
527531 }
528532
529533 ; GCN-LABEL: {{^}}smrd_load_nonconst5:
530 ; SICIVI: v_add_{{i32|u32}}_e32 v{{[0-9]+}}, vcc, 0x1004, v0
531 ; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0x1004, v0
532 ; GCN-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
533 ; GCN-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
534 ; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
535 ; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
534 ; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x1004, v0
535 ; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
536 ; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
537 ; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
538 ; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
539 ; VIGFX9: s_movk_i32 s4, 0xfc0
540 ; VIGFX9-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], s4 offen offset:68 ;
541 ; VIGFX9-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], s4 offen offset:84 ;
542 ; VIGFX9-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], s4 offen offset:100 ;
543 ; VIGFX9-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], s4 offen offset:116 ;
536544 ; GCN: ; return to shader part epilog
537545 define amdgpu_ps <16 x float> @smrd_load_nonconst5(<4 x i32> inreg %rsrc, i32 %off) #0 {
538546 main_body:
560568
561569 ; GCN-LABEL: {{^}}smrd_uniform_loop:
562570 ;
563 ; TODO: this should use an s_buffer_load
564 ;
565 ; GCN: buffer_load_dword
571 ; TODO: we should keep the loop counter in an SGPR
572 ;
573 ; GCN: v_readfirstlane_b32
574 ; GCN: s_buffer_load_dword
566575 define amdgpu_ps float @smrd_uniform_loop(<4 x i32> inreg %desc, i32 %bound) #0 {
567576 main_body:
568577 br label %loop