llvm.org GIT mirror llvm / 69f971e
Revert "AMDGPU: Divergence-driven selection of scalar buffer load intrinsics" This reverts commit r344696 for now (except for some test additions). See https://bugs.freedesktop.org/show_bug.cgi?id=108611. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346364 91177308-0d34-0410-b5e6-96231b3b80d8 Nicolai Haehnle 10 months ago
8 changed file(s) with 242 addition(s) and 126 deletion(s). Raw diff Collapse all Expand all
48464846 return SDValue(NewNode, 0);
48474847 }
48484848
4849 SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
4850 SDValue Offset, SDValue GLC,
4851 SelectionDAG &DAG) const {
4852 MachineFunction &MF = DAG.getMachineFunction();
4853 MachineMemOperand *MMO = MF.getMachineMemOperand(
4854 MachinePointerInfo(),
4855 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4856 MachineMemOperand::MOInvariant,
4857 VT.getStoreSize(), VT.getStoreSize());
4858
4859 if (!Offset->isDivergent()) {
4860 SDValue Ops[] = {
4861 Rsrc,
4862 Offset, // Offset
4863 GLC // glc
4864 };
4865 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
4866 DAG.getVTList(VT), Ops, VT, MMO);
4867 }
4868
4869 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
4870 // assume that the buffer is unswizzled.
4871 SmallVector Loads;
4872 unsigned NumLoads = 1;
4873 MVT LoadVT = VT.getSimpleVT();
4874
4875 assert(LoadVT == MVT::i32 || LoadVT == MVT::v2i32 || LoadVT == MVT::v4i32 ||
4876 LoadVT == MVT::v8i32 || LoadVT == MVT::v16i32);
4877
4878 if (VT == MVT::v8i32 || VT == MVT::v16i32) {
4879 NumLoads = VT == MVT::v16i32 ? 4 : 2;
4880 LoadVT = MVT::v4i32;
4881 }
4882
4883 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
4884 unsigned CachePolicy = cast(GLC)->getZExtValue();
4885 SDValue Ops[] = {
4886 DAG.getEntryNode(), // Chain
4887 Rsrc, // rsrc
4888 DAG.getConstant(0, DL, MVT::i32), // vindex
4889 {}, // voffset
4890 {}, // soffset
4891 {}, // offset
4892 DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy
4893 DAG.getConstant(0, DL, MVT::i1), // idxen
4894 };
4895
4896 // Use the alignment to ensure that the required offsets will fit into the
4897 // immediate offsets.
4898 setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4);
4899
4900 uint64_t InstOffset = cast(Ops[5])->getZExtValue();
4901 for (unsigned i = 0; i < NumLoads; ++i) {
4902 Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32);
4903 Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
4904 Ops, LoadVT, MMO));
4905 }
4906
4907 if (VT == MVT::v8i32 || VT == MVT::v16i32)
4908 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
4909
4910 return Loads[0];
4911 }
4912
49134849 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
49144850 SelectionDAG &DAG) const {
49154851 MachineFunction &MF = DAG.getMachineFunction();
50645000 SDLoc(DAG.getEntryNode()),
50655001 MFI->getArgInfo().WorkItemIDZ);
50665002 case AMDGPUIntrinsic::SI_load_const: {
5067 SDValue Load =
5068 lowerSBuffer(MVT::i32, DL, Op.getOperand(1), Op.getOperand(2),
5069 DAG.getTargetConstant(0, DL, MVT::i1), DAG);
5003 SDValue Ops[] = {
5004 Op.getOperand(1), // Ptr
5005 Op.getOperand(2), // Offset
5006 DAG.getTargetConstant(0, DL, MVT::i1) // glc
5007 };
5008
5009 MachineMemOperand *MMO = MF.getMachineMemOperand(
5010 MachinePointerInfo(),
5011 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5012 MachineMemOperand::MOInvariant,
5013 VT.getStoreSize(), 4);
5014 SDVTList VTList = DAG.getVTList(MVT::i32);
5015 SDValue Load = DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
5016 VTList, Ops, MVT::i32, MMO);
5017
50705018 return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load);
50715019 }
50725020 case Intrinsic::amdgcn_s_buffer_load: {
50735021 unsigned Cache = cast(Op.getOperand(3))->getZExtValue();
5074 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
5075 DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG);
5022 SDValue Ops[] = {
5023 Op.getOperand(1), // Ptr
5024 Op.getOperand(2), // Offset
5025 DAG.getTargetConstant(Cache & 1, DL, MVT::i1) // glc
5026 };
5027
5028 MachineMemOperand *MMO = MF.getMachineMemOperand(
5029 MachinePointerInfo(),
5030 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5031 MachineMemOperand::MOInvariant,
5032 VT.getStoreSize(), VT.getStoreSize());
5033 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
5034 Op->getVTList(), Ops, VT, MMO);
50765035 }
50775036 case Intrinsic::amdgcn_fdiv_fast:
50785037 return lowerFDIV_FAST(Op, DAG);
61076066 // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
61086067 // pointed to by Offsets.
61096068 void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
6110 SelectionDAG &DAG, SDValue *Offsets,
6111 unsigned Align) const {
6069 SelectionDAG &DAG,
6070 SDValue *Offsets) const {
61126071 SDLoc DL(CombinedOffset);
61136072 if (auto C = dyn_cast(CombinedOffset)) {
61146073 uint32_t Imm = C->getZExtValue();
61156074 uint32_t SOffset, ImmOffset;
6116 if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
6075 if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget)) {
61176076 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
61186077 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
61196078 Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
61256084 SDValue N1 = CombinedOffset.getOperand(1);
61266085 uint32_t SOffset, ImmOffset;
61276086 int Offset = cast(N1)->getSExtValue();
6128 if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
6129 Subtarget, Align)) {
6087 if (Offset >= 0
6088 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, Subtarget)) {
61306089 Offsets[0] = N0;
61316090 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
61326091 Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
5959 MVT VT, unsigned Offset) const;
6060 SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
6161 SelectionDAG &DAG) const;
62 SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset,
63 SDValue GLC, SelectionDAG &DAG) const;
6462
6563 SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
6664 SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
191189 // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
192190 // pointed to by Offsets.
193191 void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
194 SDValue *Offsets, unsigned Align = 4) const;
192 SDValue *Offsets) const;
195193
196194 public:
197195 SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI);
35573557 // pointer value is uniform.
35583558 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
35593559 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
3560 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
3561 SBase->setReg(SGPR);
3562 }
3563 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
3564 if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
3565 unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
3566 SOff->setReg(SGPR);
3560 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
3561 SBase->setReg(SGPR);
35673562 }
35683563 }
35693564
41924187 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
41934188 Inst.eraseFromParent();
41944189 continue;
4190
4191 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
4192 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
4193 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
4194 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
4195 case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: {
4196 unsigned VDst;
4197 unsigned NewOpcode;
4198
4199 switch(Opcode) {
4200 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
4201 NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
4202 VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4203 break;
4204 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
4205 NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
4206 VDst = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4207 break;
4208 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
4209 NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
4210 VDst = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass);
4211 break;
4212 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
4213 case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR:
4214 splitScalarBuffer(Worklist, Inst);
4215 Inst.eraseFromParent();
4216 continue;
4217 }
4218
4219 const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff);
4220 auto Add = MRI.getUniqueVRegDef(VAddr->getReg());
4221 unsigned Offset = 0;
4222
4223 // FIXME: This isn't safe because the addressing mode doesn't work
4224 // correctly if vaddr is negative.
4225 //
4226 // FIXME: Should probably be done somewhere else, maybe SIFoldOperands.
4227 //
4228 // See if we can extract an immediate offset by recognizing one of these:
4229 // V_ADD_I32_e32 dst, imm, src1
4230 // V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1
4231 // V_ADD will be removed by "Remove dead machine instructions".
4232 if (Add &&
4233 (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 ||
4234 Add->getOpcode() == AMDGPU::V_ADD_U32_e32 ||
4235 Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) {
4236 static const unsigned SrcNames[2] = {
4237 AMDGPU::OpName::src0,
4238 AMDGPU::OpName::src1,
4239 };
4240
4241 // Find a literal offset in one of source operands.
4242 for (int i = 0; i < 2; i++) {
4243 const MachineOperand *Src =
4244 getNamedOperand(*Add, SrcNames[i]);
4245
4246 if (Src->isReg()) {
4247 MachineInstr *Def = MRI.getUniqueVRegDef(Src->getReg());
4248 if (Def) {
4249 if (Def->isMoveImmediate())
4250 Src = &Def->getOperand(1);
4251 else if (Def->isCopy()) {
4252 auto Mov = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
4253 if (Mov && Mov->isMoveImmediate()) {
4254 Src = &Mov->getOperand(1);
4255 }
4256 }
4257 }
4258 }
4259
4260 if (Src) {
4261 if (Src->isImm())
4262 Offset = Src->getImm();
4263 else if (Src->isCImm())
4264 Offset = Src->getCImm()->getZExtValue();
4265 }
4266
4267 if (Offset && isLegalMUBUFImmOffset(Offset)) {
4268 VAddr = getNamedOperand(*Add, SrcNames[!i]);
4269 break;
4270 }
4271
4272 Offset = 0;
4273 }
4274 }
4275
4276 MachineInstr *NewInstr =
4277 BuildMI(*MBB, Inst, Inst.getDebugLoc(),
4278 get(NewOpcode), VDst)
4279 .add(*VAddr) // vaddr
4280 .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc
4281 .addImm(0) // soffset
4282 .addImm(Offset) // offset
4283 .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm())
4284 .addImm(0) // slc
4285 .addImm(0) // tfe
4286 .cloneMemRefs(Inst)
4287 .getInstr();
4288
4289 MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(),
4290 VDst);
4291 addUsersToMoveToVALUWorklist(VDst, MRI, Worklist);
4292 Inst.eraseFromParent();
4293
4294 // Legalize all operands other than the offset. Notably, convert the srsrc
4295 // into SGPRs using v_readfirstlane if needed.
4296 legalizeOperands(*NewInstr, MDT);
4297 continue;
4298 }
41954299 }
41964300
41974301 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
46714775
46724776 MRI.replaceRegWith(Dest.getReg(), ResultReg);
46734777 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4778 }
4779
4780 void SIInstrInfo::splitScalarBuffer(SetVectorType &Worklist,
4781 MachineInstr &Inst) const {
4782 MachineBasicBlock &MBB = *Inst.getParent();
4783 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4784
4785 MachineBasicBlock::iterator MII = Inst;
4786 auto &DL = Inst.getDebugLoc();
4787
4788 MachineOperand &Dest = *getNamedOperand(Inst, AMDGPU::OpName::sdst);;
4789 MachineOperand &Rsrc = *getNamedOperand(Inst, AMDGPU::OpName::sbase);
4790 MachineOperand &Offset = *getNamedOperand(Inst, AMDGPU::OpName::soff);
4791 MachineOperand &Glc = *getNamedOperand(Inst, AMDGPU::OpName::glc);
4792
4793 unsigned Opcode = Inst.getOpcode();
4794 unsigned NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
4795 unsigned Count = 0;
4796 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
4797 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
4798
4799 switch(Opcode) {
4800 default:
4801 return;
4802 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
4803 Count = 2;
4804 break;
4805 case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR:
4806 Count = 4;
4807 break;
4808 }
4809
4810 // FIXME: Should also attempt to build VAddr and Offset like the non-split
4811 // case (see call site for this function)
4812
4813 // Create a vector of result registers
4814 SmallVector ResultRegs;
4815 for (unsigned i = 0; i < Count ; ++i) {
4816 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass);
4817 MachineInstr &NewMI = *BuildMI(MBB, MII, DL, get(NewOpcode), ResultReg)
4818 .addReg(Offset.getReg()) // offset
4819 .addReg(Rsrc.getReg()) // rsrc
4820 .addImm(0) // soffset
4821 .addImm(i << 4) // inst_offset
4822 .addImm(Glc.getImm()) // glc
4823 .addImm(0) // slc
4824 .addImm(0) // tfe
4825 .addMemOperand(*Inst.memoperands_begin());
4826 // Extract the 4 32 bit sub-registers from the result to add into the final REG_SEQUENCE
4827 auto &NewDestOp = NewMI.getOperand(0);
4828 for (unsigned i = 0 ; i < 4 ; i++)
4829 ResultRegs.push_back(buildExtractSubReg(MII, MRI, NewDestOp, &AMDGPU::VReg_128RegClass,
4830 RI.getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass));
4831 }
4832 // Create a new combined result to replace original with
4833 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
4834 MachineInstrBuilder CombinedResBuilder = BuildMI(MBB, MII, DL,
4835 get(TargetOpcode::REG_SEQUENCE), FullDestReg);
4836
4837 for (unsigned i = 0 ; i < Count * 4 ; ++i) {
4838 CombinedResBuilder
4839 .addReg(ResultRegs[i])
4840 .addImm(RI.getSubRegFromChannel(i));
4841 }
4842
4843 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4844 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
46744845 }
46754846
46764847 void SIInstrInfo::addUsersToMoveToVALUWorklist(
102102 MachineInstr &Inst) const;
103103 void splitScalar64BitBFE(SetVectorType &Worklist,
104104 MachineInstr &Inst) const;
105 void splitScalarBuffer(SetVectorType &Worklist,
106 MachineInstr &Inst) const;
105107 void movePackToVALU(SetVectorType &Worklist,
106108 MachineRegisterInfo &MRI,
107109 MachineInstr &Inst) const;
893893 // Given Imm, split it into the values to put into the SOffset and ImmOffset
894894 // fields in an MUBUF instruction. Return false if it is not possible (due to a
895895 // hardware bug needing a workaround).
896 //
897 // The required alignment ensures that individual address components remain
898 // aligned if they are aligned to begin with. It also ensures that additional
899 // offsets within the given alignment can be added to the resulting ImmOffset.
900896 bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
901 const GCNSubtarget *Subtarget, uint32_t Align) {
897 const GCNSubtarget *Subtarget) {
898 const uint32_t Align = 4;
902899 const uint32_t MaxImm = alignDown(4095, Align);
903900 uint32_t Overflow = 0;
904901
440440 /// not the encoded offset.
441441 bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
442442
443 // Given Imm, split it into the values to put into the SOffset and ImmOffset
444 // fields in an MUBUF instruction. Return false if it is not possible (due to a
445 // hardware bug needing a workaround).
443446 bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
444 const GCNSubtarget *Subtarget, uint32_t Align = 4);
447 const GCNSubtarget *Subtarget);
445448
446449 /// \returns true if the intrinsic is divergent
447450 bool isIntrinsicSourceOfDivergence(unsigned IntrID);
0 # RUN: llc -march=amdgcn -run-pass si-fix-sgpr-copies -o - %s | FileCheck -check-prefix=GCN %s
11
2 # GCN-LABEL: name: smrd_vgpr_offset_imm
3 # GCN: V_READFIRSTLANE_B32
4 # GCN: S_BUFFER_LOAD_DWORD_SGPR
2 # GCN: BUFFER_LOAD_DWORD_OFFEN %{{[0-9]+}}, killed %{{[0-9]+}}, 0, 4095
53 ---
64 name: smrd_vgpr_offset_imm
75 body: |
2321 SI_RETURN_TO_EPILOG $vgpr0
2422 ...
2523
26 # GCN-LABEL: name: smrd_vgpr_offset_imm_add_u32
27 # GCN: V_READFIRSTLANE_B32
28 # GCN: S_BUFFER_LOAD_DWORD_SGPR
24 # GCN: BUFFER_LOAD_DWORD_OFFEN %{{[0-9]+}}, killed %{{[0-9]+}}, 0, 4095
2925 ---
3026 name: smrd_vgpr_offset_imm_add_u32
3127 body: |
291291
292292 ; GCN-LABEL: {{^}}smrd_vgpr_offset_imm:
293293 ; GCN-NEXT: %bb.
294 ; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4092 ;
294 ; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4095 ;
295295 define amdgpu_ps float @smrd_vgpr_offset_imm(<4 x i32> inreg %desc, i32 %offset) #0 {
296296 main_body:
297 %off = add i32 %offset, 4092
297 %off = add i32 %offset, 4095
298298 %r = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %off)
299299 ret float %r
300300 }
301301
302302 ; GCN-LABEL: {{^}}smrd_vgpr_offset_imm_too_large:
303303 ; GCN-NEXT: %bb.
304 ; SICI-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0
305 ; SICI-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ;
306 ; VIGFX9-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 4 offen offset:4092 ;
304 ; GCN-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0
305 ; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ;
307306 define amdgpu_ps float @smrd_vgpr_offset_imm_too_large(<4 x i32> inreg %desc, i32 %offset) #0 {
308307 main_body:
309308 %off = add i32 %offset, 4096
510509 }
511510
512511 ; GCN-LABEL: {{^}}smrd_load_nonconst4:
513 ; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0xff8, v0 ;
514 ; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
515 ; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
516 ; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
517 ; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
518 ; VIGFX9-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 56 offen offset:4032 ;
519 ; VIGFX9-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 56 offen offset:4048 ;
520 ; VIGFX9-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 56 offen offset:4064 ;
521 ; VIGFX9-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 56 offen offset:4080 ;
512 ; SICIVI: v_add_{{i32|u32}}_e32 v{{[0-9]+}}, vcc, 0xff8, v0 ;
513 ; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0xff8, v0 ;
514 ; GCN-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
515 ; GCN-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
516 ; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
517 ; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
522518 ; GCN: ; return to shader part epilog
523519 define amdgpu_ps <16 x float> @smrd_load_nonconst4(<4 x i32> inreg %rsrc, i32 %off) #0 {
524520 main_body:
529525 }
530526
531527 ; GCN-LABEL: {{^}}smrd_load_nonconst5:
532 ; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x1004, v0
533 ; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
534 ; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
535 ; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
536 ; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
537 ; VIGFX9: s_movk_i32 s4, 0xfc0
538 ; VIGFX9-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], s4 offen offset:68 ;
539 ; VIGFX9-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], s4 offen offset:84 ;
540 ; VIGFX9-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], s4 offen offset:100 ;
541 ; VIGFX9-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], s4 offen offset:116 ;
528 ; SICIVI: v_add_{{i32|u32}}_e32 v{{[0-9]+}}, vcc, 0x1004, v0
529 ; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0x1004, v0
530 ; GCN-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
531 ; GCN-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
532 ; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
533 ; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
542534 ; GCN: ; return to shader part epilog
543535 define amdgpu_ps <16 x float> @smrd_load_nonconst5(<4 x i32> inreg %rsrc, i32 %off) #0 {
544536 main_body:
566558
567559 ; GCN-LABEL: {{^}}smrd_uniform_loop:
568560 ;
569 ; TODO: we should keep the loop counter in an SGPR
561 ; TODO: this should use an s_buffer_load
570562 ;
571 ; GCN: v_readfirstlane_b32
572 ; GCN: s_buffer_load_dword
563 ; GCN: buffer_load_dword
573564 define amdgpu_ps float @smrd_uniform_loop(<4 x i32> inreg %desc, i32 %bound) #0 {
574565 main_body:
575566 br label %loop
593584 ; (this test differs from smrd_uniform_loop by the more complex structure of phis,
594585 ; which used to confuse the DivergenceAnalysis after structurization)
595586 ;
596 ; TODO: we should keep the loop counter in an SGPR
587 ; TODO: we should keep the loop counter in an SGPR and use an S_BUFFER_LOAD
597588 ;
598 ; GCN: v_readfirstlane_b32
599 ; GCN: s_buffer_load_dword
589 ; GCN: buffer_load_dword
600590 define amdgpu_ps float @smrd_uniform_loop2(<4 x i32> inreg %desc, i32 %bound, i32 %bound.a) #0 {
601591 main_body:
602592 br label %loop