llvm.org GIT mirror llvm / ee9a2bd
[AMDGPU] Add support for multi-dword s.buffer.load intrinsic Summary: Patch by Marek Olsak and David Stuttard, both of AMD. This adds a new amdgcn intrinsic supporting s.buffer.load, in particular multiple dword variants. These are convenient to use from some front-end implementations. Also modified the existing llvm.SI.load.const intrinsic to common up the underlying implementation. This modification also requires that we can lower to non-uniform loads correctly by splitting larger dword variants into sizes supported by the non-uniform versions of the load. V2: Addressed minor review comments. V3: i1 glc is now i32 cachepolicy for consistency with buffer and tbuffer intrinsics, plus fixed formatting issue. V4: Added glc test. Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D51098 Change-Id: I83a6e00681158bb243591a94a51c7baa445f169b git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@340684 91177308-0d34-0410-b5e6-96231b3b80d8 Tim Renouf 1 year, 21 days ago
10 changed file(s) with 390 addition(s) and 35 deletion(s). Raw diff Collapse all Expand all
800800 AMDGPURsrcIntrinsic<0>;
801801 def int_amdgcn_buffer_load_format : AMDGPUBufferLoad;
802802 def int_amdgcn_buffer_load : AMDGPUBufferLoad;
803
804 def int_amdgcn_s_buffer_load : Intrinsic <
805 [llvm_anyint_ty],
806 [llvm_v4i32_ty, // rsrc(SGPR)
807 llvm_i32_ty, // byte offset(SGPR/VGPR/imm)
808 llvm_i32_ty], // cachepolicy(imm; bit 0 = glc)
809 [IntrNoMem]>,
810 AMDGPURsrcIntrinsic<0>;
803811
804812 class AMDGPUBufferStore : Intrinsic <
805813 [],
41694169 NODE_NAME_CASE(BUFFER_LOAD)
41704170 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
41714171 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
4172 NODE_NAME_CASE(SBUFFER_LOAD)
41724173 NODE_NAME_CASE(BUFFER_STORE)
41734174 NODE_NAME_CASE(BUFFER_STORE_FORMAT)
41744175 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
485485 BUFFER_LOAD,
486486 BUFFER_LOAD_FORMAT,
487487 BUFFER_LOAD_FORMAT_D16,
488 SBUFFER_LOAD,
488489 BUFFER_STORE,
489490 BUFFER_STORE_FORMAT,
490491 BUFFER_STORE_FORMAT_D16,
49204920 MFI->getArgInfo().WorkItemIDZ);
49214921 case AMDGPUIntrinsic::SI_load_const: {
49224922 SDValue Ops[] = {
4923 Op.getOperand(1),
4924 Op.getOperand(2)
4923 Op.getOperand(1), // Ptr
4924 Op.getOperand(2), // Offset
4925 DAG.getTargetConstant(0, DL, MVT::i1) // glc
49254926 };
49264927
49274928 MachineMemOperand *MMO = MF.getMachineMemOperand(
49294930 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
49304931 MachineMemOperand::MOInvariant,
49314932 VT.getStoreSize(), 4);
4932 return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
4933 SDVTList VTList = DAG.getVTList(MVT::i32);
4934 SDValue Load = DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
4935 VTList, Ops, MVT::i32, MMO);
4936
4937 return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load);
4938 }
4939 case Intrinsic::amdgcn_s_buffer_load: {
4940 unsigned Cache = cast(Op.getOperand(3))->getZExtValue();
4941 SDValue Ops[] = {
4942 Op.getOperand(1), // Ptr
4943 Op.getOperand(2), // Offset
4944 DAG.getTargetConstant(Cache & 1, DL, MVT::i1) // glc
4945 };
4946
4947 MachineMemOperand *MMO = MF.getMachineMemOperand(
4948 MachinePointerInfo(),
4949 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4950 MachineMemOperand::MOInvariant,
4951 VT.getStoreSize(), VT.getStoreSize());
4952 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
49334953 Op->getVTList(), Ops, VT, MMO);
49344954 }
49354955 case Intrinsic::amdgcn_fdiv_fast:
39033903 Inst.eraseFromParent();
39043904 continue;
39053905
3906 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: {
3907 unsigned VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3906 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
3907 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
3908 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
3909 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
3910 case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: {
3911 unsigned VDst;
3912 unsigned NewOpcode;
3913
3914 switch(Opcode) {
3915 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
3916 NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
3917 VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3918 break;
3919 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
3920 NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
3921 VDst = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
3922 break;
3923 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
3924 NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
3925 VDst = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass);
3926 break;
3927 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
3928 case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR:
3929 splitScalarBuffer(Worklist, Inst);
3930 Inst.eraseFromParent();
3931 continue;
3932 }
3933
39083934 const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff);
39093935 auto Add = MRI.getUniqueVRegDef(VAddr->getReg());
39103936 unsigned Offset = 0;
39553981
39563982 MachineInstr *NewInstr =
39573983 BuildMI(*MBB, Inst, Inst.getDebugLoc(),
3958 get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), VDst)
3984 get(NewOpcode), VDst)
39593985 .add(*VAddr) // vaddr
39603986 .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc
39613987 .addImm(0) // soffset
44544480
44554481 MRI.replaceRegWith(Dest.getReg(), ResultReg);
44564482 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4483 }
4484
4485 void SIInstrInfo::splitScalarBuffer(SetVectorType &Worklist,
4486 MachineInstr &Inst) const {
4487 MachineBasicBlock &MBB = *Inst.getParent();
4488 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4489
4490 MachineBasicBlock::iterator MII = Inst;
4491 auto &DL = Inst.getDebugLoc();
4492
4493 MachineOperand &Dest = *getNamedOperand(Inst, AMDGPU::OpName::sdst);;
4494 MachineOperand &Rsrc = *getNamedOperand(Inst, AMDGPU::OpName::sbase);
4495 MachineOperand &Offset = *getNamedOperand(Inst, AMDGPU::OpName::soff);
4496 MachineOperand &Glc = *getNamedOperand(Inst, AMDGPU::OpName::glc);
4497
4498 unsigned Opcode = Inst.getOpcode();
4499 unsigned NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
4500 unsigned Count = 0;
4501 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
4502 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
4503
4504 switch(Opcode) {
4505 default:
4506 return;
4507 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
4508 Count = 2;
4509 break;
4510 case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR:
4511 Count = 4;
4512 break;
4513 }
4514
4515 // FIXME: Should also attempt to build VAddr and Offset like the non-split
4516 // case (see call site for this function)
4517
4518 // Create a vector of result registers
4519 SmallVector ResultRegs;
4520 for (unsigned i = 0; i < Count ; ++i) {
4521 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass);
4522 MachineInstr &NewMI = *BuildMI(MBB, MII, DL, get(NewOpcode), ResultReg)
4523 .addReg(Offset.getReg()) // offset
4524 .addReg(Rsrc.getReg()) // rsrc
4525 .addImm(0) // soffset
4526 .addImm(i << 4) // inst_offset
4527 .addImm(Glc.getImm()) // glc
4528 .addImm(0) // slc
4529 .addImm(0) // tfe
4530 .addMemOperand(*Inst.memoperands_begin());
4531 // Extract the 4 32 bit sub-registers from the result to add into the final REG_SEQUENCE
4532 auto &NewDestOp = NewMI.getOperand(0);
4533 for (unsigned i = 0 ; i < 4 ; i++)
4534 ResultRegs.push_back(buildExtractSubReg(MII, MRI, NewDestOp, &AMDGPU::VReg_128RegClass,
4535 RI.getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass));
4536 }
4537 // Create a new combined result to replace original with
4538 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
4539 MachineInstrBuilder CombinedResBuilder = BuildMI(MBB, MII, DL,
4540 get(TargetOpcode::REG_SEQUENCE), FullDestReg);
4541
4542 for (unsigned i = 0 ; i < Count * 4 ; ++i) {
4543 CombinedResBuilder
4544 .addReg(ResultRegs[i])
4545 .addImm(RI.getSubRegFromChannel(i));
4546 }
4547
4548 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4549 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
44574550 }
44584551
44594552 void SIInstrInfo::addUsersToMoveToVALUWorklist(
100100 MachineInstr &Inst) const;
101101 void splitScalar64BitBFE(SetVectorType &Worklist,
102102 MachineInstr &Inst) const;
103 void splitScalarBuffer(SetVectorType &Worklist,
104 MachineInstr &Inst) const;
103105 void movePackToVALU(SetVectorType &Worklist,
104106 MachineRegisterInfo &MRI,
105107 MachineInstr &Inst) const;
3939
4040 def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
4141
42 def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
43 SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>,
44 [SDNPMayLoad, SDNPMemOperand]
42 def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD",
43 SDTypeProfile<1, 3, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i1>]>,
44 [SDNPMayLoad, SDNPMemOperand]
4545 >;
4646
4747 def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2,
408408 >;
409409 }
410410
411 multiclass SMLoad_Pattern {
412 // 1. Offset as an immediate
413 // name this pattern to reuse AddedComplexity on CI
414 def _IMM : GCNPat <
415 (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), i1:$glc),
416 (vt (!cast(Instr#"_IMM") $sbase, $offset, (as_i1imm $glc)))
417 >;
418
419 // 2. Offset loaded in an 32bit SGPR
420 def : GCNPat <
421 (SIsbuffer_load v4i32:$sbase, i32:$offset, i1:$glc),
422 (vt (!cast(Instr#"_SGPR") $sbase, $offset, (as_i1imm $glc)))
423 >;
424 }
425
426
411427 let OtherPredicates = [isSICI] in {
412428 def : GCNPat <
413429 (i64 (readcyclecounter)),
426442 defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>;
427443 defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
428444
429 // 1. Offset as an immediate
430 def SM_LOAD_PATTERN : GCNPat < // name this pattern to reuse AddedComplexity on CI
431 (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)),
432 (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset, 0)
433 >;
434
435 // 2. Offset loaded in an 32bit SGPR
436 def : GCNPat <
437 (SIload_constant v4i32:$sbase, i32:$offset),
438 (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset, 0)
439 >;
440
445 // Name the pattern to reuse AddedComplexity on CI
446 defm SM_LOAD_PATTERN : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", i32>;
447 defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2i32>;
448 defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4i32>;
449 defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8i32>;
450 defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16i32>;
441451 } // End let AddedComplexity = 100
442452
443453 let OtherPredicates = [isVI] in {
756766
757767 def S_DCACHE_INV_VOL_ci : SMRD_Real_ci <0x1d, S_DCACHE_INV_VOL>;
758768
759 let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity in {
769 let AddedComplexity = SM_LOAD_PATTERN_IMM.AddedComplexity in {
760770
761771 class SMRD_Pattern_ci : GCNPat <
762772 (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
770780 def : SMRD_Pattern_ci <"S_LOAD_DWORDX8", v8i32>;
771781 def : SMRD_Pattern_ci <"S_LOAD_DWORDX16", v16i32>;
772782
773 def : GCNPat <
774 (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)),
775 (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset, 0)> {
783 class SMLoad_Pattern_ci : GCNPat <
784 (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), i1:$glc)),
785 (!cast(Instr) $sbase, $offset, (as_i1imm $glc))> {
776786 let OtherPredicates = [isCI]; // should this be isCIOnly?
777787 }
778788
789 def : SMLoad_Pattern_ci <"S_BUFFER_LOAD_DWORD_IMM_ci", i32>;
790 def : SMLoad_Pattern_ci <"S_BUFFER_LOAD_DWORDX2_IMM_ci", v2i32>;
791 def : SMLoad_Pattern_ci <"S_BUFFER_LOAD_DWORDX4_IMM_ci", v4i32>;
792 def : SMLoad_Pattern_ci <"S_BUFFER_LOAD_DWORDX8_IMM_ci", v8i32>;
793 def : SMLoad_Pattern_ci <"S_BUFFER_LOAD_DWORDX16_IMM_ci", v16i32>;
794
779795 } // End let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity
780796
105105 ; GCN-LABEL: {{^}}smrd_load_const0:
106106 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
107107 ; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
108 define amdgpu_ps void @smrd_load_const0(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
108 define amdgpu_ps void @smrd_load_const0(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
109109 main_body:
110110 %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
111111 %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
118118 ; offset.
119119 ; GCN-LABEL: {{^}}smrd_load_const1:
120120 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
121 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff glc ; encoding: [0xff
121122 ; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
122 define amdgpu_ps void @smrd_load_const1(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
123 ; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc glc
124 define amdgpu_ps void @smrd_load_const1(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
123125 main_body:
124126 %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
125127 %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
126128 %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1020)
127 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
129 %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
130 %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1020, i32 1)
131 %s.buffer.float = bitcast i32 %s.buffer to float
132 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
128133 ret void
129134 }
130135
134139 ; GCN-LABEL: {{^}}smrd_load_const2:
135140 ; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
136141 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
142 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
143 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
137144 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
138145 ; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
139 define amdgpu_ps void @smrd_load_const2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
146 ; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
147 define amdgpu_ps void @smrd_load_const2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
140148 main_body:
141149 %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
142150 %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
143151 %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1024)
144 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
152 %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
153 %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1024, i32 0)
154 %s.buffer.float = bitcast i32 %s.buffer to float
155 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
145156 ret void
146157 }
147158
149160 ; GCN-LABEL: {{^}}smrd_load_const3:
150161 ; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc
151162 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
163 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
164 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
152165 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
153166 ; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
154 define amdgpu_ps void @smrd_load_const3(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
167 ; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
168 define amdgpu_ps void @smrd_load_const3(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
155169 main_body:
156170 %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
157171 %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
158172 %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1048572)
159 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
173 %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
174 %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1048572, i32 0)
175 %s.buffer.float = bitcast i32 %s.buffer to float
176 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
160177 ret void
161178 }
162179
164181 ; GCN-LABEL: {{^}}smrd_load_const4:
165182 ; SIVIGFX9: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000
166183 ; SIVIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
184 ; SIVIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
167185 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
168 ; GCN: s_endpgm
169 define amdgpu_ps void @smrd_load_const4(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
186 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
187 ; GCN: s_endpgm
188 define amdgpu_ps void @smrd_load_const4(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
170189 main_body:
171190 %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
172191 %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
173192 %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1048576)
174 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
193 %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
194 %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1048576, i32 0)
195 %s.buffer.float = bitcast i32 %s.buffer to float
196 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
197 ret void
198 }
199
200 ; dwordx2 s.buffer.load
201 ; GCN-LABEL: {{^}}s_buffer_load_dwordx2:
202 ; VIGFX9: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80
203 ; SICI: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20
204 define amdgpu_ps void @s_buffer_load_dwordx2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
205 main_body:
206 %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
207 %s.buffer = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %tmp22, i32 128, i32 0)
208 %s.buffer.0 = extractelement <2 x i32> %s.buffer, i32 0
209 %s.buffer.0.float = bitcast i32 %s.buffer.0 to float
210 %s.buffer.1 = extractelement <2 x i32> %s.buffer, i32 1
211 %s.buffer.1.float = bitcast i32 %s.buffer.1 to float
212 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %s.buffer.0.float, float %s.buffer.1.float, float %s.buffer.0.float, float %s.buffer.1.float, i1 true, i1 true) #0
213 ret void
214 }
215
216 ; dwordx4 s.buffer.load
217 ; GCN-LABEL: {{^}}s_buffer_load_dwordx4:
218 ; VIGFX9: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80
219 ; SICI: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20
220 define amdgpu_ps void @s_buffer_load_dwordx4(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
221 main_body:
222 %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
223 %s.buffer = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %tmp22, i32 128, i32 0)
224 %s.buffer.0 = extractelement <4 x i32> %s.buffer, i32 0
225 %s.buffer.0.float = bitcast i32 %s.buffer.0 to float
226 %s.buffer.1 = extractelement <4 x i32> %s.buffer, i32 1
227 %s.buffer.1.float = bitcast i32 %s.buffer.1 to float
228 %s.buffer.2 = extractelement <4 x i32> %s.buffer, i32 2
229 %s.buffer.2.float = bitcast i32 %s.buffer.2 to float
230 %s.buffer.3 = extractelement <4 x i32> %s.buffer, i32 3
231 %s.buffer.3.float = bitcast i32 %s.buffer.3 to float
232 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %s.buffer.0.float, float %s.buffer.1.float, float %s.buffer.2.float, float %s.buffer.3.float, i1 true, i1 true) #0
233 ret void
234 }
235
236 ; dwordx8 s.buffer.load
237 ; GCN-LABEL: {{^}}s_buffer_load_dwordx8:
238 ; VIGFX9: s_buffer_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80
239 ; SICI: s_buffer_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20
240 define amdgpu_ps void @s_buffer_load_dwordx8(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
241 main_body:
242 %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
243 %s.buffer = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %tmp22, i32 128, i32 0)
244 %s.buffer.0 = extractelement <8 x i32> %s.buffer, i32 0
245 %s.buffer.0.float = bitcast i32 %s.buffer.0 to float
246 %s.buffer.1 = extractelement <8 x i32> %s.buffer, i32 2
247 %s.buffer.1.float = bitcast i32 %s.buffer.1 to float
248 %s.buffer.2 = extractelement <8 x i32> %s.buffer, i32 5
249 %s.buffer.2.float = bitcast i32 %s.buffer.2 to float
250 %s.buffer.3 = extractelement <8 x i32> %s.buffer, i32 7
251 %s.buffer.3.float = bitcast i32 %s.buffer.3 to float
252 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %s.buffer.0.float, float %s.buffer.1.float, float %s.buffer.2.float, float %s.buffer.3.float, i1 true, i1 true) #0
253 ret void
254 }
255
256 ; dwordx16 s.buffer.load
257 ; GCN-LABEL: {{^}}s_buffer_load_dwordx16:
258 ; VIGFX9: s_buffer_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80
259 ; SICI: s_buffer_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20
260 define amdgpu_ps void @s_buffer_load_dwordx16(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
261 main_body:
262 %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
263 %s.buffer = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %tmp22, i32 128, i32 0)
264 %s.buffer.0 = extractelement <16 x i32> %s.buffer, i32 0
265 %s.buffer.0.float = bitcast i32 %s.buffer.0 to float
266 %s.buffer.1 = extractelement <16 x i32> %s.buffer, i32 3
267 %s.buffer.1.float = bitcast i32 %s.buffer.1 to float
268 %s.buffer.2 = extractelement <16 x i32> %s.buffer, i32 12
269 %s.buffer.2.float = bitcast i32 %s.buffer.2 to float
270 %s.buffer.3 = extractelement <16 x i32> %s.buffer, i32 15
271 %s.buffer.3.float = bitcast i32 %s.buffer.3 to float
272 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %s.buffer.0.float, float %s.buffer.1.float, float %s.buffer.2.float, float %s.buffer.3.float, i1 true, i1 true) #0
175273 ret void
176274 }
177275
338436 br i1 %outer_br, label %.outer_loop_header, label %ret_block
339437 }
340438
439 ; SMRD load with a non-const offset
440 ; GCN-LABEL: {{^}}smrd_load_nonconst0:
441 ; SIVIGFX9: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
442 ; SIVIGFX9: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
443 ; CI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
444 ; CI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
445 ; GCN: s_endpgm
446 define amdgpu_ps void @smrd_load_nonconst0(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in, i32 inreg %ncoff) #0 {
447 main_body:
448 %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
449 %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
450 %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 %ncoff)
451 %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
452 %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 %ncoff, i32 0)
453 %s.buffer.float = bitcast i32 %s.buffer to float
454 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
455 ret void
456 }
457
458 ; SMRD load with a non-const non-uniform offset
459 ; GCN-LABEL: {{^}}smrd_load_nonconst1:
460 ; SIVIGFX9: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
461 ; SIVIGFX9: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
462 ; CI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
463 ; CI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
464 ; GCN: s_endpgm
465 define amdgpu_ps void @smrd_load_nonconst1(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in, i32 %ncoff) #0 {
466 main_body:
467 %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
468 %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
469 %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 %ncoff)
470 %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
471 %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 %ncoff, i32 0)
472 %s.buffer.float = bitcast i32 %s.buffer to float
473 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
474 ret void
475 }
476
477 ; SMRD load with a non-const non-uniform offset of > 4 dwords (requires splitting)
478 ; GCN-LABEL: {{^}}smrd_load_nonconst2:
479 ; SIVIGFX9: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
480 ; SIVIGFX9: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
481 ; CI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
482 ; CI: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
483 ; GCN: s_endpgm
484 define amdgpu_ps void @smrd_load_nonconst2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in, i32 %ncoff) #0 {
485 main_body:
486 %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
487 %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
488 %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 %ncoff)
489 %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
490 %s.buffer = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %tmp22, i32 %ncoff, i32 0)
491 %s.buffer.elt = extractelement <8 x i32> %s.buffer, i32 1
492 %s.buffer.float = bitcast i32 %s.buffer.elt to float
493 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
494 ret void
495 }
496
497 ; SMRD load dwordx2
498 ; GCN-LABEL: {{^}}smrd_load_dwordx2:
499 ; SIVIGFX9: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
500 ; CI: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
501 ; GCN: s_endpgm
502 define amdgpu_ps void @smrd_load_dwordx2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in, i32 inreg %ncoff) #0 {
503 main_body:
504 %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
505 %s.buffer = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %tmp22, i32 %ncoff, i32 0)
506 %s.buffer.float = bitcast <2 x i32> %s.buffer to <2 x float>
507 %r.1 = extractelement <2 x float> %s.buffer.float, i32 0
508 %r.2 = extractelement <2 x float> %s.buffer.float, i32 1
509 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r.1, float %r.1, float %r.1, float %r.2, i1 true, i1 true) #0
510 ret void
511 }
512
513
341514 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
342515 declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
343516 declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
344517 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
518 declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)
519 declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32)
520 declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32)
521 declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32>, i32, i32)
522 declare <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32>, i32, i32)
345523
346524 attributes #0 = { nounwind }
347525 attributes #1 = { nounwind readnone }
0 ; RUN: opt < %s -S -mtriple=amdgcn-- -early-cse | FileCheck %s
1
2 ; CHECK-LABEL: @no_cse
3 ; CHECK: call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0)
4 ; CHECK: call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0)
5 define void @no_cse(i32 addrspace(1)* %out, <4 x i32> %in) {
6 %a = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0)
7 %b = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0)
8 %c = add i32 %a, %b
9 store i32 %c, i32 addrspace(1)* %out
10 ret void
11 }
12
13 ; CHECK-LABEL: @cse_zero_offset
14 ; CHECK: [[CSE:%[a-z0-9A-Z]+]] = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0)
15 ; CHECK: add i32 [[CSE]], [[CSE]]
16 define void @cse_zero_offset(i32 addrspace(1)* %out, <4 x i32> %in) {
17 %a = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0)
18 %b = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0)
19 %c = add i32 %a, %b
20 store i32 %c, i32 addrspace(1)* %out
21 ret void
22 }
23
24 ; CHECK-LABEL: @cse_nonzero_offset
25 ; CHECK: [[CSE:%[a-z0-9A-Z]+]] = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0)
26 ; CHECK: add i32 [[CSE]], [[CSE]]
27 define void @cse_nonzero_offset(i32 addrspace(1)* %out, <4 x i32> %in) {
28 %a = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0)
29 %b = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0)
30 %c = add i32 %a, %b
31 store i32 %c, i32 addrspace(1)* %out
32 ret void
33 }
34
35 declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> nocapture, i32, i32)