llvm.org GIT mirror llvm / 59762a4
AMDGPU: Make s34 the FP register Make the FP register callee saved. This is tricky because now the FP needs to be spilled in the prolog relative to the incoming SP register, rather than the frame register used throughout the rest of the function. I don't like how this bypassess the standard mechanism for CSR spills just to get the correct insert point. I may look for a better solution, since all CSR VGPRs may also need to have all lanes activated. Another option might be to make getFrameIndexReference change the base register if the frame index is a CSR, and then try to figure out the right insertion point in emitProlog. If there is a free VGPR lane available for SGPR spilling, try to use it for the FP. If that would require intrtoducing a new VGPR spill, try to use a free call clobbered SGPR. Only fallback to introducing a new VGPR spill as a last resort. This also doesn't attempt to handle SGPR spilling with scalar stores. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@365372 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 4 months ago
29 changed file(s) with 1245 addition(s) and 888 deletion(s). Raw diff Collapse all Expand all
2020
2121 using namespace llvm;
2222
23 #define DEBUG_TYPE "frame-info"
24
25
2326 static ArrayRef getAllSGPR128(const GCNSubtarget &ST,
2427 const MachineFunction &MF) {
2528 return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
3033 const MachineFunction &MF) {
3134 return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
3235 ST.getMaxNumSGPRs(MF));
36 }
37
38 // Find a scratch register that we can use at the start of the prologue to
39 // re-align the stack pointer. We avoid using callee-save registers since they
40 // may appear to be free when this is called from canUseAsPrologue (during
41 // shrink wrapping), but then no longer be free when this is called from
42 // emitPrologue.
43 //
44 // FIXME: This is a bit conservative, since in the above case we could use one
45 // of the callee-save registers as a scratch temp to re-align the stack pointer,
46 // but we would then have to make sure that we were in fact saving at least one
47 // callee-save register in the prologue, which is additional complexity that
48 // doesn't seem worth the benefit.
49 static unsigned findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
50 LivePhysRegs &LiveRegs,
51 const TargetRegisterClass &RC,
52 bool Unused = false) {
53 // Mark callee saved registers as used so we will not choose them.
54 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
55 for (unsigned i = 0; CSRegs[i]; ++i)
56 LiveRegs.addReg(CSRegs[i]);
57
58 if (Unused) {
59 // We are looking for a register that can be used throughout the entire
60 // function, so any use is unacceptable.
61 for (unsigned Reg : RC) {
62 if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg))
63 return Reg;
64 }
65 } else {
66 for (unsigned Reg : RC) {
67 if (LiveRegs.available(MRI, Reg))
68 return Reg;
69 }
70 }
71
72 // If we require an unused register, this is used in contexts where failure is
73 // an option and has an alternative plan. In other contexts, this must
74 // succeed0.
75 if (!Unused)
76 report_fatal_error("failed to find free scratch register");
77
78 return AMDGPU::NoRegister;
79 }
80
81 static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) {
82 LivePhysRegs LiveRegs;
83 LiveRegs.init(*MRI.getTargetRegisterInfo());
84 return findScratchNonCalleeSaveRegister(
85 MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
86 }
87
88 // We need to specially emit stack operations here because a different frame
89 // register is used than in the rest of the function, as getFrameRegister would
90 // use.
91 static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
92 MachineBasicBlock::iterator I,
93 const SIInstrInfo *TII, unsigned SpillReg,
94 unsigned ScratchRsrcReg, unsigned SPReg, int FI) {
95 MachineFunction *MF = MBB.getParent();
96 MachineFrameInfo &MFI = MF->getFrameInfo();
97
98 int64_t Offset = MFI.getObjectOffset(FI);
99
100 MachineMemOperand *MMO = MF->getMachineMemOperand(
101 MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4,
102 MFI.getObjectAlignment(FI));
103
104 if (isUInt<12>(Offset)) {
105 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET))
106 .addReg(SpillReg, RegState::Kill)
107 .addReg(ScratchRsrcReg)
108 .addReg(SPReg)
109 .addImm(Offset)
110 .addImm(0) // glc
111 .addImm(0) // slc
112 .addImm(0) // tfe
113 .addImm(0) // dlc
114 .addMemOperand(MMO);
115 return;
116 }
117
118 MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
119 MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
120
121 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
122 .addImm(Offset);
123
124 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN))
125 .addReg(SpillReg, RegState::Kill)
126 .addReg(OffsetReg, RegState::Kill)
127 .addReg(ScratchRsrcReg)
128 .addReg(SPReg)
129 .addImm(0)
130 .addImm(0) // glc
131 .addImm(0) // slc
132 .addImm(0) // tfe
133 .addImm(0) // dlc
134 .addMemOperand(MMO);
135 }
136
137 static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
138 MachineBasicBlock::iterator I,
139 const SIInstrInfo *TII, unsigned SpillReg,
140 unsigned ScratchRsrcReg, unsigned SPReg, int FI) {
141 MachineFunction *MF = MBB.getParent();
142 MachineFrameInfo &MFI = MF->getFrameInfo();
143 int64_t Offset = MFI.getObjectOffset(FI);
144
145 MachineMemOperand *MMO = MF->getMachineMemOperand(
146 MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4,
147 MFI.getObjectAlignment(FI));
148
149 if (isUInt<12>(Offset)) {
150 BuildMI(MBB, I, DebugLoc(),
151 TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg)
152 .addReg(ScratchRsrcReg)
153 .addReg(SPReg)
154 .addImm(Offset)
155 .addImm(0) // glc
156 .addImm(0) // slc
157 .addImm(0) // tfe
158 .addImm(0) // dlc
159 .addMemOperand(MMO);
160 return;
161 }
162
163 MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
164 MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
165
166 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
167 .addImm(Offset);
168
169 BuildMI(MBB, I, DebugLoc(),
170 TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg)
171 .addReg(OffsetReg, RegState::Kill)
172 .addReg(ScratchRsrcReg)
173 .addReg(SPReg)
174 .addImm(0)
175 .addImm(0) // glc
176 .addImm(0) // slc
177 .addImm(0) // tfe
178 .addImm(0) // dlc
179 .addMemOperand(MMO);
33180 }
34181
35182 void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
510657 }
511658 }
512659
513 // Find a scratch register that we can use at the start of the prologue to
514 // re-align the stack pointer. We avoid using callee-save registers since they
515 // may appear to be free when this is called from canUseAsPrologue (during
516 // shrink wrapping), but then no longer be free when this is called from
517 // emitPrologue.
518 //
519 // FIXME: This is a bit conservative, since in the above case we could use one
520 // of the callee-save registers as a scratch temp to re-align the stack pointer,
521 // but we would then have to make sure that we were in fact saving at least one
522 // callee-save register in the prologue, which is additional complexity that
523 // doesn't seem worth the benefit.
524 static unsigned findScratchNonCalleeSaveRegister(MachineFunction &MF,
525 LivePhysRegs &LiveRegs,
526 const TargetRegisterClass &RC) {
527 MachineRegisterInfo &MRI = MF.getRegInfo();
528
529 // Mark callee saved registers as used so we will not choose them.
530 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
531 for (unsigned i = 0; CSRegs[i]; ++i)
532 LiveRegs.addReg(CSRegs[i]);
533
534 for (unsigned Reg : RC) {
535 if (LiveRegs.available(MRI, Reg))
536 return Reg;
537 }
538
539 return AMDGPU::NoRegister;
540 }
541
542660 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
543661 switch (ID) {
544662 case TargetStackID::Default:
558676 }
559677
560678 const MachineFrameInfo &MFI = MF.getFrameInfo();
679 MachineRegisterInfo &MRI = MF.getRegInfo();
561680 const GCNSubtarget &ST = MF.getSubtarget();
562681 const SIInstrInfo *TII = ST.getInstrInfo();
563682 const SIRegisterInfo &TRI = TII->getRegisterInfo();
572691 bool HasFP = false;
573692 uint32_t NumBytes = MFI.getStackSize();
574693 uint32_t RoundedSize = NumBytes;
694 // To avoid clobbering VGPRs in lanes that weren't active on function entry,
695 // turn on all lanes before doing the spill to memory.
696 unsigned ScratchExecCopy = AMDGPU::NoRegister;
697
698 // Emit the copy if we need an FP, and are using a free SGPR to save it.
699 if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) {
700 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy)
701 .addReg(FramePtrReg)
702 .setMIFlag(MachineInstr::FrameSetup);
703 }
704
705 for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
706 : FuncInfo->getSGPRSpillVGPRs()) {
707 if (!Reg.FI.hasValue())
708 continue;
709
710 if (ScratchExecCopy == AMDGPU::NoRegister) {
711 if (LiveRegs.empty()) {
712 LiveRegs.init(TRI);
713 LiveRegs.addLiveIns(MBB);
714 if (FuncInfo->SGPRForFPSaveRestoreCopy)
715 LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
716 }
717
718 ScratchExecCopy
719 = findScratchNonCalleeSaveRegister(MRI, LiveRegs,
720 *TRI.getWaveMaskRegClass());
721 assert(FuncInfo->SGPRForFPSaveRestoreCopy != ScratchExecCopy);
722
723 const unsigned OrSaveExec = ST.isWave32() ?
724 AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
725 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec),
726 ScratchExecCopy)
727 .addImm(-1);
728 }
729
730 buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
731 FuncInfo->getScratchRSrcReg(),
732 StackPtrReg,
733 Reg.FI.getValue());
734 }
735
736 if (ScratchExecCopy != AMDGPU::NoRegister) {
737 // FIXME: Split block and make terminator.
738 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
739 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
740 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
741 .addReg(ScratchExecCopy, RegState::Kill);
742 LiveRegs.addReg(ScratchExecCopy);
743 }
744
745
746 if (FuncInfo->FramePointerSaveIndex) {
747 const int FI = FuncInfo->FramePointerSaveIndex.getValue();
748 assert(!MFI.isDeadObjectIndex(FI) &&
749 MFI.getStackID(FI) == TargetStackID::SGPRSpill);
750 ArrayRef Spill
751 = FuncInfo->getSGPRToVGPRSpills(FI);
752 assert(Spill.size() == 1);
753
754 // Save FP before setting it up.
755 // FIXME: This should respect spillSGPRToVGPR;
756 BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
757 Spill[0].VGPR)
758 .addReg(FramePtrReg)
759 .addImm(Spill[0].Lane)
760 .addReg(Spill[0].VGPR, RegState::Undef);
761 }
575762
576763 if (TRI.needsStackRealignment(MF)) {
577764 HasFP = true;
578765 const unsigned Alignment = MFI.getMaxAlignment();
579766
580767 RoundedSize += Alignment;
581
582 LiveRegs.init(TRI);
583 LiveRegs.addLiveIns(MBB);
584
585 unsigned ScratchSPReg
586 = findScratchNonCalleeSaveRegister(MF, LiveRegs,
587 AMDGPU::SReg_32_XM0RegClass);
588 assert(ScratchSPReg != AMDGPU::NoRegister);
768 if (LiveRegs.empty()) {
769 LiveRegs.init(TRI);
770 LiveRegs.addLiveIns(MBB);
771 LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
772 }
773
774 unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(
775 MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass);
776 assert(ScratchSPReg != AMDGPU::NoRegister &&
777 ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy);
589778
590779 // s_add_u32 tmp_reg, s32, NumBytes
591780 // s_and_b32 s32, tmp_reg, 0b111...0000
615804 .setMIFlag(MachineInstr::FrameSetup);
616805 }
617806
618 // To avoid clobbering VGPRs in lanes that weren't active on function entry,
619 // turn on all lanes before doing the spill to memory.
620 unsigned ScratchExecCopy = AMDGPU::NoRegister;
621
622 for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
623 : FuncInfo->getSGPRSpillVGPRs()) {
624 if (!Reg.FI.hasValue())
625 continue;
626
627 if (ScratchExecCopy == AMDGPU::NoRegister) {
628 if (LiveRegs.empty()) {
629 LiveRegs.init(TRI);
630 LiveRegs.addLiveIns(MBB);
631 }
632
633 ScratchExecCopy
634 = findScratchNonCalleeSaveRegister(MF, LiveRegs,
635 *TRI.getWaveMaskRegClass());
636
637 const unsigned OrSaveExec = ST.isWave32() ?
638 AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
639 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec),
640 ScratchExecCopy)
641 .addImm(-1);
642 }
643
644 TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true,
645 Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
646 &TII->getRegisterInfo());
647 }
648
649 if (ScratchExecCopy != AMDGPU::NoRegister) {
650 // FIXME: Split block and make terminator.
651 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
652 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
653 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
654 .addReg(ScratchExecCopy);
655 }
807 assert(!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister ||
808 FuncInfo->FramePointerSaveIndex) &&
809 "Needed to save FP but didn't save it anywhere");
810
811 assert(HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy == AMDGPU::NoRegister &&
812 !FuncInfo->FramePointerSaveIndex) &&
813 "Saved FP but didn't need it");
656814 }
657815
658816 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
663821
664822 const GCNSubtarget &ST = MF.getSubtarget();
665823 const SIInstrInfo *TII = ST.getInstrInfo();
824 MachineRegisterInfo &MRI = MF.getRegInfo();
666825 MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
826 LivePhysRegs LiveRegs;
667827 DebugLoc DL;
668
669 unsigned ScratchExecCopy = AMDGPU::NoRegister;
670 for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
671 : FuncInfo->getSGPRSpillVGPRs()) {
672 if (!Reg.FI.hasValue())
673 continue;
674
675 const SIRegisterInfo &TRI = TII->getRegisterInfo();
676 if (ScratchExecCopy == AMDGPU::NoRegister) {
677 // See emitPrologue
678 LivePhysRegs LiveRegs(*ST.getRegisterInfo());
679 LiveRegs.addLiveOuts(MBB);
680 LiveRegs.stepBackward(*MBBI);
681
682 ScratchExecCopy
683 = findScratchNonCalleeSaveRegister(MF, LiveRegs,
684 *TRI.getWaveMaskRegClass());
685
686 const unsigned OrSaveExec = ST.isWave32() ?
687 AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
688
689 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy)
690 .addImm(-1);
691 }
692
693 TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR,
694 Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
695 &TII->getRegisterInfo());
696 }
697
698 if (ScratchExecCopy != AMDGPU::NoRegister) {
699 // FIXME: Split block and make terminator.
700 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
701 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
702 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
703 .addReg(ScratchExecCopy);
704 }
705828
706829 const MachineFrameInfo &MFI = MF.getFrameInfo();
707830 uint32_t NumBytes = MFI.getStackSize();
715838 .addImm(RoundedSize * ST.getWavefrontSize())
716839 .setMIFlag(MachineInstr::FrameDestroy);
717840 }
841
842 if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) {
843 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->getFrameOffsetReg())
844 .addReg(FuncInfo->SGPRForFPSaveRestoreCopy)
845 .setMIFlag(MachineInstr::FrameSetup);
846 }
847
848 if (FuncInfo->FramePointerSaveIndex) {
849 const int FI = FuncInfo->FramePointerSaveIndex.getValue();
850 const MachineFrameInfo &MFI = MF.getFrameInfo();
851
852 assert(!MFI.isDeadObjectIndex(FI));
853 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
854 ArrayRef Spill
855 = FuncInfo->getSGPRToVGPRSpills(FI);
856 assert(Spill.size() == 1);
857 BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
858 FuncInfo->getFrameOffsetReg())
859 .addReg(Spill[0].VGPR)
860 .addImm(Spill[0].Lane);
861 }
862
863 unsigned ScratchExecCopy = AMDGPU::NoRegister;
864 for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
865 : FuncInfo->getSGPRSpillVGPRs()) {
866 if (!Reg.FI.hasValue())
867 continue;
868
869 const SIRegisterInfo &TRI = TII->getRegisterInfo();
870 if (ScratchExecCopy == AMDGPU::NoRegister) {
871 // See emitPrologue
872 if (LiveRegs.empty()) {
873 LiveRegs.init(*ST.getRegisterInfo());
874 LiveRegs.addLiveOuts(MBB);
875 LiveRegs.stepBackward(*MBBI);
876 }
877
878 ScratchExecCopy = findScratchNonCalleeSaveRegister(
879 MRI, LiveRegs, *TRI.getWaveMaskRegClass());
880 LiveRegs.removeReg(ScratchExecCopy);
881
882 const unsigned OrSaveExec =
883 ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
884
885 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy)
886 .addImm(-1);
887 }
888
889 buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
890 FuncInfo->getScratchRSrcReg(),
891 FuncInfo->getStackPtrOffsetReg(), Reg.FI.getValue());
892 }
893
894 if (ScratchExecCopy != AMDGPU::NoRegister) {
895 // FIXME: Split block and make terminator.
896 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
897 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
898 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
899 .addReg(ScratchExecCopy, RegState::Kill);
900 }
718901 }
719902
720903 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
721 // memory.
904 // memory. They should have been removed by now.
722905 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
723906 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
724907 I != E; ++I) {
729912 return true;
730913 }
731914
915
916 #ifndef NDEBUG
917 static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
918 Optional FramePointerSaveIndex) {
919 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
920 I != E; ++I) {
921 if (!MFI.isDeadObjectIndex(I) &&
922 MFI.getStackID(I) == TargetStackID::SGPRSpill &&
923 FramePointerSaveIndex && I != FramePointerSaveIndex) {
924 return false;
925 }
926 }
927
928 return true;
929 }
930 #endif
931
732932 int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
733933 unsigned &FrameReg) const {
734934 const SIRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
742942 RegScavenger *RS) const {
743943 MachineFrameInfo &MFI = MF.getFrameInfo();
744944
745 if (!MFI.hasStackObjects())
746 return;
747
748945 const GCNSubtarget &ST = MF.getSubtarget();
749 const SIInstrInfo *TII = ST.getInstrInfo();
750 const SIRegisterInfo &TRI = TII->getRegisterInfo();
946 const SIRegisterInfo *TRI = ST.getRegisterInfo();
751947 SIMachineFunctionInfo *FuncInfo = MF.getInfo();
752948
753 FuncInfo->removeSGPRToVGPRFrameIndices(MFI);
949 assert(allSGPRSpillsAreDead(MFI, None) &&
950 "SGPR spill should have been removed in SILowerSGPRSpills");
754951
755952 // FIXME: The other checks should be redundant with allStackObjectsAreDead,
756953 // but currently hasNonSpillStackObjects is set only from source
760957
761958 if (FuncInfo->isEntryFunction()) {
762959 int ScavengeFI = MFI.CreateFixedObject(
763 TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
960 TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
764961 RS->addScavengingFrameIndex(ScavengeFI);
765962 } else {
766963 int ScavengeFI = MFI.CreateStackObject(
767 TRI.getSpillSize(AMDGPU::SGPR_32RegClass),
768 TRI.getSpillAlignment(AMDGPU::SGPR_32RegClass),
964 TRI->getSpillSize(AMDGPU::SGPR_32RegClass),
965 TRI->getSpillAlignment(AMDGPU::SGPR_32RegClass),
769966 false);
770967 RS->addScavengingFrameIndex(ScavengeFI);
771968 }
774971
775972 // Only report VGPRs to generic code.
776973 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
777 BitVector &SavedRegs,
974 BitVector &SavedVGPRs,
778975 RegScavenger *RS) const {
779 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
976 TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
977
978 SIMachineFunctionInfo *MFI = MF.getInfo();
979 const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
780980 const GCNSubtarget &ST = MF.getSubtarget();
781981 const SIRegisterInfo *TRI = ST.getRegisterInfo();
782 SavedRegs.clearBitsNotInMask(TRI->getAllVGPRRegMask());
783
784 // VGPRs used for SGPR spilling need to be specially inserted in the prolog.
785 const SIMachineFunctionInfo *MFI = MF.getInfo();
982
983 // Ignore the SGPRs the default implementation found.
984 SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask());
985
986 // hasFP only knows about stack objects that already exist. We're now
987 // determining the stack slots that will be created, so we have to predict
988 // them. Stack objects force FP usage with calls.
989 //
990 // Note a new VGPR CSR may be introduced if one is used for the spill, but we
991 // don't want to report it here.
992 //
993 // FIXME: Is this really hasReservedCallFrame?
994 const bool WillHaveFP =
995 FrameInfo.hasCalls() &&
996 (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
997
998 // VGPRs used for SGPR spilling need to be specially inserted in the prolog,
999 // so don't allow the default insertion to handle them.
7861000 for (auto SSpill : MFI->getSGPRSpillVGPRs())
787 SavedRegs.reset(SSpill.VGPR);
1001 SavedVGPRs.reset(SSpill.VGPR);
1002
1003 const bool HasFP = WillHaveFP || hasFP(MF);
1004 if (!HasFP)
1005 return;
1006
1007 if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
1008 int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
1009 TargetStackID::SGPRSpill);
1010
1011 // If there is already a VGPR with free lanes, use it. We may already have
1012 // to pay the penalty for spilling a CSR VGPR.
1013 if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
1014 llvm_unreachable("allocate SGPR spill should have worked");
1015
1016 MFI->FramePointerSaveIndex = NewFI;
1017
1018 LLVM_DEBUG(
1019 auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
1020 dbgs() << "Spilling FP to " << printReg(Spill.VGPR, TRI)
1021 << ':' << Spill.Lane << '\n');
1022 return;
1023 }
1024
1025 MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo());
1026
1027 if (!MFI->SGPRForFPSaveRestoreCopy) {
1028 // There's no free lane to spill, and no free register to save FP, so we're
1029 // forced to spill another VGPR to use for the spill.
1030 int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
1031 TargetStackID::SGPRSpill);
1032 if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
1033 llvm_unreachable("allocate SGPR spill should have worked");
1034 MFI->FramePointerSaveIndex = NewFI;
1035
1036 LLVM_DEBUG(
1037 auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
1038 dbgs() << "FP requires fallback spill to " << printReg(Spill.VGPR, TRI)
1039 << ':' << Spill.Lane << '\n';);
1040 } else {
1041 LLVM_DEBUG(dbgs() << "Saving FP with copy to " <<
1042 printReg(MFI->SGPRForFPSaveRestoreCopy, TRI) << '\n');
1043 }
7881044 }
7891045
7901046 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
7991055 // The SP is specifically managed and we don't want extra spills of it.
8001056 SavedRegs.reset(MFI->getStackPtrOffsetReg());
8011057 SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask());
1058 }
1059
1060 bool SIFrameLowering::assignCalleeSavedSpillSlots(
1061 MachineFunction &MF, const TargetRegisterInfo *TRI,
1062 std::vector &CSI) const {
1063 if (CSI.empty())
1064 return true; // Early exit if no callee saved registers are modified!
1065
1066 const SIMachineFunctionInfo *FuncInfo = MF.getInfo();
1067 if (!FuncInfo->SGPRForFPSaveRestoreCopy)
1068 return false;
1069
1070 for (auto &CS : CSI) {
1071 if (CS.getReg() == FuncInfo->getFrameOffsetReg()) {
1072 if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister)
1073 CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
1074 break;
1075 }
1076 }
1077
1078 return false;
8021079 }
8031080
8041081 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
8401117 if (MFI.hasCalls()) {
8411118 // All offsets are unsigned, so need to be addressed in the same direction
8421119 // as stack growth.
1120
1121 // FIXME: This function is pretty broken, since it can be called before the
1122 // frame layout is determined or CSR spills are inserted.
8431123 if (MFI.getStackSize() != 0)
8441124 return true;
8451125
3737 RegScavenger *RS = nullptr) const override;
3838 void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs,
3939 RegScavenger *RS = nullptr) const;
40 bool
41 assignCalleeSavedSpillSlots(MachineFunction &MF,
42 const TargetRegisterInfo *TRI,
43 std::vector &CSI) const override;
4044
4145 bool isSupportedStackID(TargetStackID::Value ID) const override;
4246
26932693 MachineFrameInfo &MFI = MF.getFrameInfo();
26942694 SmallVector, 8> RegsToPass;
26952695
2696 SDValue CallerSavedFP;
2697
26982696 // Adjust the stack pointer for the new arguments...
26992697 // These operations are automatically eliminated by the prolog/epilog pass
27002698 if (!IsSibCall) {
27072705 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
27082706 RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
27092707 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
2710
2711 if (!Info->isEntryFunction()) {
2712 // Avoid clobbering this function's FP value. In the current convention
2713 // callee will overwrite this, so do save/restore around the call site.
2714 CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
2715 Info->getFrameOffsetReg(), MVT::i32);
2716 CopyFromChains.push_back(CallerSavedFP.getValue(1));
2717 }
2718
27192708 Chain = DAG.getTokenFactor(DL, CopyFromChains);
27202709 }
27212710
29032892 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
29042893 Chain = Call.getValue(0);
29052894 InFlag = Call.getValue(1);
2906
2907 if (CallerSavedFP) {
2908 SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
2909 Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
2910 InFlag = Chain.getValue(1);
2911 }
29122895
29132896 uint64_t CalleePopBytes = NumBytes;
29142897 Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
956956 // Add the scratch resource registers as implicit uses because we may end up
957957 // needing them, and need to ensure that the reserved registers are
958958 // correctly handled.
959
960 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
959 if (RI.spillSGPRToVGPR())
960 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
961961 if (ST.hasScalarStores()) {
962962 // m0 is used for offset to scalar stores if used to spill.
963963 Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
10511051 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
10521052 }
10531053
1054 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1054 if (RI.spillSGPRToVGPR())
1055 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
10551056 MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
10561057 .addFrameIndex(FrameIndex) // addr
10571058 .addMemOperand(MMO)
7070 // required for scratch access.
7171 ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
7272 ScratchWaveOffsetReg = AMDGPU::SGPR33;
73 FrameOffsetReg = AMDGPU::SGPR5;
73
74 // TODO: Pick a high register, and shift down, similar to a kernel.wwwwwwwwwwww
75 FrameOffsetReg = AMDGPU::SGPR34;
7476 StackPtrOffsetReg = AMDGPU::SGPR32;
7577
7678 ArgInfo.PrivateSegmentBuffer =
244246 return false;
245247 }
246248
249 /// \p returns true if \p NumLanes slots are available in VGPRs already used for
250 /// SGPR spilling.
251 //
252 // FIXME: This only works after processFunctionBeforeFrameFinalized
253 bool SIMachineFunctionInfo::haveFreeLanesForSGPRSpill(const MachineFunction &MF,
254 unsigned NumNeed) const {
255 const GCNSubtarget &ST = MF.getSubtarget();
256 unsigned WaveSize = ST.getWavefrontSize();
257 return NumVGPRSpillLanes + NumNeed <= WaveSize * SpillVGPRs.size();
258 }
259
247260 /// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
248261 bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
249262 int FI) {
306319 }
307320
308321 void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) {
309 for (auto &R : SGPRToVGPRSpills)
310 MFI.RemoveStackObject(R.first);
311 // All other SPGRs must be allocated on the default stack, so reset
312 // the stack ID.
313 for (unsigned i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd();
314 i != e; ++i)
315 MFI.setStackID(i, 0);
322 // The FP spill hasn't been inserted yet, so keep it around.
323 for (auto &R : SGPRToVGPRSpills) {
324 if (R.first != FramePointerSaveIndex)
325 MFI.RemoveStackObject(R.first);
326 }
327
328 // All other SPGRs must be allocated on the default stack, so reset the stack
329 // ID.
330 for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e;
331 ++i)
332 if (i != FramePointerSaveIndex)
333 MFI.setStackID(i, TargetStackID::Default);
316334 }
317335
318336 MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
428428 unsigned NumVGPRSpillLanes = 0;
429429 SmallVector SpillVGPRs;
430430
431 public: // FIXME
432 /// If this is set, an SGPR used for save/restore of the register used for the
433 /// frame pointer.
434 unsigned SGPRForFPSaveRestoreCopy = 0;
435 Optional FramePointerSaveIndex;
436
431437 public:
432438 SIMachineFunctionInfo(const MachineFunction &MF);
433439
447453 return Mode;
448454 }
449455
456 bool haveFreeLanesForSGPRSpill(const MachineFunction &MF,
457 unsigned NumLane) const;
450458 bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
451459 void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI);
452460
55 ; GCN: ; %bb.0:
66 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
77 ; GCN-NEXT: ds_read_b32 v2, v0
8 ; GCN-NEXT: s_mov_b64 s[6:7], 0
8 ; GCN-NEXT: s_mov_b64 s[4:5], 0
99 ; GCN-NEXT: BB0_1: ; %atomicrmw.start
1010 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
1111 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1717 ; GCN-NEXT: buffer_wbinvl1_vol
1818 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
1919 ; GCN-NEXT: v_mov_b32_e32 v2, v1
20 ; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
21 ; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7]
20 ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
21 ; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
2222 ; GCN-NEXT: s_cbranch_execnz BB0_1
2323 ; GCN-NEXT: ; %bb.2: ; %atomicrmw.end
24 ; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
24 ; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
2525 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2626 ; GCN-NEXT: s_setpc_b64 s[30:31]
2727 %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
3333 ; GCN: ; %bb.0:
3434 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3535 ; GCN-NEXT: global_load_dword v3, v[0:1], off
36 ; GCN-NEXT: s_mov_b64 s[6:7], 0
36 ; GCN-NEXT: s_mov_b64 s[4:5], 0
3737 ; GCN-NEXT: BB1_1: ; %atomicrmw.start
3838 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
3939 ; GCN-NEXT: s_waitcnt vmcnt(0)
4040 ; GCN-NEXT: v_not_b32_e32 v2, v3
4141 ; GCN-NEXT: v_or_b32_e32 v2, -5, v2
42 ; GCN-NEXT: s_waitcnt vmcnt(0)
42 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4343 ; GCN-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
44 ; GCN-NEXT: s_waitcnt vmcnt(0)
44 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4545 ; GCN-NEXT: buffer_wbinvl1_vol
4646 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
4747 ; GCN-NEXT: v_mov_b32_e32 v3, v2
48 ; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
49 ; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7]
48 ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
49 ; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
5050 ; GCN-NEXT: s_cbranch_execnz BB1_1
5151 ; GCN-NEXT: ; %bb.2: ; %atomicrmw.end
52 ; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
52 ; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
5353 ; GCN-NEXT: v_mov_b32_e32 v0, v2
5454 ; GCN-NEXT: s_setpc_b64 s[30:31]
5555 %result = atomicrmw nand i32 addrspace(1)* %ptr, i32 4 seq_cst
6161 ; GCN: ; %bb.0:
6262 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6363 ; GCN-NEXT: flat_load_dword v3, v[0:1]
64 ; GCN-NEXT: s_mov_b64 s[6:7], 0
64 ; GCN-NEXT: s_mov_b64 s[4:5], 0
6565 ; GCN-NEXT: BB2_1: ; %atomicrmw.start
6666 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
6767 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6868 ; GCN-NEXT: v_not_b32_e32 v2, v3
6969 ; GCN-NEXT: v_or_b32_e32 v2, -5, v2
70 ; GCN-NEXT: s_waitcnt vmcnt(0)
70 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7171 ; GCN-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
72 ; GCN-NEXT: s_waitcnt vmcnt(0)
72 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7373 ; GCN-NEXT: buffer_wbinvl1_vol
7474 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
7575 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
7676 ; GCN-NEXT: v_mov_b32_e32 v3, v2
77 ; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
78 ; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7]
77 ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
78 ; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
7979 ; GCN-NEXT: s_cbranch_execnz BB2_1
8080 ; GCN-NEXT: ; %bb.2: ; %atomicrmw.end
81 ; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
81 ; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
8282 ; GCN-NEXT: v_mov_b32_e32 v0, v2
8383 ; GCN-NEXT: s_setpc_b64 s[30:31]
8484 %result = atomicrmw nand i32* %ptr, i32 4 seq_cst
6868 }
6969
7070 ; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf:
71 ; GCN: s_mov_b32 s5, s32
72 ; GCN: s_add_u32 s32, s32, 0xc00{{$}}
73 ; GCN-DAG: buffer_store_dword v32
74 ; GCN-DAG: buffer_store_dword v33
71 ; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:36
72 ; GCN-DAG: v_writelane_b32 v33, s34,
73 ; GCN: s_mov_b32 s34, s32
74 ; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s34{{$}}
75 ; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}}
76 ; GCN-DAG: buffer_store_dword v32, off, s[0:3], s34 offset:32
7577 ; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32
76 ; GCN-DAG: v_writelane_b32
77 ; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}}
78
7879 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]]
79 ; GCN-DAG: buffer_store_dword [[ADD0]], off, s[0:3], s5{{$}}
80
81 ; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:16{{$}}
80 ; GCN: buffer_store_dword [[ADD0]], off, s[0:3], s34{{$}}
81
82 ; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s34 offset:16{{$}}
8283 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD1:v[0-9]+]], vcc, 2, [[LOAD1]]
8384
8485 ; GCN: s_swappc_b64
8586
86 ; GCN: buffer_store_dword [[ADD1]], off, s[0:3], s5 offset:16{{$}}
87 ; GCN: buffer_store_dword [[ADD1]], off, s[0:3], s34 offset:16{{$}}
8788
8889 ; GCN: v_readlane_b32
8990 ; GCN-NOT: v_readlane_b32 s32
90 ; GCN-DAG: buffer_load_dword v32,
91 ; GCN-DAG: buffer_load_dword v33,
91 ; GCN-DAG: buffer_load_dword v32, off, s[0:3], s34 offset:32
9292 ; GCN: s_sub_u32 s32, s32, 0xc00{{$}}
93 ; GCN: v_readlane_b32 s34, v33,
94 ; GCN-DAG: buffer_load_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
9395 ; GCN: s_setpc_b64
9496 define void @void_func_byval_struct_non_leaf(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 {
9597 entry:
107109 }
108110
109111 ; GCN-LABEL: {{^}}call_void_func_byval_struct_func:
110 ; GCN: s_mov_b32 s5, s32
112 ; GCN: s_mov_b32 s34, s32
111113 ; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}}
112114 ; GCN-DAG: v_writelane_b32
113115
114116 ; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
115117 ; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
116118
117 ; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5{{$}}
118 ; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:16
119
120 ; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}}
121 ; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:4
122 ; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:8
123 ; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:12
119 ; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s34{{$}}
120 ; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s34 offset:16
121
122 ; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s34{{$}}
123 ; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s34 offset:4
124 ; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s34 offset:8
125 ; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s34 offset:12
124126
125127 ; GCN-NOT: s_add_u32 s32, s32, 0x800
126128
130132 ; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
131133 ; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
132134
133 ; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:16
134 ; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:20
135 ; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:24
136 ; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:28
135 ; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16
136 ; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20
137 ; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s34 offset:24
138 ; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s34 offset:28
137139
138140 ; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16
139141 ; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20
148150 ; GCN-NOT: s_sub_u32 s32, s32, 0x800
149151
150152 ; GCN: s_sub_u32 s32, s32, 0xc00{{$}}
151 ; GCN-NEXT: s_waitcnt
152 ; GCN-NEXT: s_setpc_b64
153 ; GCN: v_readlane_b32 s34, v
154 ; GCN: s_waitcnt
155 ; GCN: s_setpc_b64
153156 define void @call_void_func_byval_struct_func() #1 {
154157 entry:
155158 %arg0 = alloca %struct.ByValStruct, align 4, addrspace(5)
305308 }
306309
307310 ; GCN-LABEL: {{^}}call_void_func_byval_struct_align8_func:
308 ; GCN: s_mov_b32 s5, s32
311 ; GCN: s_mov_b32 s34, s32
309312 ; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}}
310313 ; GCN-DAG: v_writelane_b32
311314
312315 ; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
313316 ; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
314317
315 ; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5{{$}}
316 ; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:16
317
318 ; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}}
319 ; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:4
320 ; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:8
321 ; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:12
318 ; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s34{{$}}
319 ; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s34 offset:16
320
321 ; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s34{{$}}
322 ; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s34 offset:4
323 ; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s34 offset:8
324 ; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s34 offset:12
322325
323326 ; GCN-NOT: s_add_u32 s32, s32, 0x800
324327
327330 ; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
328331 ; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
329332
330 ; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:16
331 ; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:20
332 ; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:24
333 ; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:28
333 ; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16
334 ; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20
335 ; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s34 offset:24
336 ; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s34 offset:28
334337 ; GCN: s_waitcnt vmcnt(0)
335338 ; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16
336339 ; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20
345348 ; GCN-NOT: s_sub_u32 s32, s32, 0x800
346349
347350 ; GCN: s_sub_u32 s32, s32, 0xc00{{$}}
348 ; GCN-NEXT: s_waitcnt
351 ; GCN: v_readlane_b32 s34, v
352 ; GCN: s_waitcnt
349353 ; GCN-NEXT: s_setpc_b64
350354 define void @call_void_func_byval_struct_align8_func() #0 {
351355 entry:
384388 ret void
385389 }
386390
387 declare void @external_void_func_void() #0
391 declare hidden void @external_void_func_void() #0
388392
389393 declare void @llvm.lifetime.start.p5i8(i64, i8 addrspace(5)* nocapture) #3
390394 declare void @llvm.lifetime.end.p5i8(i64, i8 addrspace(5)* nocapture) #3
1212 }
1313
1414 ; GCN-LABEL: {{^}}indirect_use_vcc:
15 ; GCN: v_writelane_b32 v32, s34, 0
16 ; GCN: v_writelane_b32 v32, s35, 1
17 ; GCN: v_writelane_b32 v32, s36, 2
15 ; GCN: v_writelane_b32 v32, s34, 2
16 ; GCN: v_writelane_b32 v32, s36, 0
17 ; GCN: v_writelane_b32 v32, s37, 1
1818 ; GCN: s_swappc_b64
19 ; GCN: v_readlane_b32 s36, v32, 2
20 ; GCN: v_readlane_b32 s35, v32, 1
21 ; GCN: v_readlane_b32 s34, v32, 0
22 ; GCN: ; NumSgprs: 39
19 ; GCN: v_readlane_b32 s37, v32, 1
20 ; GCN: v_readlane_b32 s36, v32, 0
21 ; GCN: v_readlane_b32 s34, v32, 2
22 ; GCN: ; NumSgprs: 40
2323 ; GCN: ; NumVgprs: 33
2424 define void @indirect_use_vcc() #1 {
2525 call void @use_vcc()
2828
2929 ; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel:
3030 ; GCN: is_dynamic_callstack = 0
31 ; CI: ; NumSgprs: 41
32 ; VI-NOBUG: ; NumSgprs: 43
31 ; CI: ; NumSgprs: 42
32 ; VI-NOBUG: ; NumSgprs: 44
3333 ; VI-BUG: ; NumSgprs: 96
3434 ; GCN: ; NumVgprs: 33
3535 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(i32 addrspace(1)* %out) #0 {
4747 }
4848
4949 ; GCN-LABEL: {{^}}indirect_use_flat_scratch:
50 ; CI: ; NumSgprs: 41
51 ; VI: ; NumSgprs: 43
50 ; CI: ; NumSgprs: 42
51 ; VI: ; NumSgprs: 44
5252 ; GCN: ; NumVgprs: 33
5353 define void @indirect_use_flat_scratch() #1 {
5454 call void @use_flat_scratch()
5757
5858 ; GCN-LABEL: {{^}}indirect_2level_use_flat_scratch_kernel:
5959 ; GCN: is_dynamic_callstack = 0
60 ; CI: ; NumSgprs: 41
61 ; VI-NOBUG: ; NumSgprs: 43
60 ; CI: ; NumSgprs: 42
61 ; VI-NOBUG: ; NumSgprs: 44
6262 ; VI-BUG: ; NumSgprs: 96
6363 ; GCN: ; NumVgprs: 33
6464 define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(i32 addrspace(1)* %out) #0 {
2222 }
2323
2424 ; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
25 ; GCN: v_writelane_b32 v32, s34, 0
26 ; GCN: v_writelane_b32 v32, s35, 1
27 ; GCN: v_writelane_b32 v32, s36, 2
28 ; GCN: v_writelane_b32 v32, s37, 3
29 ; GCN: v_writelane_b32 v32, s38, 4
30
31 ; GCN: s_mov_b32 [[COPY_FP:s[0-9]+]], s5
32 ; GCN-NEXT: s_swappc_b64
33 ; GCN-NEXT: s_mov_b32 s5, [[COPY_FP]]
34 ; GCN-NEXT: s_mov_b32 [[COPY_FP]], s5
25 ; GCN: buffer_store_dword
26 ; GCN: v_writelane_b32 v32, s34, 4
27 ; GCN: v_writelane_b32 v32, s36, 0
28 ; GCN: v_writelane_b32 v32, s37, 1
29 ; GCN: v_writelane_b32 v32, s38, 2
30
31 ; GCN: s_swappc_b64
3532 ; GCN-NEXT: ;;#ASMSTART
3633 ; GCN-NEXT: ;;#ASMEND
3734 ; GCN-NEXT: s_swappc_b64
38 ; GCN-DAG: s_mov_b32 s5, [[COPY_FP]]
39 ; GCN-DAG: v_readlane_b32 s38, v32, 4
40 ; GCN: v_readlane_b32 s37, v32, 3
41 ; GCN: v_readlane_b32 s36, v32, 2
42 ; GCN: v_readlane_b32 s35, v32, 1
43 ; GCN: v_readlane_b32 s34, v32, 0
35 ; GCN-DAG: v_readlane_b32 s39, v32, 3
36 ; GCN-DAG: v_readlane_b32 s38, v32, 2
37 ; GCN: v_readlane_b32 s37, v32, 1
38 ; GCN: v_readlane_b32 s36, v32, 0
39
40 ; GCN: v_readlane_b32 s34, v32, 4
41 ; GCN: buffer_load_dword
4442 ; GCN: s_setpc_b64
4543 define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 {
4644 call void @external_void_func_void()
4947 ret void
5048 }
5149
52 ; FIXME: Avoid extra restore of FP in between calls.
5350 ; GCN-LABEL: {{^}}test_func_call_external_void_funcx2:
54 ; GCN: s_mov_b32 [[COPY_FP:s[0-9]+]], s5
55 ; GCN-NEXT: s_swappc_b64
56 ; GCN-NEXT: s_mov_b32 s5, [[COPY_FP]]
57 ; GCN-NEXT: s_mov_b32 [[COPY_FP]], s5
58 ; GCN-NEXT: s_swappc_b64
59 ; GCN: s_mov_b32 s5, [[COPY_FP]]
51 ; GCN: buffer_store_dword v32
52 ; GCN: v_writelane_b32 v32, s34, 4
53
54 ; GCN: s_mov_b32 s34, s32
55 ; GCN: s_add_u32 s32, s32, 0x400
56 ; GCN: s_swappc_b64
57 ; GCN-NEXT: s_swappc_b64
58
59 ; GCN: v_readlane_b32 s34, v32, 4
60 ; GCN: buffer_load_dword v32,
6061 define void @test_func_call_external_void_funcx2() #0 {
6162 call void @external_void_func_void()
6263 call void @external_void_func_void()
118118 ; GCN-LABEL: tailcall_got_load:
119119 ; GCN: ; %bb.0:
120120 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121 ; GCN-NEXT: s_getpc_b64 s[6:7]
122 ; GCN-NEXT: s_add_u32 s6, s6, got.func@gotpcrel32@lo+4
123 ; GCN-NEXT: s_addc_u32 s7, s7, got.func@gotpcrel32@hi+4
124 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
121 ; GCN-NEXT: s_getpc_b64 s[4:5]
122 ; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4
123 ; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+4
124 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
125125 ; GCN-NEXT: v_mov_b32_e32 v0, 0
126126 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
127 ; GCN-NEXT: s_setpc_b64 s[6:7]
127 ; GCN-NEXT: s_setpc_b64 s[4:5]
128128 tail call void @got.func(i32 0)
129129 ret void
130130 }
135135 ; GCN: ; %bb.0:
136136 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
137137 ; GCN-NEXT: ds_read_b32 v0, v0
138 ; GCN-NEXT: s_getpc_b64 s[6:7]
139 ; GCN-NEXT: s_add_u32 s6, s6, func@rel32@lo+4
140 ; GCN-NEXT: s_addc_u32 s7, s7, func@rel32@hi+4
141 ; GCN-NEXT: s_setpc_b64 s[6:7]
138 ; GCN-NEXT: s_getpc_b64 s[4:5]
139 ; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4
140 ; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+4
141 ; GCN-NEXT: s_setpc_b64 s[4:5]
142142 %vgpr = load volatile i32, i32 addrspace(3)* %ptr
143143 tail call void @func(i32 %vgpr)
144144 ret void
1111 ; GCN-LABEL: {{^}}callee_no_stack_no_fp_elim_all:
1212 ; GCN: ; %bb.0:
1313 ; GCN-NEXT: s_waitcnt
14 ; GCN-NEXT: s_mov_b32 s5, s32
14 ; GCN-NEXT: s_mov_b32 s4, s34
15 ; GCN-NEXT: s_mov_b32 s34, s32
16 ; GCN-NEXT: s_mov_b32 s34, s4
1517 ; GCN-NEXT: s_setpc_b64
1618 define void @callee_no_stack_no_fp_elim_all() #1 {
1719 ret void
3840 ret void
3941 }
4042
43 ; Can use free call clobbered register to preserve original FP value.
44
4145 ; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_all:
4246 ; GCN: ; %bb.0:
4347 ; GCN-NEXT: s_waitcnt
44 ; GCN-NEXT: s_mov_b32 s5, s32
48 ; GCN-NEXT: s_mov_b32 s4, s34
49 ; GCN-NEXT: s_mov_b32 s34, s32
4550 ; GCN-NEXT: s_add_u32 s32, s32, 0x200
4651 ; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}}
47 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}}
52 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s34 offset:4{{$}}
4853 ; GCN-NEXT: s_sub_u32 s32, s32, 0x200
54 ; GCN-NEXT: s_mov_b32 s34, s4
4955 ; GCN-NEXT: s_waitcnt vmcnt(0)
5056 ; GCN-NEXT: s_setpc_b64
5157 define void @callee_with_stack_no_fp_elim_all() #1 {
7076 ; GCN-LABEL: {{^}}callee_with_stack_and_call:
7177 ; GCN: ; %bb.0:
7278 ; GCN-NEXT: s_waitcnt
73 ; GCN: s_mov_b32 s5, s32
79 ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
80 ; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
81 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
82 ; GCN: v_writelane_b32 [[CSR_VGPR]], s34, 2
83 ; GCN-DAG: s_mov_b32 s34, s32
7484 ; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}}
75 ; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4
76
77 ; GCN-DAG: v_writelane_b32 v32, s34,
78 ; GCN-DAG: v_writelane_b32 v32, s35,
79 ; GCN-DAG: v_writelane_b32 v32, s36,
80 ; GCN-DAG: v_mov_b32_e32 v0, 0{{$}}
81 ; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5{{$}}
82 ; GCN-DAG: s_mov_b32 [[COPY_FP:s[0-9]+]], s5
83
85 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
86 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s36,
87 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s37,
88
89 ; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s34{{$}}
8490
8591 ; GCN: s_swappc_b64
86 ; GCN-DAG: s_mov_b32 s5, [[COPY_FP]]
87 ; GCN-DAG: v_readlane_b32 s34,
88 ; GCN-DAG: v_readlane_b32 s35,
89 ; GCN-DAG: v_readlane_b32 s36,
90 ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
91 ; GCN: s_waitcnt
92
93 ; GCN-DAG: v_readlane_b32 s36, [[CSR_VGPR]]
94 ; GCN-DAG: v_readlane_b32 s37, [[CSR_VGPR]]
95
96 ; GCN: s_sub_u32 s32, s32, 0x400{{$}}
97 ; GCN-NEXT: v_readlane_b32 s34, [[CSR_VGPR]], 2
98 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
99 ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
100 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
101 ; GCN-NEXT: s_waitcnt vmcnt(0)
102
92103 ; GCN-NEXT: s_setpc_b64
93104 define void @callee_with_stack_and_call() #0 {
94105 %alloca = alloca i32, addrspace(5)
105116
106117 ; GCN-LABEL: {{^}}callee_no_stack_with_call:
107118 ; GCN: s_waitcnt
108 ; GCN: s_mov_b32 s5, s32
109 ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
110 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
119 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
120 ; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill
111121 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
112 ; GCN-DAG: v_writelane_b32 v32, s34, 0
113 ; GCN-DAG: v_writelane_b32 v32, s35, 1
114 ; GCN-DAG: v_writelane_b32 v32, s36, 2
115 ; GCN-DAG: s_mov_b32 [[COPY_FP:s[0-9]+]], s5
122 ; GCN-DAG: s_add_u32 s32, s32, 0x400
123 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s34, [[FP_SPILL_LANE:[0-9]+]]
124
125 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s36, 0
126 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s37, 1
116127 ; GCN: s_swappc_b64
117 ; GCN: s_mov_b32 s5, [[COPY_FP]]
118
119 ; GCN-DAG: v_readlane_b32 s34, v32, 0
120 ; GCN-DAG: v_readlane_b32 s35, v32, 1
121 ; GCN-DAG: v_readlane_b32 s36, v32, 2
122
123 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
124 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
128
129 ; GCN-DAG: v_readlane_b32 s36, v32, 0
130 ; GCN-DAG: v_readlane_b32 s37, v32, 1
131
132 ; GCN: s_sub_u32 s32, s32, 0x400
133 ; GCN-NEXT: v_readlane_b32 s34, [[CSR_VGPR]], [[FP_SPILL_LANE]]
134 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
135 ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload
125136 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
126
127 ; GCN: s_sub_u32 s32, s32, 0x400
128 ; GCN: s_setpc_b64
137 ; GCN-NEXT: s_waitcnt vmcnt(0)
138 ; GCN-NEXT: s_setpc_b64
129139 define void @callee_no_stack_with_call() #0 {
130140 call void @external_void_func_void()
131141 ret void
132142 }
133143
134 declare void @external_void_func_void() #0
135
136 ; Make sure if a CSR vgpr is used for SGPR spilling, it is saved and restored
144 declare hidden void @external_void_func_void() #0
145
146 ; Make sure if a CSR vgpr is used for SGPR spilling, it is saved and
147 ; restored. No FP is required.
148 ;
137149 ; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls:
138150 ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
139 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
151 ; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill
140152 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
141
142 ; GCN: v_writelane_b32 v32
143 ; GCN: ;;#ASMSTART
144 ; GCN: v_readlane_b32 s{{[0-9]+}}, v32
153 ; GCN: v_writelane_b32 [[CSR_VGPR]], s
154 ; GCN: v_writelane_b32 [[CSR_VGPR]], s
155
156 ; GCN: ;;#ASMSTART
157 ; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]]
158 ; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]]
145159
146160 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
147 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
161 ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload
148162 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
149
150163 ; GCN-NEXT: s_waitcnt
151164 ; GCN-NEXT: s_setpc_b64
152165 define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
187200 ret void
188201 }
189202
203 ; TODO: Can the SP inc/deec be remvoed?
204 ; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_csr_vgpr:
205 ; GCN: s_waitcnt
206 ; GCN-NEXT:s_mov_b32 [[FP_COPY:s[0-9]+]], s34
207 ; GCN-NEXT: s_mov_b32 s34, s32
208 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
209 ; GCN-DAG: buffer_store_dword v33, off, s[0:3], s34 ; 4-byte Folded Spill
210 ; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s34 offset:8
211
212 ; GCN: ;;#ASMSTART
213 ; GCN-NEXT: ; clobber v33
214 ; GCN-NEXT: ;;#ASMEND
215
216 ; GCN: buffer_load_dword v33, off, s[0:3], s34 ; 4-byte Folded Reload
217 ; GCN: s_add_u32 s32, s32, 0x300
218 ; GCN-NEXT: s_sub_u32 s32, s32, 0x300
219 ; GCN-NEXT: s_mov_b32 s34, s4
220 ; GCN-NEXT: s_waitcnt vmcnt(0)
221 ; GCN-NEXT: s_setpc_b64
222 define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
223 %alloca = alloca i32, addrspace(5)
224 store volatile i32 0, i32 addrspace(5)* %alloca
225 call void asm sideeffect "; clobber v33", "~{v33}"()
226 ret void
227 }
228
229 ; Use a copy to a free SGPR instead of introducing a second CSR VGPR.
230 ; GCN-LABEL: {{^}}last_lane_vgpr_for_fp_csr:
231 ; GCN: s_waitcnt
232 ; GCN-NEXT: v_writelane_b32 v1, s34, 63
233 ; GCN-NEXT: s_mov_b32 s34, s32
234 ; GCN: buffer_store_dword v33, off, s[0:3], s34 ; 4-byte Folded Spill
235 ; GCN-COUNT-63: v_writelane_b32 v1
236 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34 offset:8
237 ; GCN: ;;#ASMSTART
238 ; GCN-COUNT-63: v_readlane_b32 s{{[0-9]+}}, v1
239
240 ; GCN: s_add_u32 s32, s32, 0x300
241 ; GCN-NEXT: s_sub_u32 s32, s32, 0x300
242 ; GCN-NEXT: v_readlane_b32 s34, v1, 63
243 ; GCN-NEXT: s_waitcnt vmcnt(0)
244 ; GCN-NEXT: s_setpc_b64
245 define void @last_lane_vgpr_for_fp_csr() #1 {
246 %alloca = alloca i32, addrspace(5)
247 store volatile i32 0, i32 addrspace(5)* %alloca
248 call void asm sideeffect "; clobber v33", "~{v33}"()
249 call void asm sideeffect "",
250 "~{s40},~{s41},~{s42},~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49}
251 ,~{s50},~{s51},~{s52},~{s53},~{s54},~{s55},~{s56},~{s57},~{s58},~{s59}
252 ,~{s60},~{s61},~{s62},~{s63},~{s64},~{s65},~{s66},~{s67},~{s68},~{s69}
253 ,~{s70},~{s71},~{s72},~{s73},~{s74},~{s75},~{s76},~{s77},~{s78},~{s79}
254 ,~{s80},~{s81},~{s82},~{s83},~{s84},~{s85},~{s86},~{s87},~{s88},~{s89}
255 ,~{s90},~{s91},~{s92},~{s93},~{s94},~{s95},~{s96},~{s97},~{s98},~{s99}
256 ,~{s100},~{s101},~{s102}"() #1
257
258 ret void
259 }
260
261 ; Use a copy to a free SGPR instead of introducing a second CSR VGPR.
262 ; GCN-LABEL: {{^}}no_new_vgpr_for_fp_csr:
263 ; GCN: s_waitcnt
264 ; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s34
265 ; GCN-NEXT: s_mov_b32 s34, s32
266 ; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s34 ; 4-byte Folded Spill
267 ; GCN-COUNT-64: v_writelane_b32 v1,
268
269 ; GCN: buffer_store_dword
270 ; GCN: ;;#ASMSTART
271 ; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1
272
273 ; GCN: buffer_load_dword v33, off, s[0:3], s34 ; 4-byte Folded Reload
274 ; GCN: s_add_u32 s32, s32, 0x300
275 ; GCN-NEXT: s_sub_u32 s32, s32, 0x300
276 ; GCN-NEXT: s_mov_b32 s34, [[FP_COPY]]
277 ; GCN-NEXT: s_waitcnt vmcnt(0)
278 ; GCN-NEXT: s_setpc_b64
279 define void @no_new_vgpr_for_fp_csr() #1 {
280 %alloca = alloca i32, addrspace(5)
281 store volatile i32 0, i32 addrspace(5)* %alloca
282 call void asm sideeffect "; clobber v33", "~{v33}"()
283 call void asm sideeffect "",
284 "~{s39},~{s40},~{s41},~{s42},~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49}
285 ,~{s50},~{s51},~{s52},~{s53},~{s54},~{s55},~{s56},~{s57},~{s58},~{s59}
286 ,~{s60},~{s61},~{s62},~{s63},~{s64},~{s65},~{s66},~{s67},~{s68},~{s69}
287 ,~{s70},~{s71},~{s72},~{s73},~{s74},~{s75},~{s76},~{s77},~{s78},~{s79}
288 ,~{s80},~{s81},~{s82},~{s83},~{s84},~{s85},~{s86},~{s87},~{s88},~{s89}
289 ,~{s90},~{s91},~{s92},~{s93},~{s94},~{s95},~{s96},~{s97},~{s98},~{s99}
290 ,~{s100},~{s101},~{s102}"() #1
291
292 ret void
293 }
294
295 ; GCN-LABEL: {{^}}realign_stack_no_fp_elim:
296 ; GCN: s_waitcnt
297 ; GCN-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x7ffc0
298 ; GCN-NEXT: s_mov_b32 s4, s34
299 ; GCN-NEXT: s_and_b32 s34, [[SCRATCH]], 0xfff80000
300 ; GCN-NEXT: s_add_u32 s32, s32, 0x100000
301 ; GCN-NEXT: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
302 ; GCN-NEXT: buffer_store_dword [[ZERO]], off, s[0:3], s34
303 ; GCN-NEXT: s_sub_u32 s32, s32, 0x100000
304 ; GCN-NEXT: s_mov_b32 s34, s4
305 ; GCN-NEXT: s_waitcnt vmcnt(0)
306 ; GCN-NEXT: s_setpc_b64
307 define void @realign_stack_no_fp_elim() #1 {
308 %alloca = alloca i32, align 8192, addrspace(5)
309 store volatile i32 0, i32 addrspace(5)* %alloca
310 ret void
311 }
312
313 ; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp:
314 ; GCN: s_waitcnt
315 ; GCN-NEXT: v_writelane_b32 v1, s34, 0
316 ; GCN-NEXT: s_mov_b32 s34, s32
317 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
318 ; GCN: buffer_store_dword [[ZERO]], off, s[0:3], s34 offset:4
319 ; GCN: ;;#ASMSTART
320 ; GCN: s_add_u32 s32, s32, 0x200
321 ; GCN-NEXT: s_mov_b64 s[30:31], vcc
322 ; GCN-NEXT: s_sub_u32 s32, s32, 0x200
323 ; GCN-NEXT: v_readlane_b32 s34, v1, 0
324 ; GCN-NEXT: s_waitcnt vmcnt(0)
325 ; GCN-NEXT: s_setpc_b64 s[30:31]
326 define void @no_unused_non_csr_sgpr_for_fp() #1 {
327 %alloca = alloca i32, addrspace(5)
328 store volatile i32 0, i32 addrspace(5)* %alloca
329
330 ; Use all clobberable registers, so FP has to spill to a VGPR.
331 call void asm sideeffect "",
332 "~{s0},~{s1},~{s2},~{s3},~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
333 ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19}
334 ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29}
335 ,~{s30},~{s31}"() #0
336
337 ret void
338 }
339
340 ; Need a new CSR VGPR to satisfy the FP spill.
341 ; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr:
342 ; GCN: s_waitcnt
343 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
344 ; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
345 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
346 ; GCN-NEXT: v_writelane_b32 v32, s34, 0
347 ; GCN-NEXT: s_mov_b32 s34, s32
348 ; GCN: s_add_u32 s32, s32, 0x300{{$}}
349
350 ; GCN-DAG: s_mov_b64 vcc, s[30:31]
351 ; GCN-DAG: buffer_store_dword
352
353 ; GCN: ;;#ASMSTART
354 ; GCN: s_mov_b64 s[30:31], vcc
355
356 ; GCN: s_sub_u32 s32, s32, 0x300{{$}}
357 ; GCN-NEXT: v_readlane_b32 s34, v32, 0
358 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
359 ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
360 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
361 ; GCN-NEXT: s_waitcnt vmcnt(0)
362 ; GCN-NEXT: s_setpc_b64
363 define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
364 %alloca = alloca i32, addrspace(5)
365 store volatile i32 0, i32 addrspace(5)* %alloca
366
367 ; Use all clobberable registers, so FP has to spill to a VGPR.
368 call void asm sideeffect "",
369 "~{s0},~{s1},~{s2},~{s3},~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
370 ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19}
371 ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29}
372 ,~{s30},~{s31}"() #0
373
374 call void asm sideeffect "; clobber nonpreserved VGPRs",
375 "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
376 ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
377 ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
378 ,~{v30},~{v31}"() #1
379
380 ret void
381 }
382
383 ; The byval argument exceeds the MUBUF constant offset, so a scratch
384 ; register is needed to access the CSR VGPR slot.
385 ; GCN-LABEL: {{^}}scratch_reg_needed_mubuf_offset:
386 ; GCN: s_waitcnt
387 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
388 ; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008
389 ; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Spill
390 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
391 ; GCN-NEXT: v_writelane_b32 v32, s34, 0
392 ; GCN-NEXT: s_mov_b32 s34, s32
393 ; GCN-DAG: s_add_u32 s32, s32, 0x40300{{$}}
394 ; GCN-DAG: s_mov_b64 vcc, s[30:31]
395 ; GCN-DAG: buffer_store_dword
396
397 ; GCN: ;;#ASMSTART
398 ; GCN: s_mov_b64 s[30:31], vcc
399
400 ; GCN: s_sub_u32 s32, s32, 0x40300{{$}}
401 ; GCN-NEXT: v_readlane_b32 s34, v32, 0
402 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
403 ; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008
404 ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Reload
405 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
406 ; GCN-NEXT: s_waitcnt vmcnt(0)
407 ; GCN-NEXT: s_setpc_b64
408 define void @scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval align 4 %arg) #1 {
409 %alloca = alloca i32, addrspace(5)
410 store volatile i32 0, i32 addrspace(5)* %alloca
411
412 ; Use all clobberable registers, so FP has to spill to a VGPR.
413 call void asm sideeffect "; clobber nonpreserved SGPRs",
414 "~{s0},~{s1},~{s2},~{s3},~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
415 ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19}
416 ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29}
417 ,~{s30},~{s31}"() #0
418
419 ; Use all clobberable VGPRs, so a CSR spill is needed for the VGPR
420 call void asm sideeffect "; clobber nonpreserved VGPRs",
421 "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
422 ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
423 ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
424 ,~{v30},~{v31}"() #1
425
426 ret void
427 }
428
429 ; GCN-LABEL: {{^}}local_empty_func:
430 ; GCN: s_waitcnt
431 ; GCN-NEXT: s_setpc_b64
432 define internal void @local_empty_func() #0 {
433 ret void
434 }
435
436 ; An FP is needed, despite not needing any spills
437 ; TODO: Ccould see callee does not use stack and omit FP.
438 ; GCN-LABEL: {{^}}ipra_call_with_stack:
439 ; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s34
440 ; GCN: s_mov_b32 s34, s32
441 ; GCN: s_add_u32 s32, s32, 0x400
442 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34{{$}}
443 ; GCN: s_swappc_b64
444 ; GCN: s_sub_u32 s32, s32, 0x400
445 ; GCN: s_mov_b32 s34, [[FP_COPY:s[0-9]+]]
446 define void @ipra_call_with_stack() #0 {
447 %alloca = alloca i32, addrspace(5)
448 store volatile i32 0, i32 addrspace(5)* %alloca
449 call void @local_empty_func()
450 ret void
451 }
452
190453 attributes #0 = { nounwind }
191454 attributes #1 = { nounwind "frame-pointer"="all" }
192455 attributes #2 = { nounwind "frame-pointer"="non-leaf" }
11 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s
22
33 ; GCN-LABEL: {{^}}use_dispatch_ptr:
4 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
5 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7
4 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4
5 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5
66 ; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
77 define hidden void @use_dispatch_ptr() #1 {
88 %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
1313
1414 ; GCN-LABEL: {{^}}kern_indirect_use_dispatch_ptr:
1515 ; GCN: enable_sgpr_dispatch_ptr = 1
16 ; GCN: s_mov_b64 s[6:7], s[4:5]
16 ; GCN-NOT: s[4:5]
17 ; GCN-NOT: s4
18 ; GCN-NOT: s5
1719 define amdgpu_kernel void @kern_indirect_use_dispatch_ptr(i32) #1 {
1820 call void @use_dispatch_ptr()
1921 ret void
2022 }
2123
2224 ; GCN-LABEL: {{^}}use_queue_ptr:
23 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
24 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7
25 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4
26 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5
2527 ; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
2628 define hidden void @use_queue_ptr() #1 {
2729 %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
3234
3335 ; GCN-LABEL: {{^}}kern_indirect_use_queue_ptr:
3436 ; GCN: enable_sgpr_queue_ptr = 1
35 ; GCN: s_mov_b64 s[6:7], s[4:5]
36 ; GCN: s_swappc_b64
37 ; GCN-NOT: s[4:5]
38 ; GCN-NOT: s4
39 ; GCN-NOT: s5
3740 define amdgpu_kernel void @kern_indirect_use_queue_ptr(i32) #1 {
3841 call void @use_queue_ptr()
3942 ret void
5457
5558 ; GCN-LABEL: {{^}}kern_indirect_use_queue_ptr_addrspacecast:
5659 ; CIVI: enable_sgpr_queue_ptr = 1
57
58 ; CIVI: s_mov_b64 s[6:7], s[4:5]
59 ; GFX9-NOT: s_mov_b64
60 ; GCN: s_swappc_b64
60 ; CIVI-NOT: s[4:5]
61 ; CIVI-NOT: s4
62 ; CIVI-NOT: s5
6163 define amdgpu_kernel void @kern_indirect_use_queue_ptr_addrspacecast(i32) #1 {
6264 call void @use_queue_ptr_addrspacecast()
6365 ret void
6466 }
6567
6668 ; GCN-LABEL: {{^}}use_kernarg_segment_ptr:
67 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
68 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7
69 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4
70 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5
6971 ; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
7072 define hidden void @use_kernarg_segment_ptr() #1 {
7173 %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
7678
7779 ; GCN-LABEL: {{^}}kern_indirect_use_kernarg_segment_ptr:
7880 ; GCN: enable_sgpr_kernarg_segment_ptr = 1
79 ; GCN: s_mov_b64 s[6:7], s[4:5]
81 ; GCN-NOT: s[4:5]
82 ; GCN-NOT: s4
83 ; GCN-NOT: s5
8084 ; GCN: s_swappc_b64
8185 define amdgpu_kernel void @kern_indirect_use_kernarg_segment_ptr(i32) #1 {
8286 call void @use_kernarg_segment_ptr()
8488 }
8589
8690 ; GCN-LABEL: {{^}}use_dispatch_id:
87 ; GCN: ; use s[6:7]
91 ; GCN: ; use s[4:5]
8892 define hidden void @use_dispatch_id() #1 {
8993 %id = call i64 @llvm.amdgcn.dispatch.id()
9094 call void asm sideeffect "; use $0", "s"(i64 %id)
96100
97101 ; GCN-LABEL: {{^}}kern_indirect_use_dispatch_id:
98102 ; GCN: enable_sgpr_dispatch_id = 1
99
100 ; GCN: s_mov_b64 s[6:7], s[4:5]
103 ; GCN-NOT: s[4:5]
104 ; GCN-NOT: s4
105 ; GCN-NOT: s5
101106 define amdgpu_kernel void @kern_indirect_use_dispatch_id() #1 {
102107 call void @use_dispatch_id()
103108 ret void
146151
147152 ; GCN-LABEL: {{^}}use_workgroup_id_xy:
148153 ; GCN: ; use s4
149 ; GCN: ; use s6
154 ; GCN: ; use s5
150155 define hidden void @use_workgroup_id_xy() #1 {
151156 %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
152157 %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
157162
158163 ; GCN-LABEL: {{^}}use_workgroup_id_xyz:
159164 ; GCN: ; use s4
165 ; GCN: ; use s5
160166 ; GCN: ; use s6
161 ; GCN: ; use s7
162167 define hidden void @use_workgroup_id_xyz() #1 {
163168 %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
164169 %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
171176
172177 ; GCN-LABEL: {{^}}use_workgroup_id_xz:
173178 ; GCN: ; use s4
174 ; GCN: ; use s6
179 ; GCN: ; use s5
175180 define hidden void @use_workgroup_id_xz() #1 {
176181 %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
177182 %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
182187
183188 ; GCN-LABEL: {{^}}use_workgroup_id_yz:
184189 ; GCN: ; use s4
185 ; GCN: ; use s6
190 ; GCN: ; use s5
186191 define hidden void @use_workgroup_id_yz() #1 {
187192 %val0 = call i32 @llvm.amdgcn.workgroup.id.y()
188193 %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
243248
244249 ; GCN: s_mov_b32 s33, s8
245250
251 ; GCN: s_mov_b32 s5, s7
246252 ; GCN: s_mov_b32 s4, s6
247 ; GCN: s_mov_b32 s6, s7
248253 ; GCN: s_mov_b32 s32, s33
249254 ; GCN: s_swappc_b64
250255 define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 {
260265 ; GCN: s_mov_b32 s33, s9
261266
262267 ; GCN: s_mov_b32 s4, s6
263 ; GCN: s_mov_b32 s6, s7
264 ; GCN: s_mov_b32 s7, s8
268 ; GCN: s_mov_b32 s5, s7
269 ; GCN: s_mov_b32 s6, s8
265270
266271 ; GCN: s_mov_b32 s32, s33
267272 ; GCN: s_swappc_b64
276281 ; GCN: enable_sgpr_workgroup_id_z = 1
277282
278283 ; GCN: s_mov_b32 s33, s8
284 ; GCN: s_mov_b32 s5, s7
279285 ; GCN: s_mov_b32 s4, s6
280 ; GCN: s_mov_b32 s6, s7
281286
282287 ; GCN: s_mov_b32 s32, s33
283288
293298 ; GCN: enable_sgpr_workgroup_id_z = 1
294299
295300 ; GCN: s_mov_b32 s33, s9
296 ; GCN: s_mov_b32 s6, s8
297301 ; GCN: s_mov_b32 s4, s7
302 ; GCN: s_mov_b32 s5, s8
298303 ; GCN: s_mov_b32 s32, s33
299304 ; GCN: s_swappc_b64
300305 define amdgpu_kernel void @kern_indirect_use_workgroup_id_yz() #1 {
403408
404409 ; GCN-LABEL: {{^}}use_every_sgpr_input:
405410 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
411 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4
412 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5
413 ; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
406414 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
407415 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7
408416 ; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
409417 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s8
410418 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s9
411419 ; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
412 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s10
413 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s11
414 ; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
415 ; GCN: ; use s[12:13]
416 ; GCN: ; use s4
420 ; GCN: ; use s[10:11]
421 ; GCN: ; use s12
422 ; GCN: ; use s13
417423 ; GCN: ; use s14
418 ; GCN: ; use s15
419424 define hidden void @use_every_sgpr_input() #1 {
420425 %alloca = alloca i32, align 4, addrspace(5)
421426 store volatile i32 0, i32 addrspace(5)* %alloca
461466 ; GCN: enable_sgpr_flat_scratch_init = 1
462467
463468 ; GCN: s_mov_b32 s33, s17
464 ; GCN: s_mov_b64 s[12:13], s[10:11]
465 ; GCN: s_mov_b64 s[10:11], s[8:9]
466 ; GCN: s_mov_b64 s[8:9], s[6:7]
467 ; GCN: s_mov_b64 s[6:7], s[4:5]
468 ; GCN: s_mov_b32 s4, s14
469 ; GCN: s_mov_b32 s14, s15
470 ; GCN: s_mov_b32 s15, s16
469 ; GCN: s_mov_b32 s12, s14
470 ; GCN: s_mov_b32 s13, s15
471 ; GCN: s_mov_b32 s14, s16
471472 ; GCN: s_mov_b32 s32, s33
472473 ; GCN: s_swappc_b64
473474 define amdgpu_kernel void @kern_indirect_use_every_sgpr_input() #1 {
488489 ; GCN-NOT: s[8:9]
489490 ; GCN-NOT: s[10:11]
490491 ; GCN-NOT: s[12:13]
491 ; GCN: s_or_saveexec_b64 s[6:7], -1
492 ; GCN: s_or_saveexec_b64 s[4:5], -1
492493 define hidden void @func_indirect_use_every_sgpr_input() #1 {
493494 call void @use_every_sgpr_input()
494495 ret void
495496 }
496497
497498 ; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz:
498 ; GCN-NOT: s_mov_b32 s4
499 ; GCN: s_mov_b32 s4, s12
500 ; GCN: s_mov_b32 s5, s13
499501 ; GCN: s_mov_b32 s6, s14
500 ; GCN-NEXT: s_mov_b32 s7, s15
501 ; GCN-NOT: s_mov_b32 s4
502 ; GCN: ; use s[10:11]
503 ; GCN: ; use s12
504 ; GCN: ; use s13
505 ; GCN: ; use s14
502506
503507 ; GCN: s_swappc_b64
504508 define hidden void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 {
534538 }
535539
536540 ; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill:
537 ; GCN: s_mov_b32 s5, s32
538
541 ; GCN-DAG: s_mov_b32 s34, s32
539542 ; GCN-DAG: s_add_u32 s32, s32, 0x400
540
541 ; GCN: s_mov_b64 s{{\[}}[[LO_X:[0-9]+]]{{\:}}[[HI_X:[0-9]+]]{{\]}}, s[6:7]
542 ; GCN-NOT: s_mov_b32 s4,
543 ; GCN-DAG: s_mov_b32 s6, s14
544 ; GCN-DAG: s_mov_b32 s7, s15
545
546 ; GCN: s_mov_b64 s{{\[}}[[LO_Y:[0-9]+]]{{\:}}[[HI_Y:[0-9]+]]{{\]}}, s[8:9]
547
548 ; GCN-DAG: s_mov_b32 [[SAVE_X:s[0-57-9][0-9]*]], s4
549 ; GCN-DAG: s_mov_b32 [[SAVE_Y:s[0-57-9][0-9]*]], s14
550 ; GCN-DAG: s_mov_b32 [[SAVE_Z:s[0-68-9][0-9]*]], s15
551
552 ; GCN: s_mov_b64 s{{\[}}[[LO_Z:[0-9]+]]{{\:}}[[HI_Z:[0-9]+]]{{\]}}, s[10:11]
553
554 ; GCN: s_swappc_b64
555
556 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5{{$}}
543 ; GCN-DAG: s_mov_b64 s{{\[}}[[LO_X:[0-9]+]]{{\:}}[[HI_X:[0-9]+]]{{\]}}, s[4:5]
544 ; GCN-DAG: s_mov_b64 s{{\[}}[[LO_Y:[0-9]+]]{{\:}}[[HI_Y:[0-9]+]]{{\]}}, s[6:7]
545
546
547 ; GCN: s_mov_b32 s4, s12
548 ; GCN: s_mov_b32 s5, s13
549 ; GCN: s_mov_b32 s6, s14
550
551 ; GCN: s_mov_b64 s{{\[}}[[LO_Z:[0-9]+]]{{\:}}[[HI_Z:[0-9]+]]{{\]}}, s[8:9]
552
553 ; GCN-DAG: s_mov_b32 [[SAVE_X:s[0-57-9][0-9]*]], s12
554 ; GCN-DAG: s_mov_b32 [[SAVE_Y:s[0-57-9][0-9]*]], s13
555 ; GCN-DAG: s_mov_b32 [[SAVE_Z:s[0-68-9][0-9]*]], s14
556
557
558
559 ; GCN: s_swappc_b64
560
561 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34{{$}}
557562 ; GCN-DAG: v_mov_b32_e32 v[[LO1:[0-9]+]], s[[LO_X]]
558563 ; GCN-DAG: v_mov_b32_e32 v[[HI1:[0-9]+]], s[[HI_X]]
559564 ; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO1]]:[[HI1]]{{\]}}
378378 }
379379
380380 ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x:
381 ; GCN: s_mov_b32 s5, s32
381 ; GCN: s_mov_b32 s34, s32
382382 ; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}}
383383 ; GCN: s_swappc_b64
384384 define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
397397
398398 ; Requires loading and storing to stack slot.
399399 ; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x:
400 ; GCN: s_add_u32 s32, s32, 0x400{{$}}
401 ; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
402 ; GCN: buffer_load_dword v32, off, s[0:3], s5{{$}}
400 ; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}}
401 ; GCN-DAG: buffer_store_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill
402 ; GCN: buffer_load_dword v32, off, s[0:3], s34{{$}}
403403
404404 ; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}}
405405
406406 ; GCN: s_swappc_b64
407407
408 ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
408 ; GCN: buffer_load_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload
409409 ; GCN: s_sub_u32 s32, s32, 0x400{{$}}
410410 ; GCN: s_setpc_b64
411411 define void @too_many_args_call_too_many_args_use_workitem_id_x(
519519
520520 ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval:
521521 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
522 ; GCN: buffer_store_dword [[K]], off, s[0:3], s5{{$}}
522 ; GCN: buffer_store_dword [[K]], off, s[0:3], s34{{$}}
523523 ; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
524524
525 ; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s5{{$}}
525 ; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s34{{$}}
526526 ; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
527527 ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
528528 ; GCN: s_swappc_b64
2626 ; GCN-LABEL: call_split_type_used_outside_block_v2f32:
2727 ; GCN: ; %bb.0: ; %bb0
2828 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29 ; GCN-NEXT: s_mov_b32 s5, s32
29 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
30 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
31 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
32 ; GCN-NEXT: v_writelane_b32 v32, s34, 2
33 ; GCN-NEXT: v_writelane_b32 v32, s36, 0
34 ; GCN-NEXT: s_mov_b32 s34, s32
3035 ; GCN-NEXT: s_add_u32 s32, s32, 0x400
31 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
32 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
33 ; GCN-NEXT: s_mov_b64 exec, s[6:7]
34 ; GCN-NEXT: v_writelane_b32 v32, s34, 0
35 ; GCN-NEXT: v_writelane_b32 v32, s35, 1
36 ; GCN-NEXT: v_writelane_b32 v32, s36, 2
37 ; GCN-NEXT: s_getpc_b64 s[6:7]
38 ; GCN-NEXT: s_add_u32 s6, s6, func_v2f32@rel32@lo+4
39 ; GCN-NEXT: s_addc_u32 s7, s7, func_v2f32@rel32@hi+4
40 ; GCN-NEXT: s_mov_b64 s[34:35], s[30:31]
41 ; GCN-NEXT: s_mov_b32 s36, s5
42 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
43 ; GCN-NEXT: s_mov_b32 s5, s36
44 ; GCN-NEXT: s_mov_b64 s[30:31], s[34:35]
45 ; GCN-NEXT: v_readlane_b32 s36, v32, 2
46 ; GCN-NEXT: v_readlane_b32 s35, v32, 1
47 ; GCN-NEXT: v_readlane_b32 s34, v32, 0
48 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
49 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
50 ; GCN-NEXT: s_mov_b64 exec, s[6:7]
36 ; GCN-NEXT: v_writelane_b32 v32, s37, 1
37 ; GCN-NEXT: s_getpc_b64 s[4:5]
38 ; GCN-NEXT: s_add_u32 s4, s4, func_v2f32@rel32@lo+4
39 ; GCN-NEXT: s_addc_u32 s5, s5, func_v2f32@rel32@hi+4
40 ; GCN-NEXT: s_mov_b64 s[36:37], s[30:31]
41 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
42 ; GCN-NEXT: s_mov_b64 s[30:31], s[36:37]
43 ; GCN-NEXT: v_readlane_b32 s37, v32, 1
44 ; GCN-NEXT: v_readlane_b32 s36, v32, 0
5145 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400
46 ; GCN-NEXT: v_readlane_b32 s34, v32, 2
47 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
48 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
49 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
5250 ; GCN-NEXT: s_waitcnt vmcnt(0)
5351 ; GCN-NEXT: s_setpc_b64 s[30:31]
5452 bb0:
6462 ; GCN-LABEL: call_split_type_used_outside_block_v3f32:
6563 ; GCN: ; %bb.0: ; %bb0
6664 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
67 ; GCN-NEXT: s_mov_b32 s5, s32
65 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
66 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
67 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
68 ; GCN-NEXT: v_writelane_b32 v32, s34, 2
69 ; GCN-NEXT: v_writelane_b32 v32, s36, 0
70 ; GCN-NEXT: s_mov_b32 s34, s32
6871 ; GCN-NEXT: s_add_u32 s32, s32, 0x400
69 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
70 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
71 ; GCN-NEXT: s_mov_b64 exec, s[6:7]
72 ; GCN-NEXT: v_writelane_b32 v32, s34, 0
73 ; GCN-NEXT: v_writelane_b32 v32, s35, 1
74 ; GCN-NEXT: v_writelane_b32 v32, s36, 2
75 ; GCN-NEXT: s_getpc_b64 s[6:7]
76 ; GCN-NEXT: s_add_u32 s6, s6, func_v3f32@rel32@lo+4
77 ; GCN-NEXT: s_addc_u32 s7, s7, func_v3f32@rel32@hi+4
78 ; GCN-NEXT: s_mov_b64 s[34:35], s[30:31]
79 ; GCN-NEXT: s_mov_b32 s36, s5
80 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
81 ; GCN-NEXT: s_mov_b32 s5, s36
82 ; GCN-NEXT: s_mov_b64 s[30:31], s[34:35]
83 ; GCN-NEXT: v_readlane_b32 s36, v32, 2
84 ; GCN-NEXT: v_readlane_b32 s35, v32, 1
85 ; GCN-NEXT: v_readlane_b32 s34, v32, 0
86 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
87 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
88 ; GCN-NEXT: s_mov_b64 exec, s[6:7]
72 ; GCN-NEXT: v_writelane_b32 v32, s37, 1
73 ; GCN-NEXT: s_getpc_b64 s[4:5]
74 ; GCN-NEXT: s_add_u32 s4, s4, func_v3f32@rel32@lo+4
75 ; GCN-NEXT: s_addc_u32 s5, s5, func_v3f32@rel32@hi+4
76 ; GCN-NEXT: s_mov_b64 s[36:37], s[30:31]
77 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
78 ; GCN-NEXT: s_mov_b64 s[30:31], s[36:37]
79 ; GCN-NEXT: v_readlane_b32 s37, v32, 1
80 ; GCN-NEXT: v_readlane_b32 s36, v32, 0
8981 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400
82 ; GCN-NEXT: v_readlane_b32 s34, v32, 2
83 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
84 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
85 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
9086 ; GCN-NEXT: s_waitcnt vmcnt(0)
9187 ; GCN-NEXT: s_setpc_b64 s[30:31]
9288 bb0:
10298 ; GCN-LABEL: call_split_type_used_outside_block_v4f16:
10399 ; GCN: ; %bb.0: ; %bb0
104100 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105 ; GCN-NEXT: s_mov_b32 s5, s32
101 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
102 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
103 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
104 ; GCN-NEXT: v_writelane_b32 v32, s34, 2
105 ; GCN-NEXT: v_writelane_b32 v32, s36, 0
106 ; GCN-NEXT: s_mov_b32 s34, s32
106107 ; GCN-NEXT: s_add_u32 s32, s32, 0x400
107 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
108 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
109 ; GCN-NEXT: s_mov_b64 exec, s[6:7]
110 ; GCN-NEXT: v_writelane_b32 v32, s34, 0
111 ; GCN-NEXT: v_writelane_b32 v32, s35, 1
112 ; GCN-NEXT: v_writelane_b32 v32, s36, 2
113 ; GCN-NEXT: s_getpc_b64 s[6:7]
114 ; GCN-NEXT: s_add_u32 s6, s6, func_v4f16@rel32@lo+4
115 ; GCN-NEXT: s_addc_u32 s7, s7, func_v4f16@rel32@hi+4
116 ; GCN-NEXT: s_mov_b64 s[34:35], s[30:31]
117 ; GCN-NEXT: s_mov_b32 s36, s5
118 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
119 ; GCN-NEXT: s_mov_b32 s5, s36
120 ; GCN-NEXT: s_mov_b64 s[30:31], s[34:35]
121 ; GCN-NEXT: v_readlane_b32 s36, v32, 2
122 ; GCN-NEXT: v_readlane_b32 s35, v32, 1
123 ; GCN-NEXT: v_readlane_b32 s34, v32, 0
124 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
125 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
126 ; GCN-NEXT: s_mov_b64 exec, s[6:7]
108 ; GCN-NEXT: v_writelane_b32 v32, s37, 1
109 ; GCN-NEXT: s_getpc_b64 s[4:5]
110 ; GCN-NEXT: s_add_u32 s4, s4, func_v4f16@rel32@lo+4
111 ; GCN-NEXT: s_addc_u32 s5, s5, func_v4f16@rel32@hi+4
112 ; GCN-NEXT: s_mov_b64 s[36:37], s[30:31]
113 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
114 ; GCN-NEXT: s_mov_b64 s[30:31], s[36:37]
115 ; GCN-NEXT: v_readlane_b32 s37, v32, 1
116 ; GCN-NEXT: v_readlane_b32 s36, v32, 0
127117 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400
118 ; GCN-NEXT: v_readlane_b32 s34, v32, 2
119 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
120 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
121 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
128122 ; GCN-NEXT: s_waitcnt vmcnt(0)
129123 ; GCN-NEXT: s_setpc_b64 s[30:31]
130124 bb0:
140134 ; GCN-LABEL: call_split_type_used_outside_block_struct:
141135 ; GCN: ; %bb.0: ; %bb0
142136 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
143 ; GCN-NEXT: s_mov_b32 s5, s32
137 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
138 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
139 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
140 ; GCN-NEXT: v_writelane_b32 v32, s34, 2
141 ; GCN-NEXT: v_writelane_b32 v32, s36, 0
142 ; GCN-NEXT: s_mov_b32 s34, s32
144143 ; GCN-NEXT: s_add_u32 s32, s32, 0x400
145 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
146 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
147 ; GCN-NEXT: s_mov_b64 exec, s[6:7]
148 ; GCN-NEXT: v_writelane_b32 v32, s34, 0
149 ; GCN-NEXT: v_writelane_b32 v32, s35, 1
150 ; GCN-NEXT: v_writelane_b32 v32, s36, 2
151 ; GCN-NEXT: s_getpc_b64 s[6:7]
152 ; GCN-NEXT: s_add_u32 s6, s6, func_struct@rel32@lo+4
153 ; GCN-NEXT: s_addc_u32 s7, s7, func_struct@rel32@hi+4
154 ; GCN-NEXT: s_mov_b64 s[34:35], s[30:31]
155 ; GCN-NEXT: s_mov_b32 s36, s5
156 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
157 ; GCN-NEXT: s_mov_b32 s5, s36
144 ; GCN-NEXT: v_writelane_b32 v32, s37, 1
145 ; GCN-NEXT: s_getpc_b64 s[4:5]
146 ; GCN-NEXT: s_add_u32 s4, s4, func_struct@rel32@lo+4
147 ; GCN-NEXT: s_addc_u32 s5, s5, func_struct@rel32@hi+4
148 ; GCN-NEXT: s_mov_b64 s[36:37], s[30:31]
149 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
150 ; GCN-NEXT: s_mov_b64 s[30:31], s[36:37]
151 ; GCN-NEXT: v_readlane_b32 s37, v32, 1
152 ; GCN-NEXT: v_readlane_b32 s36, v32, 0
158153 ; GCN-NEXT: v_mov_b32_e32 v1, v4
159 ; GCN-NEXT: s_mov_b64 s[30:31], s[34:35]
160 ; GCN-NEXT: v_readlane_b32 s36, v32, 2
161 ; GCN-NEXT: v_readlane_b32 s35, v32, 1
162 ; GCN-NEXT: v_readlane_b32 s34, v32, 0
163 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
164 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
165 ; GCN-NEXT: s_mov_b64 exec, s[6:7]
166154 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400
155 ; GCN-NEXT: v_readlane_b32 s34, v32, 2
156 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
157 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
158 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
167159 ; GCN-NEXT: s_waitcnt vmcnt(0)
168160 ; GCN-NEXT: s_setpc_b64 s[30:31]
169161 bb0:
2828
2929 ; CI: s_sub_u32 [[SUB:s[0-9]+]], s32, s33
3030 ; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6
31 ; CI-NEXT: v_add_i32_e64 v1, s[6:7], 4, [[SCALED]]
31 ; CI-NEXT: v_add_i32_e64 v1, s{{\[[0-9]+:[0-9]+\]}}, 4, [[SCALED]]
3232 ; CI-NOT: v_mov
3333 ; CI: ds_write_b32 v0, v0
3434 ; CI-NEXT: ds_write_b32 v0, v1
180180 ; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x200
181181
182182 ; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6
183 ; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s[6:7], [[K]], [[SCALED]]
183 ; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[SCALED]]
184184
185185 ; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB]]
186186 ; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], [[K]], [[SCALED]]
232232
233233 ; GCN-LABEL: {{^}}undefined_stack_store_reg:
234234 ; GCN: s_and_saveexec_b64
235 ; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:
236 ; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:
237 ; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:
238 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:
235 ; GCN: buffer_store_dword v0, off, s[0:3], s34 offset:
236 ; GCN: buffer_store_dword v0, off, s[0:3], s34 offset:
237 ; GCN: buffer_store_dword v0, off, s[0:3], s34 offset:
238 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34 offset:
239239 define void @undefined_stack_store_reg(float %arg, i32 %arg1) #0 {
240240 bb:
241241 %tmp = alloca <4 x float>, align 16, addrspace(5)
103103 }
104104
105105 ; GCN-LABEL: {{^}}f32_func_void:
106 ; GCN: buffer_load_dword v0, off, s[8:11], 0
106 ; GCN: buffer_load_dword v0, off, s[4:7], 0
107107 ; GCN-NEXT: s_waitcnt vmcnt(0)
108108 ; GCN-NEXT: s_setpc_b64
109109 define float @f32_func_void() #0 {
6262
6363 ; GCN-LABEL: {{^}}func_tail_call:
6464 ; GCN: s_waitcnt
65 ; GCN-NEXT: s_getpc_b64 s[6:7]
66 ; GCN-NEXT: s_add_u32 s6,
67 ; GCN-NEXT: s_addc_u32 s7,
68 ; GCN-NEXT: s_setpc_b64 s[6:7]
65 ; GCN-NEXT: s_getpc_b64 s[4:5]
66 ; GCN-NEXT: s_add_u32 s4,
67 ; GCN-NEXT: s_addc_u32 s5,
68 ; GCN-NEXT: s_setpc_b64 s[4:5]
6969
7070 ; GCN: ; NumSgprs: 32
7171 ; GCN: ; NumVgprs: 8
508508
509509 ; SI-LABEL: {{^}}test_fold_and_ord:
510510 ; SI: s_waitcnt
511 ; SI-NEXT: v_cmp_class_f32_e64 s[6:7], v0, 32{{$}}
512 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
511 ; SI-NEXT: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v0, 32{{$}}
512 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, [[COND]]
513513 ; SI-NEXT: s_setpc_b64
514514 define i1 @test_fold_and_ord(float %a) {
515515 %class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
520520
521521 ; SI-LABEL: {{^}}test_fold_and_unord:
522522 ; SI: s_waitcnt
523 ; SI-NEXT: v_cmp_class_f32_e64 s[6:7], v0, 3{{$}}
524 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
523 ; SI-NEXT: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v0, 3{{$}}
524 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, [[COND]]
525525 ; SI-NEXT: s_setpc_b64
526526 define i1 @test_fold_and_unord(float %a) {
527527 %class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
5858
5959 ; GCN-LABEL: {{^}}func_implicitarg_ptr:
6060 ; GCN: s_waitcnt
61 ; MESA: v_mov_b32_e32 v0, s6
62 ; MESA: v_mov_b32_e32 v1, s7
63 ; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
64 ; HSA: v_mov_b32_e32 v0, s6
65 ; HSA: v_mov_b32_e32 v1, s7
61 ; MESA: v_mov_b32_e32 v0, s4
62 ; MESA: v_mov_b32_e32 v1, s5
63 ; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
64 ; HSA: v_mov_b32_e32 v0, s4
65 ; HSA: v_mov_b32_e32 v1, s5
6666 ; HSA: flat_load_dword v0, v[0:1]
6767 ; GCN-NEXT: s_waitcnt
6868 ; GCN-NEXT: s_setpc_b64
7575
7676 ; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr:
7777 ; GCN: s_waitcnt
78 ; MESA: v_mov_b32_e32 v0, s6
79 ; MESA: v_mov_b32_e32 v1, s7
80 ; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
81 ; HSA: v_mov_b32_e32 v0, s6
82 ; HSA: v_mov_b32_e32 v1, s7
78 ; MESA: v_mov_b32_e32 v0, s4
79 ; MESA: v_mov_b32_e32 v1, s5
80 ; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
81 ; HSA: v_mov_b32_e32 v0, s4
82 ; HSA: v_mov_b32_e32 v1, s5
8383 ; HSA: flat_load_dword v0, v[0:1]
8484 ; GCN-NEXT: s_waitcnt
8585 ; GCN-NEXT: s_setpc_b64
9494 ; GCN: enable_sgpr_kernarg_segment_ptr = 1
9595 ; HSA: kernarg_segment_byte_size = 0
9696 ; MESA: kernarg_segment_byte_size = 16
97 ; GCN: s_mov_b64 s[6:7], s[4:5]
97 ; GCN-NOT: s[4:5]
98 ; GCN-NOT: s4
99 ; GCN-NOT: s5
98100 ; GCN: s_swappc_b64
99101 define amdgpu_kernel void @kernel_call_implicitarg_ptr_func_empty() #0 {
100102 call void @func_implicitarg_ptr()
105107 ; GCN: enable_sgpr_kernarg_segment_ptr = 1
106108 ; HSA: kernarg_segment_byte_size = 48
107109 ; MESA: kernarg_segment_byte_size = 16
108 ; GCN: s_mov_b64 s[6:7], s[4:5]
110 ; GCN-NOT: s[4:5]
111 ; GCN-NOT: s4
112 ; GCN-NOT: s5
109113 ; GCN: s_swappc_b64
110114 define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func_empty() #1 {
111115 call void @func_implicitarg_ptr()
117121 ; HSA: kernarg_segment_byte_size = 112
118122 ; MESA: kernarg_segment_byte_size = 128
119123
120 ; HSA: s_add_u32 s6, s4, 0x70
121 ; MESA: s_add_u32 s6, s4, 0x70
122
123 ; GCN: s_addc_u32 s7, s5, 0{{$}}
124 ; HSA: s_add_u32 s4, s4, 0x70
125 ; MESA: s_add_u32 s4, s4, 0x70
126
127 ; GCN: s_addc_u32 s5, s5, 0{{$}}
124128 ; GCN: s_swappc_b64
125129 define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 {
126130 call void @func_implicitarg_ptr()
132136 ; HSA: kernarg_segment_byte_size = 160
133137 ; MESA: kernarg_segment_byte_size = 128
134138
135 ; GCN: s_add_u32 s6, s4, 0x70
136
137 ; GCN: s_addc_u32 s7, s5, 0{{$}}
139 ; GCN: s_add_u32 s4, s4, 0x70
140 ; GCN: s_addc_u32 s5, s5, 0{{$}}
138141 ; GCN: s_swappc_b64
139142 define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func([112 x i8]) #1 {
140143 call void @func_implicitarg_ptr()
142145 }
143146
144147 ; GCN-LABEL: {{^}}func_call_implicitarg_ptr_func:
145 ; GCN-NOT: s6
146 ; GCN-NOT: s7
147 ; GCN-NOT: s[6:7]
148 ; GCN-NOT: s4
149 ; GCN-NOT: s5
150 ; GCN-NOT: s[4:5]
148151 define void @func_call_implicitarg_ptr_func() #0 {
149152 call void @func_implicitarg_ptr()
150153 ret void
151154 }
152155
153156 ; GCN-LABEL: {{^}}opencl_func_call_implicitarg_ptr_func:
154 ; GCN-NOT: s6
155 ; GCN-NOT: s7
156 ; GCN-NOT: s[6:7]
157 ; GCN-NOT: s4
158 ; GCN-NOT: s5
159 ; GCN-NOT: s[4:5]
157160 define void @opencl_func_call_implicitarg_ptr_func() #0 {
158161 call void @func_implicitarg_ptr()
159162 ret void
161164
162165 ; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr:
163166 ; GCN: s_waitcnt
164 ; MESA-DAG: v_mov_b32_e32 v0, s6
165 ; MESA-DAG: v_mov_b32_e32 v1, s7
166 ; MESA-DAG: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
167 ; MESA: v_mov_b32_e32 v0, s8
168 ; MESA: v_mov_b32_e32 v1, s9
169 ; MESA: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
170
167 ; MESA-DAG: v_mov_b32_e32 v0, s4
168 ; MESA-DAG: v_mov_b32_e32 v1, s5
169 ; MESA-DAG: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
170 ; MESA: v_mov_b32_e32 v0, s6
171 ; MESA: v_mov_b32_e32 v1, s7
172 ; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
173
174 ; HSA: v_mov_b32_e32 v0, s4
175 ; HSA: v_mov_b32_e32 v1, s5
176 ; HSA: flat_load_dword v0, v[0:1]
171177 ; HSA: v_mov_b32_e32 v0, s6
172178 ; HSA: v_mov_b32_e32 v1, s7
173 ; HSA: flat_load_dword v0, v[0:1]
174 ; HSA: v_mov_b32_e32 v0, s8
175 ; HSA: v_mov_b32_e32 v1, s9
176179 ; HSA: flat_load_dword v0, v[0:1]
177180
178181 ; GCN: s_waitcnt vmcnt(0)
188191
189192 ; GCN-LABEL: {{^}}opencl_func_kernarg_implicitarg_ptr:
190193 ; GCN: s_waitcnt
194 ; MESA-DAG: v_mov_b32_e32 v0, s4
195 ; MESA-DAG: v_mov_b32_e32 v1, s5
196 ; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
191197 ; MESA-DAG: v_mov_b32_e32 v0, s6
192198 ; MESA-DAG: v_mov_b32_e32 v1, s7
193 ; MESA: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
194 ; MESA-DAG: v_mov_b32_e32 v0, s8
195 ; MESA-DAG: v_mov_b32_e32 v1, s9
196 ; MESA: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
197
199 ; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
200
201
202 ; HSA: v_mov_b32_e32 v0, s4
203 ; HSA: v_mov_b32_e32 v1, s5
204 ; HSA: flat_load_dword v0, v[0:1]
198205
199206 ; HSA: v_mov_b32_e32 v0, s6
200207 ; HSA: v_mov_b32_e32 v1, s7
201 ; HSA: flat_load_dword v0, v[0:1]
202
203 ; HSA: v_mov_b32_e32 v0, s8
204 ; HSA: v_mov_b32_e32 v1, s9
205208 ; HSA: flat_load_dword v0, v[0:1]
206209
207210 ; GCN: s_waitcnt vmcnt(0)
216219 }
217220
218221 ; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func:
219 ; GCN: s_add_u32 s8, s4, 0x70
220 ; GCN: s_addc_u32 s9, s5, 0
221
222 ; GCN: s_mov_b64 s[6:7], s[4:5]
222 ; GCN: s_add_u32 s6, s4, 0x70
223 ; GCN: s_addc_u32 s7, s5, 0
223224 ; GCN: s_swappc_b64
224225 define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 {
225226 call void @func_kernarg_implicitarg_ptr()
11 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
22
33 ; GCN-LABEL: {{^}}mad_i64_i32_sextops:
4 ; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3]
4 ; CI: v_mad_i64_i32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v0, v1, v[2:3]
55
66 ; SI: v_mul_lo_u32
77 ; SI: v_mul_hi_i32
1616 }
1717
1818 ; GCN-LABEL: {{^}}mad_i64_i32_sextops_commute:
19 ; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3]
19 ; CI: v_mad_i64_i32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v0, v1, v[2:3]
2020
2121 ; SI-DAG: v_mul_lo_u32
2222 ; SI-DAG: v_mul_hi_i32
3131 }
3232
3333 ; GCN-LABEL: {{^}}mad_u64_u32_zextops:
34 ; CI: v_mad_u64_u32 v[0:1], s[6:7], v0, v1, v[2:3]
34 ; CI: v_mad_u64_u32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v0, v1, v[2:3]
3535
3636 ; SI-DAG: v_mul_lo_u32
3737 ; SI-DAG: v_mul_hi_u32
4646 }
4747
4848 ; GCN-LABEL: {{^}}mad_u64_u32_zextops_commute:
49 ; CI: v_mad_u64_u32 v[0:1], s[6:7], v0, v1, v[2:3]
49 ; CI: v_mad_u64_u32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v0, v1, v[2:3]
5050
5151 ; SI-DAG: v_mul_lo_u32
5252 ; SI-DAG: v_mul_hi_u32
8484 ; GCN-LABEL: {{^}}mad_i64_i32_sextops_i32_i63:
8585 ; CI: v_lshl_b64
8686 ; CI: v_ashr
87 ; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3]
87 ; CI: v_mad_i64_i32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v0, v1, v[2:3]
8888
8989 ; SI-NOT: v_mad_u64_u32
9090 define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 {
100100 ; CI: v_bfe_i32 v[[B1:[0-9]+]], v1, 0, 31
101101 ; CI: v_ashr_i64
102102 ; CI: v_bfe_i32 v[[B2:[0-9]+]], v0, 0, 31
103 ; CI: v_mad_i64_i32 v[0:1], s[6:7], v[[B2]], v[[B1]], v[1:2]
103 ; CI: v_mad_i64_i32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v[[B2]], v[[B1]], v[1:2]
104104 define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 {
105105 %sext0 = sext i31 %arg0 to i63
106106 %sext1 = sext i31 %arg1 to i63
110110 }
111111
112112 ; GCN-LABEL: {{^}}mad_u64_u32_bitops:
113 ; CI: v_mad_u64_u32 v[0:1], s[6:7], v0, v2, v[4:5]
113 ; CI: v_mad_u64_u32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v0, v2, v[4:5]
114114 define i64 @mad_u64_u32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
115115 %trunc.lhs = and i64 %arg0, 4294967295
116116 %trunc.rhs = and i64 %arg1, 4294967295
140140 }
141141
142142 ; GCN-LABEL: {{^}}mad_i64_i32_bitops:
143 ; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v2, v[4:5]
143 ; CI: v_mad_i64_i32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v0, v2, v[4:5]
144144 ; SI-NOT: v_mad_
145145 define i64 @mad_i64_i32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
146146 %shl.lhs = shl i64 %arg0, 32
154154
155155 ; Example from bug report
156156 ; GCN-LABEL: {{^}}mad_i64_i32_unpack_i64ops:
157 ; CI: v_mad_u64_u32 v[0:1], s[6:7], v1, v0, v[0:1]
157 ; CI: v_mad_u64_u32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v1, v0, v[0:1]
158158 ; SI-NOT: v_mad_u64_u32
159159 define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 {
160160 %tmp4 = lshr i64 %arg0, 32
88
99 ; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm:
1010 ; GCN: s_waitcnt
11 ; GCN: s_mov_b32 s5, s32
12 ; GCN-DAG: s_add_u32 s32, s32, 0x400
11
1312 ; Spill CSR VGPR used for SGPR spilling
1413 ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
15 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
14 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
1615 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
17
18 ; GCN-DAG: v_writelane_b32 v32, s34, 0
19 ; GCN-DAG: v_writelane_b32 v32, s35, 1
20 ; GCN-DAG: v_writelane_b32 v32, s36, 2
16 ; GCN-DAG: v_writelane_b32 v32, s34, 2
17 ; GCN-DAG: s_mov_b32 s34, s32
18 ; GCN-DAG: s_add_u32 s32, s32, 0x400
19 ; GCN-DAG: v_writelane_b32 v32, s36, 0
20 ; GCN-DAG: v_writelane_b32 v32, s37, 1
2121
2222 ; GCN: s_swappc_b64
2323
24 ; GCN: v_readlane_b32 s36, v32, 2
25 ; GCN: v_readlane_b32 s35, v32, 1
26 ; GCN: v_readlane_b32 s34, v32, 0
24 ; GCN: v_readlane_b32 s37, v32, 1
25 ; GCN: v_readlane_b32 s36, v32, 0
26
27 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400
28 ; GCN-NEXT: v_readlane_b32 s34, v32, 2
2729 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
28 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
30 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
2931 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
30
31 ; GCN: s_sub_u32 s32, s32, 0x400
32 ; GCN: s_setpc_b64
32 ; GCN-NEXT: s_waitcnt vmcnt(0)
33 ; GCN-NEXT: s_setpc_b64
3334 define void @test_func_call_external_void_func_i32_imm() #0 {
3435 call void @external_void_func_i32(i32 42)
3536 ret void
3738
3839 ; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm_stack_use:
3940 ; GCN: s_waitcnt
40 ; GCN: s_mov_b32 s5, s32
41 ; GCN: s_mov_b32 s34, s32
4142 ; GCN-DAG: s_add_u32 s32, s32, 0x1400{{$}}
42 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset
43 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34 offset:
4344 ; GCN: s_swappc_b64
4445 ; GCN: s_sub_u32 s32, s32, 0x1400{{$}}
4546 ; GCN: s_setpc_b64
77 ; GCN-NEXT: v_sub_i32_e32 v7, vcc, 64, v4
88 ; GCN-NEXT: v_lshl_b64 v[5:6], v[2:3], v4
99 ; GCN-NEXT: v_lshr_b64 v[7:8], v[0:1], v7
10 ; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v4
10 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4
1111 ; GCN-NEXT: v_or_b32_e32 v7, v5, v7
1212 ; GCN-NEXT: v_subrev_i32_e32 v5, vcc, 64, v4
1313 ; GCN-NEXT: v_or_b32_e32 v8, v6, v8
1616 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v4
1717 ; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
1818 ; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
19 ; GCN-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[6:7]
20 ; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[6:7]
19 ; GCN-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[4:5]
20 ; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[4:5]
2121 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
2222 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
2323 ; GCN-NEXT: s_setpc_b64 s[30:31]
3232 ; GCN-NEXT: v_sub_i32_e32 v7, vcc, 64, v4
3333 ; GCN-NEXT: v_lshr_b64 v[5:6], v[0:1], v4
3434 ; GCN-NEXT: v_lshl_b64 v[7:8], v[2:3], v7
35 ; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v4
35 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4
3636 ; GCN-NEXT: v_or_b32_e32 v7, v5, v7
3737 ; GCN-NEXT: v_subrev_i32_e32 v5, vcc, 64, v4
3838 ; GCN-NEXT: v_or_b32_e32 v8, v6, v8
4141 ; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], v4
4242 ; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
4343 ; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
44 ; GCN-NEXT: v_cndmask_b32_e64 v1, v6, v1, s[6:7]
45 ; GCN-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[6:7]
44 ; GCN-NEXT: v_cndmask_b32_e64 v1, v6, v1, s[4:5]
45 ; GCN-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[4:5]
4646 ; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
4747 ; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
4848 ; GCN-NEXT: s_setpc_b64 s[30:31]
6363 ; GCN-NEXT: v_subrev_i32_e32 v10, vcc, 64, v4
6464 ; GCN-NEXT: v_ashr_i64 v[5:6], v[2:3], v4
6565 ; GCN-NEXT: v_ashr_i64 v[2:3], v[2:3], v10
66 ; GCN-NEXT: v_cmp_gt_u32_e64 s[6:7], 64, v4
66 ; GCN-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4
6767 ; GCN-NEXT: v_or_b32_e32 v7, v7, v9
68 ; GCN-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7]
69 ; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[6:7]
68 ; GCN-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[4:5]
69 ; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5]
7070 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
71 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7]
72 ; GCN-NEXT: v_cndmask_b32_e64 v6, v11, v6, s[6:7]
71 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[4:5]
72 ; GCN-NEXT: v_cndmask_b32_e64 v6, v11, v6, s[4:5]
7373 ; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
7474 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
7575 ; GCN-NEXT: v_mov_b32_e32 v2, v5
130130 ; GCN-NEXT: v_lshl_b64 v[4:5], 17, v3
131131 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0
132132 ; GCN-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
133 ; GCN-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v0
134 ; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v2, s[6:7]
133 ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
134 ; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v2, s[4:5]
135135 ; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v1, vcc
136136 ; GCN-NEXT: v_lshl_b64 v[0:1], 17, v0
137 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7]
137 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[4:5]
138138 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
139139 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
140140 ; GCN-NEXT: s_setpc_b64 s[30:31]
146146 ; GCN-LABEL: v_lshr_i128_kv:
147147 ; GCN: ; %bb.0:
148148 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
149 ; GCN-NEXT: s_mov_b32 s7, 0
150 ; GCN-NEXT: s_movk_i32 s6, 0x41
151 ; GCN-NEXT: v_lshr_b64 v[2:3], s[6:7], v0
149 ; GCN-NEXT: s_mov_b32 s5, 0
150 ; GCN-NEXT: s_movk_i32 s4, 0x41
151 ; GCN-NEXT: v_lshr_b64 v[2:3], s[4:5], v0
152152 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0
153 ; GCN-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v0
153 ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
154154 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
155155 ; GCN-NEXT: v_mov_b32_e32 v2, 0x41
156156 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
157 ; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[6:7]
157 ; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
158158 ; GCN-NEXT: v_mov_b32_e32 v2, 0
159 ; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[6:7]
159 ; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5]
160160 ; GCN-NEXT: v_mov_b32_e32 v3, 0
161161 ; GCN-NEXT: s_setpc_b64 s[30:31]
162162 %shl = lshr i128 65, %rhs
169169 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
170170 ; GCN-NEXT: v_lshr_b64 v[2:3], 33, v0
171171 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0
172 ; GCN-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v0
172 ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
173173 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
174174 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
175175 ; GCN-NEXT: v_mov_b32_e32 v2, 0
176 ; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[6:7]
177 ; GCN-NEXT: v_cndmask_b32_e64 v0, 33, v0, s[6:7]
176 ; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5]
177 ; GCN-NEXT: v_cndmask_b32_e64 v0, 33, v0, s[4:5]
178178 ; GCN-NEXT: v_mov_b32_e32 v3, 0
179179 ; GCN-NEXT: s_setpc_b64 s[30:31]
180180 %shl = ashr i128 33, %rhs
213213 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
214214 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
215215 ; GCN-NEXT: s_endpgm
216 ; GCN-NEXT: .section .rodata,#alloc
217 ; GCN-NEXT: .p2align 6
218 ; GCN-NEXT: .amdhsa_kernel s_shl_i128_ss
219 ; GCN-NEXT: .amdhsa_group_segment_fixed_size 0
220 ; GCN-NEXT: .amdhsa_private_segment_fixed_size 0
221 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
222 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
223 ; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0
224 ; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
225 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
226 ; GCN-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
227 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
228 ; GCN-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
229 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
230 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
231 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
232 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
233 ; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
234 ; GCN-NEXT: .amdhsa_next_free_vgpr 8
235 ; GCN-NEXT: .amdhsa_next_free_sgpr 12
236 ; GCN-NEXT: .amdhsa_reserve_flat_scratch 0
237 ; GCN-NEXT: .amdhsa_float_round_mode_32 0
238 ; GCN-NEXT: .amdhsa_float_round_mode_16_64 0
239 ; GCN-NEXT: .amdhsa_float_denorm_mode_32 0
240 ; GCN-NEXT: .amdhsa_float_denorm_mode_16_64 3
241 ; GCN-NEXT: .amdhsa_dx10_clamp 1
242 ; GCN-NEXT: .amdhsa_ieee_mode 1
243 ; GCN-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0
244 ; GCN-NEXT: .amdhsa_exception_fp_denorm_src 0
245 ; GCN-NEXT: .amdhsa_exception_fp_ieee_div_zero 0
246 ; GCN-NEXT: .amdhsa_exception_fp_ieee_overflow 0
247 ; GCN-NEXT: .amdhsa_exception_fp_ieee_underflow 0
248 ; GCN-NEXT: .amdhsa_exception_fp_ieee_inexact 0
249 ; GCN-NEXT: .amdhsa_exception_int_div_zero 0
250 ; GCN-NEXT: .end_amdhsa_kernel
251 ; GCN-NEXT: .text
252216 %shift = shl i128 %lhs, %rhs
253217 store i128 %shift, i128 addrspace(1)* null
254218 ret void
286250 ; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
287251 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
288252 ; GCN-NEXT: s_endpgm
289 ; GCN-NEXT: .section .rodata,#alloc
290 ; GCN-NEXT: .p2align 6
291 ; GCN-NEXT: .amdhsa_kernel s_lshr_i128_ss
292 ; GCN-NEXT: .amdhsa_group_segment_fixed_size 0
293 ; GCN-NEXT: .amdhsa_private_segment_fixed_size 0
294 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
295 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
296 ; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0
297 ; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
298 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
299 ; GCN-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
300 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
301 ; GCN-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
302 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
303 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
304 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
305 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
306 ; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
307 ; GCN-NEXT: .amdhsa_next_free_vgpr 8
308 ; GCN-NEXT: .amdhsa_next_free_sgpr 12
309 ; GCN-NEXT: .amdhsa_reserve_flat_scratch 0
310 ; GCN-NEXT: .amdhsa_float_round_mode_32 0
311 ; GCN-NEXT: .amdhsa_float_round_mode_16_64 0
312 ; GCN-NEXT: .amdhsa_float_denorm_mode_32 0
313 ; GCN-NEXT: .amdhsa_float_denorm_mode_16_64 3
314 ; GCN-NEXT: .amdhsa_dx10_clamp 1
315 ; GCN-NEXT: .amdhsa_ieee_mode 1
316 ; GCN-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0
317 ; GCN-NEXT: .amdhsa_exception_fp_denorm_src 0
318 ; GCN-NEXT: .amdhsa_exception_fp_ieee_div_zero 0
319 ; GCN-NEXT: .amdhsa_exception_fp_ieee_overflow 0
320 ; GCN-NEXT: .amdhsa_exception_fp_ieee_underflow 0
321 ; GCN-NEXT: .amdhsa_exception_fp_ieee_inexact 0
322 ; GCN-NEXT: .amdhsa_exception_int_div_zero 0
323 ; GCN-NEXT: .end_amdhsa_kernel
324 ; GCN-NEXT: .text
325253 %shift = lshr i128 %lhs, %rhs
326254 store i128 %shift, i128 addrspace(1)* null
327255 ret void
361289 ; GCN-NEXT: v_mov_b32_e32 v5, 0
362290 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
363291 ; GCN-NEXT: s_endpgm
364 ; GCN-NEXT: .section .rodata,#alloc
365 ; GCN-NEXT: .p2align 6
366 ; GCN-NEXT: .amdhsa_kernel s_ashr_i128_ss
367 ; GCN-NEXT: .amdhsa_group_segment_fixed_size 0
368 ; GCN-NEXT: .amdhsa_private_segment_fixed_size 0
369 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
370 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
371 ; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0
372 ; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
373 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
374 ; GCN-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
375 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
376 ; GCN-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
377 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
378 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
379 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
380 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
381 ; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
382 ; GCN-NEXT: .amdhsa_next_free_vgpr 8
383 ; GCN-NEXT: .amdhsa_next_free_sgpr 12
384 ; GCN-NEXT: .amdhsa_reserve_flat_scratch 0
385 ; GCN-NEXT: .amdhsa_float_round_mode_32 0
386 ; GCN-NEXT: .amdhsa_float_round_mode_16_64 0
387 ; GCN-NEXT: .amdhsa_float_denorm_mode_32 0
388 ; GCN-NEXT: .amdhsa_float_denorm_mode_16_64 3
389 ; GCN-NEXT: .amdhsa_dx10_clamp 1
390 ; GCN-NEXT: .amdhsa_ieee_mode 1
391 ; GCN-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0
392 ; GCN-NEXT: .amdhsa_exception_fp_denorm_src 0
393 ; GCN-NEXT: .amdhsa_exception_fp_ieee_div_zero 0
394 ; GCN-NEXT: .amdhsa_exception_fp_ieee_overflow 0
395 ; GCN-NEXT: .amdhsa_exception_fp_ieee_underflow 0
396 ; GCN-NEXT: .amdhsa_exception_fp_ieee_inexact 0
397 ; GCN-NEXT: .amdhsa_exception_int_div_zero 0
398 ; GCN-NEXT: .end_amdhsa_kernel
399 ; GCN-NEXT: .text
400292 %shift = ashr i128 %lhs, %rhs
401293 store i128 %shift, i128 addrspace(1)* null
402294 ret void
409301 ; GCN-NEXT: v_sub_i32_e32 v18, vcc, 64, v8
410302 ; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v8
411303 ; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v18
412 ; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[10:11]
413 ; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[8:9]
304 ; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
305 ; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9]
414306 ; GCN-NEXT: v_or_b32_e32 v11, v9, v11
415307 ; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8
416308 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10
417309 ; GCN-NEXT: v_or_b32_e32 v19, v17, v19
418310 ; GCN-NEXT: v_or_b32_e32 v18, v16, v18
419311 ; GCN-NEXT: v_lshl_b64 v[16:17], v[0:1], v9
420 ; GCN-NEXT: s_and_b64 s[6:7], s[8:9], s[6:7]
312 ; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
421313 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
422 ; GCN-NEXT: v_cndmask_b32_e64 v9, v17, v19, s[6:7]
314 ; GCN-NEXT: v_cndmask_b32_e64 v9, v17, v19, s[4:5]
423315 ; GCN-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
424 ; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[6:7]
316 ; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[4:5]
425317 ; GCN-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
426318 ; GCN-NEXT: v_sub_i32_e32 v11, vcc, 64, v12
427319 ; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v12
428320 ; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v11
429 ; GCN-NEXT: v_cmp_gt_u64_e64 s[8:9], 64, v[12:13]
321 ; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13]
430322 ; GCN-NEXT: v_or_b32_e32 v16, v9, v16
431 ; GCN-NEXT: v_cmp_eq_u64_e64 s[10:11], 0, v[14:15]
323 ; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15]
432324 ; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12
433325 ; GCN-NEXT: v_or_b32_e32 v11, v10, v17
434326 ; GCN-NEXT: v_lshl_b64 v[9:10], v[4:5], v9
435 ; GCN-NEXT: s_and_b64 vcc, s[10:11], s[8:9]
327 ; GCN-NEXT: s_and_b64 vcc, s[8:9], s[6:7]
436328 ; GCN-NEXT: v_cndmask_b32_e32 v17, v10, v11, vcc
437329 ; GCN-NEXT: v_or_b32_e32 v11, v13, v15
438330 ; GCN-NEXT: v_or_b32_e32 v10, v12, v14
439331 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v8
440332 ; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v12
441 ; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[10:11]
333 ; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
442334 ; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc
443 ; GCN-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[8:9]
444 ; GCN-NEXT: v_cndmask_b32_e64 v6, v9, v6, s[8:9]
445 ; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[6:7]
446 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[6:7]
335 ; GCN-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[6:7]
336 ; GCN-NEXT: v_cndmask_b32_e64 v6, v9, v6, s[6:7]
337 ; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5]
338 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5]
447339 ; GCN-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
448340 ; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
449341 ; GCN-NEXT: s_setpc_b64 s[30:31]
458350 ; GCN-NEXT: v_sub_i32_e32 v18, vcc, 64, v8
459351 ; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v8
460352 ; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v18
461 ; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[10:11]
462 ; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[8:9]
353 ; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
354 ; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9]
463355 ; GCN-NEXT: v_or_b32_e32 v11, v9, v11
464356 ; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8
465357 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10
466358 ; GCN-NEXT: v_or_b32_e32 v19, v17, v19
467359 ; GCN-NEXT: v_or_b32_e32 v18, v16, v18
468360 ; GCN-NEXT: v_lshr_b64 v[16:17], v[2:3], v9
469 ; GCN-NEXT: s_and_b64 s[6:7], s[8:9], s[6:7]
361 ; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
470362 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
471 ; GCN-NEXT: v_cndmask_b32_e64 v9, v17, v19, s[6:7]
363 ; GCN-NEXT: v_cndmask_b32_e64 v9, v17, v19, s[4:5]
472364 ; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
473 ; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[6:7]
365 ; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[4:5]
474366 ; GCN-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
475367 ; GCN-NEXT: v_sub_i32_e32 v11, vcc, 64, v12
476368 ; GCN-NEXT: v_lshr_b64 v[9:10], v[4:5], v12
477369 ; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v11
478 ; GCN-NEXT: v_cmp_gt_u64_e64 s[8:9], 64, v[12:13]
370 ; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13]
479371 ; GCN-NEXT: v_or_b32_e32 v16, v9, v16
480 ; GCN-NEXT: v_cmp_eq_u64_e64 s[10:11], 0, v[14:15]
372 ; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15]
481373 ; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12
482374 ; GCN-NEXT: v_or_b32_e32 v11, v10, v17
483375 ; GCN-NEXT: v_lshr_b64 v[9:10], v[6:7], v9
484 ; GCN-NEXT: s_and_b64 vcc, s[10:11], s[8:9]
376 ; GCN-NEXT: s_and_b64 vcc, s[8:9], s[6:7]
485377 ; GCN-NEXT: v_cndmask_b32_e32 v17, v10, v11, vcc
486378 ; GCN-NEXT: v_or_b32_e32 v11, v13, v15
487379 ; GCN-NEXT: v_or_b32_e32 v10, v12, v14
488380 ; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], v8
489381 ; GCN-NEXT: v_lshr_b64 v[6:7], v[6:7], v12
490 ; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[10:11]
382 ; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
491383 ; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc
492 ; GCN-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[8:9]
493 ; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[8:9]
494 ; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[6:7]
495 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7]
384 ; GCN-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[6:7]
385 ; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[6:7]
386 ; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5]
387 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[4:5]
496388 ; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
497389 ; GCN-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
498390 ; GCN-NEXT: s_setpc_b64 s[30:31]
507399 ; GCN-NEXT: v_sub_i32_e32 v18, vcc, 64, v8
508400 ; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v8
509401 ; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v18
510 ; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[10:11]
511 ; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[8:9]
402 ; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
403 ; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9]
512404 ; GCN-NEXT: v_or_b32_e32 v11, v9, v11
513405 ; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8
514406 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10
515407 ; GCN-NEXT: v_or_b32_e32 v19, v17, v19
516408 ; GCN-NEXT: v_or_b32_e32 v18, v16, v18
517409 ; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v9
518 ; GCN-NEXT: s_and_b64 s[6:7], s[8:9], s[6:7]
410 ; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
519411 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
520 ; GCN-NEXT: v_cndmask_b32_e64 v9, v17, v19, s[6:7]
412 ; GCN-NEXT: v_cndmask_b32_e64 v9, v17, v19, s[4:5]
521413 ; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
522 ; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[6:7]
414 ; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[4:5]
523415 ; GCN-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
524416 ; GCN-NEXT: v_sub_i32_e32 v11, vcc, 64, v12
525417 ; GCN-NEXT: v_lshr_b64 v[9:10], v[4:5], v12
526418 ; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v11
527 ; GCN-NEXT: v_cmp_gt_u64_e64 s[8:9], 64, v[12:13]
419 ; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13]
528420 ; GCN-NEXT: v_or_b32_e32 v16, v9, v16
529 ; GCN-NEXT: v_cmp_eq_u64_e64 s[10:11], 0, v[14:15]
421 ; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15]
530422 ; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12
531423 ; GCN-NEXT: v_or_b32_e32 v11, v10, v17
532424 ; GCN-NEXT: v_ashr_i64 v[9:10], v[6:7], v9
533 ; GCN-NEXT: s_and_b64 vcc, s[10:11], s[8:9]
425 ; GCN-NEXT: s_and_b64 vcc, s[8:9], s[6:7]
534426 ; GCN-NEXT: v_cndmask_b32_e32 v17, v10, v11, vcc
535427 ; GCN-NEXT: v_or_b32_e32 v11, v13, v15
536428 ; GCN-NEXT: v_or_b32_e32 v10, v12, v14
537 ; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[10:11]
429 ; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
538430 ; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc
539 ; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[8:9]
431 ; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[6:7]
540432 ; GCN-NEXT: v_ashr_i64 v[8:9], v[2:3], v8
541433 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v3
542 ; GCN-NEXT: v_cndmask_b32_e64 v3, v2, v9, s[6:7]
543 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[6:7]
434 ; GCN-NEXT: v_cndmask_b32_e64 v3, v2, v9, s[4:5]
435 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[4:5]
544436 ; GCN-NEXT: v_ashr_i64 v[8:9], v[6:7], v12
545437 ; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v7
546438 ; GCN-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc
547 ; GCN-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[8:9]
439 ; GCN-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[6:7]
548440 ; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
549441 ; GCN-NEXT: s_setpc_b64 s[30:31]
550442 %shl = ashr <2 x i128> %lhs, %rhs
616508 ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
617509 ; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
618510 ; GCN-NEXT: s_endpgm
619 ; GCN-NEXT: .section .rodata,#alloc
620 ; GCN-NEXT: .p2align 6
621 ; GCN-NEXT: .amdhsa_kernel s_shl_v2i128ss
622 ; GCN-NEXT: .amdhsa_group_segment_fixed_size 0
623 ; GCN-NEXT: .amdhsa_private_segment_fixed_size 0
624 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
625 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
626 ; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0
627 ; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
628 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
629 ; GCN-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
630 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
631 ; GCN-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
632 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
633 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
634 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
635 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
636 ; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
637 ; GCN-NEXT: .amdhsa_next_free_vgpr 16
638 ; GCN-NEXT: .amdhsa_next_free_sgpr 22
639 ; GCN-NEXT: .amdhsa_reserve_flat_scratch 0
640 ; GCN-NEXT: .amdhsa_float_round_mode_32 0
641 ; GCN-NEXT: .amdhsa_float_round_mode_16_64 0
642 ; GCN-NEXT: .amdhsa_float_denorm_mode_32 0
643 ; GCN-NEXT: .amdhsa_float_denorm_mode_16_64 3
644 ; GCN-NEXT: .amdhsa_dx10_clamp 1
645 ; GCN-NEXT: .amdhsa_ieee_mode 1
646 ; GCN-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0
647 ; GCN-NEXT: .amdhsa_exception_fp_denorm_src 0
648 ; GCN-NEXT: .amdhsa_exception_fp_ieee_div_zero 0
649 ; GCN-NEXT: .amdhsa_exception_fp_ieee_overflow 0
650 ; GCN-NEXT: .amdhsa_exception_fp_ieee_underflow 0
651 ; GCN-NEXT: .amdhsa_exception_fp_ieee_inexact 0
652 ; GCN-NEXT: .amdhsa_exception_int_div_zero 0
653 ; GCN-NEXT: .end_amdhsa_kernel
654 ; GCN-NEXT: .text
655511 %shift = shl <2 x i128> %lhs, %rhs
656512 store <2 x i128> %shift, <2 x i128> addrspace(1)* null
657513 ret void
722578 ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
723579 ; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
724580 ; GCN-NEXT: s_endpgm
725 ; GCN-NEXT: .section .rodata,#alloc
726 ; GCN-NEXT: .p2align 6
727 ; GCN-NEXT: .amdhsa_kernel s_lshr_v2i128_ss
728 ; GCN-NEXT: .amdhsa_group_segment_fixed_size 0
729 ; GCN-NEXT: .amdhsa_private_segment_fixed_size 0
730 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
731 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
732 ; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0
733 ; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
734 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
735 ; GCN-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
736 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
737 ; GCN-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
738 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
739 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
740 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
741 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
742 ; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
743 ; GCN-NEXT: .amdhsa_next_free_vgpr 16
744 ; GCN-NEXT: .amdhsa_next_free_sgpr 22
745 ; GCN-NEXT: .amdhsa_reserve_flat_scratch 0
746 ; GCN-NEXT: .amdhsa_float_round_mode_32 0
747 ; GCN-NEXT: .amdhsa_float_round_mode_16_64 0
748 ; GCN-NEXT: .amdhsa_float_denorm_mode_32 0
749 ; GCN-NEXT: .amdhsa_float_denorm_mode_16_64 3
750 ; GCN-NEXT: .amdhsa_dx10_clamp 1
751 ; GCN-NEXT: .amdhsa_ieee_mode 1
752 ; GCN-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0
753 ; GCN-NEXT: .amdhsa_exception_fp_denorm_src 0
754 ; GCN-NEXT: .amdhsa_exception_fp_ieee_div_zero 0
755 ; GCN-NEXT: .amdhsa_exception_fp_ieee_overflow 0
756 ; GCN-NEXT: .amdhsa_exception_fp_ieee_underflow 0
757 ; GCN-NEXT: .amdhsa_exception_fp_ieee_inexact 0
758 ; GCN-NEXT: .amdhsa_exception_int_div_zero 0
759 ; GCN-NEXT: .end_amdhsa_kernel
760 ; GCN-NEXT: .text
761581 %shift = lshr <2 x i128> %lhs, %rhs
762582 store <2 x i128> %shift, <2 x i128> addrspace(1)* null
763583 ret void
832652 ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
833653 ; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
834654 ; GCN-NEXT: s_endpgm
835 ; GCN-NEXT: .section .rodata,#alloc
836 ; GCN-NEXT: .p2align 6
837 ; GCN-NEXT: .amdhsa_kernel s_ashr_v2i128_ss
838 ; GCN-NEXT: .amdhsa_group_segment_fixed_size 0
839 ; GCN-NEXT: .amdhsa_private_segment_fixed_size 0
840 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
841 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
842 ; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0
843 ; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
844 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
845 ; GCN-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
846 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
847 ; GCN-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
848 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
849 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
850 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
851 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
852 ; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
853 ; GCN-NEXT: .amdhsa_next_free_vgpr 16
854 ; GCN-NEXT: .amdhsa_next_free_sgpr 23
855 ; GCN-NEXT: .amdhsa_reserve_flat_scratch 0
856 ; GCN-NEXT: .amdhsa_float_round_mode_32 0
857 ; GCN-NEXT: .amdhsa_float_round_mode_16_64 0
858 ; GCN-NEXT: .amdhsa_float_denorm_mode_32 0
859 ; GCN-NEXT: .amdhsa_float_denorm_mode_16_64 3
860 ; GCN-NEXT: .amdhsa_dx10_clamp 1
861 ; GCN-NEXT: .amdhsa_ieee_mode 1
862 ; GCN-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0
863 ; GCN-NEXT: .amdhsa_exception_fp_denorm_src 0
864 ; GCN-NEXT: .amdhsa_exception_fp_ieee_div_zero 0
865 ; GCN-NEXT: .amdhsa_exception_fp_ieee_overflow 0
866 ; GCN-NEXT: .amdhsa_exception_fp_ieee_underflow 0
867 ; GCN-NEXT: .amdhsa_exception_fp_ieee_inexact 0
868 ; GCN-NEXT: .amdhsa_exception_int_div_zero 0
869 ; GCN-NEXT: .end_amdhsa_kernel
870 ; GCN-NEXT: .text
871655 %shift = ashr <2 x i128> %lhs, %rhs
872656 store <2 x i128> %shift, <2 x i128> addrspace(1)* null
873657 ret void
201201
202202 ; Have another non-tail in the function
203203 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call:
204 ; GCN: s_mov_b32 s5, s32
205 ; GCN: s_add_u32 s32, s32, 0x400
206
207204 ; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
208 ; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s5 offset:8
205 ; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
209206 ; GCN-NEXT: s_mov_b64 exec
210 ; GCN-DAG: s_getpc_b64
211
212 ; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
213 ; GCN: buffer_store_dword v33, off, s[0:3], s5 ; 4-byte Folded Spill
214 ; GCN-DAG: v_writelane_b32 v34, s34, 0
215 ; GCN-DAG: v_writelane_b32 v34, s35, 1
207 ; GCN: s_mov_b32 s34, s32
208 ; GCN-DAG: s_add_u32 s32, s32, 0x400
209
210 ; GCN-DAG: buffer_store_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill
211 ; GCN-DAG: buffer_store_dword v33, off, s[0:3], s34 ; 4-byte Folded Spill
212 ; GCN-DAG: v_writelane_b32 v34, s36, 0
213 ; GCN-DAG: v_writelane_b32 v34, s37, 1
214
215 ; GCN-DAG: s_getpc_b64 s[4:5]
216 ; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4
217 ; GCN-DAG: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+4
218
216219
217220 ; GCN: s_swappc_b64
218221
219 ; GCN-DAG: v_readlane_b32 s34, v34, 0
220 ; GCN-DAG: v_readlane_b32 s35, v34, 1
221
222 ; GCN: buffer_load_dword v33, off, s[0:3], s5 ; 4-byte Folded Reload
223 ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
224 ; GCN: s_getpc_b64 s[6:7]
225 ; GCN: s_add_u32 s6, s6, sibling_call_i32_fastcc_i32_i32@rel32@lo+4
226 ; GCN: s_addc_u32 s7, s7, sibling_call_i32_fastcc_i32_i32@rel32@hi+4
227 ; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
228 ; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s5 offset:8
229 ; GCN-NEXT: s_mov_b64 exec
222 ; GCN-DAG: v_readlane_b32 s36, v34, 0
223 ; GCN-DAG: v_readlane_b32 s37, v34, 1
224
225 ; GCN: buffer_load_dword v33, off, s[0:3], s34 ; 4-byte Folded Reload
226 ; GCN: buffer_load_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload
227
228 ; GCN: s_getpc_b64 s[4:5]
229 ; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4
230 ; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+4
230231
231232 ; GCN: s_sub_u32 s32, s32, 0x400
232 ; GCN: s_setpc_b64 s[6:7]
233 ; GCN-NEXT: v_readlane_b32 s34,
234 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
235 ; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
236 ; GCN-NEXT: s_mov_b64 exec, s[6:7]
237 ; GCN-NEXT: s_setpc_b64 s[4:5]
233238 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {
234239 entry:
235240 %other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
247252 ; GCN-NOT: s33
248253
249254 ; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
250 ; GCN: s_setpc_b64 s[6:7]
255 ; GCN: s_setpc_b64 s[4:5]
251256 define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
252257 entry:
253258 %alloca = alloca [16 x i32], align 4, addrspace(5)
262267 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:44
263268
264269 ; GCN-NOT: s33
265 ; GCN: s_setpc_b64 s[6:7]
270 ; GCN: s_setpc_b64 s[4:5]
266271 define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 {
267272 entry:
268273 %alloca = alloca [16 x i32], align 4, addrspace(5)
0 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=1 < %s | FileCheck -check-prefix=GCN %s
11
2 ; For the CSR copy of s5, it may be possible to see it in
3 ; storeRegToStackSlot.
2 ; GCN-LABEL: {{^}}spill_csr_s5_copy:
3 ; GCN: s_or_saveexec_b64
4 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
5 ; GCN-NEXT: s_mov_b64 exec
6 ; GCN: v_writelane_b32 v32, s34, 2
7 ; GCN: s_swappc_b64
48
5 ; GCN-LABEL: {{^}}spill_csr_s5_copy:
6 ; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
7 ; GCN: v_writelane_b32 v32, s5, 2
8 ; GCN: s_swappc_b64
9 ; GCN: v_readlane_b32 s5, v32, 2
109 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9
11 ; GCN: buffer_store_dword [[K]], off, s[0:3], s5{{$}}
12 ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
10 ; GCN: buffer_store_dword [[K]], off, s[0:3], s34{{$}}
11
12 ; GCN: v_readlane_b32 s34, v32, 2
13 ; GCN: s_or_saveexec_b64
14 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
15 ; GCN: s_mov_b64 exec
1316 ; GCN: s_setpc_b64
1417 define void @spill_csr_s5_copy() #0 {
1518 bb:
291291 }
292292
293293 attributes #0 = { nounwind }
294 attributes #1 = { nounwind "amdgpu-num-sgpr"="18" "amdgpu-num-vgpr"="8" }
295 attributes #2 = { nounwind "amdgpu-num-sgpr"="15" "amdgpu-num-vgpr"="8" }
294 attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" }
295 attributes #2 = { nounwind "amdgpu-num-sgpr"="14" "amdgpu-num-vgpr"="8" }
3333
3434 ; GCN-LABEL: {{^}}needs_align16_stack_align4:
3535 ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x3c0{{$}}
36 ; GCN: s_and_b32 s5, [[SCRATCH_REG]], 0xfffffc00
36 ; GCN: s_and_b32 s34, [[SCRATCH_REG]], 0xfffffc00
3737 ; GCN: s_add_u32 s32, s32, 0x2800{{$}}
3838
3939 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen
5454
5555 ; GCN-LABEL: {{^}}needs_align32:
5656 ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x7c0{{$}}
57 ; GCN: s_and_b32 s5, [[SCRATCH_REG]], 0xfffff800
57 ; GCN: s_and_b32 s34, [[SCRATCH_REG]], 0xfffff800
5858 ; GCN: s_add_u32 s32, s32, 0x3000{{$}}
5959
6060 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen
7575
7676 ; GCN-LABEL: {{^}}force_realign4:
7777 ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xc0{{$}}
78 ; GCN: s_and_b32 s5, [[SCRATCH_REG]], 0xffffff00
78 ; GCN: s_and_b32 s34, [[SCRATCH_REG]], 0xffffff00
7979 ; GCN: s_add_u32 s32, s32, 0xd00{{$}}
8080
8181 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen
128128
129129 ; GCN-LABEL: {{^}}default_realign_align128:
130130 ; GCN: s_add_u32 [[TMP:s[0-9]+]], s32, 0x1fc0
131 ; GCN-NEXT: s_and_b32 s5, [[TMP]], 0xffffe000
131 ; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s34
132 ; GCN-NEXT: s_and_b32 s34, [[TMP]], 0xffffe000
132133 ; GCN-NEXT: s_add_u32 s32, s32, 0x4000
133 ; GCN-NOT: s5
134 ; GCN: buffer_store_dword v0, off, s[0:3], s5{{$}}
134 ; GCN-NOT: s34
135 ; GCN: buffer_store_dword v0, off, s[0:3], s34{{$}}
135136 ; GCN: s_sub_u32 s32, s32, 0x4000
137 ; GCN: s_mov_b32 s34, [[FP_COPY]]
136138 define void @default_realign_align128(i32 %idx) #0 {
137139 %alloca.align = alloca i32, align 128, addrspace(5)
138140 store volatile i32 9, i32 addrspace(5)* %alloca.align, align 128
10671067
10681068 ; GCN-LABEL: {{^}}callee_no_stack_with_call:
10691069 ; GCN: s_waitcnt
1070 ; GCN: s_mov_b32 s5, s32
1070 ; GCN-NEXT: s_waitcnt_vscnt
1071
1072 ; GFX1064-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
1073 ; GFX1032-NEXT: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]]], -1{{$}}
1074 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
1075 ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
1076 ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC0]]
1077
1078 ; GCN-NEXT: v_writelane_b32 v32, s34, 2
1079 ; GCN: s_mov_b32 s34, s32
10711080 ; GFX1064: s_add_u32 s32, s32, 0x400
10721081 ; GFX1032: s_add_u32 s32, s32, 0x200
10731082
1074 ; GFX1064: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
1075 ; GFX1032: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]]], -1{{$}}
1076
1077 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
1078
1079 ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
1080 ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC0]]
1081
1082 ; GCN-DAG: v_writelane_b32 v32, s34, 0
1083 ; GCN-DAG: v_writelane_b32 v32, s35, 1
1084 ; GCN-DAG: s_mov_b32 [[COPY_FP:s[0-9]+]], s5
1083
1084 ; GCN-DAG: v_writelane_b32 v32, s36, 0
1085 ; GCN-DAG: v_writelane_b32 v32, s37, 1
10851086 ; GCN: s_swappc_b64
1086 ; GCN-DAG: s_mov_b32 s5, [[COPY_FP]]
1087 ; GCN-DAG: v_readlane_b32 s35, v32, 1
1088 ; GCN-DAG: v_readlane_b32 s34, v32, 0
1089
1087 ; GCN-DAG: v_readlane_b32 s36, v32, 0
1088 ; GCN-DAG: v_readlane_b32 s37, v32, 1
1089
1090
1091 ; GFX1064: s_sub_u32 s32, s32, 0x400
1092 ; GFX1032: s_sub_u32 s32, s32, 0x200
1093 ; GCN: v_readlane_b32 s34, v32, 2
10901094 ; GFX1064: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
10911095 ; GFX1032: s_or_saveexec_b32 [[COPY_EXEC1:s[0-9]]], -1{{$}}
1092 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
1096 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
10931097 ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
10941098 ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC1]]
1095
1096 ; GFX1064: s_sub_u32 s32, s32, 0x400
1097 ; GFX1032: s_sub_u32 s32, s32, 0x200
1098 ; GCN: s_setpc_b64
1099 ; GCN-NEXT: s_waitcnt vmcnt(0)
1100 ; GCN-NEXT: s_setpc_b64
10991101 define void @callee_no_stack_with_call() #1 {
11001102 call void @external_void_func_void()
11011103 ret void
6363 ; CHECK-NEXT: waveLimiter: false
6464 ; CHECK-NEXT: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
6565 ; CHECK-NEXT: scratchWaveOffsetReg: '$sgpr33'
66 ; CHECK-NEXT: frameOffsetReg: '$sgpr5'
66 ; CHECK-NEXT: frameOffsetReg: '$sgpr34'
6767 ; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'
6868 ; CHECK-NEXT: argumentInfo:
6969 ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
8484 ; CHECK-NEXT: waveLimiter: false
8585 ; CHECK-NEXT: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
8686 ; CHECK-NEXT: scratchWaveOffsetReg: '$sgpr33'
87 ; CHECK-NEXT: frameOffsetReg: '$sgpr5'
87 ; CHECK-NEXT: frameOffsetReg: '$sgpr34'
8888 ; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'
8989 ; CHECK-NEXT: argumentInfo:
9090 ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }