llvm.org GIT mirror llvm / 2a24827
AMDGPU/SI: Add back reverted SGPR spilling code, but disable it suggested as a better solution by Matt git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@287942 91177308-0d34-0410-b5e6-96231b3b80d8 Marek Olsak 3 years ago
21 changed file(s) with 674 addition(s) and 188 deletion(s). Raw diff Collapse all Expand all
252252 static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
253253 switch (NumVectorElts) {
254254 case 1:
255 return AMDGPU::SReg_32RegClassID;
255 return AMDGPU::SReg_32_XM0RegClassID;
256256 case 2:
257257 return AMDGPU::SReg_64RegClassID;
258258 case 4:
5858 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
5959 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
6060
61 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
61 addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
6262 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
6363
6464 addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
7878 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
7979
8080 if (Subtarget->has16BitInsts()) {
81 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
82 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
81 addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
82 addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
8383 }
8484
8585 computeRegisterProperties(STI.getRegisterInfo());
940940 // Start adding system SGPRs.
941941 if (Info->hasWorkGroupIDX()) {
942942 unsigned Reg = Info->addWorkGroupIDX();
943 MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
943 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
944944 CCInfo.AllocateReg(Reg);
945945 }
946946
947947 if (Info->hasWorkGroupIDY()) {
948948 unsigned Reg = Info->addWorkGroupIDY();
949 MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
949 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
950950 CCInfo.AllocateReg(Reg);
951951 }
952952
953953 if (Info->hasWorkGroupIDZ()) {
954954 unsigned Reg = Info->addWorkGroupIDZ();
955 MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
955 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
956956 CCInfo.AllocateReg(Reg);
957957 }
958958
959959 if (Info->hasWorkGroupInfo()) {
960960 unsigned Reg = Info->addWorkGroupInfo();
961 MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
961 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
962962 CCInfo.AllocateReg(Reg);
963963 }
964964
24132413 SI::KernelInputOffsets::LOCAL_SIZE_Z);
24142414 case Intrinsic::amdgcn_workgroup_id_x:
24152415 case Intrinsic::r600_read_tgid_x:
2416 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
2416 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
24172417 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
24182418 case Intrinsic::amdgcn_workgroup_id_y:
24192419 case Intrinsic::r600_read_tgid_y:
2420 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
2420 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
24212421 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
24222422 case Intrinsic::amdgcn_workgroup_id_z:
24232423 case Intrinsic::r600_read_tgid_z:
2424 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
2424 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
24252425 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
24262426 case Intrinsic::amdgcn_workitem_id_x:
24272427 case Intrinsic::r600_read_tidig_x:
41814181 default:
41824182 return std::make_pair(0U, nullptr);
41834183 case 32:
4184 return std::make_pair(0U, &AMDGPU::SReg_32RegClass);
4184 return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass);
41854185 case 64:
41864186 return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
41874187 case 128:
531531 TRI = &TII->getRegisterInfo();
532532 MRI = &MF.getRegInfo();
533533 IV = getIsaVersion(ST->getFeatureBits());
534 const SIMachineFunctionInfo *MFI = MF.getInfo();
534535
535536 HardwareLimits.Named.VM = getVmcntBitMask(IV);
536537 HardwareLimits.Named.EXP = getExpcntBitMask(IV);
542543 LastOpcodeType = OTHER;
543544 LastInstWritesM0 = false;
544545 IsFlatOutstanding = false;
545 ReturnsVoid = MF.getInfo()->returnsVoid();
546 ReturnsVoid = MFI->returnsVoid();
546547
547548 memset(&UsedRegs, 0, sizeof(UsedRegs));
548549 memset(&DefinedRegs, 0, sizeof(DefinedRegs));
549550
550551 SmallVector RemoveMI;
552 SmallVector EndPgmBlocks;
553
554 bool HaveScalarStores = false;
551555
552556 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
553557 BI != BE; ++BI) {
554558
555559 MachineBasicBlock &MBB = *BI;
560
556561 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
557562 I != E; ++I) {
563
564 if (!HaveScalarStores && TII->isScalarStore(*I))
565 HaveScalarStores = true;
558566
559567 if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
560568 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
624632
625633 pushInstruction(MBB, I, Increment);
626634 handleSendMsg(MBB, I);
635
636 if (I->getOpcode() == AMDGPU::S_ENDPGM ||
637 I->getOpcode() == AMDGPU::SI_RETURN)
638 EndPgmBlocks.push_back(&MBB);
627639 }
628640
629641 // Wait for everything at the end of the MBB
630642 Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
631643 }
632644
645 if (HaveScalarStores) {
646 // If scalar writes are used, the cache must be flushed or else the next
647 // wave to reuse the same scratch memory can be clobbered.
648 //
649 // Insert s_dcache_wb at wave termination points if there were any scalar
650 // stores, and only if the cache hasn't already been flushed. This could be
651 // improved by looking across blocks for flushes in postdominating blocks
652 // from the stores but an explicitly requested flush is probably very rare.
653 for (MachineBasicBlock *MBB : EndPgmBlocks) {
654 bool SeenDCacheWB = false;
655
656 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
657 I != E; ++I) {
658
659 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
660 SeenDCacheWB = true;
661 else if (TII->isScalarStore(*I))
662 SeenDCacheWB = false;
663
664 // FIXME: It would be better to insert this before a waitcnt if any.
665 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
666 I->getOpcode() == AMDGPU::SI_RETURN) && !SeenDCacheWB) {
667 Changes = true;
668 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
669 }
670 }
671 }
672 }
673
633674 for (MachineInstr *I : RemoveMI)
634675 I->eraseFromParent();
635676
363363 return;
364364 }
365365
366 if (RC == &AMDGPU::SReg_32RegClass) {
366 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
367 RC == &AMDGPU::SReg_32RegClass) {
367368 if (SrcReg == AMDGPU::SCC) {
368369 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
369370 .addImm(-1)
543544 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
544545 }
545546
546 BuildMI(MBB, MI, DL, OpDesc)
547 MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
547548 .addReg(SrcReg, getKillRegState(isKill)) // data
548549 .addFrameIndex(FrameIndex) // addr
549550 .addMemOperand(MMO)
552553 // Add the scratch resource registers as implicit uses because we may end up
553554 // needing them, and need to ensure that the reserved registers are
554555 // correctly handled.
556
557 if (ST.hasScalarStores()) {
558 // m0 is used for offset to scalar stores if used to spill.
559 Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine);
560 }
555561
556562 return;
557563 }
642648 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
643649 }
644650
645 BuildMI(MBB, MI, DL, OpDesc, DestReg)
651 MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
646652 .addFrameIndex(FrameIndex) // addr
647653 .addMemOperand(MMO)
648654 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
649655 .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
656
657 if (ST.hasScalarStores()) {
658 // m0 is used for offset to scalar stores if used to spill.
659 Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine);
660 }
650661
651662 return;
652663 }
2323
2424 using namespace llvm;
2525
26 static cl::opt EnableSpillSGPRToSMEM(
27 "amdgpu-spill-sgpr-to-smem",
28 cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"),
29 cl::init(false));
30
31
2632 static bool hasPressureSet(const int *PSets, unsigned PSetID) {
2733 for (unsigned i = 0; PSets[i] != -1; ++i) {
2834 if (PSets[i] == (int)PSetID)
236242
237243 MachineRegisterInfo &MRI = MF->getRegInfo();
238244 unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
239 unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
245 unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
240246
241247 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
242248 .addImm(Offset);
400406
401407 void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
402408 unsigned LoadStoreOp,
403 const MachineOperand *SrcDst,
409 int Index,
410 unsigned ValueReg,
411 bool IsKill,
404412 unsigned ScratchRsrcReg,
405 unsigned ScratchOffset,
406 int64_t Offset,
413 unsigned ScratchOffsetReg,
414 int64_t InstOffset,
415 MachineMemOperand *MMO,
407416 RegScavenger *RS) const {
408 unsigned Value = SrcDst->getReg();
409 bool IsKill = SrcDst->isKill();
410417 MachineBasicBlock *MBB = MI->getParent();
411418 MachineFunction *MF = MI->getParent()->getParent();
412419 const SISubtarget &ST = MF->getSubtarget();
413420 const SIInstrInfo *TII = ST.getInstrInfo();
414
415 DebugLoc DL = MI->getDebugLoc();
416 bool IsStore = MI->mayStore();
421 const MachineFrameInfo &MFI = MF->getFrameInfo();
422
423 const MCInstrDesc &Desc = TII->get(LoadStoreOp);
424 const DebugLoc &DL = MI->getDebugLoc();
425 bool IsStore = Desc.mayStore();
417426
418427 bool RanOutOfSGPRs = false;
419428 bool Scavenged = false;
420 unsigned SOffset = ScratchOffset;
421 unsigned OriginalImmOffset = Offset;
422
423 unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
429 unsigned SOffset = ScratchOffsetReg;
430
431 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
432 unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / 32;
424433 unsigned Size = NumSubRegs * 4;
434 int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
435 const int64_t OriginalImmOffset = Offset;
436
437 unsigned Align = MFI.getObjectAlignment(Index);
438 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
425439
426440 if (!isUInt<12>(Offset + Size)) {
427441 SOffset = AMDGPU::NoRegister;
440454 // subtract the offset after the spill to return ScratchOffset to it's
441455 // original value.
442456 RanOutOfSGPRs = true;
443 SOffset = ScratchOffset;
457 SOffset = ScratchOffsetReg;
444458 } else {
445459 Scavenged = true;
446460 }
461
447462 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
448 .addReg(ScratchOffset)
449 .addImm(Offset);
463 .addReg(ScratchOffsetReg)
464 .addImm(Offset);
465
450466 Offset = 0;
451467 }
452468
453 for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) {
469 const unsigned EltSize = 4;
470
471 for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
454472 unsigned SubReg = NumSubRegs == 1 ?
455 Value : getSubReg(Value, getSubRegFromChannel(i));
473 ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i));
456474
457475 unsigned SOffsetRegState = 0;
458476 unsigned SrcDstRegState = getDefRegState(!IsStore);
462480 SrcDstRegState |= getKillRegState(IsKill);
463481 }
464482
465 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
466 .addReg(SubReg, getDefRegState(!IsStore))
483 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
484 MachineMemOperand *NewMMO
485 = MF->getMachineMemOperand(PInfo, MMO->getFlags(),
486 EltSize, MinAlign(Align, EltSize * i));
487
488 auto MIB = BuildMI(*MBB, MI, DL, Desc)
489 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill))
467490 .addReg(ScratchRsrcReg)
468491 .addReg(SOffset, SOffsetRegState)
469492 .addImm(Offset)
470493 .addImm(0) // glc
471494 .addImm(0) // slc
472495 .addImm(0) // tfe
473 .addReg(Value, RegState::Implicit | SrcDstRegState)
474 .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
475 }
496 .addMemOperand(NewMMO);
497
498 if (NumSubRegs > 1)
499 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
500 }
501
476502 if (RanOutOfSGPRs) {
477503 // Subtract the offset we added to the ScratchOffset register.
478 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffset)
479 .addReg(ScratchOffset)
480 .addImm(OriginalImmOffset);
504 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg)
505 .addReg(ScratchOffsetReg)
506 .addImm(OriginalImmOffset);
481507 }
482508 }
483509
484510 void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
485511 int Index,
486512 RegScavenger *RS) const {
487 MachineFunction *MF = MI->getParent()->getParent();
513 MachineBasicBlock *MBB = MI->getParent();
514 MachineFunction *MF = MBB->getParent();
488515 MachineRegisterInfo &MRI = MF->getRegInfo();
489 MachineBasicBlock *MBB = MI->getParent();
490 SIMachineFunctionInfo *MFI = MF->getInfo();
491 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
492516 const SISubtarget &ST = MF->getSubtarget();
493517 const SIInstrInfo *TII = ST.getInstrInfo();
494 const DebugLoc &DL = MI->getDebugLoc();
495518
496519 unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
497520 unsigned SuperReg = MI->getOperand(0).getReg();
498521 bool IsKill = MI->getOperand(0).isKill();
522 const DebugLoc &DL = MI->getDebugLoc();
523
524 SIMachineFunctionInfo *MFI = MF->getInfo();
525 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
526
527 bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM;
528
529 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
530
531 const unsigned EltSize = 4;
532 unsigned OffsetReg = AMDGPU::M0;
533 unsigned M0CopyReg = AMDGPU::NoRegister;
534
535 if (SpillToSMEM) {
536 if (RS->isRegUsed(AMDGPU::M0)) {
537 M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
538 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
539 .addReg(AMDGPU::M0);
540 }
541 }
499542
500543 // SubReg carries the "Kill" flag when SubReg == SuperReg.
501544 unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
503546 unsigned SubReg = NumSubRegs == 1 ?
504547 SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i));
505548
549 if (SpillToSMEM) {
550 int64_t FrOffset = FrameInfo.getObjectOffset(Index);
551 unsigned Align = FrameInfo.getObjectAlignment(Index);
552 MachinePointerInfo PtrInfo
553 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
554 MachineMemOperand *MMO
555 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
556 EltSize, MinAlign(Align, EltSize * i));
557
558 // Add i * 4 wave offset.
559 //
560 // SMEM instructions only support a single offset, so increment the wave
561 // offset.
562
563 int64_t Offset = ST.getWavefrontSize() * (FrOffset + 4 * i);
564 if (Offset != 0) {
565 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
566 .addReg(MFI->getScratchWaveOffsetReg())
567 .addImm(Offset);
568 } else {
569 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
570 .addReg(MFI->getScratchWaveOffsetReg());
571 }
572
573 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BUFFER_STORE_DWORD_SGPR))
574 .addReg(SubReg, getKillRegState(IsKill)) // sdata
575 .addReg(MFI->getScratchRSrcReg()) // sbase
576 .addReg(OffsetReg, RegState::Kill) // soff
577 .addImm(0) // glc
578 .addMemOperand(MMO);
579
580 continue;
581 }
582
506583 struct SIMachineFunctionInfo::SpilledReg Spill =
507584 MFI->getSpilledReg(MF, Index, i);
508585 if (Spill.hasReg()) {
509 if (SuperReg == AMDGPU::M0) {
510 assert(NumSubRegs == 1);
511 unsigned CopyM0
512 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
513 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), CopyM0)
514 .addReg(SuperReg, getKillRegState(IsKill));
515
516 // The real spill now kills the temp copy.
517 SubReg = SuperReg = CopyM0;
518 IsKill = true;
519 }
520
521586 BuildMI(*MBB, MI, DL,
522587 TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
523588 Spill.VGPR)
529594 // it are fixed.
530595 } else {
531596 // Spill SGPR to a frame index.
532 // FIXME we should use S_STORE_DWORD here for VI.
533
534597 // TODO: Should VI try to spill to VGPR and then spill to SMEM?
535598 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
599 // TODO: Should VI try to spill to VGPR and then spill to SMEM?
536600
537601 MachineInstrBuilder Mov
538602 = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
549613 Mov.addReg(SuperReg, RegState::Implicit | SuperKillState);
550614 }
551615
552 unsigned Size = FrameInfo.getObjectSize(Index);
553616 unsigned Align = FrameInfo.getObjectAlignment(Index);
554617 MachinePointerInfo PtrInfo
555 = MachinePointerInfo::getFixedStack(*MF, Index);
618 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
556619 MachineMemOperand *MMO
557620 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
558 Size, Align);
621 EltSize, MinAlign(Align, EltSize * i));
559622 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
560623 .addReg(TmpReg, RegState::Kill) // src
561624 .addFrameIndex(Index) // vaddr
564627 .addImm(i * 4) // offset
565628 .addMemOperand(MMO);
566629 }
630 }
631
632 if (M0CopyReg != AMDGPU::NoRegister) {
633 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
634 .addReg(M0CopyReg, RegState::Kill);
567635 }
568636
569637 MI->eraseFromParent();
584652
585653 unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
586654 unsigned SuperReg = MI->getOperand(0).getReg();
587
588 // m0 is not allowed as with readlane/writelane, so a temporary SGPR and
589 // extra copy is needed.
590 bool IsM0 = (SuperReg == AMDGPU::M0);
591 if (IsM0) {
592 assert(NumSubRegs == 1);
593 SuperReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
594 }
655 bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM;
656
657 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
658
659 unsigned OffsetReg = AMDGPU::M0;
660 unsigned M0CopyReg = AMDGPU::NoRegister;
661
662 if (SpillToSMEM) {
663 if (RS->isRegUsed(AMDGPU::M0)) {
664 M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
665 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
666 .addReg(AMDGPU::M0);
667 }
668 }
669
670 // SubReg carries the "Kill" flag when SubReg == SuperReg.
671 int64_t FrOffset = FrameInfo.getObjectOffset(Index);
672
673 const unsigned EltSize = 4;
595674
596675 for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
597676 unsigned SubReg = NumSubRegs == 1 ?
598677 SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i));
599678
679 if (SpillToSMEM) {
680 unsigned Align = FrameInfo.getObjectAlignment(Index);
681 MachinePointerInfo PtrInfo
682 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
683 MachineMemOperand *MMO
684 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
685 EltSize, MinAlign(Align, EltSize * i));
686
687 // Add i * 4 offset
688 int64_t Offset = ST.getWavefrontSize() * (FrOffset + 4 * i);
689 if (Offset != 0) {
690 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
691 .addReg(MFI->getScratchWaveOffsetReg())
692 .addImm(Offset);
693 } else {
694 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
695 .addReg(MFI->getScratchWaveOffsetReg());
696 }
697
698 auto MIB =
699 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BUFFER_LOAD_DWORD_SGPR), SubReg)
700 .addReg(MFI->getScratchRSrcReg()) // sbase
701 .addReg(OffsetReg, RegState::Kill) // soff
702 .addImm(0) // glc
703 .addMemOperand(MMO);
704
705 if (NumSubRegs > 1)
706 MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
707
708 continue;
709 }
710
600711 SIMachineFunctionInfo::SpilledReg Spill
601712 = MFI->getSpilledReg(MF, Index, i);
602713
603714 if (Spill.hasReg()) {
604 BuildMI(*MBB, MI, DL,
605 TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
606 SubReg)
715 auto MIB =
716 BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
717 SubReg)
607718 .addReg(Spill.VGPR)
608 .addImm(Spill.Lane)
609 .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
719 .addImm(Spill.Lane);
720
721 if (NumSubRegs > 1)
722 MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
610723 } else {
611724 // Restore SGPR from a stack slot.
612725 // FIXME: We should use S_LOAD_DWORD here for VI.
613
614726 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
615727 unsigned Align = FrameInfo.getObjectAlignment(Index);
616 unsigned Size = FrameInfo.getObjectSize(Index);
617728
618729 MachinePointerInfo PtrInfo
619 = MachinePointerInfo::getFixedStack(*MF, Index);
620
621 MachineMemOperand *MMO = MF->getMachineMemOperand(
622 PtrInfo, MachineMemOperand::MOLoad, Size, Align);
730 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
731
732 MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo,
733 MachineMemOperand::MOLoad, EltSize,
734 MinAlign(Align, EltSize * i));
623735
624736 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
625737 .addFrameIndex(Index) // vaddr
627739 .addReg(MFI->getScratchWaveOffsetReg()) // soffset
628740 .addImm(i * 4) // offset
629741 .addMemOperand(MMO);
630 BuildMI(*MBB, MI, DL,
631 TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
632 .addReg(TmpReg, RegState::Kill)
633 .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
634 }
635 }
636
637 if (IsM0 && SuperReg != AMDGPU::M0) {
638 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
639 .addReg(SuperReg);
742
743 auto MIB =
744 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
745 .addReg(TmpReg, RegState::Kill);
746
747 if (NumSubRegs > 1)
748 MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
749 }
750 }
751
752 if (M0CopyReg != AMDGPU::NoRegister) {
753 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
754 .addReg(M0CopyReg, RegState::Kill);
640755 }
641756
642757 MI->eraseFromParent();
684799 case AMDGPU::SI_SPILL_V128_SAVE:
685800 case AMDGPU::SI_SPILL_V96_SAVE:
686801 case AMDGPU::SI_SPILL_V64_SAVE:
687 case AMDGPU::SI_SPILL_V32_SAVE:
802 case AMDGPU::SI_SPILL_V32_SAVE: {
803 const MachineOperand *VData = TII->getNamedOperand(*MI,
804 AMDGPU::OpName::vdata);
688805 buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
689 TII->getNamedOperand(*MI, AMDGPU::OpName::vdata),
806 Index,
807 VData->getReg(), VData->isKill(),
690808 TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
691809 TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(),
692 FrameInfo.getObjectOffset(Index) +
693 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS);
810 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
811 *MI->memoperands_begin(),
812 RS);
694813 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
695814 MI->eraseFromParent();
696815 break;
816 }
697817 case AMDGPU::SI_SPILL_V32_RESTORE:
698818 case AMDGPU::SI_SPILL_V64_RESTORE:
699819 case AMDGPU::SI_SPILL_V96_RESTORE:
700820 case AMDGPU::SI_SPILL_V128_RESTORE:
701821 case AMDGPU::SI_SPILL_V256_RESTORE:
702822 case AMDGPU::SI_SPILL_V512_RESTORE: {
823 const MachineOperand *VData = TII->getNamedOperand(*MI,
824 AMDGPU::OpName::vdata);
825
703826 buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
704 TII->getNamedOperand(*MI, AMDGPU::OpName::vdata),
827 Index,
828 VData->getReg(), VData->isKill(),
705829 TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
706830 TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(),
707 FrameInfo.getObjectOffset(Index) +
708 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS);
831 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
832 *MI->memoperands_begin(),
833 RS);
709834 MI->eraseFromParent();
710835 break;
711836 }
252252
253253 private:
254254 void buildSpillLoadStore(MachineBasicBlock::iterator MI,
255 unsigned LoadStoreOp, const MachineOperand *SrcDst,
256 unsigned ScratchRsrcReg, unsigned ScratchOffset,
257 int64_t Offset,
255 unsigned LoadStoreOp,
256 int Index,
257 unsigned ValueReg,
258 bool ValueIsKill,
259 unsigned ScratchRsrcReg,
260 unsigned ScratchOffsetReg,
261 int64_t InstrOffset,
262 MachineMemOperand *MMO,
258263 RegScavenger *RS) const;
259264 };
260265
116116
117117 def SCC_CLASS : RegisterClass<"AMDGPU", [i1], 1, (add SCC)> {
118118 let CopyCost = -1;
119 let isAllocatable = 0;
120 }
121
122 def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> {
123 let CopyCost = 1;
119124 let isAllocatable = 0;
120125 }
121126
258263
259264 // Register class for all scalar registers (SGPRs + Special Registers)
260265 def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
261 (add SReg_32_XM0, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)> {
266 (add SReg_32_XM0, M0_CLASS)> {
262267 let AllocationPriority = 1;
268 let isAllocatable = 0;
263269 }
264270
265271 def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> {
436436 MachineBasicBlock::iterator
437437 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
438438 MachineBasicBlock::iterator Before) {
439 unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
439 unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
440440
441441 MachineInstr *Save =
442442 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
None ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
0 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSGPR -check-prefix=ALL %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s
12
2 ; CHECK-LABEL: {{^}}max_14_sgprs:
3 ; If spilling to smem, additional registers are used for the resource
4 ; descriptor.
5
6 ; ALL-LABEL: {{^}}max_14_sgprs:
37
48 ; FIXME: Should be ablo to skip this copying of the private segment
59 ; buffer because all the SGPR spills are to VGPRs.
610
7 ; CHECK: s_mov_b64 s[6:7], s[2:3]
8 ; CHECK: s_mov_b64 s[4:5], s[0:1]
11 ; ALL: s_mov_b64 s[6:7], s[2:3]
12 ; ALL: s_mov_b64 s[4:5], s[0:1]
13 ; ALL: SGPRBlocks: 1
14 ; ALL: NumSGPRsForWavesPerEU: 14
15 define void @max_14_sgprs(i32 addrspace(1)* %out1,
916
10 ; CHECK: SGPRBlocks: 1
11 ; CHECK: NumSGPRsForWavesPerEU: 14
12 define void @max_14_sgprs(i32 addrspace(1)* %out1,
1317 i32 addrspace(1)* %out2,
1418 i32 addrspace(1)* %out3,
1519 i32 addrspace(1)* %out4,
3034 ; ---------------------
3135 ; total: 14
3236
33 ; + reserved vcc, flat_scratch = 18
37 ; + reserved vcc, xnack, flat_scratch = 20
3438
3539 ; Because we can't handle re-using the last few input registers as the
3640 ; special vcc etc. registers (as well as decide to not use the unused
3943
4044 ; ALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs:
4145 ; TOSGPR: SGPRBlocks: 2
42 ; TOSGPR: NumSGPRsForWavesPerEU: 18
46 ; TOSGPR: NumSGPRsForWavesPerEU: 20
4347
4448 ; TOSMEM: s_mov_b64 s[6:7], s[2:3]
45 ; TOSMEM: s_mov_b32 s9, s13
4649 ; TOSMEM: s_mov_b64 s[4:5], s[0:1]
50 ; TOSMEM: s_mov_b32 s3, s13
4751
4852 ; TOSMEM: SGPRBlocks: 2
49 ; TOSMEM: NumSGPRsForWavesPerEU: 18
53 ; TOSMEM: NumSGPRsForWavesPerEU: 20
5054 define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
5155 i32 addrspace(1)* %out2,
5256 i32 addrspace(1)* %out3,
7882 ; ; swapping the order the registers are copied from what normally
7983 ; ; happens.
8084
81 ; TOSMEM: s_mov_b64 s[6:7], s[2:3]
82 ; TOSMEM: s_mov_b64 s[4:5], s[0:1]
83 ; TOSMEM: s_mov_b32 s3, s11
85 ; TOSMEM: s_mov_b32 s5, s11
86 ; TOSMEM: s_add_u32 m0, s5,
87 ; TOSMEM: s_buffer_store_dword vcc_lo, s[0:3], m0
8488
85 ; ALL: SGPRBlocks: 1
86 ; ALL: NumSGPRsForWavesPerEU: 16
89 ; ALL: SGPRBlocks: 2
90 ; ALL: NumSGPRsForWavesPerEU: 18
8791 define void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1,
8892 i32 addrspace(1)* %out2,
8993 i32 addrspace(1)* %out3,
0 ; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s
1 ; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s
1 ; RUN: llc -O0 -march=amdgcn -mcpu=tonga -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s
22 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s
33 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s
44
2525
2626
2727 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
28 ; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 ; 8-byte Folded Spill
28 ; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 ; 4-byte Folded Spill
2929 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
30 ; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:4 ; 8-byte Folded Spill
30 ; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill
3131
3232 ; Spill load
3333 ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
5454
5555
5656
57 ; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 ; 8-byte Folded Reload
57 ; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload
5858 ; VMEM: s_waitcnt vmcnt(0)
5959 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
6060
61 ; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:4 ; 8-byte Folded Reload
61 ; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
6262 ; VMEM: s_waitcnt vmcnt(0)
6363 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
6464
107107
108108
109109 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
110 ; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:16 ; 8-byte Folded Spill
110 ; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:16 ; 4-byte Folded Spill
111111 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
112 ; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:20 ; 8-byte Folded Spill
112 ; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:20 ; 4-byte Folded Spill
113113
114114 ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
115115
132132 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
133133 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
134134
135 ; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:16 ; 8-byte Folded Reload
135 ; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:16 ; 4-byte Folded Reload
136136 ; VMEM: s_waitcnt vmcnt(0)
137137 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
138138
139 ; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:20 ; 8-byte Folded Reload
139 ; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:20 ; 4-byte Folded Reload
140140 ; VMEM: s_waitcnt vmcnt(0)
141141 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
142142
186186 ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]
187187
188188 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
189 ; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[SAVEEXEC_LO_OFFSET:[0-9]+]] ; 8-byte Folded Spill
189 ; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill
190190 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
191 ; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 8-byte Folded Spill
191 ; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill
192192
193193 ; GCN: s_mov_b64 exec, [[CMP0]]
194194 ; GCN: s_waitcnt vmcnt(0) expcnt(0)
207207 ; VMEM: s_waitcnt vmcnt(0)
208208 ; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_LO]]
209209
210 ; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET]] ; 8-byte Folded Reload
210 ; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload
211211 ; VMEM: s_waitcnt vmcnt(0)
212212 ; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_HI]]
213213
223223
224224
225225 ; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_LO:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_LO]]
226 ; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET:[0-9]+]] ; 8-byte Folded Spill
226 ; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill
227227 ; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_HI:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_HI]]
228 ; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET:[0-9]+]] ; 8-byte Folded Spill
228 ; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill
229229
230230 ; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], s7 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
231231 ; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}
254254 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]]
255255
256256
257 ; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET]] ; 8-byte Folded Reload
257 ; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET]] ; 4-byte Folded Reload
258258 ; VMEM: s_waitcnt vmcnt(0)
259259 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
260260
261 ; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET]] ; 8-byte Folded Reload
261 ; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload
262262 ; VMEM: s_waitcnt vmcnt(0)
263263 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
264264
2626 # CHECK: S_NOP 0, implicit undef %5.sub0
2727 name: test0
2828 registers:
29 - { id: 0, class: sreg_32 }
30 - { id: 1, class: sreg_32 }
31 - { id: 2, class: sreg_32 }
29 - { id: 0, class: sreg_32_xm0 }
30 - { id: 1, class: sreg_32_xm0 }
31 - { id: 2, class: sreg_32_xm0 }
3232 - { id: 3, class: sreg_128 }
3333 - { id: 4, class: sreg_64 }
3434 - { id: 5, class: sreg_64 }
8686 - { id: 0, class: sreg_128 }
8787 - { id: 1, class: sreg_128 }
8888 - { id: 2, class: sreg_64 }
89 - { id: 3, class: sreg_32 }
89 - { id: 3, class: sreg_32_xm0 }
9090 - { id: 4, class: sreg_128 }
9191 - { id: 5, class: sreg_64 }
92 - { id: 6, class: sreg_32 }
93 - { id: 7, class: sreg_32 }
92 - { id: 6, class: sreg_32_xm0 }
93 - { id: 7, class: sreg_32_xm0 }
9494 - { id: 8, class: sreg_64 }
95 - { id: 9, class: sreg_32 }
95 - { id: 9, class: sreg_32_xm0 }
9696 - { id: 10, class: sreg_128 }
9797 body: |
9898 bb.0:
161161
162162 name: test2
163163 registers:
164 - { id: 0, class: sreg_32 }
165 - { id: 1, class: sreg_32 }
164 - { id: 0, class: sreg_32_xm0 }
165 - { id: 1, class: sreg_32_xm0 }
166166 - { id: 2, class: sreg_64 }
167167 - { id: 3, class: sreg_128 }
168 - { id: 4, class: sreg_32 }
169 - { id: 5, class: sreg_32 }
168 - { id: 4, class: sreg_32_xm0 }
169 - { id: 5, class: sreg_32_xm0 }
170170 - { id: 6, class: sreg_64 }
171171 - { id: 7, class: sreg_128 }
172172 - { id: 8, class: sreg_64 }
259259 name: test5
260260 tracksRegLiveness: true
261261 registers:
262 - { id: 0, class: sreg_32 }
262 - { id: 0, class: sreg_32_xm0 }
263263 - { id: 1, class: sreg_64 }
264264 body: |
265265 bb.0:
285285 name: loop0
286286 tracksRegLiveness: true
287287 registers:
288 - { id: 0, class: sreg_32 }
289 - { id: 1, class: sreg_32 }
290 - { id: 2, class: sreg_32 }
288 - { id: 0, class: sreg_32_xm0 }
289 - { id: 1, class: sreg_32_xm0 }
290 - { id: 2, class: sreg_32_xm0 }
291291 - { id: 3, class: sreg_128 }
292292 - { id: 4, class: sreg_128 }
293293 - { id: 5, class: sreg_128 }
338338 name: loop1
339339 tracksRegLiveness: true
340340 registers:
341 - { id: 0, class: sreg_32 }
342 - { id: 1, class: sreg_32 }
343 - { id: 2, class: sreg_32 }
344 - { id: 3, class: sreg_32 }
341 - { id: 0, class: sreg_32_xm0 }
342 - { id: 1, class: sreg_32_xm0 }
343 - { id: 2, class: sreg_32_xm0 }
344 - { id: 3, class: sreg_32_xm0 }
345345 - { id: 4, class: sreg_128 }
346346 - { id: 5, class: sreg_128 }
347347 - { id: 6, class: sreg_128 }
389389 name: loop2
390390 tracksRegLiveness: true
391391 registers:
392 - { id: 0, class: sreg_32 }
392 - { id: 0, class: sreg_32_xm0 }
393393 - { id: 1, class: sreg_128 }
394394 - { id: 2, class: sreg_128 }
395395 - { id: 3, class: sreg_128 }
2121 ret void
2222 }
2323
24 ; FIXME: Should be able to avoid copy
2425 ; GCN-LABEL: {{^}}inline_sreg_constraint_m0:
2526 ; GCN: s_mov_b32 m0, -1
26 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, m0
27 ; GCN: ; use m0
27 ; GCN: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
28 ; GCN: ; use [[COPY_M0]]
2829 define void @inline_sreg_constraint_m0() {
2930 %m0 = tail call i32 asm sideeffect "s_mov_b32 m0, -1", "={M0}"()
3031 tail call void asm sideeffect "; use $0", "s"(i32 %m0)
2121 ; TODO: m0 should be folded.
2222 ; CHECK-LABEL: {{^}}test_readfirstlane_m0:
2323 ; CHECK: s_mov_b32 m0, -1
24 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], m0
24 ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
25 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], [[COPY_M0]]
2526 ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, [[VVAL]]
2627 define void @test_readfirstlane_m0(i32 addrspace(1)* %out) #1 {
2728 %m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"()
2121 ; TODO: m0 should be folded.
2222 ; CHECK-LABEL: {{^}}test_readlane_m0_sreg:
2323 ; CHECK: s_mov_b32 m0, -1
24 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], m0
24 ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
25 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], [[COPY_M0]]
2526 ; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}}
2627 define void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
2728 %m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"()
22 declare i32 @llvm.read_register.i32(metadata) #0
33 declare i64 @llvm.read_register.i64(metadata) #0
44
5 ; FIXME: Should be able to eliminate copy
56 ; CHECK-LABEL: {{^}}test_read_m0:
67 ; CHECK: s_mov_b32 m0, -1
7 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], m0
8 ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
9 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], [[COPY_M0]]
810 ; CHECK: buffer_store_dword [[COPY]]
911 define void @test_read_m0(i32 addrspace(1)* %out) #0 {
1012 store volatile i32 0, i32 addrspace(3)* undef
None ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
0 ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SGPR %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SMEM %s
12
23 ; Make sure this doesn't crash.
3 ; CHECK: {{^}}test:
4 ; ALL-LABEL: {{^}}test:
5 ; ALL: s_mov_b32 s92, SCRATCH_RSRC_DWORD0
6 ; ALL: s_mov_b32 s91, s3
7
48 ; Make sure we are handling hazards correctly.
5 ; CHECK: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:12
6 ; CHECK-NEXT: s_waitcnt vmcnt(0)
7 ; CHECK-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]]
8 ; CHECK-NEXT: s_nop 4
9 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0
10 ; CHECK: s_endpgm
9 ; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:12
10 ; SGPR-NEXT: s_waitcnt vmcnt(0)
11 ; SGPR-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]]
12 ; SGPR-NEXT: s_nop 4
13 ; SGPR-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0
14
15
16 ; Make sure scratch wave offset register is correctly incremented and
17 ; then restored.
18 ; SMEM: s_mov_b32 m0, s91{{$}}
19 ; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill
20 ; SMEM: s_add_u32 m0, s91, 0x100{{$}}
21 ; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill
22 ; SMEM: s_add_u32 m0, s91, 0x200{{$}}
23 ; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill
24 ; SMEM: s_add_u32 m0, s91, 0x300{{$}}
25 ; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill
26
27
28 ; SMEM: s_mov_b32 m0, s91{{$}}
29 ; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload
30 ; SMEM: s_add_u32 m0, s91, 0x100{{$}}
31 ; SMEM: s_waitcnt lgkmcnt(0)
32 ; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload
33 ; SMEM: s_add_u32 m0, s91, 0x200{{$}}
34 ; SMEM: s_waitcnt lgkmcnt(0)
35 ; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload
36 ; SMEM: s_add_u32 m0, s91, 0x300{{$}}
37 ; SMEM: s_waitcnt lgkmcnt(0)
38 ; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload
39
40 ; ALL: s_endpgm
1141 define void @test(i32 addrspace(1)* %out, i32 %in) {
1242 call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
1343 call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
0 ; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s
1 ; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s
1 ; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -amdgpu-spill-sgpr-to-smem=0 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s
22 ; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s
3 ; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -mattr=+vgpr-spilling -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s
3 ; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-spill-sgpr-to-smem=0 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s
4 ; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-spill-sgpr-to-smem=1 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=GCN %s
45
56 ; XXX - Why does it like to use vcc?
67
78 ; GCN-LABEL: {{^}}spill_m0:
8 ; TOSMEM: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
9 ; TOSMEM: s_mov_b32 s84, SCRATCH_RSRC_DWORD0
910
10 ; GCN: s_cmp_lg_u32
11 ; GCN-DAG: s_cmp_lg_u32
1112
12 ; TOVGPR: s_mov_b32 vcc_hi, m0
13 ; TOVGPR: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], vcc_hi, 0
13 ; TOVGPR-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
14 ; TOVGPR: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]], 0
1415
15 ; TOVMEM: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], m0
16 ; TOVMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
17 ; TOVMEM-DAG: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]]
1618 ; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Spill
1719 ; TOVMEM: s_waitcnt vmcnt(0)
20
21 ; TOSMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
22 ; TOSMEM: s_mov_b32 m0, s3{{$}}
23 ; TOSMEM-NOT: [[M0_COPY]]
24 ; TOSMEM: s_buffer_store_dword [[M0_COPY]], s[84:87], m0 ; 4-byte Folded Spill
25 ; TOSMEM: s_waitcnt lgkmcnt(0)
26
1827 ; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]]
1928
2029 ; GCN: [[ENDIF]]:
21 ; TOVGPR: v_readlane_b32 vcc_hi, [[SPILL_VREG]], 0
22 ; TOVGPR: s_mov_b32 m0, vcc_hi
30 ; TOVGPR: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[SPILL_VREG]], 0
31 ; TOVGPR: s_mov_b32 m0, [[M0_RESTORE]]
2332
2433 ; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Reload
2534 ; TOVMEM: s_waitcnt vmcnt(0)
26 ; TOVMEM: v_readfirstlane_b32 vcc_hi, [[RELOAD_VREG]]
27 ; TOVMEM: s_mov_b32 m0, vcc_hi
35 ; TOVMEM: v_readfirstlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]]
36 ; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]]
2837
29 ; GCN: s_add_i32 m0, m0, 1
38 ; TOSMEM: s_mov_b32 m0, s3{{$}}
39 ; TOSMEM: s_buffer_load_dword [[M0_RESTORE:s[0-9]+]], s[84:87], m0 ; 4-byte Folded Reload
40 ; TOSMEM-NOT: [[M0_RESTORE]]
41 ; TOSMEM: s_mov_b32 m0, [[M0_RESTORE]]
42
43 ; GCN: s_add_i32 s{{[0-9]+}}, m0, 1
3044 define void @spill_m0(i32 %cond, i32 addrspace(1)* %out) #0 {
3145 entry:
3246 %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0
4660 @lds = internal addrspace(3) global [64 x float] undef
4761
4862 ; GCN-LABEL: {{^}}spill_m0_lds:
63 ; GCN: s_mov_b32 m0, s6
64 ; GCN: v_interp_mov_f32
65
66 ; TOSMEM: s_mov_b32 vcc_hi, m0
67 ; TOSMEM: s_mov_b32 m0, s7
68 ; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
69 ; TOSMEM: s_mov_b32 m0, vcc_hi
70
71 ; TOSMEM: s_mov_b32 vcc_hi, m0
72 ; TOSMEM: s_add_u32 m0, s7, 0x100
73 ; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
74 ; TOSMEM: s_add_u32 m0, s7, 0x200
75 ; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
76 ; TOSMEM: s_mov_b32 m0, vcc_hi
77
78 ; TOSMEM: s_mov_b64 exec,
79 ; TOSMEM: s_cbranch_execz
80 ; TOSMEM: s_branch
81
82 ; TOSMEM: BB{{[0-9]+_[0-9]+}}:
83 ; TOSMEM-NEXT: s_add_u32 m0, s7, 0x100
84 ; TOSMEM-NEXT: s_buffer_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Reload
85
86
4987 ; GCN-NOT: v_readlane_b32 m0
88 ; GCN-NOT: s_buffer_store_dword m0
89 ; GCN-NOT: s_buffer_load_dword m0
5090 define amdgpu_ps void @spill_m0_lds(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) #0 {
5191 main_body:
5292 %4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
70110 ret void
71111 }
72112
113 ; GCN-LABEL: {{^}}restore_m0_lds:
114 ; TOSMEM: s_cmp_eq_u32
115 ; TOSMEM: s_mov_b32 vcc_hi, m0
116 ; TOSMEM: s_mov_b32 m0, s3
117 ; TOSMEM: s_buffer_store_dword s4, s[84:87], m0 ; 4-byte Folded Spill
118 ; TOSMEM: s_mov_b32 m0, vcc_hi
119 ; TOSMEM: s_cbranch_scc1
120
121 ; TOSMEM: s_mov_b32 m0, -1
122
123 ; TOSMEM: s_mov_b32 vcc_hi, m0
124 ; TOSMEM: s_mov_b32 m0, s3
125 ; TOSMEM: s_buffer_load_dword s4, s[84:87], m0 ; 4-byte Folded Reload
126 ; TOSMEM: s_add_u32 m0, s3, 0x100
127 ; TOSMEM: s_waitcnt lgkmcnt(0)
128 ; TOSMEM: s_buffer_load_dword s5, s[84:87], m0 ; 4-byte Folded Reload
129 ; TOSMEM: s_mov_b32 m0, vcc_hi
130 ; TOSMEM: s_waitcnt lgkmcnt(0)
131
132 ; TOSMEM: ds_write_b64
133
134 ; TOSMEM: s_mov_b32 vcc_hi, m0
135 ; TOSMEM: s_add_u32 m0, s3, 0x200
136 ; TOSMEM: s_buffer_load_dword s0, s[84:87], m0 ; 4-byte Folded Reload
137 ; TOSMEM: s_mov_b32 m0, vcc_hi
138 ; TOSMEM: s_waitcnt lgkmcnt(0)
139 ; TOSMEM: s_mov_b32 m0, s0
140 ; TOSMEM: ; use m0
141
142 ; TOSMEM: s_dcache_wb
143 ; TOSMEM: s_endpgm
144 define void @restore_m0_lds(i32 %arg) {
145 %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0
146 %sval = load volatile i64, i64 addrspace(2)* undef
147 %cmp = icmp eq i32 %arg, 0
148 br i1 %cmp, label %ret, label %bb
149
150 bb:
151 store volatile i64 %sval, i64 addrspace(3)* undef
152 call void asm sideeffect "; use $0", "{M0}"(i32 %m0) #0
153 br label %ret
154
155 ret:
156 ret void
157 }
158
73159 declare float @llvm.SI.fs.constant(i32, i32, i32) readnone
74160
75161 declare i32 @llvm.SI.packf16(float, float) readnone
1919 ; VI-DAG: s_mov_b32 s15, 0xe80000
2020
2121 ; s11 is offset system SGPR
22 ; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Spill
23 ; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Reload
22 ; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill
23 ; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Reload
2424
2525 ; GCN: NumVgprs: 256
2626 ; GCN: ScratchSize: 1024
0 # RUN: llc -march=amdgcn -run-pass si-insert-waits %s -o - | FileCheck %s
1
2 --- |
3 define void @basic_insert_dcache_wb() {
4 ret void
5 }
6
7 define void @explicit_flush_after() {
8 ret void
9 }
10
11 define void @explicit_flush_before() {
12 ret void
13 }
14
15 define void @no_scalar_store() {
16 ret void
17 }
18
19 define void @multi_block_store() {
20 bb0:
21 br i1 undef, label %bb1, label %bb2
22
23 bb1:
24 ret void
25
26 bb2:
27 ret void
28 }
29
30 define void @one_block_store() {
31 bb0:
32 br i1 undef, label %bb1, label %bb2
33
34 bb1:
35 ret void
36
37 bb2:
38 ret void
39 }
40
41 define amdgpu_ps float @si_return() {
42 ret float undef
43 }
44
45 ...
46 ---
47 # CHECK-LABEL: name: basic_insert_dcache_wb
48 # CHECK: bb.0:
49 # CHECK-NEXT: S_STORE_DWORD
50 # CHECK-NEXT: S_DCACHE_WB
51 # CHECK-NEXT: S_ENDPGM
52
53 name: basic_insert_dcache_wb
54 tracksRegLiveness: false
55
56 body: |
57 bb.0:
58 S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0
59 S_ENDPGM
60 ...
61 ---
62 # Already has an explicitly requested flush after the last store.
63 # CHECK-LABEL: name: explicit_flush_after
64 # CHECK: bb.0:
65 # CHECK-NEXT: S_STORE_DWORD
66 # CHECK-NEXT: S_DCACHE_WB
67 # CHECK-NEXT: S_ENDPGM
68
69 name: explicit_flush_after
70 tracksRegLiveness: false
71
72 body: |
73 bb.0:
74 S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0
75 S_DCACHE_WB
76 S_ENDPGM
77 ...
78 ---
79 # Already has an explicitly requested flush before the last store.
80 # CHECK-LABEL: name: explicit_flush_before
81 # CHECK: bb.0:
82 # CHECK-NEXT: S_DCACHE_WB
83 # CHECK-NEXT: S_STORE_DWORD
84 # CHECK-NEXT: S_DCACHE_WB
85 # CHECK-NEXT: S_ENDPGM
86
87 name: explicit_flush_before
88 tracksRegLiveness: false
89
90 body: |
91 bb.0:
92 S_DCACHE_WB
93 S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0
94 S_ENDPGM
95 ...
96 ---
97 # CHECK-LABEL: no_scalar_store
98 # CHECK: bb.0
99 # CHECK-NEXT: S_ENDPGM
100 name: no_scalar_store
101 tracksRegLiveness: false
102
103 body: |
104 bb.0:
105 S_ENDPGM
106 ...
107
108 # CHECK-LABEL: name: multi_block_store
109 # CHECK: bb.0:
110 # CHECK-NEXT: S_STORE_DWORD
111 # CHECK-NEXT: S_DCACHE_WB
112 # CHECK-NEXT: S_ENDPGM
113
114 # CHECK: bb.1:
115 # CHECK-NEXT: S_STORE_DWORD
116 # CHECK-NEXT: S_DCACHE_WB
117 # CHECK-NEXT: S_ENDPGM
118
119 name: multi_block_store
120 tracksRegLiveness: false
121
122 body: |
123 bb.0:
124 S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0
125 S_ENDPGM
126
127 bb.1:
128 S_STORE_DWORD_SGPR undef %sgpr4, undef %sgpr6_sgpr7, undef %m0, 0
129 S_ENDPGM
130 ...
131 ...
132
133 # This one should be able to omit the flush in the storeless block but
134 # this isn't handled now.
135
136 # CHECK-LABEL: name: one_block_store
137 # CHECK: bb.0:
138 # CHECK-NEXT: S_DCACHE_WB
139 # CHECK-NEXT: S_ENDPGM
140
141 # CHECK: bb.1:
142 # CHECK-NEXT: S_STORE_DWORD
143 # CHECK-NEXT: S_DCACHE_WB
144 # CHECK-NEXT: S_ENDPGM
145
146 name: one_block_store
147 tracksRegLiveness: false
148
149 body: |
150 bb.0:
151 S_ENDPGM
152
153 bb.1:
154 S_STORE_DWORD_SGPR undef %sgpr4, undef %sgpr6_sgpr7, undef %m0, 0
155 S_ENDPGM
156 ...
157 ---
158 # CHECK-LABEL: name: si_return
159 # CHECK: bb.0:
160 # CHECK-NEXT: S_STORE_DWORD
161 # CHECK-NEXT: S_WAITCNT
162 # CHECK-NEXT: S_DCACHE_WB
163 # CHECK-NEXT: SI_RETURN
164
165 name: si_return
166 tracksRegLiveness: false
167
168 body: |
169 bb.0:
170 S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0
171 SI_RETURN undef %vgpr0
172 ...
55 name: phi_visit_order
66 tracksRegLiveness: true
77 registers:
8 - { id: 0, class: sreg_32 }
8 - { id: 0, class: sreg_32_xm0 }
99 - { id: 1, class: sreg_64 }
10 - { id: 2, class: sreg_32 }
10 - { id: 2, class: sreg_32_xm0 }
1111 - { id: 7, class: vgpr_32 }
12 - { id: 8, class: sreg_32 }
12 - { id: 8, class: sreg_32_xm0 }
1313 - { id: 9, class: vgpr_32 }
1414 - { id: 10, class: sreg_64 }
15 - { id: 11, class: sreg_32 }
15 - { id: 11, class: sreg_32_xm0 }
1616
1717 body: |
1818 ; GCN-LABEL: name: phi_visit_order