llvm.org GIT mirror llvm / 6b1290a
[AMDGPU] gfx1010 loop alignment Differential Revision: https://reviews.llvm.org/D61529 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@359935 91177308-0d34-0410-b5e6-96231b3b80d8 Stanislav Mekhanoshin 1 year, 5 months ago
2 changed file(s) with 78 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
9797 cl::desc("High bits of frame index assumed to be zero"),
9898 cl::init(5),
9999 cl::ReallyHidden);
100
101 static cl::opt DisableLoopAlignment(
102 "amdgpu-disable-loop-alignment",
103 cl::desc("Do not align and prefetch loops"),
104 cl::init(false));
100105
101106 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
102107 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
99659970 Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
99669971 }
99679972
9973 unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
9974 const unsigned PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
9975 const unsigned CacheLineAlign = 6; // log2(64)
9976
9977 // Pre-GFX10 target did not benefit from loop alignment
9978 if (!ML || DisableLoopAlignment ||
9979 (getSubtarget()->getGeneration() < AMDGPUSubtarget::GFX10) ||
9980 getSubtarget()->hasInstFwdPrefetchBug())
9981 return PrefAlign;
9982
9983 // On GFX10 I$ is 4 x 64 bytes cache lines.
9984 // By default prefetcher keeps one cache line behind and reads two ahead.
9985 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
9986 // behind and one ahead.
9987 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
9988 // If loop fits 64 bytes it always spans no more than two cache lines and
9989 // does not need an alignment.
9990 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
9991 // Else if loop is less or equal 192 bytes we need two lines behind.
9992
9993 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
9994 const MachineBasicBlock *Header = ML->getHeader();
9995 if (Header->getAlignment() != PrefAlign)
9996 return Header->getAlignment(); // Already processed.
9997
9998 unsigned LoopSize = 0;
9999 for (const MachineBasicBlock *MBB : ML->blocks()) {
10000 // If inner loop block is aligned assume in average half of the alignment
10001 // size to be added as nops.
10002 if (MBB != Header)
10003 LoopSize += (1 << MBB->getAlignment()) / 2;
10004
10005 for (const MachineInstr &MI : *MBB) {
10006 LoopSize += TII->getInstSizeInBytes(MI);
10007 if (LoopSize > 192)
10008 return PrefAlign;
10009 }
10010 }
10011
10012 if (LoopSize <= 64)
10013 return PrefAlign;
10014
10015 if (LoopSize <= 128)
10016 return CacheLineAlign;
10017
10018 // If any of parent loops is surrounded by prefetch instructions do not
10019 // insert new for inner loop, which would reset parent's settings.
10020 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
10021 if (MachineBasicBlock *Exit = P->getExitBlock()) {
10022 auto I = Exit->getFirstNonDebugInstr();
10023 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
10024 return CacheLineAlign;
10025 }
10026 }
10027
10028 MachineBasicBlock *Pre = ML->getLoopPreheader();
10029 MachineBasicBlock *Exit = ML->getExitBlock();
10030
10031 if (Pre && Exit) {
10032 BuildMI(*Pre, Pre->getFirstTerminator(), DebugLoc(),
10033 TII->get(AMDGPU::S_INST_PREFETCH))
10034 .addImm(1); // prefetch 2 lines behind PC
10035
10036 BuildMI(*Exit, Exit->getFirstNonDebugInstr(), DebugLoc(),
10037 TII->get(AMDGPU::S_INST_PREFETCH))
10038 .addImm(2); // prefetch 1 line behind PC
10039 }
10040
10041 return CacheLineAlign;
10042 }
10043
996810044 LLVM_ATTRIBUTE_UNUSED
996910045 static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
997010046 assert(N->getOpcode() == ISD::CopyFromReg);
366366 bool SNaN = false,
367367 unsigned Depth = 0) const override;
368368 AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
369
370 unsigned getPrefLoopAlignment(MachineLoop *ML) const override;
369371 };
370372
371373 } // End namespace llvm