llvm.org GIT mirror llvm / c313a17 lib / CodeGen / HardwareLoops.cpp
c313a17

Tree @c313a17 (Download .tar.gz)

HardwareLoops.cpp @c313a17raw · history · blame

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
//===-- HardwareLoops.cpp - Target Independent Hardware Loops --*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
/// Insert hardware loop intrinsics into loops which are deemed profitable by
/// the target, by querying TargetTransformInfo. A hardware loop comprises of
/// two intrinsics: one, outside the loop, to set the loop iteration count and
/// another, in the exit block, to decrement the counter. The decremented value
/// can either be carried through the loop via a phi or handled in some opaque
/// way by the target.
///
//===----------------------------------------------------------------------===//

#include "llvm/Pass.h"
#include "llvm/PassRegistry.h"
#include "llvm/PassSupport.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"

#define DEBUG_TYPE "hardware-loops"

#define HW_LOOPS_NAME "Hardware Loop Insertion"

using namespace llvm;

static cl::opt<bool>
ForceHardwareLoops("force-hardware-loops", cl::Hidden, cl::init(false),
                   cl::desc("Force hardware loops intrinsics to be inserted"));

static cl::opt<bool>
ForceHardwareLoopPHI(
  "force-hardware-loop-phi", cl::Hidden, cl::init(false),
  cl::desc("Force hardware loop counter to be updated through a phi"));

static cl::opt<bool>
ForceNestedLoop("force-nested-hardware-loop", cl::Hidden, cl::init(false),
                cl::desc("Force allowance of nested hardware loops"));

static cl::opt<unsigned>
LoopDecrement("hardware-loop-decrement", cl::Hidden, cl::init(1),
            cl::desc("Set the loop decrement value"));

static cl::opt<unsigned>
CounterBitWidth("hardware-loop-counter-bitwidth", cl::Hidden, cl::init(32),
                cl::desc("Set the loop counter bitwidth"));

STATISTIC(NumHWLoops, "Number of loops converted to hardware loops");

namespace {

  using TTI = TargetTransformInfo;

  class HardwareLoops : public FunctionPass {
  public:
    static char ID;

    HardwareLoops() : FunctionPass(ID) {
      initializeHardwareLoopsPass(*PassRegistry::getPassRegistry());
    }

    bool runOnFunction(Function &F) override;

    void getAnalysisUsage(AnalysisUsage &AU) const override {
      AU.addRequired<LoopInfoWrapperPass>();
      AU.addPreserved<LoopInfoWrapperPass>();
      AU.addRequired<DominatorTreeWrapperPass>();
      AU.addPreserved<DominatorTreeWrapperPass>();
      AU.addRequired<ScalarEvolutionWrapperPass>();
      AU.addRequired<AssumptionCacheTracker>();
      AU.addRequired<TargetTransformInfoWrapperPass>();
    }

    // Try to convert the given Loop into a hardware loop.
    bool TryConvertLoop(Loop *L);

    // Given that the target believes the loop to be profitable, try to
    // convert it.
    bool TryConvertLoop(TTI::HardwareLoopInfo &HWLoopInfo);

  private:
    ScalarEvolution *SE = nullptr;
    LoopInfo *LI = nullptr;
    const DataLayout *DL = nullptr;
    const TargetTransformInfo *TTI = nullptr;
    DominatorTree *DT = nullptr;
    bool PreserveLCSSA = false;
    AssumptionCache *AC = nullptr;
    TargetLibraryInfo *LibInfo = nullptr;
    Module *M = nullptr;
    bool MadeChange = false;
  };

  class HardwareLoop {
    // Expand the trip count scev into a value that we can use.
    Value *InitLoopCount(BasicBlock *BB);

    // Insert the set_loop_iteration intrinsic.
    void InsertIterationSetup(Value *LoopCountInit, BasicBlock *BB);

    // Insert the loop_decrement intrinsic.
    void InsertLoopDec();

    // Insert the loop_decrement_reg intrinsic.
    Instruction *InsertLoopRegDec(Value *EltsRem);

    // If the target requires the counter value to be updated in the loop,
    // insert a phi to hold the value. The intended purpose is for use by
    // loop_decrement_reg.
    PHINode *InsertPHICounter(Value *NumElts, Value *EltsRem);

    // Create a new cmp, that checks the returned value of loop_decrement*,
    // and update the exit branch to use it.
    void UpdateBranch(Value *EltsRem);

  public:
    HardwareLoop(TTI::HardwareLoopInfo &Info, ScalarEvolution &SE,
                 const DataLayout &DL) :
      SE(SE), DL(DL), L(Info.L), M(L->getHeader()->getModule()),
      ExitCount(Info.ExitCount),
      CountType(Info.CountType),
      ExitBranch(Info.ExitBranch),
      LoopDecrement(Info.LoopDecrement),
      UsePHICounter(Info.CounterInReg) { }

    void Create();

  private:
    ScalarEvolution &SE;
    const DataLayout &DL;
    Loop *L                 = nullptr;
    Module *M               = nullptr;
    const SCEV *ExitCount   = nullptr;
    Type *CountType         = nullptr;
    BranchInst *ExitBranch  = nullptr;
    Value *LoopDecrement      = nullptr;
    bool UsePHICounter      = false;
  };
}

char HardwareLoops::ID = 0;

bool HardwareLoops::runOnFunction(Function &F) {
  if (skipFunction(F))
    return false;

  LLVM_DEBUG(dbgs() << "HWLoops: Running on " << F.getName() << "\n");

  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
  DL = &F.getParent()->getDataLayout();
  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
  LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
  PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
  M = F.getParent();

  for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) {
    Loop *L = *I;
    if (!L->getParentLoop())
      TryConvertLoop(L);
  }

  return MadeChange;
}

// Return true if the search should stop, which will be when an inner loop is
// converted and the parent loop doesn't support containing a hardware loop.
bool HardwareLoops::TryConvertLoop(Loop *L) {
  // Process nested loops first.
  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
    if (TryConvertLoop(*I))
      return true; // Stop search.

  // Bail out if the loop has irreducible control flow.
  LoopBlocksRPO RPOT(L);
  RPOT.perform(LI);
  if (containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI))
    return false;

  TTI::HardwareLoopInfo HWLoopInfo(L);
  if (TTI->isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo) ||
      ForceHardwareLoops) {

    // Allow overriding of the counter width and loop decrement value.
    if (CounterBitWidth.getNumOccurrences())
      HWLoopInfo.CountType =
        IntegerType::get(M->getContext(), CounterBitWidth);

    if (LoopDecrement.getNumOccurrences())
      HWLoopInfo.LoopDecrement =
        ConstantInt::get(HWLoopInfo.CountType, LoopDecrement);

    MadeChange |= TryConvertLoop(HWLoopInfo);
    return MadeChange && (!HWLoopInfo.IsNestingLegal && !ForceNestedLoop);
  }

  return false;
}

bool HardwareLoops::TryConvertLoop(TTI::HardwareLoopInfo &HWLoopInfo) {

  Loop *L = HWLoopInfo.L;
  LLVM_DEBUG(dbgs() << "HWLoops: Try to convert profitable loop: " << *L);

  SmallVector<BasicBlock*, 4> ExitingBlocks;
  L->getExitingBlocks(ExitingBlocks);

  for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(),
       IE = ExitingBlocks.end(); I != IE; ++I) {
    const SCEV *EC = SE->getExitCount(L, *I);
    if (isa<SCEVCouldNotCompute>(EC))
      continue;
    if (const SCEVConstant *ConstEC = dyn_cast<SCEVConstant>(EC)) {
      if (ConstEC->getValue()->isZero())
        continue;
    } else if (!SE->isLoopInvariant(EC, L))
      continue;

    if (SE->getTypeSizeInBits(EC->getType()) >
        HWLoopInfo.CountType->getBitWidth())
      continue;

    // If this exiting block is contained in a nested loop, it is not eligible
    // for insertion of the branch-and-decrement since the inner loop would
    // end up messing up the value in the CTR.
    if (!HWLoopInfo.IsNestingLegal && LI->getLoopFor(*I) != L &&
        !ForceNestedLoop)
      continue;

    // We now have a loop-invariant count of loop iterations (which is not the
    // constant zero) for which we know that this loop will not exit via this
    // existing block.

    // We need to make sure that this block will run on every loop iteration.
    // For this to be true, we must dominate all blocks with backedges. Such
    // blocks are in-loop predecessors to the header block.
    bool NotAlways = false;
    for (pred_iterator PI = pred_begin(L->getHeader()),
         PIE = pred_end(L->getHeader()); PI != PIE; ++PI) {
      if (!L->contains(*PI))
        continue;

      if (!DT->dominates(*I, *PI)) {
        NotAlways = true;
        break;
      }
    }

    if (NotAlways)
      continue;

    // Make sure this blocks ends with a conditional branch.
    Instruction *TI = (*I)->getTerminator();
    if (!TI)
      continue;

    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
      if (!BI->isConditional())
        continue;

      HWLoopInfo.ExitBranch = BI;
    } else
      continue;

    // Note that this block may not be the loop latch block, even if the loop
    // has a latch block.
    HWLoopInfo.ExitBlock = *I;
    HWLoopInfo.ExitCount = EC;
    break;
  }

  if (!HWLoopInfo.ExitBlock)
    return false;

  BasicBlock *Preheader = L->getLoopPreheader();

  // If we don't have a preheader, then insert one.
  if (!Preheader)
    Preheader = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA);
  if (!Preheader)
    return false;

  HardwareLoop HWLoop(HWLoopInfo, *SE, *DL);
  HWLoop.Create();
  ++NumHWLoops;
  return true;
}

void HardwareLoop::Create() {
  LLVM_DEBUG(dbgs() << "HWLoops: Converting loop..\n");
  BasicBlock *BeginBB = L->getLoopPreheader();
  Value *LoopCountInit = InitLoopCount(BeginBB);
  if (!LoopCountInit)
    return;

  InsertIterationSetup(LoopCountInit, BeginBB);

  if (UsePHICounter || ForceHardwareLoopPHI) {
    Instruction *LoopDec = InsertLoopRegDec(LoopCountInit);
    Value *EltsRem = InsertPHICounter(LoopCountInit, LoopDec);
    LoopDec->setOperand(0, EltsRem);
    UpdateBranch(LoopDec);
  } else
    InsertLoopDec();

  // Run through the basic blocks of the loop and see if any of them have dead
  // PHIs that can be removed.
  for (auto I : L->blocks())
    DeleteDeadPHIs(I);
}

Value *HardwareLoop::InitLoopCount(BasicBlock *BB) {
  SCEVExpander SCEVE(SE, DL, "loopcnt");
  if (!ExitCount->getType()->isPointerTy() &&
      ExitCount->getType() != CountType)
    ExitCount = SE.getZeroExtendExpr(ExitCount, CountType);

  ExitCount = SE.getAddExpr(ExitCount, SE.getOne(CountType));

  if (!isSafeToExpandAt(ExitCount, BB->getTerminator(), SE)) {
    LLVM_DEBUG(dbgs() << "HWLoops: Bailing, unsafe to expand ExitCount "
               << *ExitCount << "\n");
    return nullptr;
  }

  Value *Count = SCEVE.expandCodeFor(ExitCount, CountType,
                                     BB->getTerminator());
  LLVM_DEBUG(dbgs() << "HWLoops: Loop Count: " << *Count << "\n");
  return Count;
}

void HardwareLoop::InsertIterationSetup(Value *LoopCountInit,
                                        BasicBlock *BB) {
  IRBuilder<> Builder(BB->getTerminator());
  Type *Ty = LoopCountInit->getType();
  Function *LoopIter =
    Intrinsic::getDeclaration(M, Intrinsic::set_loop_iterations, Ty);
  Value *Call = Builder.CreateCall(LoopIter, LoopCountInit);
  LLVM_DEBUG(dbgs() << "HWLoops: Iteration set: " << *Call << "\n");
}

void HardwareLoop::InsertLoopDec() {
  IRBuilder<> CondBuilder(ExitBranch);

  Function *DecFunc =
    Intrinsic::getDeclaration(M, Intrinsic::loop_decrement,
                              LoopDecrement->getType());
  Value *Ops[] = { LoopDecrement };
  Value *NewCond = CondBuilder.CreateCall(DecFunc, Ops);
  Value *OldCond = ExitBranch->getCondition();
  ExitBranch->setCondition(NewCond);

  // The false branch must exit the loop.
  if (!L->contains(ExitBranch->getSuccessor(0)))
    ExitBranch->swapSuccessors();

  // The old condition may be dead now, and may have even created a dead PHI
  // (the original induction variable).
  RecursivelyDeleteTriviallyDeadInstructions(OldCond);

  LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *NewCond << "\n");
}

Instruction* HardwareLoop::InsertLoopRegDec(Value *EltsRem) {
  IRBuilder<> CondBuilder(ExitBranch);

  Function *DecFunc =
      Intrinsic::getDeclaration(M, Intrinsic::loop_decrement_reg,
                                { EltsRem->getType(), EltsRem->getType(),
                                  LoopDecrement->getType()
                                });
  Value *Ops[] = { EltsRem, LoopDecrement };
  Value *Call = CondBuilder.CreateCall(DecFunc, Ops);

  LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *Call << "\n");
  return cast<Instruction>(Call);
}

PHINode* HardwareLoop::InsertPHICounter(Value *NumElts, Value *EltsRem) {
  BasicBlock *Preheader = L->getLoopPreheader();
  BasicBlock *Header = L->getHeader();
  BasicBlock *Latch = ExitBranch->getParent();
  IRBuilder<> Builder(Header->getFirstNonPHI());
  PHINode *Index = Builder.CreatePHI(NumElts->getType(), 2);
  Index->addIncoming(NumElts, Preheader);
  Index->addIncoming(EltsRem, Latch);
  LLVM_DEBUG(dbgs() << "HWLoops: PHI Counter: " << *Index << "\n");
  return Index;
}

void HardwareLoop::UpdateBranch(Value *EltsRem) {
  IRBuilder<> CondBuilder(ExitBranch);
  Value *NewCond =
    CondBuilder.CreateICmpNE(EltsRem, ConstantInt::get(EltsRem->getType(), 0));
  Value *OldCond = ExitBranch->getCondition();
  ExitBranch->setCondition(NewCond);

  // The false branch must exit the loop.
  if (!L->contains(ExitBranch->getSuccessor(0)))
    ExitBranch->swapSuccessors();

  // The old condition may be dead now, and may have even created a dead PHI
  // (the original induction variable).
  RecursivelyDeleteTriviallyDeadInstructions(OldCond);
}

INITIALIZE_PASS_BEGIN(HardwareLoops, DEBUG_TYPE, HW_LOOPS_NAME, false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_END(HardwareLoops, DEBUG_TYPE, HW_LOOPS_NAME, false, false)

FunctionPass *llvm::createHardwareLoopsPass() { return new HardwareLoops(); }