llvm.org GIT mirror llvm / 4855d2d
Reland rL312315: [MergeICmps] MergeICmps is a new optimization pass that turns chains of integer Add missing header. This reverts commit 86dd6335cf7607af22f383a9a8e072ba929848cf. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@312322 91177308-0d34-0410-b5e6-96231b3b80d8 Clement Courbet 2 years ago
10 changed file(s) with 853 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
255255 void initializeMemorySSAWrapperPassPass(PassRegistry&);
256256 void initializeMemorySanitizerPass(PassRegistry&);
257257 void initializeMergeFunctionsPass(PassRegistry&);
258 void initializeMergeICmpsPass(PassRegistry&);
258259 void initializeMergedLoadStoreMotionLegacyPassPass(PassRegistry&);
259260 void initializeMetaRenamerPass(PassRegistry&);
260261 void initializeModuleDebugInfoPrinterPass(PassRegistry&);
178178 (void) llvm::createPostOrderFunctionAttrsLegacyPass();
179179 (void) llvm::createReversePostOrderFunctionAttrsPass();
180180 (void) llvm::createMergeFunctionsPass();
181 (void) llvm::createMergeICmpsPass();
181182 std::string buf;
182183 llvm::raw_string_ostream os(buf);
183184 (void) llvm::createPrintModulePass(os);
421421
422422 //===----------------------------------------------------------------------===//
423423 //
424 // MergeICmps - Merge integer comparison chains
425 //
426 Pass *createMergeICmpsPass();
427
428 //===----------------------------------------------------------------------===//
429 //
424430 // ValuePropagation - Propagate CFG-derived value information
425431 //
426432 Pass *createCorrelatedValuePropagationPass();
9393 "enable-implicit-null-checks",
9494 cl::desc("Fold null checks into faulting memory operations"),
9595 cl::init(false));
96 static cl::opt EnableMergeICmps(
97 "enable-mergeicmps",
98 cl::desc("Merge ICmp chains into a single memcmp"),
99 cl::init(false));
96100 static cl::opt PrintLSR("print-lsr-output", cl::Hidden,
97101 cl::desc("Print LLVM IR produced by the loop-reduce pass"));
98102 static cl::opt PrintISelInput("print-isel-input", cl::Hidden,
588592 addPass(createLoopStrengthReducePass());
589593 if (PrintLSR)
590594 addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n"));
595 }
596
597 if (getOptLevel() != CodeGenOpt::None && EnableMergeICmps) {
598 addPass(createMergeICmpsPass());
591599 }
592600
593601 // Run GC lowering passes for builtin collectors
4141 LowerExpectIntrinsic.cpp
4242 LowerGuardIntrinsic.cpp
4343 MemCpyOptimizer.cpp
44 MergeICmps.cpp
4445 MergedLoadStoreMotion.cpp
4546 NaryReassociate.cpp
4647 NewGVN.cpp
0 //===- MergeICmps.cpp - Optimize chains of integer comparisons ------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass turns chains of integer comparisons into memcmp (the memcmp is
10 // later typically inlined as a chain of efficient hardware comparisons). This
11 // typically benefits c++ member or nonmember operator==().
12 //
13 // The basic idea is to replace a larger chain of integer comparisons loaded
14 // from contiguous memory locations into a smaller chain of such integer
15 // comparisons. Benefits are double:
16 // - There are less jumps, and therefore less opportunities for mispredictions
17 // and I-cache misses.
18 // - Code size is smaller, both because jumps are removed and because the
19 // encoding of a 2*n byte compare is smaller than that of two n-byte
20 // compares.
21
22 //===----------------------------------------------------------------------===//
23
24 #include "llvm/ADT/APSInt.h"
25 #include "llvm/Analysis/Loads.h"
26 #include "llvm/IR/Function.h"
27 #include "llvm/IR/IRBuilder.h"
28 #include "llvm/IR/IntrinsicInst.h"
29 #include "llvm/Pass.h"
30 #include "llvm/Transforms/Scalar.h"
31 #include "llvm/Transforms/Utils/BuildLibCalls.h"
32 #include
33 #include
34 #include
35 #include
36
37 using namespace llvm;
38
39 namespace {
40
41 #define DEBUG_TYPE "mergeicmps"
42
43 #define MERGEICMPS_DOT_ON
44
45 // A BCE atom.
46 struct BCEAtom {
47 const Value *Base() const { return GEP ? GEP->getPointerOperand() : nullptr; }
48
49 bool operator<(const BCEAtom &O) const {
50 return Base() == O.Base() ? Offset.slt(O.Offset) : Base() < O.Base();
51 }
52
53 GetElementPtrInst *GEP = nullptr;
54 LoadInst *LoadI = nullptr;
55 APInt Offset;
56 };
57
58 // If this value is a load from a constant offset w.r.t. a base address, and
59 // there are no othe rusers of the load or address, returns the base address and
60 // the offset.
61 BCEAtom visitICmpLoadOperand(Value *const Val) {
62 BCEAtom Result;
63 if (auto *const LoadI = dyn_cast(Val)) {
64 DEBUG(dbgs() << "load\n");
65 if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
66 DEBUG(dbgs() << "used outside of block\n");
67 return {};
68 }
69 if (LoadI->isVolatile()) {
70 DEBUG(dbgs() << "volatile\n");
71 return {};
72 }
73 Value *const Addr = LoadI->getOperand(0);
74 if (auto *const GEP = dyn_cast(Addr)) {
75 DEBUG(dbgs() << "GEP\n");
76 if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
77 DEBUG(dbgs() << "used outside of block\n");
78 return {};
79 }
80 const auto &DL = GEP->getModule()->getDataLayout();
81 if (!isDereferenceablePointer(GEP, DL)) {
82 DEBUG(dbgs() << "not dereferenceable\n");
83 // We need to make sure that we can do comparison in any order, so we
84 // require memory to be unconditionnally dereferencable.
85 return {};
86 }
87 Result.Offset = APInt(DL.getPointerTypeSizeInBits(GEP->getType()), 0);
88 if (GEP->accumulateConstantOffset(DL, Result.Offset)) {
89 Result.GEP = GEP;
90 Result.LoadI = LoadI;
91 }
92 }
93 }
94 return Result;
95 }
96
97 // A basic block with a comparison between two BCE atoms.
98 // Note: the terminology is misleading: the comparison is symmetric, so there
99 // is no real {l/r}hs. To break the symmetry, we use the smallest atom as Lhs.
100 class BCECmpBlock {
101 public:
102 BCECmpBlock() {}
103
104 BCECmpBlock(BCEAtom L, BCEAtom R, int SizeBits)
105 : Lhs_(L), Rhs_(R), SizeBits_(SizeBits) {
106 if (Rhs_ < Lhs_)
107 std::swap(Rhs_, Lhs_);
108 }
109
110 bool IsValid() const {
111 return Lhs_.Base() != nullptr && Rhs_.Base() != nullptr;
112 }
113
114 // Assert the the block is consistent: If valid, it should also have
115 // non-null members besides Lhs_ and Rhs_.
116 void AssertConsistent() const {
117 if (IsValid()) {
118 assert(BB);
119 assert(CmpI);
120 assert(BranchI);
121 }
122 }
123
124 const BCEAtom &Lhs() const { return Lhs_; }
125 const BCEAtom &Rhs() const { return Rhs_; }
126 int SizeBits() const { return SizeBits_; }
127
128 // Returns true if the block does other works besides comparison.
129 bool doesOtherWork() const;
130
131 // The basic block where this comparison happens.
132 BasicBlock *BB = nullptr;
133 // The ICMP for this comparison.
134 ICmpInst *CmpI = nullptr;
135 // The terminating branch.
136 BranchInst *BranchI = nullptr;
137
138 private:
139 BCEAtom Lhs_;
140 BCEAtom Rhs_;
141 int SizeBits_ = 0;
142 };
143
144 bool BCECmpBlock::doesOtherWork() const {
145 AssertConsistent();
146 // TODO(courbet): Can we allow some other things ? This is very conservative.
147 // We might be able to get away with anything does does not have any side
148 // effects outside of the basic block.
149 // Note: The GEPs and/or loads are not necessarily in the same block.
150 for (const Instruction &Inst : *BB) {
151 if (const auto *const GEP = dyn_cast(&Inst)) {
152 if (!(Lhs_.GEP == GEP || Rhs_.GEP == GEP))
153 return true;
154 } else if (const auto *const L = dyn_cast(&Inst)) {
155 if (!(Lhs_.LoadI == L || Rhs_.LoadI == L))
156 return true;
157 } else if (const auto *const C = dyn_cast(&Inst)) {
158 if (C != CmpI)
159 return true;
160 } else if (const auto *const Br = dyn_cast(&Inst)) {
161 if (Br != BranchI)
162 return true;
163 } else {
164 return true;
165 }
166 }
167 return false;
168 }
169
170 // Visit the given comparison. If this is a comparison between two valid
171 // BCE atoms, returns the comparison.
172 BCECmpBlock visitICmp(const ICmpInst *const CmpI,
173 const ICmpInst::Predicate ExpectedPredicate) {
174 if (CmpI->getPredicate() == ExpectedPredicate) {
175 DEBUG(dbgs() << "cmp "
176 << (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne")
177 << "\n");
178 auto Lhs = visitICmpLoadOperand(CmpI->getOperand(0));
179 if (!Lhs.Base())
180 return {};
181 auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1));
182 if (!Rhs.Base())
183 return {};
184 return BCECmpBlock(std::move(Lhs), std::move(Rhs),
185 CmpI->getOperand(0)->getType()->getScalarSizeInBits());
186 }
187 return {};
188 }
189
190 // Visit the given comparison block. If this is a comparison between two valid
191 // BCE atoms, returns the comparison.
192 BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
193 const BasicBlock *const PhiBlock) {
194 if (Block->empty())
195 return {};
196 auto *const BranchI = dyn_cast(Block->getTerminator());
197 if (!BranchI)
198 return {};
199 DEBUG(dbgs() << "branch\n");
200 if (BranchI->isUnconditional()) {
201 // In this case, we expect an incoming value which is the result of the
202 // comparison. This is the last link in the chain of comparisons (note
203 // that this does not mean that this is the last incoming value, blocks
204 // can be reordered).
205 auto *const CmpI = dyn_cast(Val);
206 if (!CmpI)
207 return {};
208 DEBUG(dbgs() << "icmp\n");
209 auto Result = visitICmp(CmpI, ICmpInst::ICMP_EQ);
210 Result.CmpI = CmpI;
211 Result.BranchI = BranchI;
212 return Result;
213 } else {
214 // In this case, we expect a constant incoming value (the comparison is
215 // chained).
216 const auto *const Const = dyn_cast(Val);
217 DEBUG(dbgs() << "const\n");
218 if (!Const->isZero())
219 return {};
220 DEBUG(dbgs() << "false\n");
221 auto *const CmpI = dyn_cast(BranchI->getCondition());
222 if (!CmpI)
223 return {};
224 DEBUG(dbgs() << "icmp\n");
225 assert(BranchI->getNumSuccessors() == 2 && "expecting a cond branch");
226 BasicBlock *const FalseBlock = BranchI->getSuccessor(1);
227 auto Result = visitICmp(
228 CmpI, FalseBlock == PhiBlock ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE);
229 Result.CmpI = CmpI;
230 Result.BranchI = BranchI;
231 return Result;
232 }
233 return {};
234 }
235
236 // A chain of comparisons.
237 class BCECmpChain {
238 public:
239 BCECmpChain(const std::vector &Blocks, PHINode &Phi);
240
241 int size() const { return Comparisons_.size(); }
242
243 #ifdef MERGEICMPS_DOT_ON
244 void dump() const;
245 #endif // MERGEICMPS_DOT_ON
246
247 bool simplify(const TargetLibraryInfo *const TLI);
248
249 private:
250 static bool IsContiguous(const BCECmpBlock &First,
251 const BCECmpBlock &Second) {
252 return First.Lhs().Base() == Second.Lhs().Base() &&
253 First.Rhs().Base() == Second.Rhs().Base() &&
254 First.Lhs().Offset + First.SizeBits() / 8 == Second.Lhs().Offset &&
255 First.Rhs().Offset + First.SizeBits() / 8 == Second.Rhs().Offset;
256 }
257
258 // Merges the given comparison blocks into one memcmp block and update
259 // branches. Comparisons are assumed to be continguous. If NextBBInChain is
260 // null, the merged block will link to the phi block.
261 static void mergeComparisons(ArrayRef Comparisons,
262 BasicBlock *const NextBBInChain, PHINode &Phi,
263 const TargetLibraryInfo *const TLI);
264
265 PHINode &Phi_;
266 std::vector Comparisons_;
267 // The original entry block (before sorting);
268 BasicBlock *EntryBlock_;
269 };
270
271 BCECmpChain::BCECmpChain(const std::vector &Blocks, PHINode &Phi)
272 : Phi_(Phi) {
273 // Now look inside blocks to check for BCE comparisons.
274 std::vector Comparisons;
275 for (BasicBlock *Block : Blocks) {
276 BCECmpBlock Comparison = visitCmpBlock(Phi.getIncomingValueForBlock(Block),
277 Block, Phi.getParent());
278 Comparison.BB = Block;
279 if (!Comparison.IsValid()) {
280 DEBUG(dbgs() << "skip: not a valid BCECmpBlock\n");
281 return;
282 }
283 if (Comparison.doesOtherWork()) {
284 DEBUG(dbgs() << "block does extra work besides compare\n");
285 if (Comparisons.empty()) { // First block.
286 // TODO(courbet): The first block can do other things, and we should
287 // split them apart in a separate block before the comparison chain.
288 // Right now we just discard it and make the chain shorter.
289 DEBUG(dbgs()
290 << "ignoring first block that does extra work besides compare\n");
291 continue;
292 }
293 // TODO(courbet): Right now we abort the whole chain. We could be
294 // merging only the blocks that don't do other work and resume the
295 // chain from there. For example:
296 // if (a[0] == b[0]) { // bb1
297 // if (a[1] == b[1]) { // bb2
298 // some_value = 3; //bb3
299 // if (a[2] == b[2]) { //bb3
300 // do a ton of stuff //bb4
301 // }
302 // }
303 // }
304 //
305 // This is:
306 //
307 // bb1 --eq--> bb2 --eq--> bb3* -eq--> bb4 --+
308 // \ \ \ \
309 // ne ne ne \
310 // \ \ \ v
311 // +------------+-----------+----------> bb_phi
312 //
313 // We can only merge the first two comparisons, because bb3* does
314 // "other work" (setting some_value to 3).
315 // We could still merge bb1 and bb2 though.
316 return;
317 }
318 DEBUG(dbgs() << "*Found cmp of " << Comparison.SizeBits()
319 << " bits between " << Comparison.Lhs().Base() << " + "
320 << Comparison.Lhs().Offset << " and "
321 << Comparison.Rhs().Base() << " + " << Comparison.Rhs().Offset
322 << "\n");
323 DEBUG(dbgs() << "\n");
324 Comparisons.push_back(Comparison);
325 }
326 EntryBlock_ = Comparisons[0].BB;
327 Comparisons_ = std::move(Comparisons);
328 #ifdef MERGEICMPS_DOT_ON
329 errs() << "BEFORE REORDERING:\n\n";
330 dump();
331 #endif // MERGEICMPS_DOT_ON
332 // Reorder blocks by LHS. We can do that without changing the
333 // semantics because we are only accessing dereferencable memory.
334 std::sort(Comparisons_.begin(), Comparisons_.end(),
335 [](const BCECmpBlock &a, const BCECmpBlock &b) {
336 return a.Lhs() < b.Lhs();
337 });
338 #ifdef MERGEICMPS_DOT_ON
339 errs() << "AFTER REORDERING:\n\n";
340 dump();
341 #endif // MERGEICMPS_DOT_ON
342 }
343
344 #ifdef MERGEICMPS_DOT_ON
345 void BCECmpChain::dump() const {
346 errs() << "digraph dag {\n";
347 errs() << " graph [bgcolor=transparent];\n";
348 errs() << " node [color=black,style=filled,fillcolor=lightyellow];\n";
349 errs() << " edge [color=black];\n";
350 for (size_t I = 0; I < Comparisons_.size(); ++I) {
351 const auto &Comparison = Comparisons_[I];
352 errs() << " \"" << I << "\" [label=\"%"
353 << Comparison.Lhs().Base()->getName() << " + "
354 << Comparison.Lhs().Offset << " == %"
355 << Comparison.Rhs().Base()->getName() << " + "
356 << Comparison.Rhs().Offset << " (" << (Comparison.SizeBits() / 8)
357 << " bytes)\"];\n";
358 const Value *const Val = Phi_.getIncomingValueForBlock(Comparison.BB);
359 if (I > 0)
360 errs() << " \"" << (I - 1) << "\" -> \"" << I << "\";\n";
361 errs() << " \"" << I << "\" -> \"Phi\" [label=\"" << *Val << "\"];\n";
362 }
363 errs() << " \"Phi\" [label=\"Phi\"];\n";
364 errs() << "}\n\n";
365 }
366 #endif // MERGEICMPS_DOT_ON
367
368 bool BCECmpChain::simplify(const TargetLibraryInfo *const TLI) {
369 // First pass to check if there is at least one merge. If not, we don't do
370 // anything and we keep analysis passes intact.
371 {
372 bool AtLeastOneMerged = false;
373 for (size_t I = 1; I < Comparisons_.size(); ++I) {
374 if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) {
375 AtLeastOneMerged = true;
376 break;
377 }
378 }
379 if (!AtLeastOneMerged)
380 return false;
381 }
382
383 // Remove phi references to comparison blocks, they will be rebuilt as we
384 // merge the blocks.
385 for (const auto &Comparison : Comparisons_) {
386 Phi_.removeIncomingValue(Comparison.BB, false);
387 }
388
389 // Point the predecessors of the chain to the first comparison block (which is
390 // the new entry point).
391 if (EntryBlock_ != Comparisons_[0].BB)
392 EntryBlock_->replaceAllUsesWith(Comparisons_[0].BB);
393
394 // Effectively merge blocks.
395 int NumMerged = 1;
396 for (size_t I = 1; I < Comparisons_.size(); ++I) {
397 if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) {
398 ++NumMerged;
399 } else {
400 // Merge all previous comparisons and start a new merge block.
401 mergeComparisons(
402 makeArrayRef(Comparisons_).slice(I - NumMerged, NumMerged),
403 Comparisons_[I].BB, Phi_, TLI);
404 NumMerged = 1;
405 }
406 }
407 mergeComparisons(makeArrayRef(Comparisons_)
408 .slice(Comparisons_.size() - NumMerged, NumMerged),
409 nullptr, Phi_, TLI);
410
411 return true;
412 }
413
414 void BCECmpChain::mergeComparisons(ArrayRef Comparisons,
415 BasicBlock *const NextBBInChain,
416 PHINode &Phi,
417 const TargetLibraryInfo *const TLI) {
418 assert(!Comparisons.empty());
419 const auto &FirstComparison = *Comparisons.begin();
420 BasicBlock *const BB = FirstComparison.BB;
421 LLVMContext &Context = BB->getContext();
422
423 if (Comparisons.size() >= 2) {
424 DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n");
425 const auto TotalSize =
426 std::accumulate(Comparisons.begin(), Comparisons.end(), 0,
427 [](int Size, const BCECmpBlock &C) {
428 return Size + C.SizeBits();
429 }) /
430 8;
431
432 // Incoming edges do not need to be updated, and both GEPs are already
433 // computing the right address, we just need to:
434 // - replace the two loads and the icmp with the memcmp
435 // - update the branch
436 // - update the incoming values in the phi.
437 FirstComparison.BranchI->eraseFromParent();
438 FirstComparison.CmpI->eraseFromParent();
439 FirstComparison.Lhs().LoadI->eraseFromParent();
440 FirstComparison.Rhs().LoadI->eraseFromParent();
441
442 IRBuilder<> Builder(BB);
443 const auto &DL = Phi.getModule()->getDataLayout();
444 Value *const MemCmpCall =
445 emitMemCmp(FirstComparison.Lhs().GEP, FirstComparison.Rhs().GEP,
446 ConstantInt::get(DL.getIntPtrType(Context), TotalSize),
447 Builder, DL, TLI);
448 Value *const MemCmpIsZero = Builder.CreateICmpEQ(
449 MemCmpCall, ConstantInt::get(Type::getInt32Ty(Context), 0));
450
451 // Add a branch to the next basic block in the chain.
452 if (NextBBInChain) {
453 Builder.CreateCondBr(MemCmpIsZero, NextBBInChain, Phi.getParent());
454 Phi.addIncoming(ConstantInt::getFalse(Context), BB);
455 } else {
456 Builder.CreateBr(Phi.getParent());
457 Phi.addIncoming(MemCmpIsZero, BB);
458 }
459
460 // Delete merged blocks.
461 for (size_t I = 1; I < Comparisons.size(); ++I) {
462 BasicBlock *CBB = Comparisons[I].BB;
463 CBB->replaceAllUsesWith(BB);
464 CBB->eraseFromParent();
465 }
466 } else {
467 assert(Comparisons.size() == 1);
468 // There are no blocks to merge, but we still need to update the branches.
469 DEBUG(dbgs() << "Only one comparison, updating branches\n");
470 if (NextBBInChain) {
471 if (FirstComparison.BranchI->isConditional()) {
472 DEBUG(dbgs() << "conditional -> conditional\n");
473 // Just update the "true" target, the "false" target should already be
474 // the phi block.
475 assert(FirstComparison.BranchI->getSuccessor(1) == Phi.getParent());
476 FirstComparison.BranchI->setSuccessor(0, NextBBInChain);
477 Phi.addIncoming(ConstantInt::getFalse(Context), BB);
478 } else {
479 DEBUG(dbgs() << "unconditional -> conditional\n");
480 // Replace the unconditional branch by a conditional one.
481 FirstComparison.BranchI->eraseFromParent();
482 IRBuilder<> Builder(BB);
483 Builder.CreateCondBr(FirstComparison.CmpI, NextBBInChain,
484 Phi.getParent());
485 Phi.addIncoming(FirstComparison.CmpI, BB);
486 }
487 } else {
488 if (FirstComparison.BranchI->isConditional()) {
489 DEBUG(dbgs() << "conditional -> unconditional\n");
490 // Replace the conditional branch by an unconditional one.
491 FirstComparison.BranchI->eraseFromParent();
492 IRBuilder<> Builder(BB);
493 Builder.CreateBr(Phi.getParent());
494 Phi.addIncoming(FirstComparison.CmpI, BB);
495 } else {
496 DEBUG(dbgs() << "unconditional -> unconditional\n");
497 Phi.addIncoming(FirstComparison.CmpI, BB);
498 }
499 }
500 }
501 }
502
503 std::vector getOrderedBlocks(PHINode &Phi,
504 BasicBlock *const LastBlock,
505 int NumBlocks) {
506 // Walk up from the last block to find other blocks.
507 std::vector Blocks(NumBlocks);
508 BasicBlock *CurBlock = LastBlock;
509 for (int BlockIndex = NumBlocks - 1; BlockIndex > 0; --BlockIndex) {
510 if (CurBlock->hasAddressTaken()) {
511 // Somebody is jumping to the block through an address, all bets are
512 // off.
513 DEBUG(dbgs() << "skip: block " << BlockIndex
514 << " has its address taken\n");
515 return {};
516 }
517 Blocks[BlockIndex] = CurBlock;
518 auto *SinglePredecessor = CurBlock->getSinglePredecessor();
519 if (!SinglePredecessor) {
520 // The block has two or more predecessors.
521 DEBUG(dbgs() << "skip: block " << BlockIndex
522 << " has two or more predecessors\n");
523 return {};
524 }
525 if (Phi.getBasicBlockIndex(SinglePredecessor) < 0) {
526 // The block does not link back to the phi.
527 DEBUG(dbgs() << "skip: block " << BlockIndex
528 << " does not link back to the phi\n");
529 return {};
530 }
531 CurBlock = SinglePredecessor;
532 }
533 Blocks[0] = CurBlock;
534 return Blocks;
535 }
536
537 bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) {
538 DEBUG(dbgs() << "processPhi()\n");
539 if (Phi.getNumIncomingValues() <= 1) {
540 DEBUG(dbgs() << "skip: only one incoming value in phi\n");
541 return false;
542 }
543 // We are looking for something that has the following structure:
544 // bb1 --eq--> bb2 --eq--> bb3 --eq--> bb4 --+
545 // \ \ \ \
546 // ne ne ne \
547 // \ \ \ v
548 // +------------+-----------+----------> bb_phi
549 //
550 // - The last basic block (bb4 here) must branch unconditionally to bb_phi.
551 // It's the only block that contributes a non-constant value to the Phi.
552 // - All other blocks (b1, b2, b3) must have exactly two successors, one of
553 // them being the the phi block.
554 // - All intermediate blocks (bb2, bb3) must have only one predecessor.
555 // - Blocks cannot do other work besides the comparison, see doesOtherWork()
556
557 // The blocks are not necessarily ordered in the phi, so we start from the
558 // last block and reconstruct the order.
559 BasicBlock *LastBlock = nullptr;
560 for (unsigned I = 0; I < Phi.getNumIncomingValues(); ++I) {
561 if (isa(Phi.getIncomingValue(I)))
562 continue;
563 if (LastBlock) {
564 // There are several non-constant values.
565 DEBUG(dbgs() << "skip: several non-constant values\n");
566 return false;
567 }
568 LastBlock = Phi.getIncomingBlock(I);
569 }
570 if (!LastBlock) {
571 // There is no non-constant block.
572 DEBUG(dbgs() << "skip: no non-constant block\n");
573 return false;
574 }
575 if (LastBlock->getSingleSuccessor() != Phi.getParent()) {
576 DEBUG(dbgs() << "skip: last block non-phi successor\n");
577 return false;
578 }
579
580 const auto Blocks =
581 getOrderedBlocks(Phi, LastBlock, Phi.getNumIncomingValues());
582 if (Blocks.empty())
583 return false;
584 BCECmpChain CmpChain(Blocks, Phi);
585
586 if (CmpChain.size() < 2) {
587 DEBUG(dbgs() << "skip: only one compare block\n");
588 return false;
589 }
590
591 return CmpChain.simplify(TLI);
592 }
593
594 class MergeICmps : public FunctionPass {
595 public:
596 static char ID;
597
598 MergeICmps() : FunctionPass(ID) {
599 initializeMergeICmpsPass(*PassRegistry::getPassRegistry());
600 }
601
602 bool runOnFunction(Function &F) override {
603 if (skipFunction(F)) return false;
604 const auto &TLI = getAnalysis().getTLI();
605 auto PA = runImpl(F, &TLI);
606 return !PA.areAllPreserved();
607 }
608
609 private:
610 void getAnalysisUsage(AnalysisUsage &AU) const override {
611 AU.addRequired();
612 }
613
614 PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI);
615 };
616
617 PreservedAnalyses MergeICmps::runImpl(Function &F,
618 const TargetLibraryInfo *TLI) {
619 DEBUG(dbgs() << "MergeICmpsPass: " << F.getName() << "\n");
620
621 bool MadeChange = false;
622
623 for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) {
624 // A Phi operation is always first in a basic block.
625 if (auto *const Phi = dyn_cast(&*BBIt->begin()))
626 MadeChange |= processPhi(*Phi, TLI);
627 }
628
629 if (MadeChange)
630 return PreservedAnalyses::none();
631 return PreservedAnalyses::all();
632 }
633
634 } // namespace
635
636 char MergeICmps::ID = 0;
637 INITIALIZE_PASS_BEGIN(MergeICmps, "mergeicmps",
638 "Merge contiguous icmps into a memcmp", false, false)
639 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
640 INITIALIZE_PASS_END(MergeICmps, "mergeicmps",
641 "Merge contiguous icmps into a memcmp", false, false)
642
643 Pass *llvm::createMergeICmpsPass() { return new MergeICmps(); }
644
7272 initializeLowerExpectIntrinsicPass(Registry);
7373 initializeLowerGuardIntrinsicLegacyPassPass(Registry);
7474 initializeMemCpyOptLegacyPassPass(Registry);
75 initializeMergeICmpsPass(Registry);
7576 initializeMergedLoadStoreMotionLegacyPassPass(Registry);
7677 initializeNaryReassociateLegacyPassPass(Registry);
7778 initializePartiallyInlineLibCallsLegacyPassPass(Registry);
0 ; RUN: opt -mergeicmps -S -o - %s | FileCheck %s
1
2 %"struct.std::pair" = type { i32, i32 }
3
4 define zeroext i1 @opeq1(
5 %"struct.std::pair"* nocapture readonly dereferenceable(8) %a,
6 %"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 {
7 entry:
8 %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0
9 %0 = load i32, i32* %first.i, align 4
10 %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0
11 %1 = load i32, i32* %first1.i, align 4
12 %cmp.i = icmp eq i32 %0, %1
13 br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
14
15 land.rhs.i:
16 %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1
17 %2 = load i32, i32* %second.i, align 4
18 %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1
19 %3 = load i32, i32* %second2.i, align 4
20 %cmp3.i = icmp eq i32 %2, %3
21 br label %opeq1.exit
22
23 opeq1.exit:
24 %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ]
25 ret i1 %4
26 ; CHECK-LABEL: @opeq1(
27 ; The entry block with zero-offset GEPs is kept, loads are removed.
28 ; CHECK: entry
29 ; CHECK: getelementptr {{.*}} i32 0
30 ; CHECK-NOT: load
31 ; CHECK: getelementptr {{.*}} i32 0
32 ; CHECK-NOT: load
33 ; The two 4 byte loads and compares are replaced with a single 8-byte memcmp.
34 ; CHECK: @memcmp({{.*}}8)
35 ; CHECK: icmp eq {{.*}} 0
36 ; The branch is now a direct branch; the other block has been removed.
37 ; CHECK: br label %opeq1.exit
38 ; CHECK-NOT: br
39 ; The phi is updated.
40 ; CHECK: phi i1 [ %{{[^,]*}}, %entry ]
41 ; CHECK-NEXT: ret
42 }
43
44 ; Same as above, but the two blocks are in inverse order.
45 define zeroext i1 @opeq1_inverse(
46 %"struct.std::pair"* nocapture readonly dereferenceable(8) %a,
47 %"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 {
48 entry:
49 %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1
50 %0 = load i32, i32* %first.i, align 4
51 %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1
52 %1 = load i32, i32* %first1.i, align 4
53 %cmp.i = icmp eq i32 %0, %1
54 br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
55
56 land.rhs.i:
57 %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0
58 %2 = load i32, i32* %second.i, align 4
59 %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0
60 %3 = load i32, i32* %second2.i, align 4
61 %cmp3.i = icmp eq i32 %2, %3
62 br label %opeq1.exit
63
64 opeq1.exit:
65 %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ]
66 ret i1 %4
67 ; CHECK-LABEL: @opeq1_inverse(
68 ; The second block with zero-offset GEPs is kept, loads are removed.
69 ; CHECK: land.rhs.i
70 ; CHECK: getelementptr {{.*}} i32 0
71 ; CHECK-NOT: load
72 ; CHECK: getelementptr {{.*}} i32 0
73 ; CHECK-NOT: load
74 ; The two 4 byte loads and compares are replaced with a single 8-byte memcmp.
75 ; CHECK: @memcmp({{.*}}8)
76 ; CHECK: icmp eq {{.*}} 0
77 ; The branch is now a direct branch; the other block has been removed.
78 ; CHECK: br label %opeq1.exit
79 ; CHECK-NOT: br
80 ; The phi is updated.
81 ; CHECK: phi i1 [ %{{[^,]*}}, %land.rhs.i ]
82 ; CHECK-NEXT: ret
83 }
84
85
86
0 ; RUN: opt -mergeicmps -S -o - %s | FileCheck %s
1
2 ; This is a more involved test: clang generates this weird pattern for
3 ; tuple. Right now we skip the entry block
4 ; (which defines the base pointer for other blocks) and the last one (which
5 ; does not have the expected structure). Only middle blocks (bytes [1,2]) are
6 ; merged.
7
8 %"class.std::tuple" = type { %"struct.std::_Tuple_impl" }
9 %"struct.std::_Tuple_impl" = type { %"struct.std::_Tuple_impl.0", %"struct.std::_Head_base.6" }
10 %"struct.std::_Tuple_impl.0" = type { %"struct.std::_Tuple_impl.1", %"struct.std::_Head_base.5" }
11 %"struct.std::_Tuple_impl.1" = type { %"struct.std::_Tuple_impl.2", %"struct.std::_Head_base.4" }
12 %"struct.std::_Tuple_impl.2" = type { %"struct.std::_Head_base" }
13 %"struct.std::_Head_base" = type { i8 }
14 %"struct.std::_Head_base.4" = type { i8 }
15 %"struct.std::_Head_base.5" = type { i8 }
16 %"struct.std::_Head_base.6" = type { i8 }
17
18 define zeroext i1 @opeq(
19 %"class.std::tuple"* nocapture readonly dereferenceable(4) %a,
20 %"class.std::tuple"* nocapture readonly dereferenceable(4) %b) local_unnamed_addr #1 {
21 entry:
22 %0 = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* %a, i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0
23 %add.ptr.i.i.i.i.i = getelementptr inbounds i8, i8* %0, i64 3
24 %1 = load i8, i8* %add.ptr.i.i.i.i.i, align 1
25 %2 = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* %b, i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0
26 %add.ptr.i.i.i6.i.i = getelementptr inbounds i8, i8* %2, i64 3
27 %3 = load i8, i8* %add.ptr.i.i.i6.i.i, align 1
28 %cmp.i.i = icmp eq i8 %1, %3
29 br i1 %cmp.i.i, label %land.rhs.i.i, label %opeq.exit
30
31 land.rhs.i.i:
32 %add.ptr.i.i.i.i.i.i = getelementptr inbounds i8, i8* %0, i64 2
33 %4 = load i8, i8* %add.ptr.i.i.i.i.i.i, align 1
34 %add.ptr.i.i.i6.i.i.i = getelementptr inbounds i8, i8* %2, i64 2
35 %5 = load i8, i8* %add.ptr.i.i.i6.i.i.i, align 1
36 %cmp.i.i.i = icmp eq i8 %4, %5
37 br i1 %cmp.i.i.i, label %land.rhs.i.i.i, label %opeq.exit
38
39 land.rhs.i.i.i:
40 %add.ptr.i.i.i.i.i.i.i = getelementptr inbounds i8, i8* %0, i64 1
41 %6 = load i8, i8* %add.ptr.i.i.i.i.i.i.i, align 1
42 %add.ptr.i.i.i6.i.i.i.i = getelementptr inbounds i8, i8* %2, i64 1
43 %7 = load i8, i8* %add.ptr.i.i.i6.i.i.i.i, align 1
44 %cmp.i.i.i.i = icmp eq i8 %6, %7
45 br i1 %cmp.i.i.i.i, label %land.rhs.i.i.i.i, label %opeq.exit
46
47 land.rhs.i.i.i.i:
48 %8 = load i8, i8* %0, align 1
49 %9 = load i8, i8* %2, align 1
50 %cmp.i.i.i.i.i = icmp eq i8 %8, %9
51 br label %opeq.exit
52
53 opeq.exit:
54 %10 = phi i1 [ false, %entry ], [ false, %land.rhs.i.i ], [ false, %land.rhs.i.i.i ], [ %cmp.i.i.i.i.i, %land.rhs.i.i.i.i ]
55 ret i1 %10
56 ; CHECK-LABEL: @opeq(
57 ; The entry block is kept as is, but the next block is now the merged comparison
58 ; block for bytes [1,2] or the block for the head.
59 ; CHECK: entry
60 ; CHECK: br i1 %cmp.i.i, label %land.rhs.i.i.i{{(.i)?}}, label %opeq.exit
61 ; The two 1 byte loads and compares at offset 1 are replaced with a single
62 ; 2-byte memcmp.
63 ; CHECK: land.rhs.i.i.i
64 ; CHECK: @memcmp({{.*}}2)
65 ; CHECK: icmp eq {{.*}} 0
66 ; In the end we have three blocks.
67 ; CHECK: phi i1
68 ; CHECK-SAME %entry
69 ; CHECK-SAME %land.rhs.i.i.i.i
70 ; CHECK-SAME %land.rhs.i.i.i
71 }
72
0 ; RUN: opt -mergeicmps -S -o - %s | FileCheck %s
1
2 %"struct.std::pair" = type { i32, i32 }
3
4 define zeroext i1 @opeq(
5 %"struct.std::pair"* nocapture readonly dereferenceable(8) %a,
6 %"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 {
7 entry:
8 %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0
9 %0 = load i32, i32* %first.i, align 4
10 %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0
11 %1 = load i32, i32* %first1.i, align 4
12 %cmp.i = icmp eq i32 %0, %1
13 br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
14
15 land.rhs.i:
16 %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1
17 %2 = load volatile i32, i32* %second.i, align 4
18 %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1
19 %3 = load i32, i32* %second2.i, align 4
20 %cmp3.i = icmp eq i32 %2, %3
21 br label %opeq1.exit
22
23 opeq1.exit:
24 %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ]
25 ret i1 %4
26 ; CHECK-LABEL: @opeq(
27 ; CHECK-NOT: memcmp
28 }
29