llvm.org GIT mirror llvm / 930b028
[MergeICmps] MergeICmps is a new optimization pass that turns chains of integer comparisons into memcmp. Thanks to recent improvements in the LLVM codegen, the memcmp is typically inlined as a chain of efficient hardware comparisons. This typically benefits C++ member or nonmember operator==(). For now this is disabled by default until: - https://bugs.llvm.org/show_bug.cgi?id=33329 is complete - Benchmarks show that this is always useful. Differential Revision: https://reviews.llvm.org/D33987 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@312315 91177308-0d34-0410-b5e6-96231b3b80d8 Clement Courbet 2 years ago
10 changed file(s) with 849 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
255255 void initializeMemorySSAWrapperPassPass(PassRegistry&);
256256 void initializeMemorySanitizerPass(PassRegistry&);
257257 void initializeMergeFunctionsPass(PassRegistry&);
258 void initializeMergeICmpsPass(PassRegistry&);
258259 void initializeMergedLoadStoreMotionLegacyPassPass(PassRegistry&);
259260 void initializeMetaRenamerPass(PassRegistry&);
260261 void initializeModuleDebugInfoPrinterPass(PassRegistry&);
178178 (void) llvm::createPostOrderFunctionAttrsLegacyPass();
179179 (void) llvm::createReversePostOrderFunctionAttrsPass();
180180 (void) llvm::createMergeFunctionsPass();
181 (void) llvm::createMergeICmpsPass();
181182 std::string buf;
182183 llvm::raw_string_ostream os(buf);
183184 (void) llvm::createPrintModulePass(os);
421421
422422 //===----------------------------------------------------------------------===//
423423 //
424 // MergeICmps - Merge integer comparison chains
425 //
426 Pass *createMergeICmpsPass();
427
428 //===----------------------------------------------------------------------===//
429 //
424430 // ValuePropagation - Propagate CFG-derived value information
425431 //
426432 Pass *createCorrelatedValuePropagationPass();
9393 "enable-implicit-null-checks",
9494 cl::desc("Fold null checks into faulting memory operations"),
9595 cl::init(false));
96 static cl::opt EnableMergeICmps(
97 "enable-mergeicmps",
98 cl::desc("Merge ICmp chains into a single memcmp"),
99 cl::init(false));
96100 static cl::opt PrintLSR("print-lsr-output", cl::Hidden,
97101 cl::desc("Print LLVM IR produced by the loop-reduce pass"));
98102 static cl::opt PrintISelInput("print-isel-input", cl::Hidden,
588592 addPass(createLoopStrengthReducePass());
589593 if (PrintLSR)
590594 addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n"));
595 }
596
597 if (getOptLevel() != CodeGenOpt::None && EnableMergeICmps) {
598 addPass(createMergeICmpsPass());
591599 }
592600
593601 // Run GC lowering passes for builtin collectors
4141 LowerExpectIntrinsic.cpp
4242 LowerGuardIntrinsic.cpp
4343 MemCpyOptimizer.cpp
44 MergeICmps.cpp
4445 MergedLoadStoreMotion.cpp
4546 NaryReassociate.cpp
4647 NewGVN.cpp
0 //===- MergeICmps.cpp - Optimize chains of integer comparisons ------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass turns chains of integer comparisons into memcmp (the memcmp is
10 // later typically inlined as a chain of efficient hardware comparisons). This
11 // typically benefits c++ member or nonmember operator==().
12 //
13 // The basic idea is to replace a larger chain of integer comparisons loaded
14 // from contiguous memory locations into a smaller chain of such integer
15 // comparisons. Benefits are double:
16 // - There are less jumps, and therefore less opportunities for mispredictions
17 // and I-cache misses.
18 // - Code size is smaller, both because jumps are removed and because the
19 // encoding of a 2*n byte compare is smaller than that of two n-byte
20 // compares.
21
22 //===----------------------------------------------------------------------===//
23
24 #include "llvm/ADT/APSInt.h"
25 #include "llvm/Analysis/Loads.h"
26 #include "llvm/IR/Function.h"
27 #include "llvm/IR/IRBuilder.h"
28 #include "llvm/IR/IntrinsicInst.h"
29 #include "llvm/Pass.h"
30 #include "llvm/Transforms/Scalar.h"
31 #include "llvm/Transforms/Utils/BuildLibCalls.h"
32
33 using namespace llvm;
34
35 namespace {
36
37 #define DEBUG_TYPE "mergeicmps"
38
39 #define MERGEICMPS_DOT_ON
40
41 // A BCE atom.
42 struct BCEAtom {
43 const Value *Base() const { return GEP ? GEP->getPointerOperand() : nullptr; }
44
45 bool operator<(const BCEAtom &O) const {
46 return Base() == O.Base() ? Offset.slt(O.Offset) : Base() < O.Base();
47 }
48
49 GetElementPtrInst *GEP = nullptr;
50 LoadInst *LoadI = nullptr;
51 APInt Offset;
52 };
53
54 // If this value is a load from a constant offset w.r.t. a base address, and
55 // there are no othe rusers of the load or address, returns the base address and
56 // the offset.
57 BCEAtom visitICmpLoadOperand(Value *const Val) {
58 BCEAtom Result;
59 if (auto *const LoadI = dyn_cast(Val)) {
60 DEBUG(dbgs() << "load\n");
61 if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
62 DEBUG(dbgs() << "used outside of block\n");
63 return {};
64 }
65 if (LoadI->isVolatile()) {
66 DEBUG(dbgs() << "volatile\n");
67 return {};
68 }
69 Value *const Addr = LoadI->getOperand(0);
70 if (auto *const GEP = dyn_cast(Addr)) {
71 DEBUG(dbgs() << "GEP\n");
72 if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
73 DEBUG(dbgs() << "used outside of block\n");
74 return {};
75 }
76 const auto &DL = GEP->getModule()->getDataLayout();
77 if (!isDereferenceablePointer(GEP, DL)) {
78 DEBUG(dbgs() << "not dereferenceable\n");
79 // We need to make sure that we can do comparison in any order, so we
80 // require memory to be unconditionnally dereferencable.
81 return {};
82 }
83 Result.Offset = APInt(DL.getPointerTypeSizeInBits(GEP->getType()), 0);
84 if (GEP->accumulateConstantOffset(DL, Result.Offset)) {
85 Result.GEP = GEP;
86 Result.LoadI = LoadI;
87 }
88 }
89 }
90 return Result;
91 }
92
93 // A basic block with a comparison between two BCE atoms.
94 // Note: the terminology is misleading: the comparison is symmetric, so there
95 // is no real {l/r}hs. To break the symmetry, we use the smallest atom as Lhs.
96 class BCECmpBlock {
97 public:
98 BCECmpBlock() {}
99
100 BCECmpBlock(BCEAtom L, BCEAtom R, int SizeBits)
101 : Lhs_(L), Rhs_(R), SizeBits_(SizeBits) {
102 if (Rhs_ < Lhs_)
103 std::swap(Rhs_, Lhs_);
104 }
105
106 bool IsValid() const {
107 return Lhs_.Base() != nullptr && Rhs_.Base() != nullptr;
108 }
109
110 // Assert the the block is consistent: If valid, it should also have
111 // non-null members besides Lhs_ and Rhs_.
112 void AssertConsistent() const {
113 if (IsValid()) {
114 assert(BB);
115 assert(CmpI);
116 assert(BranchI);
117 }
118 }
119
120 const BCEAtom &Lhs() const { return Lhs_; }
121 const BCEAtom &Rhs() const { return Rhs_; }
122 int SizeBits() const { return SizeBits_; }
123
124 // Returns true if the block does other works besides comparison.
125 bool doesOtherWork() const;
126
127 // The basic block where this comparison happens.
128 BasicBlock *BB = nullptr;
129 // The ICMP for this comparison.
130 ICmpInst *CmpI = nullptr;
131 // The terminating branch.
132 BranchInst *BranchI = nullptr;
133
134 private:
135 BCEAtom Lhs_;
136 BCEAtom Rhs_;
137 int SizeBits_ = 0;
138 };
139
140 bool BCECmpBlock::doesOtherWork() const {
141 AssertConsistent();
142 // TODO(courbet): Can we allow some other things ? This is very conservative.
143 // We might be able to get away with anything does does not have any side
144 // effects outside of the basic block.
145 // Note: The GEPs and/or loads are not necessarily in the same block.
146 for (const Instruction &Inst : *BB) {
147 if (const auto *const GEP = dyn_cast(&Inst)) {
148 if (!(Lhs_.GEP == GEP || Rhs_.GEP == GEP))
149 return true;
150 } else if (const auto *const L = dyn_cast(&Inst)) {
151 if (!(Lhs_.LoadI == L || Rhs_.LoadI == L))
152 return true;
153 } else if (const auto *const C = dyn_cast(&Inst)) {
154 if (C != CmpI)
155 return true;
156 } else if (const auto *const Br = dyn_cast(&Inst)) {
157 if (Br != BranchI)
158 return true;
159 } else {
160 return true;
161 }
162 }
163 return false;
164 }
165
166 // Visit the given comparison. If this is a comparison between two valid
167 // BCE atoms, returns the comparison.
168 BCECmpBlock visitICmp(const ICmpInst *const CmpI,
169 const ICmpInst::Predicate ExpectedPredicate) {
170 if (CmpI->getPredicate() == ExpectedPredicate) {
171 DEBUG(dbgs() << "cmp "
172 << (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne")
173 << "\n");
174 auto Lhs = visitICmpLoadOperand(CmpI->getOperand(0));
175 if (!Lhs.Base())
176 return {};
177 auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1));
178 if (!Rhs.Base())
179 return {};
180 return BCECmpBlock(std::move(Lhs), std::move(Rhs),
181 CmpI->getOperand(0)->getType()->getScalarSizeInBits());
182 }
183 return {};
184 }
185
186 // Visit the given comparison block. If this is a comparison between two valid
187 // BCE atoms, returns the comparison.
188 BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
189 const BasicBlock *const PhiBlock) {
190 if (Block->empty())
191 return {};
192 auto *const BranchI = dyn_cast(Block->getTerminator());
193 if (!BranchI)
194 return {};
195 DEBUG(dbgs() << "branch\n");
196 if (BranchI->isUnconditional()) {
197 // In this case, we expect an incoming value which is the result of the
198 // comparison. This is the last link in the chain of comparisons (note
199 // that this does not mean that this is the last incoming value, blocks
200 // can be reordered).
201 auto *const CmpI = dyn_cast(Val);
202 if (!CmpI)
203 return {};
204 DEBUG(dbgs() << "icmp\n");
205 auto Result = visitICmp(CmpI, ICmpInst::ICMP_EQ);
206 Result.CmpI = CmpI;
207 Result.BranchI = BranchI;
208 return Result;
209 } else {
210 // In this case, we expect a constant incoming value (the comparison is
211 // chained).
212 const auto *const Const = dyn_cast(Val);
213 DEBUG(dbgs() << "const\n");
214 if (!Const->isZero())
215 return {};
216 DEBUG(dbgs() << "false\n");
217 auto *const CmpI = dyn_cast(BranchI->getCondition());
218 if (!CmpI)
219 return {};
220 DEBUG(dbgs() << "icmp\n");
221 assert(BranchI->getNumSuccessors() == 2 && "expecting a cond branch");
222 BasicBlock *const FalseBlock = BranchI->getSuccessor(1);
223 auto Result = visitICmp(
224 CmpI, FalseBlock == PhiBlock ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE);
225 Result.CmpI = CmpI;
226 Result.BranchI = BranchI;
227 return Result;
228 }
229 return {};
230 }
231
232 // A chain of comparisons.
233 class BCECmpChain {
234 public:
235 BCECmpChain(const std::vector &Blocks, PHINode &Phi);
236
237 int size() const { return Comparisons_.size(); }
238
239 #ifdef MERGEICMPS_DOT_ON
240 void dump() const;
241 #endif // MERGEICMPS_DOT_ON
242
243 bool simplify(const TargetLibraryInfo *const TLI);
244
245 private:
246 static bool IsContiguous(const BCECmpBlock &First,
247 const BCECmpBlock &Second) {
248 return First.Lhs().Base() == Second.Lhs().Base() &&
249 First.Rhs().Base() == Second.Rhs().Base() &&
250 First.Lhs().Offset + First.SizeBits() / 8 == Second.Lhs().Offset &&
251 First.Rhs().Offset + First.SizeBits() / 8 == Second.Rhs().Offset;
252 }
253
254 // Merges the given comparison blocks into one memcmp block and update
255 // branches. Comparisons are assumed to be continguous. If NextBBInChain is
256 // null, the merged block will link to the phi block.
257 static void mergeComparisons(ArrayRef Comparisons,
258 BasicBlock *const NextBBInChain, PHINode &Phi,
259 const TargetLibraryInfo *const TLI);
260
261 PHINode &Phi_;
262 std::vector Comparisons_;
263 // The original entry block (before sorting);
264 BasicBlock *EntryBlock_;
265 };
266
267 BCECmpChain::BCECmpChain(const std::vector &Blocks, PHINode &Phi)
268 : Phi_(Phi) {
269 // Now look inside blocks to check for BCE comparisons.
270 std::vector Comparisons;
271 for (BasicBlock *Block : Blocks) {
272 BCECmpBlock Comparison = visitCmpBlock(Phi.getIncomingValueForBlock(Block),
273 Block, Phi.getParent());
274 Comparison.BB = Block;
275 if (!Comparison.IsValid()) {
276 DEBUG(dbgs() << "skip: not a valid BCECmpBlock\n");
277 return;
278 }
279 if (Comparison.doesOtherWork()) {
280 DEBUG(dbgs() << "block does extra work besides compare\n");
281 if (Comparisons.empty()) { // First block.
282 // TODO(courbet): The first block can do other things, and we should
283 // split them apart in a separate block before the comparison chain.
284 // Right now we just discard it and make the chain shorter.
285 DEBUG(dbgs()
286 << "ignoring first block that does extra work besides compare\n");
287 continue;
288 }
289 // TODO(courbet): Right now we abort the whole chain. We could be
290 // merging only the blocks that don't do other work and resume the
291 // chain from there. For example:
292 // if (a[0] == b[0]) { // bb1
293 // if (a[1] == b[1]) { // bb2
294 // some_value = 3; //bb3
295 // if (a[2] == b[2]) { //bb3
296 // do a ton of stuff //bb4
297 // }
298 // }
299 // }
300 //
301 // This is:
302 //
303 // bb1 --eq--> bb2 --eq--> bb3* -eq--> bb4 --+
304 // \ \ \ \
305 // ne ne ne \
306 // \ \ \ v
307 // +------------+-----------+----------> bb_phi
308 //
309 // We can only merge the first two comparisons, because bb3* does
310 // "other work" (setting some_value to 3).
311 // We could still merge bb1 and bb2 though.
312 return;
313 }
314 DEBUG(dbgs() << "*Found cmp of " << Comparison.SizeBits()
315 << " bits between " << Comparison.Lhs().Base() << " + "
316 << Comparison.Lhs().Offset << " and "
317 << Comparison.Rhs().Base() << " + " << Comparison.Rhs().Offset
318 << "\n");
319 DEBUG(dbgs() << "\n");
320 Comparisons.push_back(Comparison);
321 }
322 EntryBlock_ = Comparisons[0].BB;
323 Comparisons_ = std::move(Comparisons);
324 #ifdef MERGEICMPS_DOT_ON
325 errs() << "BEFORE REORDERING:\n\n";
326 dump();
327 #endif // MERGEICMPS_DOT_ON
328 // Reorder blocks by LHS. We can do that without changing the
329 // semantics because we are only accessing dereferencable memory.
330 std::sort(Comparisons_.begin(), Comparisons_.end(),
331 [](const BCECmpBlock &a, const BCECmpBlock &b) {
332 return a.Lhs() < b.Lhs();
333 });
334 #ifdef MERGEICMPS_DOT_ON
335 errs() << "AFTER REORDERING:\n\n";
336 dump();
337 #endif // MERGEICMPS_DOT_ON
338 }
339
340 #ifdef MERGEICMPS_DOT_ON
341 void BCECmpChain::dump() const {
342 errs() << "digraph dag {\n";
343 errs() << " graph [bgcolor=transparent];\n";
344 errs() << " node [color=black,style=filled,fillcolor=lightyellow];\n";
345 errs() << " edge [color=black];\n";
346 for (size_t I = 0; I < Comparisons_.size(); ++I) {
347 const auto &Comparison = Comparisons_[I];
348 errs() << " \"" << I << "\" [label=\"%"
349 << Comparison.Lhs().Base()->getName() << " + "
350 << Comparison.Lhs().Offset << " == %"
351 << Comparison.Rhs().Base()->getName() << " + "
352 << Comparison.Rhs().Offset << " (" << (Comparison.SizeBits() / 8)
353 << " bytes)\"];\n";
354 const Value *const Val = Phi_.getIncomingValueForBlock(Comparison.BB);
355 if (I > 0)
356 errs() << " \"" << (I - 1) << "\" -> \"" << I << "\";\n";
357 errs() << " \"" << I << "\" -> \"Phi\" [label=\"" << *Val << "\"];\n";
358 }
359 errs() << " \"Phi\" [label=\"Phi\"];\n";
360 errs() << "}\n\n";
361 }
362 #endif // MERGEICMPS_DOT_ON
363
364 bool BCECmpChain::simplify(const TargetLibraryInfo *const TLI) {
365 // First pass to check if there is at least one merge. If not, we don't do
366 // anything and we keep analysis passes intact.
367 {
368 bool AtLeastOneMerged = false;
369 for (size_t I = 1; I < Comparisons_.size(); ++I) {
370 if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) {
371 AtLeastOneMerged = true;
372 break;
373 }
374 }
375 if (!AtLeastOneMerged)
376 return false;
377 }
378
379 // Remove phi references to comparison blocks, they will be rebuilt as we
380 // merge the blocks.
381 for (const auto &Comparison : Comparisons_) {
382 Phi_.removeIncomingValue(Comparison.BB, false);
383 }
384
385 // Point the predecessors of the chain to the first comparison block (which is
386 // the new entry point).
387 if (EntryBlock_ != Comparisons_[0].BB)
388 EntryBlock_->replaceAllUsesWith(Comparisons_[0].BB);
389
390 // Effectively merge blocks.
391 int NumMerged = 1;
392 for (size_t I = 1; I < Comparisons_.size(); ++I) {
393 if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) {
394 ++NumMerged;
395 } else {
396 // Merge all previous comparisons and start a new merge block.
397 mergeComparisons(
398 makeArrayRef(Comparisons_).slice(I - NumMerged, NumMerged),
399 Comparisons_[I].BB, Phi_, TLI);
400 NumMerged = 1;
401 }
402 }
403 mergeComparisons(makeArrayRef(Comparisons_)
404 .slice(Comparisons_.size() - NumMerged, NumMerged),
405 nullptr, Phi_, TLI);
406
407 return true;
408 }
409
410 void BCECmpChain::mergeComparisons(ArrayRef Comparisons,
411 BasicBlock *const NextBBInChain,
412 PHINode &Phi,
413 const TargetLibraryInfo *const TLI) {
414 assert(!Comparisons.empty());
415 const auto &FirstComparison = *Comparisons.begin();
416 BasicBlock *const BB = FirstComparison.BB;
417 LLVMContext &Context = BB->getContext();
418
419 if (Comparisons.size() >= 2) {
420 DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n");
421 const auto TotalSize =
422 std::accumulate(Comparisons.begin(), Comparisons.end(), 0,
423 [](int Size, const BCECmpBlock &C) {
424 return Size + C.SizeBits();
425 }) /
426 8;
427
428 // Incoming edges do not need to be updated, and both GEPs are already
429 // computing the right address, we just need to:
430 // - replace the two loads and the icmp with the memcmp
431 // - update the branch
432 // - update the incoming values in the phi.
433 FirstComparison.BranchI->eraseFromParent();
434 FirstComparison.CmpI->eraseFromParent();
435 FirstComparison.Lhs().LoadI->eraseFromParent();
436 FirstComparison.Rhs().LoadI->eraseFromParent();
437
438 IRBuilder<> Builder(BB);
439 const auto &DL = Phi.getModule()->getDataLayout();
440 Value *const MemCmpCall =
441 emitMemCmp(FirstComparison.Lhs().GEP, FirstComparison.Rhs().GEP,
442 ConstantInt::get(DL.getIntPtrType(Context), TotalSize),
443 Builder, DL, TLI);
444 Value *const MemCmpIsZero = Builder.CreateICmpEQ(
445 MemCmpCall, ConstantInt::get(Type::getInt32Ty(Context), 0));
446
447 // Add a branch to the next basic block in the chain.
448 if (NextBBInChain) {
449 Builder.CreateCondBr(MemCmpIsZero, NextBBInChain, Phi.getParent());
450 Phi.addIncoming(ConstantInt::getFalse(Context), BB);
451 } else {
452 Builder.CreateBr(Phi.getParent());
453 Phi.addIncoming(MemCmpIsZero, BB);
454 }
455
456 // Delete merged blocks.
457 for (size_t I = 1; I < Comparisons.size(); ++I) {
458 BasicBlock *CBB = Comparisons[I].BB;
459 CBB->replaceAllUsesWith(BB);
460 CBB->eraseFromParent();
461 }
462 } else {
463 assert(Comparisons.size() == 1);
464 // There are no blocks to merge, but we still need to update the branches.
465 DEBUG(dbgs() << "Only one comparison, updating branches\n");
466 if (NextBBInChain) {
467 if (FirstComparison.BranchI->isConditional()) {
468 DEBUG(dbgs() << "conditional -> conditional\n");
469 // Just update the "true" target, the "false" target should already be
470 // the phi block.
471 assert(FirstComparison.BranchI->getSuccessor(1) == Phi.getParent());
472 FirstComparison.BranchI->setSuccessor(0, NextBBInChain);
473 Phi.addIncoming(ConstantInt::getFalse(Context), BB);
474 } else {
475 DEBUG(dbgs() << "unconditional -> conditional\n");
476 // Replace the unconditional branch by a conditional one.
477 FirstComparison.BranchI->eraseFromParent();
478 IRBuilder<> Builder(BB);
479 Builder.CreateCondBr(FirstComparison.CmpI, NextBBInChain,
480 Phi.getParent());
481 Phi.addIncoming(FirstComparison.CmpI, BB);
482 }
483 } else {
484 if (FirstComparison.BranchI->isConditional()) {
485 DEBUG(dbgs() << "conditional -> unconditional\n");
486 // Replace the conditional branch by an unconditional one.
487 FirstComparison.BranchI->eraseFromParent();
488 IRBuilder<> Builder(BB);
489 Builder.CreateBr(Phi.getParent());
490 Phi.addIncoming(FirstComparison.CmpI, BB);
491 } else {
492 DEBUG(dbgs() << "unconditional -> unconditional\n");
493 Phi.addIncoming(FirstComparison.CmpI, BB);
494 }
495 }
496 }
497 }
498
499 std::vector getOrderedBlocks(PHINode &Phi,
500 BasicBlock *const LastBlock,
501 int NumBlocks) {
502 // Walk up from the last block to find other blocks.
503 std::vector Blocks(NumBlocks);
504 BasicBlock *CurBlock = LastBlock;
505 for (int BlockIndex = NumBlocks - 1; BlockIndex > 0; --BlockIndex) {
506 if (CurBlock->hasAddressTaken()) {
507 // Somebody is jumping to the block through an address, all bets are
508 // off.
509 DEBUG(dbgs() << "skip: block " << BlockIndex
510 << " has its address taken\n");
511 return {};
512 }
513 Blocks[BlockIndex] = CurBlock;
514 auto *SinglePredecessor = CurBlock->getSinglePredecessor();
515 if (!SinglePredecessor) {
516 // The block has two or more predecessors.
517 DEBUG(dbgs() << "skip: block " << BlockIndex
518 << " has two or more predecessors\n");
519 return {};
520 }
521 if (Phi.getBasicBlockIndex(SinglePredecessor) < 0) {
522 // The block does not link back to the phi.
523 DEBUG(dbgs() << "skip: block " << BlockIndex
524 << " does not link back to the phi\n");
525 return {};
526 }
527 CurBlock = SinglePredecessor;
528 }
529 Blocks[0] = CurBlock;
530 return Blocks;
531 }
532
533 bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) {
534 DEBUG(dbgs() << "processPhi()\n");
535 if (Phi.getNumIncomingValues() <= 1) {
536 DEBUG(dbgs() << "skip: only one incoming value in phi\n");
537 return false;
538 }
539 // We are looking for something that has the following structure:
540 // bb1 --eq--> bb2 --eq--> bb3 --eq--> bb4 --+
541 // \ \ \ \
542 // ne ne ne \
543 // \ \ \ v
544 // +------------+-----------+----------> bb_phi
545 //
546 // - The last basic block (bb4 here) must branch unconditionally to bb_phi.
547 // It's the only block that contributes a non-constant value to the Phi.
548 // - All other blocks (b1, b2, b3) must have exactly two successors, one of
549 // them being the the phi block.
550 // - All intermediate blocks (bb2, bb3) must have only one predecessor.
551 // - Blocks cannot do other work besides the comparison, see doesOtherWork()
552
553 // The blocks are not necessarily ordered in the phi, so we start from the
554 // last block and reconstruct the order.
555 BasicBlock *LastBlock = nullptr;
556 for (unsigned I = 0; I < Phi.getNumIncomingValues(); ++I) {
557 if (isa(Phi.getIncomingValue(I)))
558 continue;
559 if (LastBlock) {
560 // There are several non-constant values.
561 DEBUG(dbgs() << "skip: several non-constant values\n");
562 return false;
563 }
564 LastBlock = Phi.getIncomingBlock(I);
565 }
566 if (!LastBlock) {
567 // There is no non-constant block.
568 DEBUG(dbgs() << "skip: no non-constant block\n");
569 return false;
570 }
571 if (LastBlock->getSingleSuccessor() != Phi.getParent()) {
572 DEBUG(dbgs() << "skip: last block non-phi successor\n");
573 return false;
574 }
575
576 const auto Blocks =
577 getOrderedBlocks(Phi, LastBlock, Phi.getNumIncomingValues());
578 if (Blocks.empty())
579 return false;
580 BCECmpChain CmpChain(Blocks, Phi);
581
582 if (CmpChain.size() < 2) {
583 DEBUG(dbgs() << "skip: only one compare block\n");
584 return false;
585 }
586
587 return CmpChain.simplify(TLI);
588 }
589
590 class MergeICmps : public FunctionPass {
591 public:
592 static char ID;
593
594 MergeICmps() : FunctionPass(ID) {
595 initializeMergeICmpsPass(*PassRegistry::getPassRegistry());
596 }
597
598 bool runOnFunction(Function &F) override {
599 if (skipFunction(F)) return false;
600 const auto &TLI = getAnalysis().getTLI();
601 auto PA = runImpl(F, &TLI);
602 return !PA.areAllPreserved();
603 }
604
605 private:
606 void getAnalysisUsage(AnalysisUsage &AU) const override {
607 AU.addRequired();
608 }
609
610 PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI);
611 };
612
613 PreservedAnalyses MergeICmps::runImpl(Function &F,
614 const TargetLibraryInfo *TLI) {
615 DEBUG(dbgs() << "MergeICmpsPass: " << F.getName() << "\n");
616
617 bool MadeChange = false;
618
619 for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) {
620 // A Phi operation is always first in a basic block.
621 if (auto *const Phi = dyn_cast(&*BBIt->begin()))
622 MadeChange |= processPhi(*Phi, TLI);
623 }
624
625 if (MadeChange)
626 return PreservedAnalyses::none();
627 return PreservedAnalyses::all();
628 }
629
630 } // namespace
631
632 char MergeICmps::ID = 0;
633 INITIALIZE_PASS_BEGIN(MergeICmps, "mergeicmps",
634 "Merge contiguous icmps into a memcmp", false, false)
635 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
636 INITIALIZE_PASS_END(MergeICmps, "mergeicmps",
637 "Merge contiguous icmps into a memcmp", false, false)
638
639 Pass *llvm::createMergeICmpsPass() { return new MergeICmps(); }
640
7272 initializeLowerExpectIntrinsicPass(Registry);
7373 initializeLowerGuardIntrinsicLegacyPassPass(Registry);
7474 initializeMemCpyOptLegacyPassPass(Registry);
75 initializeMergeICmpsPass(Registry);
7576 initializeMergedLoadStoreMotionLegacyPassPass(Registry);
7677 initializeNaryReassociateLegacyPassPass(Registry);
7778 initializePartiallyInlineLibCallsLegacyPassPass(Registry);
0 ; RUN: opt -mergeicmps -S -o - %s | FileCheck %s
1
2 %"struct.std::pair" = type { i32, i32 }
3
4 define zeroext i1 @opeq1(
5 %"struct.std::pair"* nocapture readonly dereferenceable(8) %a,
6 %"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 {
7 entry:
8 %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0
9 %0 = load i32, i32* %first.i, align 4
10 %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0
11 %1 = load i32, i32* %first1.i, align 4
12 %cmp.i = icmp eq i32 %0, %1
13 br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
14
15 land.rhs.i:
16 %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1
17 %2 = load i32, i32* %second.i, align 4
18 %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1
19 %3 = load i32, i32* %second2.i, align 4
20 %cmp3.i = icmp eq i32 %2, %3
21 br label %opeq1.exit
22
23 opeq1.exit:
24 %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ]
25 ret i1 %4
26 ; CHECK-LABEL: @opeq1(
27 ; The entry block with zero-offset GEPs is kept, loads are removed.
28 ; CHECK: entry
29 ; CHECK: getelementptr {{.*}} i32 0
30 ; CHECK-NOT: load
31 ; CHECK: getelementptr {{.*}} i32 0
32 ; CHECK-NOT: load
33 ; The two 4 byte loads and compares are replaced with a single 8-byte memcmp.
34 ; CHECK: @memcmp({{.*}}8)
35 ; CHECK: icmp eq {{.*}} 0
36 ; The branch is now a direct branch; the other block has been removed.
37 ; CHECK: br label %opeq1.exit
38 ; CHECK-NOT: br
39 ; The phi is updated.
40 ; CHECK: phi i1 [ %{{[^,]*}}, %entry ]
41 ; CHECK-NEXT: ret
42 }
43
44 ; Same as above, but the two blocks are in inverse order.
45 define zeroext i1 @opeq1_inverse(
46 %"struct.std::pair"* nocapture readonly dereferenceable(8) %a,
47 %"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 {
48 entry:
49 %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1
50 %0 = load i32, i32* %first.i, align 4
51 %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1
52 %1 = load i32, i32* %first1.i, align 4
53 %cmp.i = icmp eq i32 %0, %1
54 br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
55
56 land.rhs.i:
57 %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0
58 %2 = load i32, i32* %second.i, align 4
59 %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0
60 %3 = load i32, i32* %second2.i, align 4
61 %cmp3.i = icmp eq i32 %2, %3
62 br label %opeq1.exit
63
64 opeq1.exit:
65 %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ]
66 ret i1 %4
67 ; CHECK-LABEL: @opeq1_inverse(
68 ; The second block with zero-offset GEPs is kept, loads are removed.
69 ; CHECK: land.rhs.i
70 ; CHECK: getelementptr {{.*}} i32 0
71 ; CHECK-NOT: load
72 ; CHECK: getelementptr {{.*}} i32 0
73 ; CHECK-NOT: load
74 ; The two 4 byte loads and compares are replaced with a single 8-byte memcmp.
75 ; CHECK: @memcmp({{.*}}8)
76 ; CHECK: icmp eq {{.*}} 0
77 ; The branch is now a direct branch; the other block has been removed.
78 ; CHECK: br label %opeq1.exit
79 ; CHECK-NOT: br
80 ; The phi is updated.
81 ; CHECK: phi i1 [ %{{[^,]*}}, %land.rhs.i ]
82 ; CHECK-NEXT: ret
83 }
84
85
86
0 ; RUN: opt -mergeicmps -S -o - %s | FileCheck %s
1
2 ; This is a more involved test: clang generates this weird pattern for
3 ; tuple. Right now we skip the entry block
4 ; (which defines the base pointer for other blocks) and the last one (which
5 ; does not have the expected structure). Only middle blocks (bytes [1,2]) are
6 ; merged.
7
8 %"class.std::tuple" = type { %"struct.std::_Tuple_impl" }
9 %"struct.std::_Tuple_impl" = type { %"struct.std::_Tuple_impl.0", %"struct.std::_Head_base.6" }
10 %"struct.std::_Tuple_impl.0" = type { %"struct.std::_Tuple_impl.1", %"struct.std::_Head_base.5" }
11 %"struct.std::_Tuple_impl.1" = type { %"struct.std::_Tuple_impl.2", %"struct.std::_Head_base.4" }
12 %"struct.std::_Tuple_impl.2" = type { %"struct.std::_Head_base" }
13 %"struct.std::_Head_base" = type { i8 }
14 %"struct.std::_Head_base.4" = type { i8 }
15 %"struct.std::_Head_base.5" = type { i8 }
16 %"struct.std::_Head_base.6" = type { i8 }
17
18 define zeroext i1 @opeq(
19 %"class.std::tuple"* nocapture readonly dereferenceable(4) %a,
20 %"class.std::tuple"* nocapture readonly dereferenceable(4) %b) local_unnamed_addr #1 {
21 entry:
22 %0 = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* %a, i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0
23 %add.ptr.i.i.i.i.i = getelementptr inbounds i8, i8* %0, i64 3
24 %1 = load i8, i8* %add.ptr.i.i.i.i.i, align 1
25 %2 = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* %b, i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0
26 %add.ptr.i.i.i6.i.i = getelementptr inbounds i8, i8* %2, i64 3
27 %3 = load i8, i8* %add.ptr.i.i.i6.i.i, align 1
28 %cmp.i.i = icmp eq i8 %1, %3
29 br i1 %cmp.i.i, label %land.rhs.i.i, label %opeq.exit
30
31 land.rhs.i.i:
32 %add.ptr.i.i.i.i.i.i = getelementptr inbounds i8, i8* %0, i64 2
33 %4 = load i8, i8* %add.ptr.i.i.i.i.i.i, align 1
34 %add.ptr.i.i.i6.i.i.i = getelementptr inbounds i8, i8* %2, i64 2
35 %5 = load i8, i8* %add.ptr.i.i.i6.i.i.i, align 1
36 %cmp.i.i.i = icmp eq i8 %4, %5
37 br i1 %cmp.i.i.i, label %land.rhs.i.i.i, label %opeq.exit
38
39 land.rhs.i.i.i:
40 %add.ptr.i.i.i.i.i.i.i = getelementptr inbounds i8, i8* %0, i64 1
41 %6 = load i8, i8* %add.ptr.i.i.i.i.i.i.i, align 1
42 %add.ptr.i.i.i6.i.i.i.i = getelementptr inbounds i8, i8* %2, i64 1
43 %7 = load i8, i8* %add.ptr.i.i.i6.i.i.i.i, align 1
44 %cmp.i.i.i.i = icmp eq i8 %6, %7
45 br i1 %cmp.i.i.i.i, label %land.rhs.i.i.i.i, label %opeq.exit
46
47 land.rhs.i.i.i.i:
48 %8 = load i8, i8* %0, align 1
49 %9 = load i8, i8* %2, align 1
50 %cmp.i.i.i.i.i = icmp eq i8 %8, %9
51 br label %opeq.exit
52
53 opeq.exit:
54 %10 = phi i1 [ false, %entry ], [ false, %land.rhs.i.i ], [ false, %land.rhs.i.i.i ], [ %cmp.i.i.i.i.i, %land.rhs.i.i.i.i ]
55 ret i1 %10
56 ; CHECK-LABEL: @opeq(
57 ; The entry block is kept as is, but the next block is now the merged comparison
58 ; block for bytes [1,2] or the block for the head.
59 ; CHECK: entry
60 ; CHECK: br i1 %cmp.i.i, label %land.rhs.i.i.i{{(.i)?}}, label %opeq.exit
61 ; The two 1 byte loads and compares at offset 1 are replaced with a single
62 ; 2-byte memcmp.
63 ; CHECK: land.rhs.i.i.i
64 ; CHECK: @memcmp({{.*}}2)
65 ; CHECK: icmp eq {{.*}} 0
66 ; In the end we have three blocks.
67 ; CHECK: phi i1
68 ; CHECK-SAME %entry
69 ; CHECK-SAME %land.rhs.i.i.i.i
70 ; CHECK-SAME %land.rhs.i.i.i
71 }
72
0 ; RUN: opt -mergeicmps -S -o - %s | FileCheck %s
1
2 %"struct.std::pair" = type { i32, i32 }
3
4 define zeroext i1 @opeq(
5 %"struct.std::pair"* nocapture readonly dereferenceable(8) %a,
6 %"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 {
7 entry:
8 %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0
9 %0 = load i32, i32* %first.i, align 4
10 %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0
11 %1 = load i32, i32* %first1.i, align 4
12 %cmp.i = icmp eq i32 %0, %1
13 br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
14
15 land.rhs.i:
16 %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1
17 %2 = load volatile i32, i32* %second.i, align 4
18 %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1
19 %3 = load i32, i32* %second2.i, align 4
20 %cmp3.i = icmp eq i32 %2, %3
21 br label %opeq1.exit
22
23 opeq1.exit:
24 %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ]
25 ret i1 %4
26 ; CHECK-LABEL: @opeq(
27 ; CHECK-NOT: memcmp
28 }
29