llvm.org GIT mirror llvm / 9e59c64
Implement initial support for PHI translation in memdep. This means that memdep keeps track of how PHIs affect the pointer in dep queries, which allows it to eliminate the load in cases like rle-phi-translate.ll, which basically end up being: BB1: X = load P br BB3 BB2: Y = load Q br BB3 BB3: R = phi [P] [Q] load R turning "load R" into a phi of X/Y. In addition to additional exposed opportunities, this makes memdep safe in many cases that it wasn't before (which is required for load PRE) and also makes it substantially more efficient. For example, consider: bb1: // has many predecessors. P = some_operator() load P In this example, previously memdep would scan all the predecessors of BB1 to see if they had something that would mustalias P. In some cases (e.g. test/Transforms/GVN/rle-must-alias.ll) it would actually find them and end up eliminating something. In many other cases though, it would scan and not find anything useful. MemDep now stops at a block if the pointer is defined in that block and cannot be phi translated to predecessors. This causes it to miss the (rare) cases like rle-must-alias.ll, but makes it faster by not scanning tons of stuff that is unlikely to be useful. For example, this speeds up GVN as a whole from 3.928s to 2.448s (60%)!. IMO, scalar GVN should be enhanced to simplify the rle-must-alias pointer base anyway, which would allow the loads to be eliminated. In the future, this should be enhanced to phi translate through geps and bitcasts as well (as indicated by FIXMEs) making memdep even more powerful. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@61022 91177308-0d34-0410-b5e6-96231b3b80d8 Chris Lattner 10 years ago
5 changed file(s) with 218 addition(s) and 44 deletion(s). Raw diff Collapse all Expand all
155155 /// ValueIsLoadPair - This is a pair where the bool is true if
156156 /// the dependence is a read only dependence, false if read/write.
157157 typedef PointerIntPair ValueIsLoadPair;
158
158
159 /// BBSkipFirstBlockPair - This pair is used when caching information for a
160 /// block. If the pointer is null, the cache value is not a full query that
161 /// starts at the specified block. If non-null, the bool indicates whether
162 /// or not the contents of the block was skipped.
163 typedef PointerIntPair BBSkipFirstBlockPair;
164
159165 /// CachedNonLocalPointerInfo - This map stores the cached results of doing
160166 /// a pointer lookup at the bottom of a block. The key of this map is the
161167 /// pointer+isload bit, the value is a list of result> mappings.
162 typedef DenseMap
163 std::pair > CachedNonLocalPointerInfo;
168 typedef DenseMap
169 NonLocalDepInfo> > CachedNonLocalPointerInfo;
164170 CachedNonLocalPointerInfo NonLocalPointerDeps;
165171
166172 // A map from instructions to their non-local pointer dependencies.
258264 MemDepResult getCallSiteDependencyFrom(CallSite C, bool isReadOnlyCall,
259265 BasicBlock::iterator ScanIt,
260266 BasicBlock *BB);
261 void getNonLocalPointerDepFromBB(Value *Pointer, uint64_t Size,
267 bool getNonLocalPointerDepFromBB(Value *Pointer, uint64_t Size,
262268 bool isLoad, BasicBlock *BB,
263269 SmallVectorImpl &Result,
264 SmallPtrSet &Visited);
270 DenseMap &Visited,
271 bool SkipFirstBlock = false);
265272 MemDepResult GetNonLocalInfoForBlock(Value *Pointer, uint64_t PointeeSize,
266273 bool isLoad, BasicBlock *BB,
267274 NonLocalDepInfo *Cache,
381381
382382 // isReadonlyCall - If this is a read-only call, we can be more aggressive.
383383 bool isReadonlyCall = AA->onlyReadsMemory(QueryCS);
384
385 // Visited checked first, vector in sorted order.
384
386385 SmallPtrSet Visited;
387386
388387 unsigned NumSortedEntries = Cache.size();
486485 const Type *EltTy = cast(Pointer->getType())->getElementType();
487486 uint64_t PointeeSize = TD->getTypeStoreSize(EltTy);
488487
489 // While we have blocks to analyze, get their values.
490 SmallPtrSet Visited;
491 getNonLocalPointerDepFromBB(Pointer, PointeeSize, isLoad, FromBB,
492 Result, Visited);
488 // This is the set of blocks we've inspected, and the pointer we consider in
489 // each block. Because of critical edges, we currently bail out if querying
490 // a block with multiple different pointers. This can happen during PHI
491 // translation.
492 DenseMap Visited;
493 if (!getNonLocalPointerDepFromBB(Pointer, PointeeSize, isLoad, FromBB,
494 Result, Visited, true))
495 return;
496 Result.push_back(std::make_pair(FromBB,
497 MemDepResult::getClobber(FromBB->begin())));
493498 }
494499
495500 /// GetNonLocalInfoForBlock - Compute the memdep value for BB with
565570 }
566571
567572
568 /// getNonLocalPointerDepFromBB -
569 void MemoryDependenceAnalysis::
573 /// getNonLocalPointerDepFromBB - Perform a dependency query based on
574 /// pointer/pointeesize starting at the end of StartBB. Add any clobber/def
575 /// results to the results vector and keep track of which blocks are visited in
576 /// 'Visited'.
577 ///
578 /// This has special behavior for the first block queries (when SkipFirstBlock
579 /// is true). In this special case, it ignores the contents of the specified
580 /// block and starts returning dependence info for its predecessors.
581 ///
582 /// This function returns false on success, or true to indicate that it could
583 /// not compute dependence information for some reason. This should be treated
584 /// as a clobber dependence on the first instruction in the predecessor block.
585 bool MemoryDependenceAnalysis::
570586 getNonLocalPointerDepFromBB(Value *Pointer, uint64_t PointeeSize,
571587 bool isLoad, BasicBlock *StartBB,
572588 SmallVectorImpl &Result,
573 SmallPtrSet &Visited) {
589 DenseMap &Visited,
590 bool SkipFirstBlock) {
591
574592 // Look up the cached info for Pointer.
575593 ValueIsLoadPair CacheKey(Pointer, isLoad);
576594
577 std::pair &CacheInfo =
578 NonLocalPointerDeps[CacheKey];
579 NonLocalDepInfo *Cache = &CacheInfo.second;
595 std::pair *CacheInfo =
596 &NonLocalPointerDeps[CacheKey];
597 NonLocalDepInfo *Cache = &CacheInfo->second;
580598
581599 // If we have valid cached information for exactly the block we are
582600 // investigating, just return it with no recomputation.
583 if (CacheInfo.first == StartBB) {
601 if (CacheInfo->first == BBSkipFirstBlockPair(StartBB, SkipFirstBlock)) {
584602 for (NonLocalDepInfo::iterator I = Cache->begin(), E = Cache->end();
585603 I != E; ++I)
586604 if (!I->second.isNonLocal())
587605 Result.push_back(*I);
588606 ++NumCacheCompleteNonLocalPtr;
589 return;
607 return false;
590608 }
591609
592610 // Otherwise, either this is a new block, a block with an invalid cache
593611 // pointer or one that we're about to invalidate by putting more info into it
594612 // than its valid cache info. If empty, the result will be valid cache info,
595613 // otherwise it isn't.
596 CacheInfo.first = Cache->empty() ? StartBB : 0;
614 if (Cache->empty())
615 CacheInfo->first = BBSkipFirstBlockPair(StartBB, SkipFirstBlock);
616 else
617 CacheInfo->first = BBSkipFirstBlockPair();
597618
598619 SmallVector Worklist;
599620 Worklist.push_back(StartBB);
605626 // revisit blocks after we insert info for them.
606627 unsigned NumSortedEntries = Cache->size();
607628
608 // SkipFirstBlock - If this is the very first block that we're processing, we
609 // don't want to scan or think about its body, because the client was supposed
610 // to do a local dependence query. Instead, just start processing it by
611 // adding its predecessors to the worklist and iterating.
612 bool SkipFirstBlock = Visited.empty();
613
614629 while (!Worklist.empty()) {
615630 BasicBlock *BB = Worklist.pop_back_val();
616631
617632 // Skip the first block if we have it.
618 if (SkipFirstBlock) {
619 SkipFirstBlock = false;
620 } else {
633 if (!SkipFirstBlock) {
621634 // Analyze the dependency of *Pointer in FromBB. See if we already have
622635 // been here.
623 if (!Visited.insert(BB))
624 continue;
636 assert(Visited.count(BB) && "Should check 'visited' before adding to WL");
625637
626638 // Get the dependency info for Pointer in BB. If we have cached
627639 // information, we will use it, otherwise we compute it.
635647 }
636648 }
637649
638 // Otherwise, we have to process all the predecessors of this block to scan
639 // them as well.
640 for (BasicBlock **PI = PredCache->GetPreds(BB); *PI; ++PI) {
641 // TODO: PHI TRANSLATE.
642 Worklist.push_back(*PI);
650 // If 'Pointer' is an instruction defined in this block, then we need to do
651 // phi translation to change it into a value live in the predecessor block.
652 // If phi translation fails, then we can't continue dependence analysis.
653 Instruction *PtrInst = dyn_cast(Pointer);
654 bool NeedsPHITranslation = PtrInst && PtrInst->getParent() == BB;
655
656 // If no PHI translation is needed, just add all the predecessors of this
657 // block to scan them as well.
658 if (!NeedsPHITranslation) {
659 SkipFirstBlock = false;
660 for (BasicBlock **PI = PredCache->GetPreds(BB); *PI; ++PI) {
661 // Verify that we haven't looked at this block yet.
662 std::pair::iterator, bool>
663 InsertRes = Visited.insert(std::make_pair(*PI, Pointer));
664 if (InsertRes.second) {
665 // First time we've looked at *PI.
666 Worklist.push_back(*PI);
667 continue;
668 }
669
670 // If we have seen this block before, but it was with a different
671 // pointer then we have a phi translation failure and we have to treat
672 // this as a clobber.
673 if (InsertRes.first->second != Pointer)
674 goto PredTranslationFailure;
675 }
676 continue;
677 }
678
679 // If we do need to do phi translation, then there are a bunch of different
680 // cases, because we have to find a Value* live in the predecessor block. We
681 // know that PtrInst is defined in this block at least.
682
683 // If this is directly a PHI node, just use the incoming values for each
684 // pred as the phi translated version.
685 if (PHINode *PtrPHI = dyn_cast(PtrInst)) {
686 for (BasicBlock **PI = PredCache->GetPreds(BB); *PI; ++PI){
687 BasicBlock *Pred = *PI;
688 Value *PredPtr = PtrPHI->getIncomingValueForBlock(Pred);
689
690 // Check to see if we have already visited this pred block with another
691 // pointer. If so, we can't do this lookup. This failure can occur
692 // with PHI translation when a critical edge exists and the PHI node in
693 // the successor translates to a pointer value different than the
694 // pointer the block was first analyzed with.
695 std::pair::iterator, bool>
696 InsertRes = Visited.insert(std::make_pair(Pred, PredPtr));
697
698 if (!InsertRes.second) {
699 // If the predecessor was visited with PredPtr, then we already did
700 // the analysis and can ignore it.
701 if (InsertRes.first->second == PredPtr)
702 continue;
703
704 // Otherwise, the block was previously analyzed with a different
705 // pointer. We can't represent the result of this case, so we just
706 // treat this as a phi translation failure.
707 goto PredTranslationFailure;
708 }
709
710 // If we have a problem phi translating, fall through to the code below
711 // to handle the failure condition.
712 if (getNonLocalPointerDepFromBB(PredPtr, PointeeSize, isLoad, Pred,
713 Result, Visited))
714 goto PredTranslationFailure;
715 }
716
717 // Refresh the CacheInfo/Cache pointer so that it isn't invalidated.
718 CacheInfo = &NonLocalPointerDeps[CacheKey];
719 Cache = &CacheInfo->second;
720
721 // Since we did phi translation, the "Cache" set won't contain all of the
722 // results for the query. This is ok (we can still use it to accelerate
723 // specific block queries) but we can't do the fastpath "return all
724 // results from the set" Clear out the indicator for this.
725 CacheInfo->first = BBSkipFirstBlockPair();
726 SkipFirstBlock = false;
727 continue;
728 }
729
730 // TODO: BITCAST, GEP.
731
732 // cerr << "MEMDEP: Could not PHI translate: " << *Pointer;
733 // if (isa(PtrInst) || isa(PtrInst))
734 // cerr << "OP:\t\t\t\t" << *PtrInst->getOperand(0);
735 PredTranslationFailure:
736
737 // Refresh the CacheInfo/Cache pointer so that it isn't invalidated.
738 CacheInfo = &NonLocalPointerDeps[CacheKey];
739 Cache = &CacheInfo->second;
740
741 // Since we did phi translation, the "Cache" set won't contain all of the
742 // results for the query. This is ok (we can still use it to accelerate
743 // specific block queries) but we can't do the fastpath "return all
744 // results from the set" Clear out the indicator for this.
745 CacheInfo->first = BBSkipFirstBlockPair();
746
747 // If *nothing* works, mark the pointer as being clobbered by the first
748 // instruction in this block.
749 //
750 // If this is the magic first block, return this as a clobber of the whole
751 // incoming value. Since we can't phi translate to one of the predecessors,
752 // we have to bail out.
753 if (SkipFirstBlock)
754 return true;
755
756 for (NonLocalDepInfo::reverse_iterator I = Cache->rbegin(); ; ++I) {
757 assert(I != Cache->rend() && "Didn't find current block??");
758 if (I->first != BB)
759 continue;
760
761 assert(I->second.isNonLocal() &&
762 "Should only be here with transparent block");
763 I->second = MemDepResult::getClobber(BB->begin());
764 ReverseNonLocalPtrDeps[BB->begin()].insert(CacheKey.getOpaqueValue());
765 Result.push_back(*I);
766 break;
643767 }
644768 }
645769
657781 Cache->insert(Entry, Val);
658782 // FALL THROUGH.
659783 }
660 case 1: {
784 case 1:
661785 // One new entry, Just insert the new value at the appropriate position.
662 NonLocalDepEntry Val = Cache->back();
663 Cache->pop_back();
664 NonLocalDepInfo::iterator Entry =
665 std::upper_bound(Cache->begin(), Cache->end(), Val);
666 Cache->insert(Entry, Val);
786 if (Cache->size() != 1) {
787 NonLocalDepEntry Val = Cache->back();
788 Cache->pop_back();
789 NonLocalDepInfo::iterator Entry =
790 std::upper_bound(Cache->begin(), Cache->end(), Val);
791 Cache->insert(Entry, Val);
792 }
667793 break;
668 }
669794 default:
670795 // Added many values, do a full scale sort.
671796 std::sort(Cache->begin(), Cache->end());
672797 }
798
799 return false;
673800 }
674801
675802 /// RemoveCachedNonLocalPointerDependencies - If P exists in
850977 NonLocalDepInfo &NLPDI = NonLocalPointerDeps[P].second;
851978
852979 // The cache is not valid for any specific block anymore.
853 NonLocalPointerDeps[P].first = 0;
980 NonLocalPointerDeps[P].first = BBSkipFirstBlockPair();
854981
855982 // Update any entries for RemInst to use the instruction after it.
856983 for (NonLocalDepInfo::iterator DI = NLPDI.begin(), DE = NLPDI.end();
0 ; RUN: llvm-as < %s | opt -gvn | llvm-dis | grep {DEAD.rle = phi i32}
1 ; XFAIL: *
2
3 ; FIXME: GVN should eliminate the fully redundant %9 GEP which
4 ; allows DEAD to be removed. This is PR3198.
5
16 ; The %7 and %4 loads combine to make %DEAD unneeded.
27 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
38 target triple = "i386-apple-darwin7"
0 ; RUN: llvm-as < %s | opt -gvn | llvm-dis | grep load
1 ; FIXME: This should be promotable, but memdep/gvn don't track values
2 ; path/edge sensitively enough.
3
14 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
25 target triple = "i386-apple-darwin7"
36
0 ; RUN: llvm-as < %s | opt -gvn | llvm-dis | grep {%cv.rle = phi i32}
1 ; RUN: llvm-as < %s | opt -gvn | llvm-dis | grep {%bv.rle = phi i32}
2 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
3 target triple = "i386-apple-darwin7"
4
5 define i32 @g(i32* %b, i32* %c) nounwind {
6 entry:
7 %g = alloca i32 ; [#uses=4]
8 %t1 = icmp eq i32* %b, null ; [#uses=1]
9 br i1 %t1, label %bb, label %bb1
10
11 bb: ; preds = %entry
12 %t2 = load i32* %c, align 4 ; [#uses=1]
13 %t3 = add i32 %t2, 1 ; [#uses=1]
14 store i32 %t3, i32* %g, align 4
15 br label %bb2
16
17 bb1: ; preds = %entry
18 %t5 = load i32* %b, align 4 ; [#uses=1]
19 %t6 = add i32 %t5, 1 ; [#uses=1]
20 store i32 %t6, i32* %g, align 4
21 br label %bb2
22
23 bb2: ; preds = %bb1, %bb
24 %c_addr.0 = phi i32* [ %g, %bb1 ], [ %c, %bb ] ; [#uses=1]
25 %b_addr.0 = phi i32* [ %b, %bb1 ], [ %g, %bb ] ; [#uses=1]
26 %cv = load i32* %c_addr.0, align 4 ; [#uses=1]
27 %bv = load i32* %b_addr.0, align 4 ; [#uses=1]
28 %ret = add i32 %cv, %bv ; [#uses=1]
29 ret i32 %ret
30 }
31