llvm.org GIT mirror llvm / ea83530
[EarlyCSE] Exploit open ended invariant.start scopes If we have an invariant.start with no corresponding invariant.end, then the memory location becomes invariant indefinitely after the invariant.start. As a result, anything dominated by the start is guaranteed to see the value the memory location had when the invariant.start executed. This patch adds an AvailableInvariants table which tracks the generation a particular memory location became invariant and then uses that information to allow value forwarding that would otherwise be disallowed by potentially aliasing stores. (Reminder: In EarlyCSE everything clobbers everything by default.) This should be compatible with the MemorySSA variant, but design is generational. We can and should add first class support for invariant.start within MemorySSA at a later time. I took a quick look at doing so, but probably need some input from a MemorySSA expert. Differential Revision: https://reviews.llvm.org/D43716 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@327577 91177308-0d34-0410-b5e6-96231b3b80d8 Philip Reames 1 year, 6 months ago
3 changed file(s) with 304 addition(s) and 32 deletion(s). Raw diff Collapse all Expand all
355355 unsigned Generation = 0;
356356 int MatchingId = -1;
357357 bool IsAtomic = false;
358
359 // TODO: Remove this flag. It would be strictly stronger to add a record
360 // to the AvailableInvariant table when passing the invariant load instead.
358361 bool IsInvariant = false;
359362
360363 LoadValue() = default;
372375 LoadMapAllocator>;
373376
374377 LoadHTType AvailableLoads;
378
379 // A scoped hash table mapping memory locations (represented as typed
380 // addresses) to generation numbers at which that memory location became
381 // (henceforth indefinitely) invariant.
382 using InvariantMapAllocator =
383 RecyclingAllocator
384 ScopedHashTableVal>;
385 using InvariantHTType =
386 ScopedHashTable,
387 InvariantMapAllocator>;
388 InvariantHTType AvailableInvariants;
375389
376390 /// \brief A scoped hash table of the current values of read-only call
377391 /// values.
400414 class NodeScope {
401415 public:
402416 NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
403 CallHTType &AvailableCalls)
404 : Scope(AvailableValues), LoadScope(AvailableLoads),
405 CallScope(AvailableCalls) {}
417 InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls)
418 : Scope(AvailableValues), LoadScope(AvailableLoads),
419 InvariantScope(AvailableInvariants), CallScope(AvailableCalls) {}
406420 NodeScope(const NodeScope &) = delete;
407421 NodeScope &operator=(const NodeScope &) = delete;
408422
409423 private:
410424 ScopedHTType::ScopeTy Scope;
411425 LoadHTType::ScopeTy LoadScope;
426 InvariantHTType::ScopeTy InvariantScope;
412427 CallHTType::ScopeTy CallScope;
413428 };
414429
419434 class StackNode {
420435 public:
421436 StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
422 CallHTType &AvailableCalls, unsigned cg, DomTreeNode *n,
423 DomTreeNode::iterator child, DomTreeNode::iterator end)
437 InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
438 unsigned cg, DomTreeNode *n, DomTreeNode::iterator child,
439 DomTreeNode::iterator end)
424440 : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
425 EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls)
441 EndIter(end),
442 Scopes(AvailableValues, AvailableLoads, AvailableInvariants,
443 AvailableCalls)
426444 {}
427445 StackNode(const StackNode &) = delete;
428446 StackNode &operator=(const StackNode &) = delete;
561579 return TTI.getOrCreateResultFromMemIntrinsic(cast(Inst),
562580 ExpectedType);
563581 }
582
583 /// Return true if the instruction is known to only operate on memory
584 /// provably invariant in the given "generation".
585 bool isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt);
564586
565587 bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration,
566588 Instruction *EarlierInst, Instruction *LaterInst);
653675 MemoryAccess *LaterDef =
654676 MSSA->getWalker()->getClobberingMemoryAccess(LaterInst);
655677 return MSSA->dominates(LaterDef, EarlierMA);
678 }
679
680 bool EarlyCSE::isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt) {
681 // A location loaded from with an invariant_load is assumed to *never* change
682 // within the visible scope of the compilation.
683 if (auto *LI = dyn_cast(I))
684 if (LI->getMetadata(LLVMContext::MD_invariant_load))
685 return true;
686
687 auto MemLocOpt = MemoryLocation::getOrNone(I);
688 if (!MemLocOpt)
689 // "target" intrinsic forms of loads aren't currently known to
690 // MemoryLocation::get. TODO
691 return false;
692 MemoryLocation MemLoc = *MemLocOpt;
693 if (!AvailableInvariants.count(MemLoc))
694 return false;
695
696 // Is the generation at which this became invariant older than the
697 // current one?
698 return AvailableInvariants.lookup(MemLoc) <= GenAt;
656699 }
657700
658701 bool EarlyCSE::processNode(DomTreeNode *Node) {
740783 continue;
741784 }
742785
743 // Skip invariant.start intrinsics since they only read memory, and we can
744 // forward values across it. Also, we dont need to consume the last store
745 // since the semantics of invariant.start allow us to perform DSE of the
746 // last store, if there was a store following invariant.start. Consider:
786 // We can skip all invariant.start intrinsics since they only read memory,
787 // and we can forward values across it. For invariant starts without
788 // invariant ends, we can use the fact that the invariantness never ends to
789 // start a scope in the current generaton which is true for all future
790 // generations. Also, we dont need to consume the last store since the
791 // semantics of invariant.start allow us to perform DSE of the last
792 // store, if there was a store following invariant.start. Consider:
747793 //
748794 // store 30, i8* p
749795 // invariant.start(p)
750796 // store 40, i8* p
751797 // We can DSE the store to 30, since the store 40 to invariant location p
752798 // causes undefined behaviour.
753 if (match(Inst, m_Intrinsic()))
799 if (match(Inst, m_Intrinsic())) {
800 // If there are any uses, the scope might end.
801 if (!Inst->use_empty())
802 continue;
803 auto *CI = cast(Inst);
804 MemoryLocation MemLoc = MemoryLocation::getForArgument(CI, 1, TLI);
805 AvailableInvariants.insert(MemLoc, CurrentGeneration);
754806 continue;
807 }
755808
756809 if (match(Inst, m_Intrinsic())) {
757810 if (auto *CondI =
849902 !MemInst.isVolatile() && MemInst.isUnordered() &&
850903 // We can't replace an atomic load with one which isn't also atomic.
851904 InVal.IsAtomic >= MemInst.isAtomic() &&
852 (InVal.IsInvariant || MemInst.isInvariantLoad() ||
905 (InVal.IsInvariant ||
906 isOperatingOnInvariantMemAt(Inst, InVal.Generation) ||
853907 isSameMemGeneration(InVal.Generation, CurrentGeneration,
854908 InVal.DefInst, Inst))) {
855909 Value *Op = getOrCreateResult(InVal.DefInst, Inst->getType());
933987 InVal.MatchingId == MemInst.getMatchingId() &&
934988 // We don't yet handle removing stores with ordering of any kind.
935989 !MemInst.isVolatile() && MemInst.isUnordered() &&
936 isSameMemGeneration(InVal.Generation, CurrentGeneration,
937 InVal.DefInst, Inst)) {
990 (isOperatingOnInvariantMemAt(Inst, InVal.Generation) ||
991 isSameMemGeneration(InVal.Generation, CurrentGeneration,
992 InVal.DefInst, Inst))) {
938993 // It is okay to have a LastStore to a different pointer here if MemorySSA
939994 // tells us that the load and store are from the same memory generation.
940995 // In that case, LastStore should keep its present value since we're
10261081
10271082 // Process the root node.
10281083 nodesToProcess.push_back(new StackNode(
1029 AvailableValues, AvailableLoads, AvailableCalls, CurrentGeneration,
1030 DT.getRootNode(), DT.getRootNode()->begin(), DT.getRootNode()->end()));
1084 AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls,
1085 CurrentGeneration, DT.getRootNode(),
1086 DT.getRootNode()->begin(), DT.getRootNode()->end()));
10311087
10321088 // Save the current generation.
10331089 unsigned LiveOutGeneration = CurrentGeneration;
10511107 // Push the next child onto the stack.
10521108 DomTreeNode *child = NodeToProcess->nextChild();
10531109 nodesToProcess.push_back(
1054 new StackNode(AvailableValues, AvailableLoads, AvailableCalls,
1055 NodeToProcess->childGeneration(), child, child->begin(),
1056 child->end()));
1110 new StackNode(AvailableValues, AvailableLoads, AvailableInvariants,
1111 AvailableCalls, NodeToProcess->childGeneration(),
1112 child, child->begin(), child->end()));
10571113 } else {
10581114 // It has been processed, and there are no more children to process,
10591115 // so delete it and pop it off the stack.
9393 call void @clobber_and_use(i32 %val1)
9494 ret void
9595 }
96
97 define void @test_false_negative_dse(i32* %p, i1 %cnd) {
98 ; CHECK-LABEL: @test_false_negative_dse
99 ; CHECK: store
100 %v1 = load i32, i32* %p, !invariant.load !{}
101 call void @clobber_and_use(i32 %v1)
102 store i32 %v1, i32* %p
103 ret void
104 }
55
66 ; Check that we do load-load forwarding over invariant.start, since it does not
77 ; clobber memory
8 define i8 @test1(i8 *%P) {
9 ; CHECK-LABEL: @test1(
8 define i8 @test_bypass1(i8 *%P) {
9 ; CHECK-LABEL: @test_bypass1(
1010 ; CHECK-NEXT: %V1 = load i8, i8* %P
1111 ; CHECK-NEXT: %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
1212 ; CHECK-NEXT: ret i8 0
1313
14
1514 %V1 = load i8, i8* %P
1615 %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
1716 %V2 = load i8, i8* %P
2120
2221
2322 ; Trivial Store->load forwarding over invariant.start
24 define i8 @test2(i8 *%P) {
25 ; CHECK-LABEL: @test2(
23 define i8 @test_bypass2(i8 *%P) {
24 ; CHECK-LABEL: @test_bypass2(
2625 ; CHECK-NEXT: store i8 42, i8* %P
2726 ; CHECK-NEXT: %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
2827 ; CHECK-NEXT: ret i8 42
2928
30
3129 store i8 42, i8* %P
3230 %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
3331 %V1 = load i8, i8* %P
3735 ; We can DSE over invariant.start calls, since the first store to
3836 ; %P is valid, and the second store is actually unreachable based on semantics
3937 ; of invariant.start.
40 define void @test3(i8* %P) {
41
42 ; CHECK-LABEL: @test3(
38 define void @test_bypass3(i8* %P) {
39 ; CHECK-LABEL: @test_bypass3(
4340 ; CHECK-NEXT: %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
4441 ; CHECK-NEXT: store i8 60, i8* %P
4542
46
4743 store i8 50, i8* %P
4844 %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
4945 store i8 60, i8* %P
5349
5450 ; FIXME: Now the first store can actually be eliminated, since there is no read within
5551 ; the invariant region, between start and end.
56 define void @test4(i8* %P) {
57
58 ; CHECK-LABEL: @test4(
52 define void @test_bypass4(i8* %P) {
53
54 ; CHECK-LABEL: @test_bypass4(
5955 ; CHECK-NEXT: store i8 50, i8* %P
6056 ; CHECK-NEXT: %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
6157 ; CHECK-NEXT: call void @llvm.invariant.end.p0i8({}* %i, i64 1, i8* %P)
6864 store i8 60, i8* %P
6965 ret void
7066 }
67
68
69 declare void @clobber()
70 declare {}* @llvm.invariant.start.p0i32(i64 %size, i32* nocapture %ptr)
71 declare void @llvm.invariant.end.p0i32({}*, i64, i32* nocapture) nounwind
72
73 define i32 @test_before_load(i32* %p) {
74 ; CHECK-LABEL: @test_before_load
75 ; CHECK: ret i32 0
76 call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
77 %v1 = load i32, i32* %p
78 call void @clobber()
79 %v2 = load i32, i32* %p
80 %sub = sub i32 %v1, %v2
81 ret i32 %sub
82 }
83
84 define i32 @test_before_clobber(i32* %p) {
85 ; CHECK-LABEL: @test_before_clobber
86 ; CHECK: ret i32 0
87 %v1 = load i32, i32* %p
88 call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
89 call void @clobber()
90 %v2 = load i32, i32* %p
91 %sub = sub i32 %v1, %v2
92 ret i32 %sub
93 }
94
95 define i32 @test_unanalzyable_load(i32* %p) {
96 ; CHECK-LABEL: @test_unanalzyable_load
97 ; CHECK: ret i32 0
98 call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
99 call void @clobber()
100 %v1 = load i32, i32* %p
101 call void @clobber()
102 %v2 = load i32, i32* %p
103 %sub = sub i32 %v1, %v2
104 ret i32 %sub
105 }
106
107 define i32 @test_negative_after_clobber(i32* %p) {
108 ; CHECK-LABEL: @test_negative_after_clobber
109 ; CHECK: ret i32 %sub
110 %v1 = load i32, i32* %p
111 call void @clobber()
112 call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
113 %v2 = load i32, i32* %p
114 %sub = sub i32 %v1, %v2
115 ret i32 %sub
116 }
117
118 define i32 @test_merge(i32* %p, i1 %cnd) {
119 ; CHECK-LABEL: @test_merge
120 ; CHECK: ret i32 0
121 %v1 = load i32, i32* %p
122 call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
123 br i1 %cnd, label %merge, label %taken
124
125 taken:
126 call void @clobber()
127 br label %merge
128 merge:
129 %v2 = load i32, i32* %p
130 %sub = sub i32 %v1, %v2
131 ret i32 %sub
132 }
133
134 define i32 @test_negative_after_mergeclobber(i32* %p, i1 %cnd) {
135 ; CHECK-LABEL: @test_negative_after_mergeclobber
136 ; CHECK: ret i32 %sub
137 %v1 = load i32, i32* %p
138 br i1 %cnd, label %merge, label %taken
139
140 taken:
141 call void @clobber()
142 br label %merge
143 merge:
144 call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
145 %v2 = load i32, i32* %p
146 %sub = sub i32 %v1, %v2
147 ret i32 %sub
148 }
149
150 ; In theory, this version could work, but earlycse is incapable of
151 ; merging facts along distinct paths.
152 define i32 @test_false_negative_merge(i32* %p, i1 %cnd) {
153 ; CHECK-LABEL: @test_false_negative_merge
154 ; CHECK: ret i32 %sub
155 %v1 = load i32, i32* %p
156 br i1 %cnd, label %merge, label %taken
157
158 taken:
159 call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
160 call void @clobber()
161 br label %merge
162 merge:
163 %v2 = load i32, i32* %p
164 %sub = sub i32 %v1, %v2
165 ret i32 %sub
166 }
167
168 define i32 @test_merge_unanalyzable_load(i32* %p, i1 %cnd) {
169 ; CHECK-LABEL: @test_merge_unanalyzable_load
170 ; CHECK: ret i32 0
171 call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
172 call void @clobber()
173 %v1 = load i32, i32* %p
174 br i1 %cnd, label %merge, label %taken
175
176 taken:
177 call void @clobber()
178 br label %merge
179 merge:
180 %v2 = load i32, i32* %p
181 %sub = sub i32 %v1, %v2
182 ret i32 %sub
183 }
184
185 define void @test_dse_before_load(i32* %p, i1 %cnd) {
186 ; CHECK-LABEL: @test_dse_before_load
187 ; CHECK-NOT: store
188 call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
189 %v1 = load i32, i32* %p
190 call void @clobber()
191 store i32 %v1, i32* %p
192 ret void
193 }
194
195 define void @test_dse_after_load(i32* %p, i1 %cnd) {
196 ; CHECK-LABEL: @test_dse_after_load
197 ; CHECK-NOT: store
198 %v1 = load i32, i32* %p
199 call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
200 call void @clobber()
201 store i32 %v1, i32* %p
202 ret void
203 }
204
205
206 ; In this case, we have a false negative since MemoryLocation is implicitly
207 ; typed due to the user of a Value to represent the address. Note that other
208 ; passes will canonicalize away the bitcasts in this example.
209 define i32 @test_false_negative_types(i32* %p) {
210 ; CHECK-LABEL: @test_false_negative_types
211 ; CHECK: ret i32 %sub
212 call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
213 %v1 = load i32, i32* %p
214 call void @clobber()
215 %pf = bitcast i32* %p to float*
216 %v2f = load float, float* %pf
217 %v2 = bitcast float %v2f to i32
218 %sub = sub i32 %v1, %v2
219 ret i32 %sub
220 }
221
222 define i32 @test_negative_size1(i32* %p) {
223 ; CHECK-LABEL: @test_negative_size1
224 ; CHECK: ret i32 %sub
225 call {}* @llvm.invariant.start.p0i32(i64 3, i32* %p)
226 %v1 = load i32, i32* %p
227 call void @clobber()
228 %v2 = load i32, i32* %p
229 %sub = sub i32 %v1, %v2
230 ret i32 %sub
231 }
232
233 define i32 @test_negative_size2(i32* %p) {
234 ; CHECK-LABEL: @test_negative_size2
235 ; CHECK: ret i32 %sub
236 call {}* @llvm.invariant.start.p0i32(i64 0, i32* %p)
237 %v1 = load i32, i32* %p
238 call void @clobber()
239 %v2 = load i32, i32* %p
240 %sub = sub i32 %v1, %v2
241 ret i32 %sub
242 }
243
244 define i32 @test_negative_scope(i32* %p) {
245 ; CHECK-LABEL: @test_negative_scope
246 ; CHECK: ret i32 %sub
247 %scope = call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
248 call void @llvm.invariant.end.p0i32({}* %scope, i64 4, i32* %p)
249 %v1 = load i32, i32* %p
250 call void @clobber()
251 %v2 = load i32, i32* %p
252 %sub = sub i32 %v1, %v2
253 ret i32 %sub
254 }
255
256 define i32 @test_false_negative_scope(i32* %p) {
257 ; CHECK-LABEL: @test_false_negative_scope
258 ; CHECK: ret i32 %sub
259 %scope = call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
260 %v1 = load i32, i32* %p
261 call void @clobber()
262 %v2 = load i32, i32* %p
263 call void @llvm.invariant.end.p0i32({}* %scope, i64 4, i32* %p)
264 %sub = sub i32 %v1, %v2
265 ret i32 %sub
266 }
267
268 ; Invariant load defact starts an invariant.start scope of the appropriate size
269 define i32 @test_invariant_load_scope(i32* %p) {
270 ; CHECK-LABEL: @test_invariant_load_scope
271 ; CHECK: ret i32 0
272 %v1 = load i32, i32* %p, !invariant.load !{}
273 call void @clobber()
274 %v2 = load i32, i32* %p
275 %sub = sub i32 %v1, %v2
276 ret i32 %sub
277 }