llvm.org GIT mirror llvm / 05da4dd
Improve 'tail' call marking in TRE. A bootstrap of clang goes from 375k calls marked tail in the IR to 470k, however this improvement does not carry into an improvement of the call/jmp ratio on x86. The most common pattern is a tail call + br to a block with nothing but a 'ret'. The number of tail call to loop conversions remains the same (1618 by my count). The new algorithm does a local scan over the use-def chains to identify local "alloca-derived" values, as well as points where the alloca could escape. Then, a visit over the CFG marks blocks as being before or after the allocas have escaped, and annotates the calls accordingly. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@208017 91177308-0d34-0410-b5e6-96231b3b80d8 Nick Lewycky 5 years ago
3 changed file(s) with 280 addition(s) and 84 deletion(s). Raw diff Collapse all Expand all
165165 return isCall() && cast(getInstruction())->isMustTailCall();
166166 }
167167
168 /// \brief Tests if this call site is marked as a tail call.
169 bool isTailCall() const {
170 return isCall() && cast(getInstruction())->isTailCall();
171 }
172
168173 #define CALLSITE_DELEGATE_GETTER(METHOD) \
169174 InstrTy *II = getInstruction(); \
170175 return isCall() \
5454 #include "llvm/ADT/SmallPtrSet.h"
5555 #include "llvm/ADT/Statistic.h"
5656 #include "llvm/Analysis/CaptureTracking.h"
57 #include "llvm/Analysis/CFG.h"
5758 #include "llvm/Analysis/InlineCost.h"
5859 #include "llvm/Analysis/InstructionSimplify.h"
5960 #include "llvm/Analysis/Loads.h"
9495 bool runOnFunction(Function &F) override;
9596
9697 private:
98 bool runTRE(Function &F);
99 bool markTails(Function &F, bool &AllCallsAreTailCalls);
100
97101 CallInst *FindTRECandidate(Instruction *I,
98102 bool CannotTailCallElimCallsMarkedTail);
99103 bool EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
145149 isa(AI->getArraySize());
146150 }
147151
148 namespace {
149 struct AllocaCaptureTracker : public CaptureTracker {
150 AllocaCaptureTracker() : Captured(false) {}
151
152 void tooManyUses() override { Captured = true; }
153
154 bool shouldExplore(const Use *U) override {
155 Value *V = U->getUser();
156 if (isa(V) || isa(V))
157 UsesAlloca.insert(V);
158 return true;
159 }
160
161 bool captured(const Use *U) override {
162 if (isa(U->getUser()))
163 return false;
164 Captured = true;
165 return true;
166 }
167
168 bool Captured;
169 SmallPtrSet UsesAlloca;
170 };
171 } // end anonymous namespace
172
173152 bool TailCallElim::runOnFunction(Function &F) {
174153 if (skipOptnoneFunction(F))
175154 return false;
176155
156 bool AllCallsAreTailCalls = false;
157 bool Modified = markTails(F, AllCallsAreTailCalls);
158 if (AllCallsAreTailCalls)
159 Modified |= runTRE(F);
160 return Modified;
161 }
162
163 namespace {
164 struct AllocaDerivedValueTracker {
165 // Start at a root value and walk its use-def chain to mark calls that use the
166 // value or a derived value in AllocaUsers, and places where it may escape in
167 // EscapePoints.
168 void walk(Value *Root) {
169 SmallVector Worklist;
170 SmallPtrSet Visited;
171
172 auto AddUsesToWorklist = [&](Value *V) {
173 for (auto &U : V->uses()) {
174 if (!Visited.insert(&U))
175 continue;
176 Worklist.push_back(&U);
177 }
178 };
179
180 AddUsesToWorklist(Root);
181
182 while (!Worklist.empty()) {
183 Use *U = Worklist.pop_back_val();
184 Instruction *I = cast(U->getUser());
185
186 switch (I->getOpcode()) {
187 case Instruction::Call:
188 case Instruction::Invoke: {
189 CallSite CS(I);
190 bool IsNocapture = !CS.isCallee(U) &&
191 CS.doesNotCapture(CS.getArgumentNo(U));
192 callUsesLocalStack(CS, IsNocapture);
193 if (IsNocapture) {
194 // If the alloca-derived argument is passed in as nocapture, then it
195 // can't propagate to the call's return. That would be capturing.
196 continue;
197 }
198 break;
199 }
200 case Instruction::Load: {
201 // The result of a load is not alloca-derived (unless an alloca has
202 // otherwise escaped, but this is a local analysis).
203 continue;
204 }
205 case Instruction::Store: {
206 if (U->getOperandNo() == 0)
207 EscapePoints.insert(I);
208 continue; // Stores have no users to analyze.
209 }
210 case Instruction::BitCast:
211 case Instruction::GetElementPtr:
212 case Instruction::PHI:
213 case Instruction::Select:
214 case Instruction::AddrSpaceCast:
215 break;
216 default:
217 EscapePoints.insert(I);
218 break;
219 }
220
221 AddUsesToWorklist(I);
222 }
223 }
224
225 void callUsesLocalStack(CallSite CS, bool IsNocapture) {
226 // Add it to the list of alloca users. If it's already there, skip further
227 // processing.
228 if (!AllocaUsers.insert(CS.getInstruction()))
229 return;
230
231 // If it's nocapture then it can't capture the alloca.
232 if (IsNocapture)
233 return;
234
235 // If it can write to memory, it can leak the alloca value.
236 if (!CS.onlyReadsMemory())
237 EscapePoints.insert(CS.getInstruction());
238 }
239
240 SmallPtrSet AllocaUsers;
241 SmallPtrSet EscapePoints;
242 };
243 }
244
245 bool TailCallElim::markTails(Function &F, bool &AllCallsAreTailCalls) {
246 if (F.callsFunctionThatReturnsTwice())
247 return false;
248 AllCallsAreTailCalls = true;
249
250 // The local stack holds all alloca instructions and all byval arguments.
251 AllocaDerivedValueTracker Tracker;
252 for (Argument &Arg : F.args()) {
253 if (Arg.hasByValAttr())
254 Tracker.walk(&Arg);
255 }
256 for (auto &BB : F) {
257 for (auto &I : BB)
258 if (AllocaInst *AI = dyn_cast(&I))
259 Tracker.walk(AI);
260 }
261
262 bool Modified = false;
263
264 // Track whether a block is reachable after an alloca has escaped. Blocks that
265 // contain the escaping instruction will be marked as being visited without an
266 // escaped alloca, since that is how the block began.
267 enum VisitType {
268 UNVISITED,
269 UNESCAPED,
270 ESCAPED
271 };
272 DenseMap Visited;
273
274 // We propagate the fact that an alloca has escaped from block to successor.
275 // Visit the blocks that are propagating the escapedness first. To do this, we
276 // maintain two worklists.
277 SmallVector WorklistUnescaped, WorklistEscaped;
278
279 // We may enter a block and visit it thinking that no alloca has escaped yet,
280 // then see an escape point and go back around a loop edge and come back to
281 // the same block twice. Because of this, we defer setting tail on calls when
282 // we first encounter them in a block. Every entry in this list does not
283 // statically use an alloca via use-def chain analysis, but may find an alloca
284 // through other means if the block turns out to be reachable after an escape
285 // point.
286 SmallVector DeferredTails;
287
288 BasicBlock *BB = &F.getEntryBlock();
289 VisitType Escaped = UNESCAPED;
290 do {
291 for (auto &I : *BB) {
292 if (Tracker.EscapePoints.count(&I))
293 Escaped = ESCAPED;
294
295 CallInst *CI = dyn_cast(&I);
296 if (!CI || CI->isTailCall())
297 continue;
298
299 if (CI->doesNotAccessMemory()) {
300 // A call to a readnone function whose arguments are all things computed
301 // outside this function can be marked tail. Even if you stored the
302 // alloca address into a global, a readnone function can't load the
303 // global anyhow.
304 //
305 // Note that this runs whether we know an alloca has escaped or not. If
306 // it has, then we can't trust Tracker.AllocaUsers to be accurate.
307 bool SafeToTail = true;
308 for (auto &Arg : CI->arg_operands()) {
309 if (isa(Arg.getUser()))
310 continue;
311 if (Argument *A = dyn_cast(Arg.getUser()))
312 if (!A->hasByValAttr())
313 continue;
314 SafeToTail = false;
315 break;
316 }
317 if (SafeToTail) {
318 F.getContext().emitOptimizationRemark(
319 "tailcallelim", F, CI->getDebugLoc(),
320 "found readnone tail call candidate");
321 CI->setTailCall();
322 Modified = true;
323 continue;
324 }
325 }
326
327 if (Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) {
328 DeferredTails.push_back(CI);
329 } else {
330 AllCallsAreTailCalls = false;
331 }
332 }
333
334 for (auto *SuccBB : make_range(succ_begin(BB), succ_end(BB))) {
335 auto &State = Visited[SuccBB];
336 if (State < Escaped) {
337 State = Escaped;
338 if (State == ESCAPED)
339 WorklistEscaped.push_back(SuccBB);
340 else
341 WorklistUnescaped.push_back(SuccBB);
342 }
343 }
344
345 if (!WorklistEscaped.empty()) {
346 BB = WorklistEscaped.pop_back_val();
347 Escaped = ESCAPED;
348 } else {
349 BB = nullptr;
350 while (!WorklistUnescaped.empty()) {
351 auto *NextBB = WorklistUnescaped.pop_back_val();
352 if (Visited[NextBB] == UNESCAPED) {
353 BB = NextBB;
354 Escaped = UNESCAPED;
355 break;
356 }
357 }
358 }
359 } while (BB);
360
361 for (CallInst *CI : DeferredTails) {
362 if (Visited[CI->getParent()] != ESCAPED) {
363 // If the escape point was part way through the block, calls after the
364 // escape point wouldn't have been put into DeferredTails.
365 F.getContext().emitOptimizationRemark(
366 "tailcallelim", F, CI->getDebugLoc(), "found tail call candidate");
367 CI->setTailCall();
368 Modified = true;
369 } else {
370 AllCallsAreTailCalls = false;
371 }
372 }
373
374 return Modified;
375 }
376
377 bool TailCallElim::runTRE(Function &F) {
177378 // If this function is a varargs function, we won't be able to PHI the args
178379 // right, so don't even try to convert it...
179380 if (F.getFunctionType()->isVarArg()) return false;
190391 // doesn't).
191392 bool CanTRETailMarkedCall = true;
192393
193 // Find calls that can be marked tail.
194 AllocaCaptureTracker ACT;
394 // Find dynamic allocas.
195395 for (Function::iterator BB = F.begin(), EE = F.end(); BB != EE; ++BB) {
196396 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
197397 if (AllocaInst *AI = dyn_cast(I)) {
198398 CanTRETailMarkedCall &= CanTRE(AI);
199 PointerMayBeCaptured(AI, &ACT);
200 // If any allocas are captured, exit.
201 if (ACT.Captured)
202 return false;
203 }
204 }
205 }
206
207 // If any byval or inalloca args are captured, exit. They are also allocated
208 // in our stack frame.
209 for (Argument &Arg : F.args()) {
210 if (Arg.hasByValOrInAllocaAttr())
211 PointerMayBeCaptured(&Arg, &ACT);
212 if (ACT.Captured)
213 return false;
214 }
215
216 // Second pass, change any tail recursive calls to loops.
399 }
400 }
401 }
402
403 // Change any tail recursive calls to loops.
217404 //
218405 // FIXME: The code generator produces really bad code when an 'escaping
219406 // alloca' is changed from being a static alloca to being a dynamic alloca.
220407 // Until this is resolved, disable this transformation if that would ever
221408 // happen. This bug is PR962.
222 if (ACT.UsesAlloca.empty()) {
223 for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
224 if (ReturnInst *Ret = dyn_cast(BB->getTerminator())) {
225 bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
226 ArgumentPHIs, !CanTRETailMarkedCall);
227 if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
228 Change = FoldReturnAndProcessPred(BB, Ret, OldEntry,
229 TailCallsAreMarkedTail, ArgumentPHIs,
230 !CanTRETailMarkedCall);
231 MadeChange |= Change;
232 }
409 for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
410 if (ReturnInst *Ret = dyn_cast(BB->getTerminator())) {
411 bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
412 ArgumentPHIs, !CanTRETailMarkedCall);
413 if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
414 Change = FoldReturnAndProcessPred(BB, Ret, OldEntry,
415 TailCallsAreMarkedTail, ArgumentPHIs,
416 !CanTRETailMarkedCall);
417 MadeChange |= Change;
233418 }
234419 }
235420
238423 // with themselves. Check to see if we did and clean up our mess if so. This
239424 // occurs when a function passes an argument straight through to its tail
240425 // call.
241 if (!ArgumentPHIs.empty()) {
242 for (unsigned i = 0, e = ArgumentPHIs.size(); i != e; ++i) {
243 PHINode *PN = ArgumentPHIs[i];
244
245 // If the PHI Node is a dynamic constant, replace it with the value it is.
246 if (Value *PNV = SimplifyInstruction(PN)) {
247 PN->replaceAllUsesWith(PNV);
248 PN->eraseFromParent();
249 }
250 }
251 }
252
253 // At this point, we know that the function does not have any captured
254 // allocas. If additionally the function does not call setjmp, mark all calls
255 // in the function that do not access stack memory with the tail keyword. This
256 // implies ensuring that there does not exist any path from a call that takes
257 // in an alloca but does not capture it and the call which we wish to mark
258 // with "tail".
259 if (!F.callsFunctionThatReturnsTwice()) {
260 for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
261 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
262 if (CallInst *CI = dyn_cast(I)) {
263 if (!ACT.UsesAlloca.count(CI)) {
264 CI->setTailCall();
265 MadeChange = true;
266 }
267 }
268 }
426 for (unsigned i = 0, e = ArgumentPHIs.size(); i != e; ++i) {
427 PHINode *PN = ArgumentPHIs[i];
428
429 // If the PHI Node is a dynamic constant, replace it with the value it is.
430 if (Value *PNV = SimplifyInstruction(PN)) {
431 PN->replaceAllUsesWith(PNV);
432 PN->eraseFromParent();
269433 }
270434 }
271435
519683 BasicBlock *BB = Ret->getParent();
520684 Function *F = BB->getParent();
521685
686 F->getContext().emitOptimizationRemark(
687 "tailcallelim", *F, CI->getDebugLoc(),
688 "transforming tail recursion to loop");
689
522690 // OK! We can transform this tail call. If this is the first one found,
523691 // create the new entry block, allowing us to branch back to the old entry.
524692 if (!OldEntry) {
150150 call void @use(i32* %a)
151151 ret void
152152 }
153
154 %struct.X = type { i8* }
155
156 declare void @ctor(%struct.X*)
157 define void @test10(%struct.X* noalias sret %agg.result, i1 zeroext %b) {
158 ; CHECK-LABEL @test10
159 entry:
160 %x = alloca %struct.X, align 8
161 br i1 %b, label %if.then, label %if.end
162
163 if.then: ; preds = %entry
164 call void @ctor(%struct.X* %agg.result)
165 ; CHECK: tail call void @ctor
166 br label %return
167
168 if.end:
169 call void @ctor(%struct.X* %x)
170 ; CHECK: call void @ctor
171 br label %return
172
173 return:
174 ret void
175 }