llvm.org GIT mirror llvm / 8a189ea
[PartiallyInlineLibCalls][x86] add TTI hook to allow sqrt inlining to depend on arg rather than result This should fix PR31455: https://bugs.llvm.org/show_bug.cgi?id=31455 Differential Revision: https://reviews.llvm.org/D28314 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319094 91177308-0d34-0410-b5e6-96231b3b80d8 Sanjay Patel 1 year, 9 months ago
9 changed file(s) with 58 addition(s) and 25 deletion(s). Raw diff Collapse all Expand all
585585 /// \brief Return true if the hardware has a fast square-root instruction.
586586 bool haveFastSqrt(Type *Ty) const;
587587
588 /// Return true if it is faster to check if a floating-point value is NaN
589 /// (or not-NaN) versus a comparison against a constant FP zero value.
590 /// Targets should override this if materializing a 0.0 for comparison is
591 /// generally as cheap as checking for ordered/unordered.
592 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const;
593
588594 /// \brief Return the expected cost of supporting the floating point operation
589595 /// of the specified type.
590596 int getFPOpCost(Type *Ty) const;
10081014 bool *Fast) = 0;
10091015 virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0;
10101016 virtual bool haveFastSqrt(Type *Ty) = 0;
1017 virtual bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) = 0;
10111018 virtual int getFPOpCost(Type *Ty) = 0;
10121019 virtual int getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, const APInt &Imm,
10131020 Type *Ty) = 0;
12721279 }
12731280 bool haveFastSqrt(Type *Ty) override { return Impl.haveFastSqrt(Ty); }
12741281
1282 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) override {
1283 return Impl.isFCmpOrdCheaperThanFCmpZero(Ty);
1284 }
1285
12751286 int getFPOpCost(Type *Ty) override { return Impl.getFPOpCost(Ty); }
12761287
12771288 int getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, const APInt &Imm,
315315
316316 bool haveFastSqrt(Type *Ty) { return false; }
317317
318 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { return true; }
319
318320 unsigned getFPOpCost(Type *Ty) { return TargetTransformInfo::TCC_Basic; }
319321
320322 int getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
294294 EVT VT = TLI->getValueType(DL, Ty);
295295 return TLI->isTypeLegal(VT) &&
296296 TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
297 }
298
299 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
300 return true;
297301 }
298302
299303 unsigned getFPOpCost(Type *Ty) {
278278
279279 bool TargetTransformInfo::haveFastSqrt(Type *Ty) const {
280280 return TTIImpl->haveFastSqrt(Ty);
281 }
282
283 bool TargetTransformInfo::isFCmpOrdCheaperThanFCmpZero(Type *Ty) const {
284 return TTIImpl->isFCmpOrdCheaperThanFCmpZero(Ty);
281285 }
282286
283287 int TargetTransformInfo::getFPOpCost(Type *Ty) const {
25362536 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
25372537 }
25382538
2539 bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
2540 return false;
2541 }
2542
25392543 bool X86TTIImpl::areInlineCompatible(const Function *Caller,
25402544 const Function *Callee) const {
25412545 const TargetMachine &TM = getTLI()->getTargetMachine();
124124 bool isLegalMaskedGather(Type *DataType);
125125 bool isLegalMaskedScatter(Type *DataType);
126126 bool hasDivRemOp(Type *DataType, bool IsSigned);
127 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty);
127128 bool areInlineCompatible(const Function *Caller,
128129 const Function *Callee) const;
129130 const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(
2525
2626
2727 static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
28 BasicBlock &CurrBB, Function::iterator &BB) {
28 BasicBlock &CurrBB, Function::iterator &BB,
29 const TargetTransformInfo *TTI) {
2930 // There is no need to change the IR, since backend will emit sqrt
3031 // instruction if the call has already been marked read-only.
3132 if (Call->onlyReadsMemory())
3839 //
3940 // (after)
4041 // v0 = sqrt_noreadmem(src) # native sqrt instruction.
41 // if (v0 is a NaN)
42 // [if (v0 is a NaN) || if (src < 0)]
4243 // v1 = sqrt(src) # library call.
4344 // dst = phi(v0, v1)
4445 //
4748 // Create phi and replace all uses.
4849 BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode());
4950 IRBuilder<> Builder(JoinBB, JoinBB->begin());
50 PHINode *Phi = Builder.CreatePHI(Call->getType(), 2);
51 Type *Ty = Call->getType();
52 PHINode *Phi = Builder.CreatePHI(Ty, 2);
5153 Call->replaceAllUsesWith(Phi);
5254
5355 // Create basic block LibCallBB and insert a call to library function sqrt.
6466 Call->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
6567 CurrBB.getTerminator()->eraseFromParent();
6668 Builder.SetInsertPoint(&CurrBB);
67 Value *FCmp = Builder.CreateFCmpOEQ(Call, Call);
69 Value *FCmp = TTI->isFCmpOrdCheaperThanFCmpZero(Ty)
70 ? Builder.CreateFCmpORD(Call, Call)
71 : Builder.CreateFCmpOGE(Call->getOperand(0),
72 ConstantFP::get(Ty, 0.0));
6873 Builder.CreateCondBr(FCmp, JoinBB, LibCallBB);
6974
7075 // Add phi operands.
105110 case LibFunc_sqrtf:
106111 case LibFunc_sqrt:
107112 if (TTI->haveFastSqrt(Call->getType()) &&
108 optimizeSQRT(Call, CalledFunc, *CurrBB, BB))
113 optimizeSQRT(Call, CalledFunc, *CurrBB, BB, TTI))
109114 break;
110115 continue;
111116 default:
22
33 ; PR31455 - https://bugs.llvm.org/show_bug.cgi?id=31455
44 ; We have to assume that errno can be set, so we have to make a libcall in that case.
5 ; But it's better for perf to check that the argument is valid rather than the result of
5 ; But it's better for perf to check that the argument is valid rather than the result of
66 ; sqrtss/sqrtsd.
77 ; Note: This is really a test of the -partially-inline-libcalls IR pass (and we have an IR test
88 ; for that), but we're checking the final asm to make sure that comes out as expected too.
1010 define float @f(float %val) nounwind {
1111 ; CHECK-LABEL: f:
1212 ; CHECK: # BB#0:
13 ; CHECK-NEXT: sqrtss %xmm0, %xmm1
14 ; CHECK-NEXT: ucomiss %xmm1, %xmm1
15 ; CHECK-NEXT: jp .LBB0_2
13 ; CHECK-NEXT: xorps %xmm1, %xmm1
14 ; CHECK-NEXT: ucomiss %xmm1, %xmm0
15 ; CHECK-NEXT: jb .LBB0_2
1616 ; CHECK-NEXT: # BB#1: # %.split
17 ; CHECK-NEXT: movaps %xmm1, %xmm0
17 ; CHECK-NEXT: sqrtss %xmm0, %xmm0
1818 ; CHECK-NEXT: retq
1919 ; CHECK-NEXT: .LBB0_2: # %call.sqrt
2020 ; CHECK-NEXT: jmp sqrtf # TAILCALL
2525 define double @d(double %val) nounwind {
2626 ; CHECK-LABEL: d:
2727 ; CHECK: # BB#0:
28 ; CHECK-NEXT: sqrtsd %xmm0, %xmm1
29 ; CHECK-NEXT: ucomisd %xmm1, %xmm1
30 ; CHECK-NEXT: jp .LBB1_2
28 ; CHECK-NEXT: xorps %xmm1, %xmm1
29 ; CHECK-NEXT: ucomisd %xmm1, %xmm0
30 ; CHECK-NEXT: jb .LBB1_2
3131 ; CHECK-NEXT: # BB#1: # %.split
32 ; CHECK-NEXT: movapd %xmm1, %xmm0
32 ; CHECK-NEXT: sqrtsd %xmm0, %xmm0
3333 ; CHECK-NEXT: retq
3434 ; CHECK-NEXT: .LBB1_2: # %call.sqrt
3535 ; CHECK-NEXT: jmp sqrt # TAILCALL
0 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
1 ; RUN: opt -S -partially-inline-libcalls -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
12 ; RUN: opt -S -passes=partially-inline-libcalls -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
23
34 define float @f(float %val) {
4 ; CHECK: @f
5 ; CHECK: entry:
6 ; CHECK-NEXT: %[[RES:.+]] = tail call float @sqrtf(float %val) #0
7 ; CHECK-NEXT: %[[CMP:.+]] = fcmp oeq float %[[RES]], %[[RES]]
8 ; CHECK-NEXT: br i1 %[[CMP]], label %[[EXIT:.+]], label %[[CALL:.+]]
9 ; CHECK: [[CALL]]:
10 ; CHECK-NEXT: %[[RES2:.+]] = tail call float @sqrtf(float %val){{$}}
11 ; CHECK-NEXT: br label %[[EXIT]]
12 ; CHECK: [[EXIT]]:
13 ; CHECK-NEXT: %[[RET:.+]] = phi float [ %[[RES]], %entry ], [ %[[RES2]], %[[CALL]] ]
14 ; CHECK-NEXT: ret float %[[RET]]
5 ; CHECK-LABEL: @f(
6 ; CHECK-NEXT: entry:
7 ; CHECK-NEXT: [[RES:%.*]] = tail call float @sqrtf(float [[VAL:%.*]]) #0
8 ; CHECK-NEXT: [[TMP0:%.*]] = fcmp oge float [[VAL]], 0.000000e+00
9 ; CHECK-NEXT: br i1 [[TMP0]], label [[ENTRY_SPLIT:%.*]], label [[CALL_SQRT:%.*]]
10 ; CHECK: call.sqrt:
11 ; CHECK-NEXT: [[TMP1:%.*]] = tail call float @sqrtf(float [[VAL]])
12 ; CHECK-NEXT: br label [[ENTRY_SPLIT]]
13 ; CHECK: entry.split:
14 ; CHECK-NEXT: [[TMP2:%.*]] = phi float [ [[RES]], [[ENTRY:%.*]] ], [ [[TMP1]], [[CALL_SQRT]] ]
15 ; CHECK-NEXT: ret float [[TMP2]]
16 ;
1517 entry:
1618 %res = tail call float @sqrtf(float %val)
1719 ret float %res