llvm.org GIT mirror llvm / 486ad62
Vectorize intrinsic math function calls in SLPVectorizer. This patch adds support to recognize and vectorize intrinsic math functions in SLPVectorizer. Review: http://reviews.llvm.org/D3560 and http://reviews.llvm.org/D3559 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@207901 91177308-0d34-0410-b5e6-96231b3b80d8 Karthik Bhat 5 years ago
4 changed file(s) with 274 addition(s) and 143 deletion(s). Raw diff Collapse all Expand all
1414 #define LLVM_TRANSFORMS_UTILS_VECTORUTILS_H
1515
1616 #include "llvm/IR/Intrinsics.h"
17 #include "llvm/Target/TargetLibraryInfo.h"
1718
1819 namespace llvm {
1920
5051 }
5152 }
5253
54
55 static Intrinsic::ID checkUnaryFloatSignature(const CallInst &I,
56 Intrinsic::ID ValidIntrinsicID) {
57 if (I.getNumArgOperands() != 1 ||
58 !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
59 I.getType() != I.getArgOperand(0)->getType() ||
60 !I.onlyReadsMemory())
61 return Intrinsic::not_intrinsic;
62
63 return ValidIntrinsicID;
64 }
65
66 static Intrinsic::ID checkBinaryFloatSignature(const CallInst &I,
67 Intrinsic::ID ValidIntrinsicID) {
68 if (I.getNumArgOperands() != 2 ||
69 !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
70 !I.getArgOperand(1)->getType()->isFloatingPointTy() ||
71 I.getType() != I.getArgOperand(0)->getType() ||
72 I.getType() != I.getArgOperand(1)->getType() ||
73 !I.onlyReadsMemory())
74 return Intrinsic::not_intrinsic;
75
76 return ValidIntrinsicID;
77 }
78
79 static Intrinsic::ID
80 getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
81 // If we have an intrinsic call, check if it is trivially vectorizable.
82 if (IntrinsicInst *II = dyn_cast(CI)) {
83 Intrinsic::ID ID = II->getIntrinsicID();
84 if (isTriviallyVectorizable(ID) || ID == Intrinsic::lifetime_start ||
85 ID == Intrinsic::lifetime_end)
86 return ID;
87 else
88 return Intrinsic::not_intrinsic;
89 }
90
91 if (!TLI)
92 return Intrinsic::not_intrinsic;
93
94 LibFunc::Func Func;
95 Function *F = CI->getCalledFunction();
96 // We're going to make assumptions on the semantics of the functions, check
97 // that the target knows that it's available in this environment and it does
98 // not have local linkage.
99 if (!F || F->hasLocalLinkage() || !TLI->getLibFunc(F->getName(), Func))
100 return Intrinsic::not_intrinsic;
101
102 // Otherwise check if we have a call to a function that can be turned into a
103 // vector intrinsic.
104 switch (Func) {
105 default:
106 break;
107 case LibFunc::sin:
108 case LibFunc::sinf:
109 case LibFunc::sinl:
110 return checkUnaryFloatSignature(*CI, Intrinsic::sin);
111 case LibFunc::cos:
112 case LibFunc::cosf:
113 case LibFunc::cosl:
114 return checkUnaryFloatSignature(*CI, Intrinsic::cos);
115 case LibFunc::exp:
116 case LibFunc::expf:
117 case LibFunc::expl:
118 return checkUnaryFloatSignature(*CI, Intrinsic::exp);
119 case LibFunc::exp2:
120 case LibFunc::exp2f:
121 case LibFunc::exp2l:
122 return checkUnaryFloatSignature(*CI, Intrinsic::exp2);
123 case LibFunc::log:
124 case LibFunc::logf:
125 case LibFunc::logl:
126 return checkUnaryFloatSignature(*CI, Intrinsic::log);
127 case LibFunc::log10:
128 case LibFunc::log10f:
129 case LibFunc::log10l:
130 return checkUnaryFloatSignature(*CI, Intrinsic::log10);
131 case LibFunc::log2:
132 case LibFunc::log2f:
133 case LibFunc::log2l:
134 return checkUnaryFloatSignature(*CI, Intrinsic::log2);
135 case LibFunc::fabs:
136 case LibFunc::fabsf:
137 case LibFunc::fabsl:
138 return checkUnaryFloatSignature(*CI, Intrinsic::fabs);
139 case LibFunc::copysign:
140 case LibFunc::copysignf:
141 case LibFunc::copysignl:
142 return checkBinaryFloatSignature(*CI, Intrinsic::copysign);
143 case LibFunc::floor:
144 case LibFunc::floorf:
145 case LibFunc::floorl:
146 return checkUnaryFloatSignature(*CI, Intrinsic::floor);
147 case LibFunc::ceil:
148 case LibFunc::ceilf:
149 case LibFunc::ceill:
150 return checkUnaryFloatSignature(*CI, Intrinsic::ceil);
151 case LibFunc::trunc:
152 case LibFunc::truncf:
153 case LibFunc::truncl:
154 return checkUnaryFloatSignature(*CI, Intrinsic::trunc);
155 case LibFunc::rint:
156 case LibFunc::rintf:
157 case LibFunc::rintl:
158 return checkUnaryFloatSignature(*CI, Intrinsic::rint);
159 case LibFunc::nearbyint:
160 case LibFunc::nearbyintf:
161 case LibFunc::nearbyintl:
162 return checkUnaryFloatSignature(*CI, Intrinsic::nearbyint);
163 case LibFunc::round:
164 case LibFunc::roundf:
165 case LibFunc::roundl:
166 return checkUnaryFloatSignature(*CI, Intrinsic::round);
167 case LibFunc::pow:
168 case LibFunc::powf:
169 case LibFunc::powl:
170 return checkBinaryFloatSignature(*CI, Intrinsic::pow);
171 }
172
173 return Intrinsic::not_intrinsic;
174 }
175
176
53177 } // llvm namespace
54178
55179 #endif
8484 #include "llvm/Support/Debug.h"
8585 #include "llvm/Support/Format.h"
8686 #include "llvm/Support/raw_ostream.h"
87 #include "llvm/Target/TargetLibraryInfo.h"
8887 #include "llvm/Transforms/Scalar.h"
8988 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
9089 #include "llvm/Transforms/Utils/Local.h"
22972296 default:
22982297 llvm_unreachable("Unknown reduction kind");
22992298 }
2300 }
2301
2302 static Intrinsic::ID checkUnaryFloatSignature(const CallInst &I,
2303 Intrinsic::ID ValidIntrinsicID) {
2304 if (I.getNumArgOperands() != 1 ||
2305 !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
2306 I.getType() != I.getArgOperand(0)->getType() ||
2307 !I.onlyReadsMemory())
2308 return Intrinsic::not_intrinsic;
2309
2310 return ValidIntrinsicID;
2311 }
2312
2313 static Intrinsic::ID checkBinaryFloatSignature(const CallInst &I,
2314 Intrinsic::ID ValidIntrinsicID) {
2315 if (I.getNumArgOperands() != 2 ||
2316 !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
2317 !I.getArgOperand(1)->getType()->isFloatingPointTy() ||
2318 I.getType() != I.getArgOperand(0)->getType() ||
2319 I.getType() != I.getArgOperand(1)->getType() ||
2320 !I.onlyReadsMemory())
2321 return Intrinsic::not_intrinsic;
2322
2323 return ValidIntrinsicID;
2324 }
2325
2326
2327 static Intrinsic::ID
2328 getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
2329 // If we have an intrinsic call, check if it is trivially vectorizable.
2330 if (IntrinsicInst *II = dyn_cast(CI)) {
2331 Intrinsic::ID ID = II->getIntrinsicID();
2332 if (isTriviallyVectorizable(ID) || ID == Intrinsic::lifetime_start ||
2333 ID == Intrinsic::lifetime_end)
2334 return ID;
2335 else
2336 return Intrinsic::not_intrinsic;
2337 }
2338
2339 if (!TLI)
2340 return Intrinsic::not_intrinsic;
2341
2342 LibFunc::Func Func;
2343 Function *F = CI->getCalledFunction();
2344 // We're going to make assumptions on the semantics of the functions, check
2345 // that the target knows that it's available in this environment and it does
2346 // not have local linkage.
2347 if (!F || F->hasLocalLinkage() || !TLI->getLibFunc(F->getName(), Func))
2348 return Intrinsic::not_intrinsic;
2349
2350 // Otherwise check if we have a call to a function that can be turned into a
2351 // vector intrinsic.
2352 switch (Func) {
2353 default:
2354 break;
2355 case LibFunc::sin:
2356 case LibFunc::sinf:
2357 case LibFunc::sinl:
2358 return checkUnaryFloatSignature(*CI, Intrinsic::sin);
2359 case LibFunc::cos:
2360 case LibFunc::cosf:
2361 case LibFunc::cosl:
2362 return checkUnaryFloatSignature(*CI, Intrinsic::cos);
2363 case LibFunc::exp:
2364 case LibFunc::expf:
2365 case LibFunc::expl:
2366 return checkUnaryFloatSignature(*CI, Intrinsic::exp);
2367 case LibFunc::exp2:
2368 case LibFunc::exp2f:
2369 case LibFunc::exp2l:
2370 return checkUnaryFloatSignature(*CI, Intrinsic::exp2);
2371 case LibFunc::log:
2372 case LibFunc::logf:
2373 case LibFunc::logl:
2374 return checkUnaryFloatSignature(*CI, Intrinsic::log);
2375 case LibFunc::log10:
2376 case LibFunc::log10f:
2377 case LibFunc::log10l:
2378 return checkUnaryFloatSignature(*CI, Intrinsic::log10);
2379 case LibFunc::log2:
2380 case LibFunc::log2f:
2381 case LibFunc::log2l:
2382 return checkUnaryFloatSignature(*CI, Intrinsic::log2);
2383 case LibFunc::fabs:
2384 case LibFunc::fabsf:
2385 case LibFunc::fabsl:
2386 return checkUnaryFloatSignature(*CI, Intrinsic::fabs);
2387 case LibFunc::copysign:
2388 case LibFunc::copysignf:
2389 case LibFunc::copysignl:
2390 return checkBinaryFloatSignature(*CI, Intrinsic::copysign);
2391 case LibFunc::floor:
2392 case LibFunc::floorf:
2393 case LibFunc::floorl:
2394 return checkUnaryFloatSignature(*CI, Intrinsic::floor);
2395 case LibFunc::ceil:
2396 case LibFunc::ceilf:
2397 case LibFunc::ceill:
2398 return checkUnaryFloatSignature(*CI, Intrinsic::ceil);
2399 case LibFunc::trunc:
2400 case LibFunc::truncf:
2401 case LibFunc::truncl:
2402 return checkUnaryFloatSignature(*CI, Intrinsic::trunc);
2403 case LibFunc::rint:
2404 case LibFunc::rintf:
2405 case LibFunc::rintl:
2406 return checkUnaryFloatSignature(*CI, Intrinsic::rint);
2407 case LibFunc::nearbyint:
2408 case LibFunc::nearbyintf:
2409 case LibFunc::nearbyintl:
2410 return checkUnaryFloatSignature(*CI, Intrinsic::nearbyint);
2411 case LibFunc::round:
2412 case LibFunc::roundf:
2413 case LibFunc::roundl:
2414 return checkUnaryFloatSignature(*CI, Intrinsic::round);
2415 case LibFunc::pow:
2416 case LibFunc::powf:
2417 case LibFunc::powl:
2418 return checkBinaryFloatSignature(*CI, Intrinsic::pow);
2419 }
2420
2421 return Intrinsic::not_intrinsic;
24222299 }
24232300
24242301 /// This function translates the reduction kind to an LLVM binary operator.
345345 typedef SmallVector StoreList;
346346
347347 BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl,
348 TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li,
348 TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
349349 DominatorTree *Dt) :
350 F(Func), SE(Se), DL(Dl), TTI(Tti), AA(Aa), LI(Li), DT(Dt),
350 F(Func), SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
351351 Builder(Se->getContext()) {
352352 // Setup the block numbering utility for all of the blocks in the
353353 // function.
535535 ScalarEvolution *SE;
536536 const DataLayout *DL;
537537 TargetTransformInfo *TTI;
538 TargetLibraryInfo *TLI;
538539 AliasAnalysis *AA;
539540 LoopInfo *LI;
540541 DominatorTree *DT;
948949 }
949950 case Instruction::Call: {
950951 // Check if the calls are all to the same vectorizable intrinsic.
951 IntrinsicInst *II = dyn_cast(VL[0]);
952 Intrinsic::ID ID = II ? II->getIntrinsicID() : Intrinsic::not_intrinsic;
953
952 CallInst *CI = cast(VL[0]);
953 // Check if this is an Intrinsic call or something that can be
954 // represented by an intrinsic call
955 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
954956 if (!isTriviallyVectorizable(ID)) {
955957 newTreeEntry(VL, false);
956958 DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
957959 return;
958960 }
959961
960 Function *Int = II->getCalledFunction();
962 Function *Int = CI->getCalledFunction();
961963
962964 for (unsigned i = 1, e = VL.size(); i != e; ++i) {
963 IntrinsicInst *II2 = dyn_cast(VL[i]);
964 if (!II2 || II2->getCalledFunction() != Int) {
965 CallInst *CI2 = dyn_cast(VL[i]);
966 if (!CI2 || CI2->getCalledFunction() != Int ||
967 getIntrinsicIDForCall(CI2, TLI) != ID) {
965968 newTreeEntry(VL, false);
966 DEBUG(dbgs() << "SLP: mismatched calls:" << *II << "!=" << *VL[i]
969 DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
967970 << "\n");
968971 return;
969972 }
970973 }
971974
972975 newTreeEntry(VL, true);
973 for (unsigned i = 0, e = II->getNumArgOperands(); i != e; ++i) {
976 for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
974977 ValueList Operands;
975978 // Prepare the operand vector.
976979 for (unsigned j = 0; j < VL.size(); ++j) {
977 IntrinsicInst *II2 = dyn_cast(VL[j]);
978 Operands.push_back(II2->getArgOperand(i));
980 CallInst *CI2 = dyn_cast(VL[j]);
981 Operands.push_back(CI2->getArgOperand(i));
979982 }
980983 buildTree_rec(Operands, Depth + 1);
981984 }
11311134 }
11321135 case Instruction::Call: {
11331136 CallInst *CI = cast(VL0);
1134 IntrinsicInst *II = cast(CI);
1135 Intrinsic::ID ID = II->getIntrinsicID();
1137 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
11361138
11371139 // Calculate the cost of the scalar and vector calls.
11381140 SmallVector ScalarTys, VecTys;
1139 for (unsigned op = 0, opc = II->getNumArgOperands(); op!= opc; ++op) {
1141 for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
11401142 ScalarTys.push_back(CI->getArgOperand(op)->getType());
11411143 VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
11421144 VecTy->getNumElements()));
11491151
11501152 DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
11511153 << " (" << VecCallCost << "-" << ScalarCallCost << ")"
1152 << " for " << *II << "\n");
1154 << " for " << *CI << "\n");
11531155
11541156 return VecCallCost - ScalarCallCost;
11551157 }
16421644 }
16431645 case Instruction::Call: {
16441646 CallInst *CI = cast(VL0);
1645
16461647 setInsertPointAfterBundle(E->Scalars);
16471648 std::vector OpVecs;
16481649 for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
16581659 }
16591660
16601661 Module *M = F->getParent();
1661 IntrinsicInst *II = cast(CI);
1662 Intrinsic::ID ID = II->getIntrinsicID();
1662 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
16631663 Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
16641664 Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
16651665 Value *V = Builder.CreateCall(CF, OpVecs);
18661866 ScalarEvolution *SE;
18671867 const DataLayout *DL;
18681868 TargetTransformInfo *TTI;
1869 TargetLibraryInfo *TLI;
18691870 AliasAnalysis *AA;
18701871 LoopInfo *LI;
18711872 DominatorTree *DT;
18781879 DataLayoutPass *DLP = getAnalysisIfAvailable();
18791880 DL = DLP ? &DLP->getDataLayout() : nullptr;
18801881 TTI = &getAnalysis();
1882 TLI = getAnalysisIfAvailable();
18811883 AA = &getAnalysis();
18821884 LI = &getAnalysis();
18831885 DT = &getAnalysis().getDomTree();
19031905
19041906 // Use the bottom up slp vectorizer to construct chains that start with
19051907 // he store instructions.
1906 BoUpSLP R(&F, SE, DL, TTI, AA, LI, DT);
1908 BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT);
19071909
19081910 // Scan the blocks in the function in post order.
19091911 for (po_iterator it = po_begin(&F.getEntryBlock()),
0 ; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-999 -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
1
2 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
3 target triple = "x86_64-apple-macosx10.8.0"
4
5 declare double @sin(double)
6 declare double @cos(double)
7 declare double @pow(double, double)
8 declare double @exp2(double)
9 declare i64 @round(i64)
10
11
12 ; CHECK: sin_libm
13 ; CHECK: call <2 x double> @llvm.sin.v2f64
14 ; CHECK: ret void
15 define void @sin_libm(double* %a, double* %b, double* %c) {
16 entry:
17 %i0 = load double* %a, align 8
18 %i1 = load double* %b, align 8
19 %mul = fmul double %i0, %i1
20 %call = tail call double @sin(double %mul) nounwind readnone
21 %arrayidx3 = getelementptr inbounds double* %a, i64 1
22 %i3 = load double* %arrayidx3, align 8
23 %arrayidx4 = getelementptr inbounds double* %b, i64 1
24 %i4 = load double* %arrayidx4, align 8
25 %mul5 = fmul double %i3, %i4
26 %call5 = tail call double @sin(double %mul5) nounwind readnone
27 store double %call, double* %c, align 8
28 %arrayidx5 = getelementptr inbounds double* %c, i64 1
29 store double %call5, double* %arrayidx5, align 8
30 ret void
31 }
32
33 ; CHECK: cos_libm
34 ; CHECK: call <2 x double> @llvm.cos.v2f64
35 ; CHECK: ret void
36 define void @cos_libm(double* %a, double* %b, double* %c) {
37 entry:
38 %i0 = load double* %a, align 8
39 %i1 = load double* %b, align 8
40 %mul = fmul double %i0, %i1
41 %call = tail call double @cos(double %mul) nounwind readnone
42 %arrayidx3 = getelementptr inbounds double* %a, i64 1
43 %i3 = load double* %arrayidx3, align 8
44 %arrayidx4 = getelementptr inbounds double* %b, i64 1
45 %i4 = load double* %arrayidx4, align 8
46 %mul5 = fmul double %i3, %i4
47 %call5 = tail call double @cos(double %mul5) nounwind readnone
48 store double %call, double* %c, align 8
49 %arrayidx5 = getelementptr inbounds double* %c, i64 1
50 store double %call5, double* %arrayidx5, align 8
51 ret void
52 }
53
54 ; CHECK: pow_libm
55 ; CHECK: call <2 x double> @llvm.pow.v2f64
56 ; CHECK: ret void
57 define void @pow_libm(double* %a, double* %b, double* %c) {
58 entry:
59 %i0 = load double* %a, align 8
60 %i1 = load double* %b, align 8
61 %mul = fmul double %i0, %i1
62 %call = tail call double @pow(double %mul,double %mul) nounwind readnone
63 %arrayidx3 = getelementptr inbounds double* %a, i64 1
64 %i3 = load double* %arrayidx3, align 8
65 %arrayidx4 = getelementptr inbounds double* %b, i64 1
66 %i4 = load double* %arrayidx4, align 8
67 %mul5 = fmul double %i3, %i4
68 %call5 = tail call double @pow(double %mul5,double %mul5) nounwind readnone
69 store double %call, double* %c, align 8
70 %arrayidx5 = getelementptr inbounds double* %c, i64 1
71 store double %call5, double* %arrayidx5, align 8
72 ret void
73 }
74
75
76 ; CHECK: exp2_libm
77 ; CHECK: call <2 x double> @llvm.exp2.v2f64
78 ; CHECK: ret void
79 define void @exp2_libm(double* %a, double* %b, double* %c) {
80 entry:
81 %i0 = load double* %a, align 8
82 %i1 = load double* %b, align 8
83 %mul = fmul double %i0, %i1
84 %call = tail call double @exp2(double %mul) nounwind readnone
85 %arrayidx3 = getelementptr inbounds double* %a, i64 1
86 %i3 = load double* %arrayidx3, align 8
87 %arrayidx4 = getelementptr inbounds double* %b, i64 1
88 %i4 = load double* %arrayidx4, align 8
89 %mul5 = fmul double %i3, %i4
90 %call5 = tail call double @exp2(double %mul5) nounwind readnone
91 store double %call, double* %c, align 8
92 %arrayidx5 = getelementptr inbounds double* %c, i64 1
93 store double %call5, double* %arrayidx5, align 8
94 ret void
95 }
96
97
98 ; Negative test case
99 ; CHECK: round_custom
100 ; CHECK-NOT: load <4 x i64>
101 ; CHECK: ret void
102 define void @round_custom(i64* %a, i64* %b, i64* %c) {
103 entry:
104 %i0 = load i64* %a, align 8
105 %i1 = load i64* %b, align 8
106 %mul = mul i64 %i0, %i1
107 %call = tail call i64 @round(i64 %mul) nounwind readnone
108 %arrayidx3 = getelementptr inbounds i64* %a, i64 1
109 %i3 = load i64* %arrayidx3, align 8
110 %arrayidx4 = getelementptr inbounds i64* %b, i64 1
111 %i4 = load i64* %arrayidx4, align 8
112 %mul5 = mul i64 %i3, %i4
113 %call5 = tail call i64 @round(i64 %mul5) nounwind readnone
114 store i64 %call, i64* %c, align 8
115 %arrayidx5 = getelementptr inbounds i64* %c, i64 1
116 store i64 %call5, i64* %arrayidx5, align 8
117 ret void
118 }
119
120
121 ; CHECK: declare <2 x double> @llvm.sin.v2f64(<2 x double>) #0
122 ; CHECK: declare <2 x double> @llvm.cos.v2f64(<2 x double>) #0
123 ; CHECK: declare <2 x double> @llvm.pow.v2f64(<2 x double>, <2 x double>) #0
124 ; CHECK: declare <2 x double> @llvm.exp2.v2f64(<2 x double>) #0
125
126 ; CHECK: attributes #0 = { nounwind readnone }
127