llvm.org GIT mirror llvm / 093b041
[SLPV] Recognize vectorizable intrinsics during SLP vectorization and transform accordingly. Based on similar code from Loop vectorization. Subsequent commits will include vectorization of function calls to vector intrinsics and form function calls to vector library calls. Patch by Raul Silvera! (Much delayed due to my not running dcommit) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@200576 91177308-0d34-0410-b5e6-96231b3b80d8 Chandler Carruth 5 years ago
2 changed file(s) with 162 addition(s) and 4 deletion(s). Raw diff Collapse all Expand all
944944 // We can ignore these values because we are sinking them down.
945945 MemBarrierIgnoreList.insert(VL.begin(), VL.end());
946946 buildTree_rec(Operands, Depth + 1);
947 return;
948 }
949 case Instruction::Call: {
950 // Check if the calls are all to the same vectorizable intrinsic.
951 IntrinsicInst *II = dyn_cast(VL[0]);
952 if (II==NULL) {
953 newTreeEntry(VL, false);
954 DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
955 return;
956 }
957
958 Intrinsic::ID ID = II->getIntrinsicID();
959
960 for (unsigned i = 1, e = VL.size(); i != e; ++i) {
961 IntrinsicInst *II2 = dyn_cast(VL[i]);
962 if (!II2 || II2->getIntrinsicID() != ID) {
963 newTreeEntry(VL, false);
964 DEBUG(dbgs() << "SLP: mismatched calls:" << *II << "!=" << *VL[i]
965 << "\n");
966 return;
967 }
968 }
969
970 newTreeEntry(VL, true);
971 for (unsigned i = 0, e = II->getNumArgOperands(); i != e; ++i) {
972 ValueList Operands;
973 // Prepare the operand vector.
974 for (unsigned j = 0; j < VL.size(); ++j) {
975 IntrinsicInst *II2 = dyn_cast(VL[j]);
976 Operands.push_back(II2->getArgOperand(i));
977 }
978 buildTree_rec(Operands, Depth + 1);
979 }
947980 return;
948981 }
949982 default:
10711104 int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
10721105 return VecStCost - ScalarStCost;
10731106 }
1107 case Instruction::Call: {
1108 CallInst *CI = cast(VL0);
1109 IntrinsicInst *II = cast(CI);
1110 Intrinsic::ID ID = II->getIntrinsicID();
1111
1112 // Calculate the cost of the scalar and vector calls.
1113 SmallVector ScalarTys, VecTys;
1114 for (unsigned op = 0, opc = II->getNumArgOperands(); op!= opc; ++op) {
1115 ScalarTys.push_back(CI->getArgOperand(op)->getType());
1116 VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
1117 VecTy->getNumElements()));
1118 }
1119
1120 int ScalarCallCost = VecTy->getNumElements() *
1121 TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys);
1122
1123 int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys);
1124
1125 DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
1126 << " (" << VecCallCost << "-" << ScalarCallCost << ")"
1127 << " for " << *II << "\n");
1128
1129 return VecCallCost - ScalarCallCost;
1130 }
10741131 default:
10751132 llvm_unreachable("Unknown instruction");
10761133 }
10851142 return false;
10861143
10871144 // Gathering cost would be too much for tiny trees.
1088 if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
1089 return false;
1090
1091 return true;
1145 if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
1146 return false;
1147
1148 return true;
10921149 }
10931150
10941151 int BoUpSLP::getTreeCost() {
15531610 S->setAlignment(Alignment);
15541611 E->VectorizedValue = S;
15551612 return propagateMetadata(S, E->Scalars);
1613 }
1614 case Instruction::Call: {
1615 CallInst *CI = cast(VL0);
1616
1617 setInsertPointAfterBundle(E->Scalars);
1618 std::vector OpVecs;
1619 for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
1620 ValueList OpVL;
1621 for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
1622 CallInst *CEI = cast(E->Scalars[i]);
1623 OpVL.push_back(CEI->getArgOperand(j));
1624 }
1625
1626 Value *OpVec = vectorizeTree(OpVL);
1627 DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
1628 OpVecs.push_back(OpVec);
1629 }
1630
1631 Module *M = F->getParent();
1632 IntrinsicInst *II = cast(CI);
1633 Intrinsic::ID ID = II->getIntrinsicID();
1634 Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
1635 Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
1636 Value *V = Builder.CreateCall(CF, OpVecs);
1637 E->VectorizedValue = V;
1638 return V;
15561639 }
15571640 default:
15581641 llvm_unreachable("unknown inst");
0 ; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-999 -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
1
2 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
3 target triple = "x86_64-apple-macosx10.8.0"
4
5 declare double @llvm.fabs.f64(double) nounwind readnone
6
7 ;CHECK-LABEL: @vec_fabs_f64(
8 ;CHECK: load <2 x double>
9 ;CHECK: load <2 x double>
10 ;CHECK: call <2 x double> @llvm.fabs.v2f64
11 ;CHECK: store <2 x double>
12 ;CHECK: ret
13 define void @vec_fabs_f64(double* %a, double* %b, double* %c) {
14 entry:
15 %i0 = load double* %a, align 8
16 %i1 = load double* %b, align 8
17 %mul = fmul double %i0, %i1
18 %call = tail call double @llvm.fabs.f64(double %mul) nounwind readnone
19 %arrayidx3 = getelementptr inbounds double* %a, i64 1
20 %i3 = load double* %arrayidx3, align 8
21 %arrayidx4 = getelementptr inbounds double* %b, i64 1
22 %i4 = load double* %arrayidx4, align 8
23 %mul5 = fmul double %i3, %i4
24 %call5 = tail call double @llvm.fabs.f64(double %mul5) nounwind readnone
25 store double %call, double* %c, align 8
26 %arrayidx5 = getelementptr inbounds double* %c, i64 1
27 store double %call5, double* %arrayidx5, align 8
28 ret void
29 }
30
31 declare float @llvm.copysign.f32(float, float) nounwind readnone
32
33 ;CHECK-LABEL: @vec_copysign_f32(
34 ;CHECK: load <4 x float>
35 ;CHECK: load <4 x float>
36 ;CHECK: call <4 x float> @llvm.copysign.v4f32
37 ;CHECK: store <4 x float>
38 ;CHECK: ret
39 define void @vec_copysign_f32(float* %a, float* %b, float* noalias %c) {
40 entry:
41 %0 = load float* %a, align 4
42 %1 = load float* %b, align 4
43 %call0 = tail call float @llvm.copysign.f32(float %0, float %1) nounwind readnone
44 store float %call0, float* %c, align 4
45
46 %ix2 = getelementptr inbounds float* %a, i64 1
47 %2 = load float* %ix2, align 4
48 %ix3 = getelementptr inbounds float* %b, i64 1
49 %3 = load float* %ix3, align 4
50 %call1 = tail call float @llvm.copysign.f32(float %2, float %3) nounwind readnone
51 %c1 = getelementptr inbounds float* %c, i64 1
52 store float %call1, float* %c1, align 4
53
54 %ix4 = getelementptr inbounds float* %a, i64 2
55 %4 = load float* %ix4, align 4
56 %ix5 = getelementptr inbounds float* %b, i64 2
57 %5 = load float* %ix5, align 4
58 %call2 = tail call float @llvm.copysign.f32(float %4, float %5) nounwind readnone
59 %c2 = getelementptr inbounds float* %c, i64 2
60 store float %call2, float* %c2, align 4
61
62 %ix6 = getelementptr inbounds float* %a, i64 3
63 %6 = load float* %ix6, align 4
64 %ix7 = getelementptr inbounds float* %b, i64 3
65 %7 = load float* %ix7, align 4
66 %call3 = tail call float @llvm.copysign.f32(float %6, float %7) nounwind readnone
67 %c3 = getelementptr inbounds float* %c, i64 3
68 store float %call3, float* %c3, align 4
69
70 ret void
71 }
72
73
74