llvm.org GIT mirror llvm / 0316f02
Merging r279125 and r278343: ------------------------------------------------------------------------ r279125 | mssimpso | 2016-08-18 12:50:32 -0700 (Thu, 18 Aug 2016) | 14 lines [SLP] Initialize VectorizedValue when gathering We abort building vectorizable trees in some cases (e.g., if the maximum recursion depth is reached, if the region size is too large, etc.). If this happens for a reduction, we can be left with a root entry that needs to be gathered. For these cases, we need make sure we actually set VectorizedValue to the resulting vector. This patch ensures we properly set VectorizedValue, and it also ensures the insertelement sequence generated for the gathers is inserted at the correct location. Reference: https://llvm.org/bugs/show_bug.cgi?id=28330 Differential Revison: https://reviews.llvm.org/D23410 ------------------------------------------------------------------------ ------------------------------------------------------------------------ r278343 | mssimpso | 2016-08-11 08:28:45 -0700 (Thu, 11 Aug 2016) | 1 line [SLP] Make RecursionMaxDepth a command line option (NFC) ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_39@279174 91177308-0d34-0410-b5e6-96231b3b80d8 Hans Wennborg 4 years ago
2 changed file(s) with 164 addition(s) and 10 deletion(s). Raw diff Collapse all Expand all
8181 "slp-min-reg-size", cl::init(128), cl::Hidden,
8282 cl::desc("Attempt to vectorize for this register size in bits"));
8383
84 // FIXME: Set this via cl::opt to allow overriding.
85 static const unsigned RecursionMaxDepth = 12;
84 static cl::opt RecursionMaxDepth(
85 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
86 cl::desc("Limit the recursion depth when building a vectorizable tree"));
8687
8788 // Limit the number of alias checks. The limit is chosen so that
8889 // it has no negative effect on the llvm benchmarks.
21232124 }
21242125
21252126 void BoUpSLP::setInsertPointAfterBundle(ArrayRef VL) {
2126 Instruction *VL0 = cast(VL[0]);
2127 BasicBlock::iterator NextInst(VL0);
2128 ++NextInst;
2129 Builder.SetInsertPoint(VL0->getParent(), NextInst);
2130 Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
2127
2128 // Get the basic block this bundle is in. All instructions in the bundle
2129 // should be in this block.
2130 auto *Front = cast(VL.front());
2131 auto *BB = Front->getParent();
2132 assert(all_of(make_range(VL.begin(), VL.end()), [&](Value *V) -> bool {
2133 return cast(V)->getParent() == BB;
2134 }));
2135
2136 // The last instruction in the bundle in program order.
2137 Instruction *LastInst = nullptr;
2138
2139 // Find the last instruction. The common case should be that BB has been
2140 // scheduled, and the last instruction is VL.back(). So we start with
2141 // VL.back() and iterate over schedule data until we reach the end of the
2142 // bundle. The end of the bundle is marked by null ScheduleData.
2143 if (BlocksSchedules.count(BB)) {
2144 auto *Bundle = BlocksSchedules[BB]->getScheduleData(VL.back());
2145 if (Bundle && Bundle->isPartOfBundle())
2146 for (; Bundle; Bundle = Bundle->NextInBundle)
2147 LastInst = Bundle->Inst;
2148 }
2149
2150 // LastInst can still be null at this point if there's either not an entry
2151 // for BB in BlocksSchedules or there's no ScheduleData available for
2152 // VL.back(). This can be the case if buildTree_rec aborts for various
2153 // reasons (e.g., the maximum recursion depth is reached, the maximum region
2154 // size is reached, etc.). ScheduleData is initialized in the scheduling
2155 // "dry-run".
2156 //
2157 // If this happens, we can still find the last instruction by brute force. We
2158 // iterate forwards from Front (inclusive) until we either see all
2159 // instructions in the bundle or reach the end of the block. If Front is the
2160 // last instruction in program order, LastInst will be set to Front, and we
2161 // will visit all the remaining instructions in the block.
2162 //
2163 // One of the reasons we exit early from buildTree_rec is to place an upper
2164 // bound on compile-time. Thus, taking an additional compile-time hit here is
2165 // not ideal. However, this should be exceedingly rare since it requires that
2166 // we both exit early from buildTree_rec and that the bundle be out-of-order
2167 // (causing us to iterate all the way to the end of the block).
2168 if (!LastInst) {
2169 SmallPtrSet Bundle(VL.begin(), VL.end());
2170 for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
2171 if (Bundle.erase(&I))
2172 LastInst = &I;
2173 if (Bundle.empty())
2174 break;
2175 }
2176 }
2177
2178 // Set the insertion point after the last instruction in the bundle. Set the
2179 // debug location to Front.
2180 Builder.SetInsertPoint(BB, next(BasicBlock::iterator(LastInst)));
2181 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
21312182 }
21322183
21332184 Value *BoUpSLP::Gather(ArrayRef VL, VectorType *Ty) {
22052256
22062257 if (E->NeedToGather) {
22072258 setInsertPointAfterBundle(E->Scalars);
2208 return Gather(E->Scalars, VecTy);
2259 auto *V = Gather(E->Scalars, VecTy);
2260 E->VectorizedValue = V;
2261 return V;
22092262 }
22102263
22112264 unsigned Opcode = getSameOpcode(E->Scalars);
22522305 E->VectorizedValue = V;
22532306 return V;
22542307 }
2255 return Gather(E->Scalars, VecTy);
2308 setInsertPointAfterBundle(E->Scalars);
2309 auto *V = Gather(E->Scalars, VecTy);
2310 E->VectorizedValue = V;
2311 return V;
22562312 }
22572313 case Instruction::ExtractValue: {
22582314 if (canReuseExtract(E->Scalars, Instruction::ExtractValue)) {
22642320 E->VectorizedValue = V;
22652321 return propagateMetadata(V, E->Scalars);
22662322 }
2267 return Gather(E->Scalars, VecTy);
2323 setInsertPointAfterBundle(E->Scalars);
2324 auto *V = Gather(E->Scalars, VecTy);
2325 E->VectorizedValue = V;
2326 return V;
22682327 }
22692328 case Instruction::ZExt:
22702329 case Instruction::SExt:
0 ; RUN: opt < %s -slp-vectorizer -S | FileCheck %s --check-prefix=DEFAULT
1 ; RUN: opt < %s -slp-recursion-max-depth=0 -slp-vectorizer -S | FileCheck %s --check-prefix=GATHER
2
3 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
4 target triple = "aarch64--linux-gnu"
5
6 @a = common global [80 x i8] zeroinitializer, align 16
7
8 ; DEFAULT-LABEL: @PR28330(
9 ; DEFAULT: %tmp17 = phi i32 [ %tmp34, %for.body ], [ 0, %entry ]
10 ; DEFAULT: %tmp18 = phi i32 [ %tmp35, %for.body ], [ %n, %entry ]
11 ; DEFAULT: %[[S0:.+]] = select <8 x i1> %1, <8 x i32> , <8 x i32>
12 ; DEFAULT: %[[R0:.+]] = shufflevector <8 x i32> %[[S0]], <8 x i32> undef, <8 x i32>
13 ; DEFAULT: %[[R1:.+]] = add <8 x i32> %[[S0]], %[[R0]]
14 ; DEFAULT: %[[R2:.+]] = shufflevector <8 x i32> %[[R1]], <8 x i32> undef, <8 x i32>
15 ; DEFAULT: %[[R3:.+]] = add <8 x i32> %[[R1]], %[[R2]]
16 ; DEFAULT: %[[R4:.+]] = shufflevector <8 x i32> %[[R3]], <8 x i32> undef, <8 x i32>
17 ; DEFAULT: %[[R5:.+]] = add <8 x i32> %[[R3]], %[[R4]]
18 ; DEFAULT: %[[R6:.+]] = extractelement <8 x i32> %[[R5]], i32 0
19 ; DEFAULT: %tmp34 = add i32 %[[R6]], %tmp17
20 ;
21 ; GATHER-LABEL: @PR28330(
22 ; GATHER: %tmp17 = phi i32 [ %tmp34, %for.body ], [ 0, %entry ]
23 ; GATHER: %tmp18 = phi i32 [ %tmp35, %for.body ], [ %n, %entry ]
24 ; GATHER: %tmp19 = select i1 %tmp1, i32 -720, i32 -80
25 ; GATHER: %tmp21 = select i1 %tmp3, i32 -720, i32 -80
26 ; GATHER: %tmp23 = select i1 %tmp5, i32 -720, i32 -80
27 ; GATHER: %tmp25 = select i1 %tmp7, i32 -720, i32 -80
28 ; GATHER: %tmp27 = select i1 %tmp9, i32 -720, i32 -80
29 ; GATHER: %tmp29 = select i1 %tmp11, i32 -720, i32 -80
30 ; GATHER: %tmp31 = select i1 %tmp13, i32 -720, i32 -80
31 ; GATHER: %tmp33 = select i1 %tmp15, i32 -720, i32 -80
32 ; GATHER: %[[I0:.+]] = insertelement <8 x i32> undef, i32 %tmp19, i32 0
33 ; GATHER: %[[I1:.+]] = insertelement <8 x i32> %[[I0]], i32 %tmp21, i32 1
34 ; GATHER: %[[I2:.+]] = insertelement <8 x i32> %[[I1]], i32 %tmp23, i32 2
35 ; GATHER: %[[I3:.+]] = insertelement <8 x i32> %[[I2]], i32 %tmp25, i32 3
36 ; GATHER: %[[I4:.+]] = insertelement <8 x i32> %[[I3]], i32 %tmp27, i32 4
37 ; GATHER: %[[I5:.+]] = insertelement <8 x i32> %[[I4]], i32 %tmp29, i32 5
38 ; GATHER: %[[I6:.+]] = insertelement <8 x i32> %[[I5]], i32 %tmp31, i32 6
39 ; GATHER: %[[I7:.+]] = insertelement <8 x i32> %[[I6]], i32 %tmp33, i32 7
40 ; GATHER: %[[R0:.+]] = shufflevector <8 x i32> %[[I7]], <8 x i32> undef, <8 x i32>
41 ; GATHER: %[[R1:.+]] = add <8 x i32> %[[I7]], %[[R0]]
42 ; GATHER: %[[R2:.+]] = shufflevector <8 x i32> %[[R1]], <8 x i32> undef, <8 x i32>
43 ; GATHER: %[[R3:.+]] = add <8 x i32> %[[R1]], %[[R2]]
44 ; GATHER: %[[R4:.+]] = shufflevector <8 x i32> %[[R3]], <8 x i32> undef, <8 x i32>
45 ; GATHER: %[[R5:.+]] = add <8 x i32> %[[R3]], %[[R4]]
46 ; GATHER: %[[R6:.+]] = extractelement <8 x i32> %[[R5]], i32 0
47 ; GATHER: %tmp34 = add i32 %[[R6]], %tmp17
48
49 define void @PR28330(i32 %n) {
50 entry:
51 %tmp0 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
52 %tmp1 = icmp eq i8 %tmp0, 0
53 %tmp2 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2
54 %tmp3 = icmp eq i8 %tmp2, 0
55 %tmp4 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
56 %tmp5 = icmp eq i8 %tmp4, 0
57 %tmp6 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
58 %tmp7 = icmp eq i8 %tmp6, 0
59 %tmp8 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
60 %tmp9 = icmp eq i8 %tmp8, 0
61 %tmp10 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
62 %tmp11 = icmp eq i8 %tmp10, 0
63 %tmp12 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
64 %tmp13 = icmp eq i8 %tmp12, 0
65 %tmp14 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
66 %tmp15 = icmp eq i8 %tmp14, 0
67 br label %for.body
68
69 for.body:
70 %tmp17 = phi i32 [ %tmp34, %for.body ], [ 0, %entry ]
71 %tmp18 = phi i32 [ %tmp35, %for.body ], [ %n, %entry ]
72 %tmp19 = select i1 %tmp1, i32 -720, i32 -80
73 %tmp20 = add i32 %tmp17, %tmp19
74 %tmp21 = select i1 %tmp3, i32 -720, i32 -80
75 %tmp22 = add i32 %tmp20, %tmp21
76 %tmp23 = select i1 %tmp5, i32 -720, i32 -80
77 %tmp24 = add i32 %tmp22, %tmp23
78 %tmp25 = select i1 %tmp7, i32 -720, i32 -80
79 %tmp26 = add i32 %tmp24, %tmp25
80 %tmp27 = select i1 %tmp9, i32 -720, i32 -80
81 %tmp28 = add i32 %tmp26, %tmp27
82 %tmp29 = select i1 %tmp11, i32 -720, i32 -80
83 %tmp30 = add i32 %tmp28, %tmp29
84 %tmp31 = select i1 %tmp13, i32 -720, i32 -80
85 %tmp32 = add i32 %tmp30, %tmp31
86 %tmp33 = select i1 %tmp15, i32 -720, i32 -80
87 %tmp34 = add i32 %tmp32, %tmp33
88 %tmp35 = add nsw i32 %tmp18, -1
89 %tmp36 = icmp eq i32 %tmp35, 0
90 br i1 %tmp36, label %for.end, label %for.body
91
92 for.end:
93 ret void
94 }