llvm.org GIT mirror llvm / 4438866
[SLPVectorizer] Don't ignore scalar extraction instructions of aggregate value In SLPVectorizer, the vector build instructions (insertvalue for aggregate type) is passed to BoUpSLP.buildTree, it is treated as UserIgnoreList, so later in cost estimation, the cost of these instructions are not counted. For aggregate value, later usage are more likely to be done in scalar registers, either used as individual scalars or used as a whole for function call or return value. Ignore scalar extraction instructions may cause too aggressive vectorization for aggregate values, and slow down performance. So for vectorization of aggregate value, the scalar extraction instructions are required in cost estimation. Differential Revision: https://reviews.llvm.org/D41139 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@320736 91177308-0d34-0410-b5e6-96231b3b80d8 Guozhi Wei 1 year, 10 months ago
4 changed file(s) with 49 addition(s) and 5 deletion(s). Raw diff Collapse all Expand all
9595
9696 /// \brief Try to vectorize a list of operands.
9797 /// \@param BuildVector A list of users to ignore for the purpose of
98 /// scheduling and that don't need extracting.
98 /// scheduling and cost estimation when NeedExtraction
99 /// is false.
99100 /// \returns true if a value was vectorized.
100101 bool tryToVectorizeList(ArrayRef VL, slpvectorizer::BoUpSLP &R,
101102 ArrayRef BuildVector = None,
102 bool AllowReorder = false);
103 bool AllowReorder = false,
104 bool NeedExtraction = false);
103105
104106 /// \brief Try to vectorize a chain that may start at the operands of \p I.
105107 bool tryToVectorize(Instruction *I, slpvectorizer::BoUpSLP &R);
45324532
45334533 bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R,
45344534 ArrayRef BuildVector,
4535 bool AllowReorder) {
4535 bool AllowReorder,
4536 bool NeedExtraction) {
45364537 if (VL.size() < 2)
45374538 return false;
45384539
46264627 << "\n");
46274628 ArrayRef Ops = VL.slice(I, OpsWidth);
46284629
4630 ArrayRef EmptyArray;
46294631 ArrayRef BuildVectorSlice;
46304632 if (!BuildVector.empty())
46314633 BuildVectorSlice = BuildVector.slice(I, OpsWidth);
46324634
4633 R.buildTree(Ops, BuildVectorSlice);
4635 R.buildTree(Ops, NeedExtraction ? EmptyArray : BuildVectorSlice);
46344636 // TODO: check if we can allow reordering for more cases.
46354637 if (AllowReorder && R.shouldReorder()) {
46364638 // Conceptually, there is nothing actually preventing us from trying to
58205822 return false;
58215823
58225824 DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
5823 return tryToVectorizeList(BuildVectorOpds, R, BuildVector, false);
5825 // Aggregate value is unlikely to be processed in vector register, we need to
5826 // extract scalars into scalar registers, so NeedExtraction is set true.
5827 return tryToVectorizeList(BuildVectorOpds, R, BuildVector, false, true);
58245828 }
58255829
58265830 bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
0 ; RUN: opt -S -mtriple=powerpc64-linux-gnu -mcpu=pwr9 -mattr=+vsx -slp-vectorizer < %s | FileCheck %s
1
2 %struct.S = type { i8*, i8* }
3
4 @kS0 = common global %struct.S zeroinitializer, align 8
5
6 define { i64, i64 } @getS() {
7 entry:
8 %0 = load i64, i64* bitcast (%struct.S* @kS0 to i64*), align 8
9 %1 = load i64, i64* bitcast (i8** getelementptr inbounds (%struct.S, %struct.S* @kS0, i64 0, i32 1) to i64*), align 8
10 %2 = insertvalue { i64, i64 } undef, i64 %0, 0
11 %3 = insertvalue { i64, i64 } %2, i64 %1, 1
12 ret { i64, i64 } %3
13 }
14
15 ; CHECK: load i64
16 ; CHECK-NOT: load <2 x i64>
17 ; CHECK-NOT: extractelement
18
0 ; RUN: opt -S -mtriple=x86_64-unknown-linux -mcpu=corei7 -slp-vectorizer < %s | FileCheck %s
1
2 %struct.S = type { i8*, i8* }
3
4 @kS0 = common global %struct.S zeroinitializer, align 8
5
6 define { i64, i64 } @getS() {
7 entry:
8 %0 = load i64, i64* bitcast (%struct.S* @kS0 to i64*), align 8
9 %1 = load i64, i64* bitcast (i8** getelementptr inbounds (%struct.S, %struct.S* @kS0, i64 0, i32 1) to i64*), align 8
10 %2 = insertvalue { i64, i64 } undef, i64 %0, 0
11 %3 = insertvalue { i64, i64 } %2, i64 %1, 1
12 ret { i64, i64 } %3
13 }
14
15 ; CHECK: load i64
16 ; CHECK-NOT: load <2 x i64>
17 ; CHECK-NOT: extractelement
18