llvm.org GIT mirror llvm / 2efa509
[SLP] Enable 64-bit wide vectorization on AArch64 ARM Neon has native support for half-sized vector registers (64 bits). This is beneficial for example for 2D and 3D graphics. This patch adds the option to lower MinVecRegSize from 128 via a TTI in the SLP Vectorizer. *** Performance Analysis This change was motivated by some internal benchmarks but it is also beneficial on SPEC and the LLVM testsuite. The results are with -O3 and PGO. A negative percentage is an improvement. The testsuite was run with a sample size of 4. ** SPEC * CFP2006/482.sphinx3 -3.34% A pretty hot loop is SLP vectorized resulting in nice instruction reduction. This used to be a +22% regression before rL299482. * CFP2000/177.mesa -3.34% * CINT2000/256.bzip2 +6.97% My current plan is to extend the fix in rL299482 to i16 which brings the regression down to +2.5%. There are also other problems with the codegen in this loop so there is further room for improvement. ** LLVM testsuite * SingleSource/Benchmarks/Misc/ReedSolomon -10.75% There are multiple small SLP vectorizations outside the hot code. It's a bit surprising that it adds up to 10%. Some of this may be code-layout noise. * MultiSource/Benchmarks/VersaBench/beamformer/beamformer -8.40% The opt-viewer screenshot can be seen at F3218284. We start at a colder store but the tree leads us into the hottest loop. * MultiSource/Applications/lambda-0.1.3/lambda -2.68% * MultiSource/Benchmarks/Bullet/bullet -2.18% This is using 3D vectors. * SingleSource/Benchmarks/Shootout-C++/Shootout-C++-lists +6.67% Noise, binary is unchanged. * MultiSource/Benchmarks/Ptrdist/anagram/anagram +4.90% There is an additional SLP in the cold code. The test runs for ~1sec and prints out over 2000 lines. This is most likely noise. * MultiSource/Applications/aha/aha +1.63% * MultiSource/Applications/JM/lencod/lencod +1.41% * SingleSource/Benchmarks/Misc/richards_benchmark +1.15% Differential Revision: https://reviews.llvm.org/D31965 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@303116 91177308-0d34-0410-b5e6-96231b3b80d8 Adam Nemet 3 years ago
8 changed file(s) with 58 addition(s) and 1 deletion(s). Raw diff Collapse all Expand all
536536 /// \return The width of the largest scalar or vector register type.
537537 unsigned getRegisterBitWidth(bool Vector) const;
538538
539 /// \return The width of the smallest vector register type.
540 unsigned getMinVectorRegisterBitWidth() const;
541
539542 /// \return True if it should be considered for address type promotion.
540543 /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is
541544 /// profitable without finding other extensions fed by the same input.
839842 Type *Ty) = 0;
840843 virtual unsigned getNumberOfRegisters(bool Vector) = 0;
841844 virtual unsigned getRegisterBitWidth(bool Vector) = 0;
845 virtual unsigned getMinVectorRegisterBitWidth() = 0;
842846 virtual bool shouldConsiderAddressTypePromotion(
843847 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0;
844848 virtual unsigned getCacheLineSize() = 0;
10751079 unsigned getRegisterBitWidth(bool Vector) override {
10761080 return Impl.getRegisterBitWidth(Vector);
10771081 }
1082 unsigned getMinVectorRegisterBitWidth() override {
1083 return Impl.getMinVectorRegisterBitWidth();
1084 }
10781085 bool shouldConsiderAddressTypePromotion(
10791086 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override {
10801087 return Impl.shouldConsiderAddressTypePromotion(
309309 unsigned getNumberOfRegisters(bool Vector) { return 8; }
310310
311311 unsigned getRegisterBitWidth(bool Vector) { return 32; }
312
313 unsigned getMinVectorRegisterBitWidth() { return 128; }
312314
313315 bool
314316 shouldConsiderAddressTypePromotion(const Instruction &I,
278278 return TTIImpl->getRegisterBitWidth(Vector);
279279 }
280280
281 unsigned TargetTransformInfo::getMinVectorRegisterBitWidth() const {
282 return TTIImpl->getMinVectorRegisterBitWidth();
283 }
284
281285 bool TargetTransformInfo::shouldConsiderAddressTypePromotion(
282286 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
283287 return TTIImpl->shouldConsiderAddressTypePromotion(
9090 case Falkor:
9191 MaxInterleaveFactor = 4;
9292 VectorInsertExtractBaseCost = 2;
93 // FIXME: remove this to enable 64-bit SLP if performance looks good.
94 MinVectorRegisterBitWidth = 128;
9395 break;
9496 case Kryo:
9597 MaxInterleaveFactor = 4;
98100 PrefetchDistance = 740;
99101 MinPrefetchStride = 1024;
100102 MaxPrefetchIterationsAhead = 11;
103 // FIXME: remove this to enable 64-bit SLP if performance looks good.
104 MinVectorRegisterBitWidth = 128;
101105 break;
102106 case ThunderX2T99:
103107 CacheLineSize = 64;
107111 PrefetchDistance = 128;
108112 MinPrefetchStride = 1024;
109113 MaxPrefetchIterationsAhead = 4;
114 // FIXME: remove this to enable 64-bit SLP if performance looks good.
115 MinVectorRegisterBitWidth = 128;
110116 break;
111117 case ThunderX:
112118 case ThunderXT88:
115121 CacheLineSize = 128;
116122 PrefFunctionAlignment = 3;
117123 PrefLoopAlignment = 2;
124 // FIXME: remove this to enable 64-bit SLP if performance looks good.
125 MinVectorRegisterBitWidth = 128;
118126 break;
119127 case CortexA35: break;
120128 case CortexA53: break;
8181
8282 // NegativeImmediates - transform instructions with negative immediates
8383 bool NegativeImmediates = true;
84
85 // Enable 64-bit vectorization in SLP.
86 unsigned MinVectorRegisterBitWidth = 64;
8487
8588 bool UseAA = false;
8689 bool PredictableSelectIsExpensive = false;
190193
191194 bool isXRaySupported() const override { return true; }
192195
196 unsigned getMinVectorRegisterBitWidth() const {
197 return MinVectorRegisterBitWidth;
198 }
199
193200 bool isX18Reserved() const { return ReserveX18; }
194201 bool hasFPARMv8() const { return HasFPARMv8; }
195202 bool hasNEON() const { return HasNEON; }
8686 return 64;
8787 }
8888
89 unsigned getMinVectorRegisterBitWidth() {
90 return ST->getMinVectorRegisterBitWidth();
91 }
92
8993 unsigned getMaxInterleaveFactor(unsigned VF);
9094
9195 int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
315315 else
316316 MaxVecRegSize = TTI->getRegisterBitWidth(true);
317317
318 MinVecRegSize = MinVectorRegSizeOption;
318 if (MinVectorRegSizeOption.getNumOccurrences())
319 MinVecRegSize = MinVectorRegSizeOption;
320 else
321 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
319322 }
320323
321324 /// \brief Vectorize the tree that starts with the elements in \p VL.
0 ; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic < %s | FileCheck %s
1 ; RUN: opt -S -slp-vectorizer -mtriple=aarch64-apple-ios -mcpu=cyclone < %s | FileCheck %s
2 ; Currently disabled for a few subtargets (e.g. Kryo):
3 ; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=kryo < %s | FileCheck --check-prefix=NO_SLP %s
4 ; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -slp-min-reg-size=128 < %s | FileCheck --check-prefix=NO_SLP %s
5
6 define void @f(float* %r, float* %w) {
7 %r0 = getelementptr inbounds float, float* %r, i64 0
8 %r1 = getelementptr inbounds float, float* %r, i64 1
9 %f0 = load float, float* %r0
10 %f1 = load float, float* %r1
11 %add0 = fadd float %f0, %f0
12 ; CHECK: fadd <2 x float>
13 ; NO_SLP: fadd float
14 ; NO_SLP: fadd float
15 %add1 = fadd float %f1, %f1
16 %w0 = getelementptr inbounds float, float* %w, i64 0
17 %w1 = getelementptr inbounds float, float* %w, i64 1
18 store float %add0, float* %w0
19 store float %add1, float* %w1
20 ret void
21 }