llvm.org GIT mirror llvm / 6f328c7
Refactored X86InterleavedAccess into a class. NFCI. Patch by Farhana Aleen Differential Revision: https://reviews.llvm.org/D25986 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@288410 91177308-0d34-0410-b5e6-96231b3b80d8 David L Kreitzer 3 years ago
1 changed file(s) with 188 addition(s) and 84 deletion(s). Raw diff Collapse all Expand all
None //===------- X86InterleavedAccess.cpp --------------===//
0 //===--------- X86InterleavedAccess.cpp ----------------------------------===//
11 //
22 // The LLVM Compiler Infrastructure
33 //
44 // This file is distributed under the University of Illinois Open Source
55 // License. See LICENSE.TXT for details.
66 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the X86 implementation of the interleaved accesses
10 // optimization generating X86-specific instructions/intrinsics for interleaved
11 // access groups.
12 //
13 //===----------------------------------------------------------------------===//
7 //===--------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains the X86 implementation of the interleaved accesses
11 /// optimization generating X86-specific instructions/intrinsics for
12 /// interleaved access groups.
13 ///
14 //===--------------------------------------------------------------------===//
1415
1516 #include "X86ISelLowering.h"
1617 #include "X86TargetMachine.h"
1718
1819 using namespace llvm;
1920
20 /// Returns true if the interleaved access group represented by the shuffles
21 /// is supported for the subtarget. Returns false otherwise.
22 static bool isSupported(const X86Subtarget &SubTarget,
23 const LoadInst *LI,
24 const ArrayRef &Shuffles,
25 unsigned Factor) {
26
27 const DataLayout &DL = Shuffles[0]->getModule()->getDataLayout();
21 /// \brief This class holds necessary information to represent an interleaved
22 /// access group and supports utilities to lower the group into
23 /// X86-specific instructions/intrinsics.
24 /// E.g. A group of interleaving access loads (Factor = 2; accessing every
25 /// other element)
26 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
27 /// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
28 /// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
29
30 class X86InterleavedAccessGroup {
31 /// \brief Reference to the wide-load instruction of an interleaved access
32 /// group.
33 Instruction *const Inst;
34
35 /// \brief Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
36 ArrayRef Shuffles;
37
38 /// \brief Reference to the starting index of each user-shuffle.
39 ArrayRef Indices;
40
41 /// \brief Reference to the interleaving stride in terms of elements.
42 const unsigned Factor;
43
44 /// \brief Reference to the underlying target.
45 const X86Subtarget &Subtarget;
46
47 const DataLayout &DL;
48
49 IRBuilder<> &Builder;
50
51 /// \brief Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
52 /// sub vectors of type \p T. Returns true and the sub-vectors in
53 /// \p DecomposedVectors if it decomposes the Inst, returns false otherwise.
54 bool decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
55 SmallVectorImpl &DecomposedVectors);
56
57 /// \brief Performs matrix transposition on a 4x4 matrix \p InputVectors and
58 /// returns the transposed-vectors in \p TransposedVectors.
59 /// E.g.
60 /// InputVectors:
61 /// In-V0 = p1, p2, p3, p4
62 /// In-V1 = q1, q2, q3, q4
63 /// In-V2 = r1, r2, r3, r4
64 /// In-V3 = s1, s2, s3, s4
65 /// OutputVectors:
66 /// Out-V0 = p1, q1, r1, s1
67 /// Out-V1 = p2, q2, r2, s2
68 /// Out-V2 = p3, q3, r3, s3
69 /// Out-V3 = P4, q4, r4, s4
70 void transpose_4x4(ArrayRef InputVectors,
71 SmallVectorImpl &TrasposedVectors);
72
73 public:
74 /// In order to form an interleaved access group X86InterleavedAccessGroup
75 /// requires a wide-load instruction \p 'I', a group of interleaved-vectors
76 /// \p Shuffs, reference to the first indices of each interleaved-vector
77 /// \p 'Ind' and the interleaving stride factor \p F. In order to generate
78 /// X86-specific instructions/intrinsics it also requires the underlying
79 /// target information \p STarget.
80 explicit X86InterleavedAccessGroup(Instruction *I,
81 ArrayRef Shuffs,
82 ArrayRef Ind,
83 const unsigned F,
84 const X86Subtarget &STarget,
85 IRBuilder<> &B)
86 : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
87 DL(Inst->getModule()->getDataLayout()), Builder(B) {}
88
89 /// \brief Returns true if this interleaved access group can be lowered into
90 /// x86-specific instructions/intrinsics, false otherwise.
91 bool isSupported() const;
92
93 /// \brief Lowers this interleaved access group into X86-specific
94 /// instructions/intrinsics.
95 bool lowerIntoOptimizedSequence();
96 };
97
98 bool X86InterleavedAccessGroup::isSupported() const {
2899 VectorType *ShuffleVecTy = Shuffles[0]->getType();
29 unsigned ShuffleVecSize = DL.getTypeSizeInBits(ShuffleVecTy);
100 uint64_t ShuffleVecSize = DL.getTypeSizeInBits(ShuffleVecTy);
30101 Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
31102
32 if (DL.getTypeSizeInBits(LI->getType()) < Factor * ShuffleVecSize)
103 if (DL.getTypeSizeInBits(Inst->getType()) < Factor * ShuffleVecSize)
33104 return false;
34105
35106 // Currently, lowering is supported for 64 bits on AVX.
36 if (!SubTarget.hasAVX() || ShuffleVecSize != 256 ||
37 DL.getTypeSizeInBits(ShuffleEltTy) != 64 ||
38 Factor != 4)
107 if (!Subtarget.hasAVX() || ShuffleVecSize != 256 ||
108 DL.getTypeSizeInBits(ShuffleEltTy) != 64 || Factor != 4)
39109 return false;
40110
41111 return true;
42112 }
43113
44 /// \brief Lower interleaved load(s) into target specific instructions/
45 /// intrinsics. Lowering sequence varies depending on the vector-types, factor,
46 /// number of shuffles and ISA.
47 /// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
114 bool X86InterleavedAccessGroup::decompose(
115 Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy,
116 SmallVectorImpl &DecomposedVectors) {
117 Type *VecTy = VecInst->getType();
118
119 assert(VecTy->isVectorTy() &&
120 DL.getTypeSizeInBits(VecTy) >=
121 DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
122 "Invalid Inst-size!!!");
123 assert(VecTy->getVectorElementType() == SubVecTy->getVectorElementType() &&
124 "Element type mismatched!!!");
125
126 if (!isa(VecInst))
127 return false;
128
129 LoadInst *LI = cast(VecInst);
130 Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());
131
132 Value *VecBasePtr =
133 Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
134
135 // Generate N loads of T type
136 for (unsigned i = 0; i < NumSubVectors; i++) {
137 // TODO: Support inbounds GEP
138 Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
139 Instruction *NewLoad =
140 Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());
141 DecomposedVectors.push_back(NewLoad);
142 }
143
144 return true;
145 }
146
147 void X86InterleavedAccessGroup::transpose_4x4(
148 ArrayRef Matrix,
149 SmallVectorImpl &TransposedMatrix) {
150 assert(Matrix.size() == 4 && "Invalid matrix size");
151 TransposedMatrix.resize(4);
152
153 // dst = src1[0,1],src2[0,1]
154 uint32_t IntMask1[] = {0, 1, 4, 5};
155 ArrayRef Mask = makeArrayRef(IntMask1, 4);
156 Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
157 Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
158
159 // dst = src1[2,3],src2[2,3]
160 uint32_t IntMask2[] = {2, 3, 6, 7};
161 Mask = makeArrayRef(IntMask2, 4);
162 Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
163 Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
164
165 // dst = src1[0],src2[0],src1[2],src2[2]
166 uint32_t IntMask3[] = {0, 4, 2, 6};
167 Mask = makeArrayRef(IntMask3, 4);
168 TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
169 TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
170
171 // dst = src1[1],src2[1],src1[3],src2[3]
172 uint32_t IntMask4[] = {1, 5, 3, 7};
173 Mask = makeArrayRef(IntMask4, 4);
174 TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
175 TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
176 }
177
178 // Lowers this interleaved access group into X86-specific
179 // instructions/intrinsics.
180 bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
181 SmallVector DecomposedVectors;
182 VectorType *VecTy = Shuffles[0]->getType();
183 // Try to generate target-sized register(/instruction).
184 if (!decompose(Inst, Factor, VecTy, DecomposedVectors))
185 return false;
186
187 SmallVector TransposedVectors;
188 // Perform matrix-transposition in order to compute interleaved
189 // results by generating some sort of (optimized) target-specific
190 // instructions.
191 transpose_4x4(DecomposedVectors, TransposedVectors);
192
193 // Now replace the unoptimized-interleaved-vectors with the
194 // transposed-interleaved vectors.
195 for (unsigned i = 0; i < Shuffles.size(); i++)
196 Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
197
198 return true;
199 }
200
201 // Lower interleaved load(s) into target specific instructions/
202 // intrinsics. Lowering sequence varies depending on the vector-types, factor,
203 // number of shuffles and ISA.
204 // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
48205 bool X86TargetLowering::lowerInterleavedLoad(
49206 LoadInst *LI, ArrayRef Shuffles,
50207 ArrayRef Indices, unsigned Factor) const {
54211 assert(Shuffles.size() == Indices.size() &&
55212 "Unmatched number of shufflevectors and indices");
56213
57 if (!isSupported(Subtarget, LI, Shuffles, Factor))
58 return false;
59
60 VectorType *ShuffleVecTy = Shuffles[0]->getType();
61
62 Type *VecBasePtrTy = ShuffleVecTy->getPointerTo(LI->getPointerAddressSpace());
63
214 // Create an interleaved access group.
64215 IRBuilder<> Builder(LI);
65 SmallVector NewLoads;
66 SmallVector NewShuffles;
67 NewShuffles.resize(Factor);
68
69 Value *VecBasePtr =
70 Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
71
72 // Generate 4 loads of type v4xT64
73 for (unsigned Part = 0; Part < Factor; Part++) {
74 // TODO: Support inbounds GEP
75 Value *NewBasePtr =
76 Builder.CreateGEP(VecBasePtr, Builder.getInt32(Part));
77 Instruction *NewLoad =
78 Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());
79 NewLoads.push_back(NewLoad);
80 }
81
82 // dst = src1[0,1],src2[0,1]
83 uint32_t IntMask1[] = {0, 1, 4, 5};
84 ArrayRef ShuffleMask = makeArrayRef(IntMask1, 4);
85 Value *IntrVec1 =
86 Builder.CreateShuffleVector(NewLoads[0], NewLoads[2], ShuffleMask);
87 Value *IntrVec2 =
88 Builder.CreateShuffleVector(NewLoads[1], NewLoads[3], ShuffleMask);
89
90 // dst = src1[2,3],src2[2,3]
91 uint32_t IntMask2[] = {2, 3, 6, 7};
92 ShuffleMask = makeArrayRef(IntMask2, 4);
93 Value *IntrVec3 =
94 Builder.CreateShuffleVector(NewLoads[0], NewLoads[2], ShuffleMask);
95 Value *IntrVec4 =
96 Builder.CreateShuffleVector(NewLoads[1], NewLoads[3], ShuffleMask);
97
98 // dst = src1[0],src2[0],src1[2],src2[2]
99 uint32_t IntMask3[] = {0, 4, 2, 6};
100 ShuffleMask = makeArrayRef(IntMask3, 4);
101 NewShuffles[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, ShuffleMask);
102 NewShuffles[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, ShuffleMask);
103
104 // dst = src1[1],src2[1],src1[3],src2[3]
105 uint32_t IntMask4[] = {1, 5, 3, 7};
106 ShuffleMask = makeArrayRef(IntMask4, 4);
107 NewShuffles[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, ShuffleMask);
108 NewShuffles[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, ShuffleMask);
109
110 for (unsigned i = 0; i < Shuffles.size(); i++) {
111 unsigned Index = Indices[i];
112 Shuffles[i]->replaceAllUsesWith(NewShuffles[Index]);
113 }
114
115 return true;
116 }
216 X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
217 Builder);
218
219 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
220 }