llvm.org GIT mirror llvm / 95d397c
[X86][AVX] Match broadcast loads through a bitcast AVX1 v8i32/v4i64 shuffles are bitcasted to v8f32/v4f64, this patch peeks through any bitcast to check for a load node to allow broadcasts to occur. This is a re-commit of r257055 after r257264 fixed 32-bit broadcast loads of i64 scalars. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@257266 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 4 years ago
3 changed file(s) with 12 addition(s) and 21 deletion(s). Raw diff Collapse all Expand all
81748174
81758175 MVT BroadcastVT = VT;
81768176
8177 // Peek through any bitcast (only useful for loads).
8178 SDValue BC = V;
8179 while (BC.getOpcode() == ISD::BITCAST)
8180 BC = BC.getOperand(0);
8181
81778182 // Also check the simpler case, where we can directly reuse the scalar.
81788183 if (V.getOpcode() == ISD::BUILD_VECTOR ||
81798184 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
81838188 // Only AVX2 has register broadcasts.
81848189 if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
81858190 return SDValue();
8186 } else if (MayFoldLoad(V) && !cast(V)->isVolatile()) {
8191 } else if (MayFoldLoad(BC) && !cast(BC)->isVolatile()) {
81878192 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
81888193 if (!Subtarget->is64Bit() && VT.getScalarType() == MVT::i64)
81898194 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
81908195
81918196 // If we are broadcasting a load that is only used by the shuffle
81928197 // then we can reduce the vector load to the broadcasted scalar load.
8193 LoadSDNode *Ld = cast(V);
8198 LoadSDNode *Ld = cast(BC);
81948199 SDValue BaseAddr = Ld->getOperand(1);
81958200 EVT AddrVT = BaseAddr.getValueType();
81968201 EVT SVT = BroadcastVT.getScalarType();
22 define void @endless_loop() {
33 ; CHECK-LABEL: endless_loop:
44 ; CHECK-NEXT: # BB#0:
5 ; CHECK-NEXT: vmovaps (%eax), %ymm0
6 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
7 ; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
5 ; CHECK-NEXT: vbroadcastss (%eax), %ymm0
86 ; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
97 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
108 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
191191 ; X32-LABEL: load_splat_8i32_8i32_55555555:
192192 ; X32: ## BB#0: ## %entry
193193 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
194 ; X32-NEXT: vmovaps (%eax), %ymm0
195 ; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
196 ; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
197 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
194 ; X32-NEXT: vbroadcastss 20(%eax), %ymm0
198195 ; X32-NEXT: retl
199196 ;
200197 ; X64-LABEL: load_splat_8i32_8i32_55555555:
201198 ; X64: ## BB#0: ## %entry
202 ; X64-NEXT: vmovaps (%rdi), %ymm0
203 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
204 ; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
205 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
199 ; X64-NEXT: vbroadcastss 20(%rdi), %ymm0
206200 ; X64-NEXT: retq
207201 entry:
208202 %ld = load <8 x i32>, <8 x i32>* %ptr
303297 ; X32-LABEL: load_splat_4i64_4i64_2222:
304298 ; X32: ## BB#0: ## %entry
305299 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
306 ; X32-NEXT: vmovapd (%eax), %ymm0
307 ; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
308 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
309 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
300 ; X32-NEXT: vbroadcastsd 16(%eax), %ymm0
310301 ; X32-NEXT: retl
311302 ;
312303 ; X64-LABEL: load_splat_4i64_4i64_2222:
313304 ; X64: ## BB#0: ## %entry
314 ; X64-NEXT: vmovapd (%rdi), %ymm0
315 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
316 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
317 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
305 ; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0
318306 ; X64-NEXT: retq
319307 entry:
320308 %ld = load <4 x i64>, <4 x i64>* %ptr