llvm.org GIT mirror llvm / 860122b
X86: Don't over-align combined loads. When combining consecutive loads+inserts into a single vector load, we should keep the alignment of the base load. Doing otherwise can, and does, lead to using overly aligned instructions. In the included test case, for example, using a 32-byte vmovaps on a 16-byte aligned value. Oops. rdar://19190968 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@224746 91177308-0d34-0410-b5e6-96231b3b80d8 Jim Grosbach 5 years ago
2 changed file(s) with 38 addition(s) and 8 deletion(s). Raw diff Collapse all Expand all
60416041
60426042 SDValue NewLd = SDValue();
60436043
6044 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
6045 NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6046 LDBase->getPointerInfo(),
6047 LDBase->isVolatile(), LDBase->isNonTemporal(),
6048 LDBase->isInvariant(), 0);
60496044 NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6050 LDBase->getPointerInfo(),
6051 LDBase->isVolatile(), LDBase->isNonTemporal(),
6052 LDBase->isInvariant(), LDBase->getAlignment());
6045 LDBase->getPointerInfo(), LDBase->isVolatile(),
6046 LDBase->isNonTemporal(), LDBase->isInvariant(),
6047 LDBase->getAlignment());
60536048
60546049 if (LDBase->hasAnyUseOfValue(1)) {
60556050 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
0 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s
1
2 @e = global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8], align 16
3 @d = global [8 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 16
4
5 ; The global 'e' has 16 byte alignment, so make sure we don't generate an
6 ; aligned 32-byte load instruction when we combine the load+insert sequence.
7
8 define i32 @subb() nounwind ssp {
9 ; CHECK-LABEL: subb:
10 ; CHECK: vmovups e(%rip), %ymm
11 entry:
12 %0 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 7), align 4
13 %1 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 6), align 8
14 %2 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 5), align 4
15 %3 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 4), align 16
16 %4 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 3), align 4
17 %5 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 2), align 8
18 %6 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 1), align 4
19 %7 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 0), align 16
20 %vecinit.i = insertelement <8 x i32> undef, i32 %7, i32 0
21 %vecinit1.i = insertelement <8 x i32> %vecinit.i, i32 %6, i32 1
22 %vecinit2.i = insertelement <8 x i32> %vecinit1.i, i32 %5, i32 2
23 %vecinit3.i = insertelement <8 x i32> %vecinit2.i, i32 %4, i32 3
24 %vecinit4.i = insertelement <8 x i32> %vecinit3.i, i32 %3, i32 4
25 %vecinit5.i = insertelement <8 x i32> %vecinit4.i, i32 %2, i32 5
26 %vecinit6.i = insertelement <8 x i32> %vecinit5.i, i32 %1, i32 6
27 %vecinit7.i = insertelement <8 x i32> %vecinit6.i, i32 %0, i32 7
28 %8 = bitcast <8 x i32> %vecinit7.i to <32 x i8>
29 tail call void @llvm.x86.avx.storeu.dq.256(i8* bitcast ([8 x i32]* @d to i8*), <32 x i8> %8)
30 ret i32 0
31 }
32
33 declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
34