llvm.org GIT mirror llvm / 2dbe929
X86: Enable SSE memory intrinsics even when stack alignment is less than 16 bytes. The stack realignment code was fixed to work when there is stack realignment and a dynamic alloca is present so this shouldn't cause correctness issues anymore. Note that this also enables generation of AVX instructions for memset under the assumptions: - Unaligned loads/stores are always fast on CPUs supporting AVX - AVX is not slower than SSE We may need some tweaked heuristics if one of those assumptions turns out not to be true. Effectively reverts r58317. Part of PR2962. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167967 91177308-0d34-0410-b5e6-96231b3b80d8 Benjamin Kramer 7 years ago
5 changed file(s) with 81 addition(s) and 29 deletion(s). Raw diff Collapse all Expand all
13611361 bool IsZeroVal,
13621362 bool MemcpyStrSrc,
13631363 MachineFunction &MF) const {
1364 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
1365 // linux. This is because the stack realignment code can't handle certain
1366 // cases like PR2962. This should be removed when PR2962 is fixed.
13671364 const Function *F = MF.getFunction();
13681365 if (IsZeroVal &&
13691366 !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) {
13701367 if (Size >= 16 &&
13711368 (Subtarget->isUnalignedMemAccessFast() ||
13721369 ((DstAlign == 0 || DstAlign >= 16) &&
1373 (SrcAlign == 0 || SrcAlign >= 16))) &&
1374 Subtarget->getStackAlignment() >= 16) {
1375 if (Subtarget->getStackAlignment() >= 32) {
1370 (SrcAlign == 0 || SrcAlign >= 16)))) {
1371 if (Size >= 32) {
13761372 if (Subtarget->hasAVX2())
13771373 return MVT::v8i32;
13781374 if (Subtarget->hasAVX())
13841380 return MVT::v4f32;
13851381 } else if (!MemcpyStrSrc && Size >= 8 &&
13861382 !Subtarget->is64Bit() &&
1387 Subtarget->getStackAlignment() >= 8 &&
13881383 Subtarget->hasSSE2()) {
13891384 // Do not use f64 to lower memcpy if source is string constant. It's
13901385 // better to use i32 to avoid the loads.
+0
-22
test/CodeGen/X86/2008-10-27-StackRealignment.ll less more
None ; Linux doesn't support stack realignment for functions with allocas (PR2888).
1 ; Until it does, we shouldn't use movaps to access the stack. On targets with
2 ; sufficiently aligned stack (e.g. darwin) we should.
3 ; PR8969 - make 32-bit linux have a 16-byte aligned stack
4 ; RUN: llc < %s -mtriple=i386-pc-linux-gnu -mcpu=yonah | grep movaps | count 2
5 ; RUN: llc < %s -mtriple=i686-apple-darwin9 -mcpu=yonah | grep movaps | count 2
6
7
8 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
9 target triple = "i386-pc-linux-gnu"
10
11 define void @foo(i32 %t) nounwind {
12 %tmp1210 = alloca i8, i32 32, align 4
13 call void @llvm.memset.p0i8.i64(i8* %tmp1210, i8 0, i64 32, i32 4, i1 false)
14 %x = alloca i8, i32 %t
15 call void @dummy(i8* %x)
16 ret void
17 }
18
19 declare void @dummy(i8*)
20
21 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
0 ; RUN: llc < %s -mattr=+sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE2
1 ; RUN: llc < %s -mattr=+sse2 -mtriple=i686-pc-mingw32 -mcpu=core2 | FileCheck %s -check-prefix=SSE2
12 ; RUN: llc < %s -mattr=+sse,-sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE1
23 ; RUN: llc < %s -mattr=-sse -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=NOSSE
34 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=X86-64
0 ; Make sure that we realign the stack. Mingw32 uses 4 byte stack alignment, we
1 ; need 16 bytes for SSE and 32 bytes for AVX.
2
3 ; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=pentium2 | FileCheck %s -check-prefix=NOSSE
4 ; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=pentium3 | FileCheck %s -check-prefix=SSE1
5 ; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=yonah | FileCheck %s -check-prefix=SSE2
6 ; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX1
7 ; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=core-avx2 | FileCheck %s -check-prefix=AVX2
8
9 define void @test1(i32 %t) nounwind {
10 %tmp1210 = alloca i8, i32 32, align 4
11 call void @llvm.memset.p0i8.i64(i8* %tmp1210, i8 0, i64 32, i32 4, i1 false)
12 %x = alloca i8, i32 %t
13 call void @dummy(i8* %x)
14 ret void
15
16 ; NOSSE: test1:
17 ; NOSSE-NOT: and
18 ; NOSSE: movl $0
19
20 ; SSE1: test1:
21 ; SSE1: andl $-16
22 ; SSE1: movl %esp, %esi
23 ; SSE1: movaps
24
25 ; SSE2: test1:
26 ; SSE2: andl $-16
27 ; SSE2: movl %esp, %esi
28 ; SSE2: movaps
29
30 ; AVX1: test1:
31 ; AVX1: andl $-32
32 ; AVX1: movl %esp, %esi
33 ; AVX1: vmovaps %ymm
34
35 ; AVX2: test1:
36 ; AVX2: andl $-32
37 ; AVX2: movl %esp, %esi
38 ; AVX2: vmovaps %ymm
39
40 }
41
42 define void @test2(i32 %t) nounwind {
43 %tmp1210 = alloca i8, i32 16, align 4
44 call void @llvm.memset.p0i8.i64(i8* %tmp1210, i8 0, i64 16, i32 4, i1 false)
45 %x = alloca i8, i32 %t
46 call void @dummy(i8* %x)
47 ret void
48
49 ; NOSSE: test2:
50 ; NOSSE-NOT: and
51 ; NOSSE: movl $0
52
53 ; SSE1: test2:
54 ; SSE1: andl $-16
55 ; SSE1: movl %esp, %esi
56 ; SSE1: movaps
57
58 ; SSE2: test2:
59 ; SSE2: andl $-16
60 ; SSE2: movl %esp, %esi
61 ; SSE2: movaps
62
63 ; AVX1: test2:
64 ; AVX1: andl $-16
65 ; AVX1: movl %esp, %esi
66 ; AVX1: vmovaps %xmm
67
68 ; AVX2: test2:
69 ; AVX2: andl $-16
70 ; AVX2: movl %esp, %esi
71 ; AVX2: vmovaps %xmm
72 }
73
74 declare void @dummy(i8*)
75
76 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
0 ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | grep movups | count 5
11 ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=core2 | grep movl | count 20
2 ; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=core2 | grep movl | count 20
23 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | grep movq | count 10
34
45 define void @bork() nounwind {