llvm.org GIT mirror llvm / a5e1362
Revert r122955. It seems using movups to lower memcpy can cause massive regression (even on Nehalem) in edge cases. I also didn't see any real performance benefit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@123015 91177308-0d34-0410-b5e6-96231b3b80d8 Evan Cheng 9 years ago
11 changed file(s) with 101 addition(s) and 79 deletion(s). Raw diff Collapse all Expand all
10621062 // linux. This is because the stack realignment code can't handle certain
10631063 // cases like PR2962. This should be removed when PR2962 is fixed.
10641064 const Function *F = MF.getFunction();
1065 if (NonScalarIntSafe && !F->hasFnAttr(Attribute::NoImplicitFloat)) {
1065 if (NonScalarIntSafe &&
1066 !F->hasFnAttr(Attribute::NoImplicitFloat)) {
10661067 if (Size >= 16 &&
1068 (Subtarget->isUnalignedMemAccessFast() ||
1069 ((DstAlign == 0 || DstAlign >= 16) &&
1070 (SrcAlign == 0 || SrcAlign >= 16))) &&
10671071 Subtarget->getStackAlignment() >= 16) {
10681072 if (Subtarget->hasSSE2())
10691073 return MVT::v4i32;
None ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
0 ; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
11 ; rdar://7842028
22
33 ; Do not delete partially dead copy instructions.
88 %struct.F = type { %struct.FC*, i32, i32, i8, i32, i32, i32 }
99 %struct.FC = type { [10 x i8], [32 x i32], %struct.FC*, i32 }
1010
11 define void @t(%struct.F* %this) nounwind optsize {
11 define void @t(%struct.F* %this) nounwind {
1212 entry:
1313 ; CHECK: t:
1414 ; CHECK: addq $12, %rsi
2525 ; CHECK: rep;stosl
2626
2727 %tmp5 = bitcast i32* %tmp4 to i8*
28 call void @llvm.memset.p0i8.i64(i8* %tmp5, i8 0, i64 124, i32 4, i1 false)
28 call void @llvm.memset.p0i8.i64(i8* %tmp5, i8 0, i64 84, i32 4, i1 false)
2929 %tmp6 = getelementptr inbounds %struct.type* %s, i32 0, i32 62
3030 store i32* null, i32** %tmp6, align 8
3131 br label %bb1
1818 }
1919
2020 ; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip), %rax
21 ; CHECK: movb 30(%rsp), %cl
22 ; CHECK: movb (%rsp), %dl
23 ; CHECK: movb %dl, (%rsp)
24 ; CHECK: movb %cl, 30(%rsp)
21 ; CHECK: movb 30(%rsp), %dl
22 ; CHECK: movb (%rsp), %sil
23 ; CHECK: movb %sil, (%rsp)
24 ; CHECK: movb %dl, 30(%rsp)
2525 ; CHECK: callq ___stack_chk_fail
0 ; RUN: llc < %s -mattr=+sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE2
1 ; RUN: llc < %s -mattr=+sse,-sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE1
12 ; RUN: llc < %s -mattr=-sse -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=NOSSE
23 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=X86-64
34
1213 ; SSE2: movb $0
1314 ; SSE2: movl $0
1415 ; SSE2: movl $0
16
17 ; SSE1: t1:
18 ; SSE1: movaps _.str, %xmm0
19 ; SSE1: movaps %xmm0
20 ; SSE1: movb $0
21 ; SSE1: movl $0
22 ; SSE1: movl $0
1523
1624 ; NOSSE: t1:
1725 ; NOSSE: movb $0
4250 ; SSE2: movaps (%eax), %xmm0
4351 ; SSE2: movaps %xmm0, (%eax)
4452
53 ; SSE1: t2:
54 ; SSE1: movaps (%eax), %xmm0
55 ; SSE1: movaps %xmm0, (%eax)
56
4557 ; NOSSE: t2:
4658 ; NOSSE: movl
4759 ; NOSSE: movl
6678 define void @t3(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp {
6779 entry:
6880 ; SSE2: t3:
69 ; SSE2: movups (%eax), %xmm0
70 ; SSE2: movups %xmm0, (%eax)
81 ; SSE2: movsd (%eax), %xmm0
82 ; SSE2: movsd 8(%eax), %xmm1
83 ; SSE2: movsd %xmm1, 8(%eax)
84 ; SSE2: movsd %xmm0, (%eax)
85
86 ; SSE1: t3:
87 ; SSE1: movl
88 ; SSE1: movl
89 ; SSE1: movl
90 ; SSE1: movl
91 ; SSE1: movl
92 ; SSE1: movl
93 ; SSE1: movl
94 ; SSE1: movl
95 ; SSE1: movl
96 ; SSE1: movl
7197
7298 ; NOSSE: t3:
7399 ; NOSSE: movl
82108 ; NOSSE: movl
83109
84110 ; X86-64: t3:
85 ; X86-64: movups (%rsi), %xmm0
86 ; X86-64: movups %xmm0, (%rdi)
111 ; X86-64: movq (%rsi), %rax
112 ; X86-64: movq 8(%rsi), %rcx
113 ; X86-64: movq %rcx, 8(%rdi)
114 ; X86-64: movq %rax, (%rdi)
87115 %tmp2 = bitcast %struct.s0* %a to i8* ; [#uses=1]
88116 %tmp3 = bitcast %struct.s0* %b to i8* ; [#uses=1]
89117 tail call void @llvm.memcpy.i32(i8* %tmp2, i8* %tmp3, i32 16, i32 8)
93121 define void @t4() nounwind {
94122 entry:
95123 ; SSE2: t4:
96 ; SSE2: movups _.str2, %xmm0
97 ; SSE2: movaps %xmm0, (%esp)
98 ; SSE2: movw $120, 28(%esp)
124 ; SSE2: movw $120
99125 ; SSE2: movl $2021161080
100126 ; SSE2: movl $2021161080
101127 ; SSE2: movl $2021161080
128 ; SSE2: movl $2021161080
129 ; SSE2: movl $2021161080
130 ; SSE2: movl $2021161080
131 ; SSE2: movl $2021161080
132
133 ; SSE1: t4:
134 ; SSE1: movw $120
135 ; SSE1: movl $2021161080
136 ; SSE1: movl $2021161080
137 ; SSE1: movl $2021161080
138 ; SSE1: movl $2021161080
139 ; SSE1: movl $2021161080
140 ; SSE1: movl $2021161080
141 ; SSE1: movl $2021161080
102142
103143 ; NOSSE: t4:
104144 ; NOSSE: movw $120
113153 ; X86-64: t4:
114154 ; X86-64: movabsq $8680820740569200760, %rax
115155 ; X86-64: movq %rax
116 ; X86-64: movups _.str2(%rip), %xmm0
117 ; X86-64: movaps %xmm0, -40(%rsp)
156 ; X86-64: movq %rax
157 ; X86-64: movq %rax
118158 ; X86-64: movw $120
119159 ; X86-64: movl $2021161080
120160 %tmp1 = alloca [30 x i8]
3636 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i32 1, i1 false)
3737 ret void
3838 ; LINUX: test3:
39 ; LINUX-NOT: memcpy
40 ; LINUX: movups
41 ; LINUX: movups
42 ; LINUX: movups
43 ; LINUX: movups
44 ; LINUX: movups
45 ; LINUX: movups
46 ; LINUX: movups
47 ; LINUX: movups
39 ; LINUX: memcpy
4840
4941 ; DARWIN: test3:
5042 ; DARWIN-NOT: memcpy
51 ; DARWIN: movups
52 ; DARWIN: movups
53 ; DARWIN: movups
54 ; DARWIN: movups
55 ; DARWIN: movups
56 ; DARWIN: movups
57 ; DARWIN: movups
58 ; DARWIN: movups
59 ; DARWIN: movups
60 ; DARWIN: movups
61 ; DARWIN: movups
62 ; DARWIN: movups
63 ; DARWIN: movups
64 ; DARWIN: movups
65 ; DARWIN: movups
66 ; DARWIN: movups
43 ; DARWIN: movq
44 ; DARWIN: movq
45 ; DARWIN: movq
46 ; DARWIN: movq
47 ; DARWIN: movq
48 ; DARWIN: movq
49 ; DARWIN: movq
50 ; DARWIN: movq
51 ; DARWIN: movq
52 ; DARWIN: movq
53 ; DARWIN: movq
54 ; DARWIN: movq
55 ; DARWIN: movq
56 ; DARWIN: movq
57 ; DARWIN: movq
58 ; DARWIN: movq
6759 }
6860
6961 ; Large constant memcpy's should be inlined when not optimizing for size.
44 define fastcc void @t1() nounwind {
55 entry:
66 ; CHECK: t1:
7 ; CHECK: pxor %xmm0, %xmm0
8 ; CHECK: movups %xmm0, 160
9 ; CHECK: movups %xmm0, 144
10 ; CHECK: movups %xmm0, 128
11 ; CHECK: movups %xmm0, 112
12 ; CHECK: movups %xmm0, 96
13 ; CHECK: movups %xmm0, 80
14 ; CHECK: movups %xmm0, 64
15 ; CHECK: movups %xmm0, 48
16 ; CHECK: movups %xmm0, 32
17 ; CHECK: movups %xmm0, 16
18 ; CHECK: movups %xmm0, 0
19 ; CHECK: movl $0, 184
20 ; CHECK: movl $0, 180
21 ; CHECK: movl $0, 176
7 ; CHECK: calll _memset
228 call void @llvm.memset.i32( i8* null, i8 0, i32 188, i32 1 ) nounwind
239 unreachable
2410 }
0 ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | grep movups | count 5
1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | grep movups | count 5
1 ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=core2 | grep movl | count 20
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | grep movq | count 10
23
34 define void @bork() nounwind {
45 entry:
None ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | FileCheck %s
0 ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=core2 | grep movsd | count 8
1 ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | grep movups | count 2
12
23 define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 }* byval align 4 %z) nounwind {
34 entry:
4 ; CHECK: ccosl:
5 ; CHECK: movaps
6 ; CHECK: movaps
7 ; CHECK: movups
8 ; CHECK: movups
95 %iz = alloca { x86_fp80, x86_fp80 } ; <{ x86_fp80, x86_fp80 }*> [#uses=3]
106 %tmp1 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 1 ; [#uses=1]
117 %tmp2 = load x86_fp80* %tmp1, align 16 ; [#uses=1]
99 unreachable
1010 ; CHECK: movq _c@TLVP(%rip), %rdi
1111 ; CHECK-NEXT: callq *(%rdi)
12 ; CHECK-NEXT: pxor %xmm0, %xmm0
13 ; CHECK-NEXT: movups %xmm0, 32(%rax)
14 ; CHECK-NEXT: movups %xmm0, 16(%rax)
15 ; CHECK-NEXT: movups %xmm0, (%rax)
16 ; CHECK-NEXT: movl $0, 56(%rax)
17 ; CHECK-NEXT: movq $0, 48(%rax)
12 ; CHECK-NEXT: movl $0, 56(%rax)
13 ; CHECK-NEXT: movq $0, 48(%rax)
1814 }
1915
2016 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
None ; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=core2 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck %s
0 ; RUN: llc < %s -mtriple=i386-apple-darwin10.0 -mcpu=core2 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=I386 %s
1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=core2 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=CORE2 %s
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=corei7 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=COREI7 %s
13
24 @.str1 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 8
35 @.str3 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, 2'ND STRING\00", align 8
1012 bb:
1113 %String2Loc9 = getelementptr inbounds [31 x i8]* %String2Loc, i64 0, i64 0
1214 call void @llvm.memcpy.i64(i8* %String2Loc9, i8* getelementptr inbounds ([31 x i8]* @.str3, i64 0, i64 0), i64 31, i32 1)
13 ; CHECK: movabsq $2325069237881678925, %rax
14 ; CHECK: movups _.str3(%rip), %xmm0
15 ; I386: calll {{_?}}memcpy
16
17 ; CORE2: movabsq
18 ; CORE2: movabsq
19 ; CORE2: movabsq
20
21 ; COREI7: movups _.str3
1522 br label %bb
1623
1724 return:
2027
2128 declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
2229
23 ; CHECK: .section
24 ; CHECK: .align 4
25 ; CHECK-NEXT: _.str1:
26 ; CHECK-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING"
27 ; CHECK: .align 4
28 ; CHECK-NEXT: _.str3:
30 ; CORE2: .section
31 ; CORE2: .align 4
32 ; CORE2-NEXT: _.str1:
33 ; CORE2-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING"
34 ; CORE2: .align 4
35 ; CORE2-NEXT: _.str3: