llvm.org GIT mirror llvm / 461f1fc
Use movups to lower memcpy and memset even if it's not fast (like corei7). The theory is it's still faster than a pair of movq / a quad of movl. This will probably hurt older chips like P4 but should run faster on current and future Intel processors. rdar://8817010 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@122955 91177308-0d34-0410-b5e6-96231b3b80d8 Evan Cheng 9 years ago
11 changed file(s) with 79 addition(s) and 101 deletion(s). Raw diff Collapse all Expand all
10621062 // linux. This is because the stack realignment code can't handle certain
10631063 // cases like PR2962. This should be removed when PR2962 is fixed.
10641064 const Function *F = MF.getFunction();
1065 if (NonScalarIntSafe &&
1066 !F->hasFnAttr(Attribute::NoImplicitFloat)) {
1065 if (NonScalarIntSafe && !F->hasFnAttr(Attribute::NoImplicitFloat)) {
10671066 if (Size >= 16 &&
1068 (Subtarget->isUnalignedMemAccessFast() ||
1069 ((DstAlign == 0 || DstAlign >= 16) &&
1070 (SrcAlign == 0 || SrcAlign >= 16))) &&
10711067 Subtarget->getStackAlignment() >= 16) {
10721068 if (Subtarget->hasSSE2())
10731069 return MVT::v4i32;
None ; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
0 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
11 ; rdar://7842028
22
33 ; Do not delete partially dead copy instructions.
88 %struct.F = type { %struct.FC*, i32, i32, i8, i32, i32, i32 }
99 %struct.FC = type { [10 x i8], [32 x i32], %struct.FC*, i32 }
1010
11 define void @t(%struct.F* %this) nounwind {
11 define void @t(%struct.F* %this) nounwind optsize {
1212 entry:
1313 ; CHECK: t:
1414 ; CHECK: addq $12, %rsi
2525 ; CHECK: rep;stosl
2626
2727 %tmp5 = bitcast i32* %tmp4 to i8*
28 call void @llvm.memset.p0i8.i64(i8* %tmp5, i8 0, i64 84, i32 4, i1 false)
28 call void @llvm.memset.p0i8.i64(i8* %tmp5, i8 0, i64 124, i32 4, i1 false)
2929 %tmp6 = getelementptr inbounds %struct.type* %s, i32 0, i32 62
3030 store i32* null, i32** %tmp6, align 8
3131 br label %bb1
1818 }
1919
2020 ; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip), %rax
21 ; CHECK: movb 30(%rsp), %dl
22 ; CHECK: movb (%rsp), %sil
23 ; CHECK: movb %sil, (%rsp)
24 ; CHECK: movb %dl, 30(%rsp)
21 ; CHECK: movb 30(%rsp), %cl
22 ; CHECK: movb (%rsp), %dl
23 ; CHECK: movb %dl, (%rsp)
24 ; CHECK: movb %cl, 30(%rsp)
2525 ; CHECK: callq ___stack_chk_fail
0 ; RUN: llc < %s -mattr=+sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE2
1 ; RUN: llc < %s -mattr=+sse,-sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE1
21 ; RUN: llc < %s -mattr=-sse -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=NOSSE
32 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=X86-64
43
1312 ; SSE2: movb $0
1413 ; SSE2: movl $0
1514 ; SSE2: movl $0
16
17 ; SSE1: t1:
18 ; SSE1: movaps _.str, %xmm0
19 ; SSE1: movaps %xmm0
20 ; SSE1: movb $0
21 ; SSE1: movl $0
22 ; SSE1: movl $0
2315
2416 ; NOSSE: t1:
2517 ; NOSSE: movb $0
5042 ; SSE2: movaps (%eax), %xmm0
5143 ; SSE2: movaps %xmm0, (%eax)
5244
53 ; SSE1: t2:
54 ; SSE1: movaps (%eax), %xmm0
55 ; SSE1: movaps %xmm0, (%eax)
56
5745 ; NOSSE: t2:
5846 ; NOSSE: movl
5947 ; NOSSE: movl
7866 define void @t3(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp {
7967 entry:
8068 ; SSE2: t3:
81 ; SSE2: movsd (%eax), %xmm0
82 ; SSE2: movsd 8(%eax), %xmm1
83 ; SSE2: movsd %xmm1, 8(%eax)
84 ; SSE2: movsd %xmm0, (%eax)
85
86 ; SSE1: t3:
87 ; SSE1: movl
88 ; SSE1: movl
89 ; SSE1: movl
90 ; SSE1: movl
91 ; SSE1: movl
92 ; SSE1: movl
93 ; SSE1: movl
94 ; SSE1: movl
95 ; SSE1: movl
96 ; SSE1: movl
69 ; SSE2: movups (%eax), %xmm0
70 ; SSE2: movups %xmm0, (%eax)
9771
9872 ; NOSSE: t3:
9973 ; NOSSE: movl
10882 ; NOSSE: movl
10983
11084 ; X86-64: t3:
111 ; X86-64: movq (%rsi), %rax
112 ; X86-64: movq 8(%rsi), %rcx
113 ; X86-64: movq %rcx, 8(%rdi)
114 ; X86-64: movq %rax, (%rdi)
85 ; X86-64: movups (%rsi), %xmm0
86 ; X86-64: movups %xmm0, (%rdi)
11587 %tmp2 = bitcast %struct.s0* %a to i8* ; [#uses=1]
11688 %tmp3 = bitcast %struct.s0* %b to i8* ; [#uses=1]
11789 tail call void @llvm.memcpy.i32(i8* %tmp2, i8* %tmp3, i32 16, i32 8)
12193 define void @t4() nounwind {
12294 entry:
12395 ; SSE2: t4:
124 ; SSE2: movw $120
96 ; SSE2: movups _.str2, %xmm0
97 ; SSE2: movaps %xmm0, (%esp)
98 ; SSE2: movw $120, 28(%esp)
12599 ; SSE2: movl $2021161080
126100 ; SSE2: movl $2021161080
127101 ; SSE2: movl $2021161080
128 ; SSE2: movl $2021161080
129 ; SSE2: movl $2021161080
130 ; SSE2: movl $2021161080
131 ; SSE2: movl $2021161080
132
133 ; SSE1: t4:
134 ; SSE1: movw $120
135 ; SSE1: movl $2021161080
136 ; SSE1: movl $2021161080
137 ; SSE1: movl $2021161080
138 ; SSE1: movl $2021161080
139 ; SSE1: movl $2021161080
140 ; SSE1: movl $2021161080
141 ; SSE1: movl $2021161080
142102
143103 ; NOSSE: t4:
144104 ; NOSSE: movw $120
153113 ; X86-64: t4:
154114 ; X86-64: movabsq $8680820740569200760, %rax
155115 ; X86-64: movq %rax
156 ; X86-64: movq %rax
157 ; X86-64: movq %rax
116 ; X86-64: movups _.str2(%rip), %xmm0
117 ; X86-64: movaps %xmm0, -40(%rsp)
158118 ; X86-64: movw $120
159119 ; X86-64: movl $2021161080
160120 %tmp1 = alloca [30 x i8]
3636 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i32 1, i1 false)
3737 ret void
3838 ; LINUX: test3:
39 ; LINUX: memcpy
39 ; LINUX-NOT: memcpy
40 ; LINUX: movups
41 ; LINUX: movups
42 ; LINUX: movups
43 ; LINUX: movups
44 ; LINUX: movups
45 ; LINUX: movups
46 ; LINUX: movups
47 ; LINUX: movups
4048
4149 ; DARWIN: test3:
4250 ; DARWIN-NOT: memcpy
43 ; DARWIN: movq
44 ; DARWIN: movq
45 ; DARWIN: movq
46 ; DARWIN: movq
47 ; DARWIN: movq
48 ; DARWIN: movq
49 ; DARWIN: movq
50 ; DARWIN: movq
51 ; DARWIN: movq
52 ; DARWIN: movq
53 ; DARWIN: movq
54 ; DARWIN: movq
55 ; DARWIN: movq
56 ; DARWIN: movq
57 ; DARWIN: movq
58 ; DARWIN: movq
51 ; DARWIN: movups
52 ; DARWIN: movups
53 ; DARWIN: movups
54 ; DARWIN: movups
55 ; DARWIN: movups
56 ; DARWIN: movups
57 ; DARWIN: movups
58 ; DARWIN: movups
59 ; DARWIN: movups
60 ; DARWIN: movups
61 ; DARWIN: movups
62 ; DARWIN: movups
63 ; DARWIN: movups
64 ; DARWIN: movups
65 ; DARWIN: movups
66 ; DARWIN: movups
5967 }
6068
6169 ; Large constant memcpy's should be inlined when not optimizing for size.
44 define fastcc void @t1() nounwind {
55 entry:
66 ; CHECK: t1:
7 ; CHECK: calll _memset
7 ; CHECK: pxor %xmm0, %xmm0
8 ; CHECK: movups %xmm0, 160
9 ; CHECK: movups %xmm0, 144
10 ; CHECK: movups %xmm0, 128
11 ; CHECK: movups %xmm0, 112
12 ; CHECK: movups %xmm0, 96
13 ; CHECK: movups %xmm0, 80
14 ; CHECK: movups %xmm0, 64
15 ; CHECK: movups %xmm0, 48
16 ; CHECK: movups %xmm0, 32
17 ; CHECK: movups %xmm0, 16
18 ; CHECK: movups %xmm0, 0
19 ; CHECK: movl $0, 184
20 ; CHECK: movl $0, 180
21 ; CHECK: movl $0, 176
822 call void @llvm.memset.i32( i8* null, i8 0, i32 188, i32 1 ) nounwind
923 unreachable
1024 }
0 ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | grep movups | count 5
1 ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=core2 | grep movl | count 20
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | grep movq | count 10
1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | grep movups | count 5
32
43 define void @bork() nounwind {
54 entry:
None ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=core2 | grep movsd | count 8
1 ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | grep movups | count 2
0 ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | FileCheck %s
21
32 define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 }* byval align 4 %z) nounwind {
43 entry:
4 ; CHECK: ccosl:
5 ; CHECK: movaps
6 ; CHECK: movaps
7 ; CHECK: movups
8 ; CHECK: movups
59 %iz = alloca { x86_fp80, x86_fp80 } ; <{ x86_fp80, x86_fp80 }*> [#uses=3]
610 %tmp1 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 1 ; [#uses=1]
711 %tmp2 = load x86_fp80* %tmp1, align 16 ; [#uses=1]
99 unreachable
1010 ; CHECK: movq _c@TLVP(%rip), %rdi
1111 ; CHECK-NEXT: callq *(%rdi)
12 ; CHECK-NEXT: movl $0, 56(%rax)
13 ; CHECK-NEXT: movq $0, 48(%rax)
12 ; CHECK-NEXT: pxor %xmm0, %xmm0
13 ; CHECK-NEXT: movups %xmm0, 32(%rax)
14 ; CHECK-NEXT: movups %xmm0, 16(%rax)
15 ; CHECK-NEXT: movups %xmm0, (%rax)
16 ; CHECK-NEXT: movl $0, 56(%rax)
17 ; CHECK-NEXT: movq $0, 48(%rax)
1418 }
1519
1620 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
None ; RUN: llc < %s -mtriple=i386-apple-darwin10.0 -mcpu=core2 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=I386 %s
1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=core2 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=CORE2 %s
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=corei7 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=COREI7 %s
0 ; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=core2 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck %s
31
42 @.str1 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 8
53 @.str3 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, 2'ND STRING\00", align 8
1210 bb:
1311 %String2Loc9 = getelementptr inbounds [31 x i8]* %String2Loc, i64 0, i64 0
1412 call void @llvm.memcpy.i64(i8* %String2Loc9, i8* getelementptr inbounds ([31 x i8]* @.str3, i64 0, i64 0), i64 31, i32 1)
15 ; I386: calll {{_?}}memcpy
16
17 ; CORE2: movabsq
18 ; CORE2: movabsq
19 ; CORE2: movabsq
20
21 ; COREI7: movups _.str3
13 ; CHECK: movabsq $2325069237881678925, %rax
14 ; CHECK: movups _.str3(%rip), %xmm0
2215 br label %bb
2316
2417 return:
2720
2821 declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
2922
30 ; CORE2: .section
31 ; CORE2: .align 4
32 ; CORE2-NEXT: _.str1:
33 ; CORE2-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING"
34 ; CORE2: .align 4
35 ; CORE2-NEXT: _.str3:
23 ; CHECK: .section
24 ; CHECK: .align 4
25 ; CHECK-NEXT: _.str1:
26 ; CHECK-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING"
27 ; CHECK: .align 4
28 ; CHECK-NEXT: _.str3: