llvm.org GIT mirror llvm / f98b9a5
[x86] use SSE/AVX ops for non-zero memsets (PR27100) Move the memset check down to the CPU-with-slow-SSE-unaligned-memops case: this allows fast targets to take advantage of SSE/AVX instructions and prevents slow targets from stepping into a codegen sinkhole while trying to splat a byte into an XMM reg. Follow-on bugs exposed by the current codegen are: https://llvm.org/bugs/show_bug.cgi?id=27141 https://llvm.org/bugs/show_bug.cgi?id=27143 Differential Revision: http://reviews.llvm.org/D18566 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@265029 91177308-0d34-0410-b5e6-96231b3b80d8 Sanjay Patel 4 years ago
2 changed file(s) with 138 addition(s) and 56 deletion(s). Raw diff Collapse all Expand all
20242024 bool MemcpyStrSrc,
20252025 MachineFunction &MF) const {
20262026 const Function *F = MF.getFunction();
2027 if ((!IsMemset || ZeroMemset) &&
2028 !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
2027 if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
20292028 if (Size >= 16 &&
20302029 (!Subtarget.isUnalignedMem16Slow() ||
20312030 ((DstAlign == 0 || DstAlign >= 16) &&
20412040 return MVT::v4i32;
20422041 if (Subtarget.hasSSE1())
20432042 return MVT::v4f32;
2044 } else if (!MemcpyStrSrc && Size >= 8 &&
2045 !Subtarget.is64Bit() &&
2046 Subtarget.hasSSE2()) {
2043 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
2044 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
20472045 // Do not use f64 to lower memcpy if source is string constant. It's
20482046 // better to use i32 to avoid the loads.
2047 // Also, do not use f64 to lower memset unless this is a memset of zeros.
2048 // The gymnastics of splatting a byte value into an XMM register and then
2049 // only using 8-byte stores (because this is a CPU with slow unaligned
2050 // 16-byte accesses) makes that a loser.
20492051 return MVT::f64;
20502052 }
20512053 }
33 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx2 | FileCheck %s --check-prefix=ANY --check-prefix=AVX --check-prefix=AVX2
44
55 define void @memset_16_nonzero_bytes(i8* %x) {
6 ; ANY-LABEL: memset_16_nonzero_bytes:
7 ; ANY: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
8 ; ANY-NEXT: movq %rax, 8(%rdi)
9 ; ANY-NEXT: movq %rax, (%rdi)
10 ; ANY-NEXT: retq
6 ; SSE2-LABEL: memset_16_nonzero_bytes:
7 ; SSE2: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
8 ; SSE2-NEXT: movq %rax, 8(%rdi)
9 ; SSE2-NEXT: movq %rax, (%rdi)
10 ; SSE2-NEXT: retq
11 ;
12 ; AVX1-LABEL: memset_16_nonzero_bytes:
13 ; AVX1: vmovaps {{.*#+}} xmm0 = [707406378,707406378,707406378,707406378]
14 ; AVX1-NEXT: vmovups %xmm0, (%rdi)
15 ; AVX1-NEXT: retq
16 ;
17 ; AVX2-LABEL: memset_16_nonzero_bytes:
18 ; AVX2: vbroadcastss {{.*}}(%rip), %xmm0
19 ; AVX2-NEXT: vmovups %xmm0, (%rdi)
20 ; AVX2-NEXT: retq
1121 ;
1222 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 16, i64 -1)
1323 ret void
1424 }
1525
1626 define void @memset_32_nonzero_bytes(i8* %x) {
17 ; ANY-LABEL: memset_32_nonzero_bytes:
18 ; ANY: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
19 ; ANY-NEXT: movq %rax, 24(%rdi)
20 ; ANY-NEXT: movq %rax, 16(%rdi)
21 ; ANY-NEXT: movq %rax, 8(%rdi)
22 ; ANY-NEXT: movq %rax, (%rdi)
23 ; ANY-NEXT: retq
27 ; SSE2-LABEL: memset_32_nonzero_bytes:
28 ; SSE2: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
29 ; SSE2-NEXT: movq %rax, 24(%rdi)
30 ; SSE2-NEXT: movq %rax, 16(%rdi)
31 ; SSE2-NEXT: movq %rax, 8(%rdi)
32 ; SSE2-NEXT: movq %rax, (%rdi)
33 ; SSE2-NEXT: retq
34 ;
35 ; AVX1-LABEL: memset_32_nonzero_bytes:
36 ; AVX1: vmovaps {{.*#+}} ymm0 = [1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13]
37 ; AVX1-NEXT: vmovups %ymm0, (%rdi)
38 ; AVX1-NEXT: vzeroupper
39 ; AVX1-NEXT: retq
40 ;
41 ; AVX2-LABEL: memset_32_nonzero_bytes:
42 ; AVX2: vbroadcastss {{.*}}(%rip), %ymm0
43 ; AVX2-NEXT: vmovups %ymm0, (%rdi)
44 ; AVX2-NEXT: vzeroupper
45 ; AVX2-NEXT: retq
2446 ;
2547 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 32, i64 -1)
2648 ret void
2749 }
2850
2951 define void @memset_64_nonzero_bytes(i8* %x) {
30 ; ANY-LABEL: memset_64_nonzero_bytes:
31 ; ANY: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
32 ; ANY-NEXT: movq %rax, 56(%rdi)
33 ; ANY-NEXT: movq %rax, 48(%rdi)
34 ; ANY-NEXT: movq %rax, 40(%rdi)
35 ; ANY-NEXT: movq %rax, 32(%rdi)
36 ; ANY-NEXT: movq %rax, 24(%rdi)
37 ; ANY-NEXT: movq %rax, 16(%rdi)
38 ; ANY-NEXT: movq %rax, 8(%rdi)
39 ; ANY-NEXT: movq %rax, (%rdi)
40 ; ANY-NEXT: retq
52 ; SSE2-LABEL: memset_64_nonzero_bytes:
53 ; SSE2: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
54 ; SSE2-NEXT: movq %rax, 56(%rdi)
55 ; SSE2-NEXT: movq %rax, 48(%rdi)
56 ; SSE2-NEXT: movq %rax, 40(%rdi)
57 ; SSE2-NEXT: movq %rax, 32(%rdi)
58 ; SSE2-NEXT: movq %rax, 24(%rdi)
59 ; SSE2-NEXT: movq %rax, 16(%rdi)
60 ; SSE2-NEXT: movq %rax, 8(%rdi)
61 ; SSE2-NEXT: movq %rax, (%rdi)
62 ; SSE2-NEXT: retq
63 ;
64 ; AVX1-LABEL: memset_64_nonzero_bytes:
65 ; AVX1: vmovaps {{.*#+}} ymm0 = [1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13]
66 ; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
67 ; AVX1-NEXT: vmovups %ymm0, (%rdi)
68 ; AVX1-NEXT: vzeroupper
69 ; AVX1-NEXT: retq
70 ;
71 ; AVX2-LABEL: memset_64_nonzero_bytes:
72 ; AVX2: vbroadcastss {{.*}}(%rip), %ymm0
73 ; AVX2-NEXT: vmovups %ymm0, 32(%rdi)
74 ; AVX2-NEXT: vmovups %ymm0, (%rdi)
75 ; AVX2-NEXT: vzeroupper
76 ; AVX2-NEXT: retq
4177 ;
4278 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 64, i64 -1)
4379 ret void
4480 }
4581
4682 define void @memset_128_nonzero_bytes(i8* %x) {
47 ; ANY-LABEL: memset_128_nonzero_bytes:
48 ; ANY: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
49 ; ANY-NEXT: movq %rax, 120(%rdi)
50 ; ANY-NEXT: movq %rax, 112(%rdi)
51 ; ANY-NEXT: movq %rax, 104(%rdi)
52 ; ANY-NEXT: movq %rax, 96(%rdi)
53 ; ANY-NEXT: movq %rax, 88(%rdi)
54 ; ANY-NEXT: movq %rax, 80(%rdi)
55 ; ANY-NEXT: movq %rax, 72(%rdi)
56 ; ANY-NEXT: movq %rax, 64(%rdi)
57 ; ANY-NEXT: movq %rax, 56(%rdi)
58 ; ANY-NEXT: movq %rax, 48(%rdi)
59 ; ANY-NEXT: movq %rax, 40(%rdi)
60 ; ANY-NEXT: movq %rax, 32(%rdi)
61 ; ANY-NEXT: movq %rax, 24(%rdi)
62 ; ANY-NEXT: movq %rax, 16(%rdi)
63 ; ANY-NEXT: movq %rax, 8(%rdi)
64 ; ANY-NEXT: movq %rax, (%rdi)
65 ; ANY-NEXT: retq
83 ; SSE2-LABEL: memset_128_nonzero_bytes:
84 ; SSE2: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
85 ; SSE2-NEXT: movq %rax, 120(%rdi)
86 ; SSE2-NEXT: movq %rax, 112(%rdi)
87 ; SSE2-NEXT: movq %rax, 104(%rdi)
88 ; SSE2-NEXT: movq %rax, 96(%rdi)
89 ; SSE2-NEXT: movq %rax, 88(%rdi)
90 ; SSE2-NEXT: movq %rax, 80(%rdi)
91 ; SSE2-NEXT: movq %rax, 72(%rdi)
92 ; SSE2-NEXT: movq %rax, 64(%rdi)
93 ; SSE2-NEXT: movq %rax, 56(%rdi)
94 ; SSE2-NEXT: movq %rax, 48(%rdi)
95 ; SSE2-NEXT: movq %rax, 40(%rdi)
96 ; SSE2-NEXT: movq %rax, 32(%rdi)
97 ; SSE2-NEXT: movq %rax, 24(%rdi)
98 ; SSE2-NEXT: movq %rax, 16(%rdi)
99 ; SSE2-NEXT: movq %rax, 8(%rdi)
100 ; SSE2-NEXT: movq %rax, (%rdi)
101 ; SSE2-NEXT: retq
102 ;
103 ; AVX1-LABEL: memset_128_nonzero_bytes:
104 ; AVX1: vmovaps {{.*#+}} ymm0 = [1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13]
105 ; AVX1-NEXT: vmovups %ymm0, 96(%rdi)
106 ; AVX1-NEXT: vmovups %ymm0, 64(%rdi)
107 ; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
108 ; AVX1-NEXT: vmovups %ymm0, (%rdi)
109 ; AVX1-NEXT: vzeroupper
110 ; AVX1-NEXT: retq
111 ;
112 ; AVX2-LABEL: memset_128_nonzero_bytes:
113 ; AVX2: vbroadcastss {{.*}}(%rip), %ymm0
114 ; AVX2-NEXT: vmovups %ymm0, 96(%rdi)
115 ; AVX2-NEXT: vmovups %ymm0, 64(%rdi)
116 ; AVX2-NEXT: vmovups %ymm0, 32(%rdi)
117 ; AVX2-NEXT: vmovups %ymm0, (%rdi)
118 ; AVX2-NEXT: vzeroupper
119 ; AVX2-NEXT: retq
66120 ;
67121 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 128, i64 -1)
68122 ret void
69123 }
70124
71125 define void @memset_256_nonzero_bytes(i8* %x) {
72 ; ANY-LABEL: memset_256_nonzero_bytes:
73 ; ANY: pushq %rax
74 ; ANY-NEXT: .Ltmp0:
75 ; ANY-NEXT: .cfi_def_cfa_offset 16
76 ; ANY-NEXT: movl $42, %esi
77 ; ANY-NEXT: movl $256, %edx # imm = 0x100
78 ; ANY-NEXT: callq memset
79 ; ANY-NEXT: popq %rax
80 ; ANY-NEXT: retq
126 ; SSE2-LABEL: memset_256_nonzero_bytes:
127 ; SSE2: pushq %rax
128 ; SSE2-NEXT: .Ltmp0:
129 ; SSE2-NEXT: .cfi_def_cfa_offset 16
130 ; SSE2-NEXT: movl $42, %esi
131 ; SSE2-NEXT: movl $256, %edx # imm = 0x100
132 ; SSE2-NEXT: callq memset
133 ; SSE2-NEXT: popq %rax
134 ; SSE2-NEXT: retq
135 ;
136 ; AVX1-LABEL: memset_256_nonzero_bytes:
137 ; AVX1: vmovaps {{.*#+}} ymm0 = [1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13]
138 ; AVX1-NEXT: vmovups %ymm0, 224(%rdi)
139 ; AVX1-NEXT: vmovups %ymm0, 192(%rdi)
140 ; AVX1-NEXT: vmovups %ymm0, 160(%rdi)
141 ; AVX1-NEXT: vmovups %ymm0, 128(%rdi)
142 ; AVX1-NEXT: vmovups %ymm0, 96(%rdi)
143 ; AVX1-NEXT: vmovups %ymm0, 64(%rdi)
144 ; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
145 ; AVX1-NEXT: vmovups %ymm0, (%rdi)
146 ; AVX1-NEXT: vzeroupper
147 ; AVX1-NEXT: retq
148 ;
149 ; AVX2-LABEL: memset_256_nonzero_bytes:
150 ; AVX2: vbroadcastss {{.*}}(%rip), %ymm0
151 ; AVX2-NEXT: vmovups %ymm0, 224(%rdi)
152 ; AVX2-NEXT: vmovups %ymm0, 192(%rdi)
153 ; AVX2-NEXT: vmovups %ymm0, 160(%rdi)
154 ; AVX2-NEXT: vmovups %ymm0, 128(%rdi)
155 ; AVX2-NEXT: vmovups %ymm0, 96(%rdi)
156 ; AVX2-NEXT: vmovups %ymm0, 64(%rdi)
157 ; AVX2-NEXT: vmovups %ymm0, 32(%rdi)
158 ; AVX2-NEXT: vmovups %ymm0, (%rdi)
159 ; AVX2-NEXT: vzeroupper
160 ; AVX2-NEXT: retq
81161 ;
82162 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 256, i64 -1)
83163 ret void