llvm.org GIT mirror llvm / 2306053
[X86][SSE] Regenerated nontemporal vector store tests and added extra target types git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@271654 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 4 years ago
1 changed file(s) with 689 addition(s) and 175 deletion(s). Raw diff Collapse all Expand all
None ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s -check-prefix=CHECK -check-prefix=VLX
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=SSE --check-prefix=SSE4A
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=VLX
47
58 ; Make sure that we generate non-temporal stores for the test cases below.
69 ; We use xorps for zeroing, so domain information isn't available anymore.
710
811 define void @test_zero_v4f32(<4 x float>* %dst) {
9 ; CHECK-LABEL: test_zero_v4f32:
10 ; SSE: movntps
11 ; AVX: vmovntps
12 ; AVX2: vmovntps
13 ; VLX: vmovntdq
12 ; SSE-LABEL: test_zero_v4f32:
13 ; SSE: # BB#0:
14 ; SSE-NEXT: xorps %xmm0, %xmm0
15 ; SSE-NEXT: movntps %xmm0, (%rdi)
16 ; SSE-NEXT: retq
17 ;
18 ; AVX-LABEL: test_zero_v4f32:
19 ; AVX: # BB#0:
20 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
21 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
22 ; AVX-NEXT: retq
23 ;
24 ; VLX-LABEL: test_zero_v4f32:
25 ; VLX: # BB#0:
26 ; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0
27 ; VLX-NEXT: vmovntdq %xmm0, (%rdi)
28 ; VLX-NEXT: retq
1429 store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !1
1530 ret void
1631 }
1732
1833 define void @test_zero_v4i32(<4 x i32>* %dst) {
19 ; CHECK-LABEL: test_zero_v4i32:
20 ; SSE: movntps
21 ; AVX: vmovntps
22 ; AVX2: vmovntps
23 ; VLX: vmovntdq
34 ; SSE-LABEL: test_zero_v4i32:
35 ; SSE: # BB#0:
36 ; SSE-NEXT: xorps %xmm0, %xmm0
37 ; SSE-NEXT: movntps %xmm0, (%rdi)
38 ; SSE-NEXT: retq
39 ;
40 ; AVX-LABEL: test_zero_v4i32:
41 ; AVX: # BB#0:
42 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
43 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
44 ; AVX-NEXT: retq
45 ;
46 ; VLX-LABEL: test_zero_v4i32:
47 ; VLX: # BB#0:
48 ; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0
49 ; VLX-NEXT: vmovntdq %xmm0, (%rdi)
50 ; VLX-NEXT: retq
2451 store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1
2552 store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1
2653 ret void
2754 }
2855
2956 define void @test_zero_v2f64(<2 x double>* %dst) {
30 ; CHECK-LABEL: test_zero_v2f64:
31 ; SSE: movntps
32 ; AVX: vmovntps
33 ; AVX2: vmovntps
34 ; VLX: vmovntdq
57 ; SSE-LABEL: test_zero_v2f64:
58 ; SSE: # BB#0:
59 ; SSE-NEXT: xorps %xmm0, %xmm0
60 ; SSE-NEXT: movntps %xmm0, (%rdi)
61 ; SSE-NEXT: retq
62 ;
63 ; AVX-LABEL: test_zero_v2f64:
64 ; AVX: # BB#0:
65 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
66 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
67 ; AVX-NEXT: retq
68 ;
69 ; VLX-LABEL: test_zero_v2f64:
70 ; VLX: # BB#0:
71 ; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0
72 ; VLX-NEXT: vmovntdq %xmm0, (%rdi)
73 ; VLX-NEXT: retq
3574 store <2 x double> zeroinitializer, <2 x double>* %dst, align 16, !nontemporal !1
3675 ret void
3776 }
3877
3978 define void @test_zero_v2i64(<2 x i64>* %dst) {
40 ; CHECK-LABEL: test_zero_v2i64:
41 ; SSE: movntps
42 ; AVX: vmovntps
43 ; AVX2: vmovntps
44 ; VLX: vmovntdq
79 ; SSE-LABEL: test_zero_v2i64:
80 ; SSE: # BB#0:
81 ; SSE-NEXT: xorps %xmm0, %xmm0
82 ; SSE-NEXT: movntps %xmm0, (%rdi)
83 ; SSE-NEXT: retq
84 ;
85 ; AVX-LABEL: test_zero_v2i64:
86 ; AVX: # BB#0:
87 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
88 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
89 ; AVX-NEXT: retq
90 ;
91 ; VLX-LABEL: test_zero_v2i64:
92 ; VLX: # BB#0:
93 ; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0
94 ; VLX-NEXT: vmovntdq %xmm0, (%rdi)
95 ; VLX-NEXT: retq
4596 store <2 x i64> zeroinitializer, <2 x i64>* %dst, align 16, !nontemporal !1
4697 ret void
4798 }
4899
49100 define void @test_zero_v8i16(<8 x i16>* %dst) {
50 ; CHECK-LABEL: test_zero_v8i16:
51 ; SSE: movntps
52 ; AVX: vmovntps
53 ; AVX2: vmovntps
54 ; VLX: vmovntdq
101 ; SSE-LABEL: test_zero_v8i16:
102 ; SSE: # BB#0:
103 ; SSE-NEXT: xorps %xmm0, %xmm0
104 ; SSE-NEXT: movntps %xmm0, (%rdi)
105 ; SSE-NEXT: retq
106 ;
107 ; AVX-LABEL: test_zero_v8i16:
108 ; AVX: # BB#0:
109 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
110 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
111 ; AVX-NEXT: retq
112 ;
113 ; VLX-LABEL: test_zero_v8i16:
114 ; VLX: # BB#0:
115 ; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0
116 ; VLX-NEXT: vmovntdq %xmm0, (%rdi)
117 ; VLX-NEXT: retq
55118 store <8 x i16> zeroinitializer, <8 x i16>* %dst, align 16, !nontemporal !1
56119 ret void
57120 }
58121
59122 define void @test_zero_v16i8(<16 x i8>* %dst) {
60 ; CHECK-LABEL: test_zero_v16i8:
61 ; SSE: movntps
62 ; AVX: vmovntps
63 ; AVX2: vmovntps
64 ; VLX: vmovntdq
123 ; SSE-LABEL: test_zero_v16i8:
124 ; SSE: # BB#0:
125 ; SSE-NEXT: xorps %xmm0, %xmm0
126 ; SSE-NEXT: movntps %xmm0, (%rdi)
127 ; SSE-NEXT: retq
128 ;
129 ; AVX-LABEL: test_zero_v16i8:
130 ; AVX: # BB#0:
131 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
132 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
133 ; AVX-NEXT: retq
134 ;
135 ; VLX-LABEL: test_zero_v16i8:
136 ; VLX: # BB#0:
137 ; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0
138 ; VLX-NEXT: vmovntdq %xmm0, (%rdi)
139 ; VLX-NEXT: retq
65140 store <16 x i8> zeroinitializer, <16 x i8>* %dst, align 16, !nontemporal !1
66141 ret void
67142 }
69144 ; And now YMM versions.
70145
71146 define void @test_zero_v8f32(<8 x float>* %dst) {
72 ; CHECK-LABEL: test_zero_v8f32:
73 ; AVX: vmovntps %ymm
74 ; AVX2: vmovntps %ymm
75 ; VLX: vmovntdq %ymm
147 ; SSE-LABEL: test_zero_v8f32:
148 ; SSE: # BB#0:
149 ; SSE-NEXT: xorps %xmm0, %xmm0
150 ; SSE-NEXT: movntps %xmm0, 16(%rdi)
151 ; SSE-NEXT: movntps %xmm0, (%rdi)
152 ; SSE-NEXT: retq
153 ;
154 ; AVX-LABEL: test_zero_v8f32:
155 ; AVX: # BB#0:
156 ; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
157 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
158 ; AVX-NEXT: vzeroupper
159 ; AVX-NEXT: retq
160 ;
161 ; VLX-LABEL: test_zero_v8f32:
162 ; VLX: # BB#0:
163 ; VLX-NEXT: vpxord %ymm0, %ymm0, %ymm0
164 ; VLX-NEXT: vmovntdq %ymm0, (%rdi)
165 ; VLX-NEXT: retq
76166 store <8 x float> zeroinitializer, <8 x float>* %dst, align 32, !nontemporal !1
77167 ret void
78168 }
79169
80170 define void @test_zero_v8i32(<8 x i32>* %dst) {
81 ; CHECK-LABEL: test_zero_v8i32:
82 ; AVX: vmovntps %ymm
83 ; AVX2: vmovntps %ymm
84 ; VLX: vmovntdq %ymm
171 ; SSE-LABEL: test_zero_v8i32:
172 ; SSE: # BB#0:
173 ; SSE-NEXT: xorps %xmm0, %xmm0
174 ; SSE-NEXT: movntps %xmm0, 16(%rdi)
175 ; SSE-NEXT: movntps %xmm0, (%rdi)
176 ; SSE-NEXT: retq
177 ;
178 ; AVX-LABEL: test_zero_v8i32:
179 ; AVX: # BB#0:
180 ; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
181 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
182 ; AVX-NEXT: vzeroupper
183 ; AVX-NEXT: retq
184 ;
185 ; VLX-LABEL: test_zero_v8i32:
186 ; VLX: # BB#0:
187 ; VLX-NEXT: vpxord %ymm0, %ymm0, %ymm0
188 ; VLX-NEXT: vmovntdq %ymm0, (%rdi)
189 ; VLX-NEXT: retq
85190 store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 32, !nontemporal !1
86191 ret void
87192 }
88193
89194 define void @test_zero_v4f64(<4 x double>* %dst) {
90 ; CHECK-LABEL: test_zero_v4f64:
91 ; AVX: vmovntps %ymm
92 ; AVX2: vmovntps %ymm
93 ; VLX: vmovntdq %ymm
195 ; SSE-LABEL: test_zero_v4f64:
196 ; SSE: # BB#0:
197 ; SSE-NEXT: xorps %xmm0, %xmm0
198 ; SSE-NEXT: movntps %xmm0, 16(%rdi)
199 ; SSE-NEXT: movntps %xmm0, (%rdi)
200 ; SSE-NEXT: retq
201 ;
202 ; AVX-LABEL: test_zero_v4f64:
203 ; AVX: # BB#0:
204 ; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
205 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
206 ; AVX-NEXT: vzeroupper
207 ; AVX-NEXT: retq
208 ;
209 ; VLX-LABEL: test_zero_v4f64:
210 ; VLX: # BB#0:
211 ; VLX-NEXT: vpxord %ymm0, %ymm0, %ymm0
212 ; VLX-NEXT: vmovntdq %ymm0, (%rdi)
213 ; VLX-NEXT: retq
94214 store <4 x double> zeroinitializer, <4 x double>* %dst, align 32, !nontemporal !1
95215 ret void
96216 }
97217
98218 define void @test_zero_v4i64(<4 x i64>* %dst) {
99 ; CHECK-LABEL: test_zero_v4i64:
100 ; AVX: vmovntps %ymm
101 ; AVX2: vmovntps %ymm
102 ; VLX: vmovntdq %ymm
219 ; SSE-LABEL: test_zero_v4i64:
220 ; SSE: # BB#0:
221 ; SSE-NEXT: xorps %xmm0, %xmm0
222 ; SSE-NEXT: movntps %xmm0, 16(%rdi)
223 ; SSE-NEXT: movntps %xmm0, (%rdi)
224 ; SSE-NEXT: retq
225 ;
226 ; AVX-LABEL: test_zero_v4i64:
227 ; AVX: # BB#0:
228 ; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
229 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
230 ; AVX-NEXT: vzeroupper
231 ; AVX-NEXT: retq
232 ;
233 ; VLX-LABEL: test_zero_v4i64:
234 ; VLX: # BB#0:
235 ; VLX-NEXT: vpxord %ymm0, %ymm0, %ymm0
236 ; VLX-NEXT: vmovntdq %ymm0, (%rdi)
237 ; VLX-NEXT: retq
103238 store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 32, !nontemporal !1
104239 ret void
105240 }
106241
107242 define void @test_zero_v16i16(<16 x i16>* %dst) {
108 ; CHECK-LABEL: test_zero_v16i16:
109 ; AVX: vmovntps %ymm
110 ; AVX2: vmovntps %ymm
111 ; VLX: vmovntdq %ymm
243 ; SSE-LABEL: test_zero_v16i16:
244 ; SSE: # BB#0:
245 ; SSE-NEXT: xorps %xmm0, %xmm0
246 ; SSE-NEXT: movntps %xmm0, 16(%rdi)
247 ; SSE-NEXT: movntps %xmm0, (%rdi)
248 ; SSE-NEXT: retq
249 ;
250 ; AVX-LABEL: test_zero_v16i16:
251 ; AVX: # BB#0:
252 ; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
253 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
254 ; AVX-NEXT: vzeroupper
255 ; AVX-NEXT: retq
256 ;
257 ; VLX-LABEL: test_zero_v16i16:
258 ; VLX: # BB#0:
259 ; VLX-NEXT: vpxord %ymm0, %ymm0, %ymm0
260 ; VLX-NEXT: vmovntdq %ymm0, (%rdi)
261 ; VLX-NEXT: retq
112262 store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 32, !nontemporal !1
113263 ret void
114264 }
115265
116266 define void @test_zero_v32i8(<32 x i8>* %dst) {
117 ; CHECK-LABEL: test_zero_v32i8:
118 ; AVX: vmovntps %ymm
119 ; AVX2: vmovntps %ymm
120 ; VLX: vmovntdq %ymm
267 ; SSE-LABEL: test_zero_v32i8:
268 ; SSE: # BB#0:
269 ; SSE-NEXT: xorps %xmm0, %xmm0
270 ; SSE-NEXT: movntps %xmm0, 16(%rdi)
271 ; SSE-NEXT: movntps %xmm0, (%rdi)
272 ; SSE-NEXT: retq
273 ;
274 ; AVX-LABEL: test_zero_v32i8:
275 ; AVX: # BB#0:
276 ; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
277 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
278 ; AVX-NEXT: vzeroupper
279 ; AVX-NEXT: retq
280 ;
281 ; VLX-LABEL: test_zero_v32i8:
282 ; VLX: # BB#0:
283 ; VLX-NEXT: vpxord %ymm0, %ymm0, %ymm0
284 ; VLX-NEXT: vmovntdq %ymm0, (%rdi)
285 ; VLX-NEXT: retq
121286 store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 32, !nontemporal !1
122287 ret void
123288 }
126291 ; Check that we also handle arguments. Here the type survives longer.
127292
128293 define void @test_arg_v4f32(<4 x float> %arg, <4 x float>* %dst) {
129 ; CHECK-LABEL: test_arg_v4f32:
130 ; SSE: movntps
131 ; AVX: vmovntps
132 ; AVX2: vmovntps
133 ; VLX: vmovntps
294 ; SSE-LABEL: test_arg_v4f32:
295 ; SSE: # BB#0:
296 ; SSE-NEXT: movntps %xmm0, (%rdi)
297 ; SSE-NEXT: retq
298 ;
299 ; AVX-LABEL: test_arg_v4f32:
300 ; AVX: # BB#0:
301 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
302 ; AVX-NEXT: retq
303 ;
304 ; VLX-LABEL: test_arg_v4f32:
305 ; VLX: # BB#0:
306 ; VLX-NEXT: vmovntps %xmm0, (%rdi)
307 ; VLX-NEXT: retq
134308 store <4 x float> %arg, <4 x float>* %dst, align 16, !nontemporal !1
135309 ret void
136310 }
137311
138312 define void @test_arg_v4i32(<4 x i32> %arg, <4 x i32>* %dst) {
139 ; CHECK-LABEL: test_arg_v4i32:
140 ; SSE: movntps
141 ; AVX: vmovntps
142 ; AVX2: vmovntps
143 ; VLX: vmovntdq
313 ; SSE-LABEL: test_arg_v4i32:
314 ; SSE: # BB#0:
315 ; SSE-NEXT: movntps %xmm0, (%rdi)
316 ; SSE-NEXT: retq
317 ;
318 ; AVX-LABEL: test_arg_v4i32:
319 ; AVX: # BB#0:
320 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
321 ; AVX-NEXT: retq
322 ;
323 ; VLX-LABEL: test_arg_v4i32:
324 ; VLX: # BB#0:
325 ; VLX-NEXT: vmovntdq %xmm0, (%rdi)
326 ; VLX-NEXT: retq
144327 store <4 x i32> %arg, <4 x i32>* %dst, align 16, !nontemporal !1
145328 ret void
146329 }
147330
148331 define void @test_arg_v2f64(<2 x double> %arg, <2 x double>* %dst) {
149 ; CHECK-LABEL: test_arg_v2f64:
150 ; SSE: movntps
151 ; AVX: vmovntps
152 ; AVX2: vmovntps
153 ; VLX: vmovntpd
332 ; SSE-LABEL: test_arg_v2f64:
333 ; SSE: # BB#0:
334 ; SSE-NEXT: movntps %xmm0, (%rdi)
335 ; SSE-NEXT: retq
336 ;
337 ; AVX-LABEL: test_arg_v2f64:
338 ; AVX: # BB#0:
339 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
340 ; AVX-NEXT: retq
341 ;
342 ; VLX-LABEL: test_arg_v2f64:
343 ; VLX: # BB#0:
344 ; VLX-NEXT: vmovntpd %xmm0, (%rdi)
345 ; VLX-NEXT: retq
154346 store <2 x double> %arg, <2 x double>* %dst, align 16, !nontemporal !1
155347 ret void
156348 }
157349
158350 define void @test_arg_v2i64(<2 x i64> %arg, <2 x i64>* %dst) {
159 ; CHECK-LABEL: test_arg_v2i64:
160 ; SSE: movntps
161 ; AVX: vmovntps
162 ; AVX2: vmovntps
163 ; VLX: vmovntdq
351 ; SSE-LABEL: test_arg_v2i64:
352 ; SSE: # BB#0:
353 ; SSE-NEXT: movntps %xmm0, (%rdi)
354 ; SSE-NEXT: retq
355 ;
356 ; AVX-LABEL: test_arg_v2i64:
357 ; AVX: # BB#0:
358 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
359 ; AVX-NEXT: retq
360 ;
361 ; VLX-LABEL: test_arg_v2i64:
362 ; VLX: # BB#0:
363 ; VLX-NEXT: vmovntdq %xmm0, (%rdi)
364 ; VLX-NEXT: retq
164365 store <2 x i64> %arg, <2 x i64>* %dst, align 16, !nontemporal !1
165366 ret void
166367 }
167368
168369 define void @test_arg_v8i16(<8 x i16> %arg, <8 x i16>* %dst) {
169 ; CHECK-LABEL: test_arg_v8i16:
170 ; SSE: movntps
171 ; AVX: vmovntps
172 ; AVX2: vmovntps
173 ; VLX: vmovntdq
370 ; SSE-LABEL: test_arg_v8i16:
371 ; SSE: # BB#0:
372 ; SSE-NEXT: movntps %xmm0, (%rdi)
373 ; SSE-NEXT: retq
374 ;
375 ; AVX-LABEL: test_arg_v8i16:
376 ; AVX: # BB#0:
377 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
378 ; AVX-NEXT: retq
379 ;
380 ; VLX-LABEL: test_arg_v8i16:
381 ; VLX: # BB#0:
382 ; VLX-NEXT: vmovntdq %xmm0, (%rdi)
383 ; VLX-NEXT: retq
174384 store <8 x i16> %arg, <8 x i16>* %dst, align 16, !nontemporal !1
175385 ret void
176386 }
177387
178388 define void @test_arg_v16i8(<16 x i8> %arg, <16 x i8>* %dst) {
179 ; CHECK-LABEL: test_arg_v16i8:
180 ; SSE: movntps
181 ; AVX: vmovntps
182 ; AVX2: vmovntps
183 ; VLX: vmovntdq
389 ; SSE-LABEL: test_arg_v16i8:
390 ; SSE: # BB#0:
391 ; SSE-NEXT: movntps %xmm0, (%rdi)
392 ; SSE-NEXT: retq
393 ;
394 ; AVX-LABEL: test_arg_v16i8:
395 ; AVX: # BB#0:
396 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
397 ; AVX-NEXT: retq
398 ;
399 ; VLX-LABEL: test_arg_v16i8:
400 ; VLX: # BB#0:
401 ; VLX-NEXT: vmovntdq %xmm0, (%rdi)
402 ; VLX-NEXT: retq
184403 store <16 x i8> %arg, <16 x i8>* %dst, align 16, !nontemporal !1
185404 ret void
186405 }
188407 ; And now YMM versions.
189408
190409 define void @test_arg_v8f32(<8 x float> %arg, <8 x float>* %dst) {
191 ; CHECK-LABEL: test_arg_v8f32:
192 ; AVX: vmovntps %ymm
193 ; AVX2: vmovntps %ymm
194 ; VLX: vmovntps %ymm
410 ; SSE-LABEL: test_arg_v8f32:
411 ; SSE: # BB#0:
412 ; SSE-NEXT: movntps %xmm1, 16(%rdi)
413 ; SSE-NEXT: movntps %xmm0, (%rdi)
414 ; SSE-NEXT: retq
415 ;
416 ; AVX-LABEL: test_arg_v8f32:
417 ; AVX: # BB#0:
418 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
419 ; AVX-NEXT: vzeroupper
420 ; AVX-NEXT: retq
421 ;
422 ; VLX-LABEL: test_arg_v8f32:
423 ; VLX: # BB#0:
424 ; VLX-NEXT: vmovntps %ymm0, (%rdi)
425 ; VLX-NEXT: retq
195426 store <8 x float> %arg, <8 x float>* %dst, align 32, !nontemporal !1
196427 ret void
197428 }
198429
199430 define void @test_arg_v8i32(<8 x i32> %arg, <8 x i32>* %dst) {
200 ; CHECK-LABEL: test_arg_v8i32:
201 ; AVX: vmovntps %ymm
202 ; AVX2: vmovntps %ymm
203 ; VLX: vmovntdq %ymm
431 ; SSE-LABEL: test_arg_v8i32:
432 ; SSE: # BB#0:
433 ; SSE-NEXT: movntps %xmm1, 16(%rdi)
434 ; SSE-NEXT: movntps %xmm0, (%rdi)
435 ; SSE-NEXT: retq
436 ;
437 ; AVX-LABEL: test_arg_v8i32:
438 ; AVX: # BB#0:
439 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
440 ; AVX-NEXT: vzeroupper
441 ; AVX-NEXT: retq
442 ;
443 ; VLX-LABEL: test_arg_v8i32:
444 ; VLX: # BB#0:
445 ; VLX-NEXT: vmovntdq %ymm0, (%rdi)
446 ; VLX-NEXT: retq
204447 store <8 x i32> %arg, <8 x i32>* %dst, align 32, !nontemporal !1
205448 ret void
206449 }
207450
208451 define void @test_arg_v4f64(<4 x double> %arg, <4 x double>* %dst) {
209 ; CHECK-LABEL: test_arg_v4f64:
210 ; AVX: vmovntps %ymm
211 ; AVX2: vmovntps %ymm
212 ; VLX: vmovntpd %ymm
452 ; SSE-LABEL: test_arg_v4f64:
453 ; SSE: # BB#0:
454 ; SSE-NEXT: movntps %xmm1, 16(%rdi)
455 ; SSE-NEXT: movntps %xmm0, (%rdi)
456 ; SSE-NEXT: retq
457 ;
458 ; AVX-LABEL: test_arg_v4f64:
459 ; AVX: # BB#0:
460 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
461 ; AVX-NEXT: vzeroupper
462 ; AVX-NEXT: retq
463 ;
464 ; VLX-LABEL: test_arg_v4f64:
465 ; VLX: # BB#0:
466 ; VLX-NEXT: vmovntpd %ymm0, (%rdi)
467 ; VLX-NEXT: retq
213468 store <4 x double> %arg, <4 x double>* %dst, align 32, !nontemporal !1
214469 ret void
215470 }
216471
217472 define void @test_arg_v4i64(<4 x i64> %arg, <4 x i64>* %dst) {
218 ; CHECK-LABEL: test_arg_v4i64:
219 ; AVX: vmovntps %ymm
220 ; AVX2: vmovntps %ymm
221 ; VLX: vmovntdq %ymm
473 ; SSE-LABEL: test_arg_v4i64:
474 ; SSE: # BB#0:
475 ; SSE-NEXT: movntps %xmm1, 16(%rdi)
476 ; SSE-NEXT: movntps %xmm0, (%rdi)
477 ; SSE-NEXT: retq
478 ;
479 ; AVX-LABEL: test_arg_v4i64:
480 ; AVX: # BB#0:
481 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
482 ; AVX-NEXT: vzeroupper
483 ; AVX-NEXT: retq
484 ;
485 ; VLX-LABEL: test_arg_v4i64:
486 ; VLX: # BB#0:
487 ; VLX-NEXT: vmovntdq %ymm0, (%rdi)
488 ; VLX-NEXT: retq
222489 store <4 x i64> %arg, <4 x i64>* %dst, align 32, !nontemporal !1
223490 ret void
224491 }
225492
226493 define void @test_arg_v16i16(<16 x i16> %arg, <16 x i16>* %dst) {
227 ; CHECK-LABEL: test_arg_v16i16:
228 ; AVX: vmovntps %ymm
229 ; AVX2: vmovntps %ymm
230 ; VLX: vmovntdq %ymm
494 ; SSE-LABEL: test_arg_v16i16:
495 ; SSE: # BB#0:
496 ; SSE-NEXT: movntps %xmm1, 16(%rdi)
497 ; SSE-NEXT: movntps %xmm0, (%rdi)
498 ; SSE-NEXT: retq
499 ;
500 ; AVX-LABEL: test_arg_v16i16:
501 ; AVX: # BB#0:
502 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
503 ; AVX-NEXT: vzeroupper
504 ; AVX-NEXT: retq
505 ;
506 ; VLX-LABEL: test_arg_v16i16:
507 ; VLX: # BB#0:
508 ; VLX-NEXT: vmovntdq %ymm0, (%rdi)
509 ; VLX-NEXT: retq
231510 store <16 x i16> %arg, <16 x i16>* %dst, align 32, !nontemporal !1
232511 ret void
233512 }
234513
235514 define void @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %dst) {
236 ; CHECK-LABEL: test_arg_v32i8:
237 ; AVX: vmovntps %ymm
238 ; AVX2: vmovntps %ymm
239 ; VLX: vmovntdq %ymm
515 ; SSE-LABEL: test_arg_v32i8:
516 ; SSE: # BB#0:
517 ; SSE-NEXT: movntps %xmm1, 16(%rdi)
518 ; SSE-NEXT: movntps %xmm0, (%rdi)
519 ; SSE-NEXT: retq
520 ;
521 ; AVX-LABEL: test_arg_v32i8:
522 ; AVX: # BB#0:
523 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
524 ; AVX-NEXT: vzeroupper
525 ; AVX-NEXT: retq
526 ;
527 ; VLX-LABEL: test_arg_v32i8:
528 ; VLX: # BB#0:
529 ; VLX-NEXT: vmovntdq %ymm0, (%rdi)
530 ; VLX-NEXT: retq
240531 store <32 x i8> %arg, <32 x i8>* %dst, align 32, !nontemporal !1
241532 ret void
242533 }
246537 ; We use an add to make the type survive all the way to the MOVNT.
247538
248539 define void @test_op_v4f32(<4 x float> %a, <4 x float> %b, <4 x float>* %dst) {
249 ; CHECK-LABEL: test_op_v4f32:
250 ; SSE: movntps
251 ; AVX: vmovntps
252 ; AVX2: vmovntps
253 ; VLX: vmovntps
540 ; SSE-LABEL: test_op_v4f32:
541 ; SSE: # BB#0:
542 ; SSE-NEXT: addps %xmm1, %xmm0
543 ; SSE-NEXT: movntps %xmm0, (%rdi)
544 ; SSE-NEXT: retq
545 ;
546 ; AVX-LABEL: test_op_v4f32:
547 ; AVX: # BB#0:
548 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
549 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
550 ; AVX-NEXT: retq
551 ;
552 ; VLX-LABEL: test_op_v4f32:
553 ; VLX: # BB#0:
554 ; VLX-NEXT: vaddps %xmm1, %xmm0, %xmm0
555 ; VLX-NEXT: vmovntps %xmm0, (%rdi)
556 ; VLX-NEXT: retq
254557 %r = fadd <4 x float> %a, %b
255558 store <4 x float> %r, <4 x float>* %dst, align 16, !nontemporal !1
256559 ret void
257560 }
258561
259562 define void @test_op_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32>* %dst) {
260 ; CHECK-LABEL: test_op_v4i32:
261 ; SSE: movntdq
262 ; AVX: vmovntdq
263 ; AVX2: vmovntdq
264 ; VLX: vmovntdq
563 ; SSE-LABEL: test_op_v4i32:
564 ; SSE: # BB#0:
565 ; SSE-NEXT: paddd %xmm1, %xmm0
566 ; SSE-NEXT: movntdq %xmm0, (%rdi)
567 ; SSE-NEXT: retq
568 ;
569 ; AVX-LABEL: test_op_v4i32:
570 ; AVX: # BB#0:
571 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
572 ; AVX-NEXT: vmovntdq %xmm0, (%rdi)
573 ; AVX-NEXT: retq
574 ;
575 ; VLX-LABEL: test_op_v4i32:
576 ; VLX: # BB#0:
577 ; VLX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
578 ; VLX-NEXT: vmovntdq %xmm0, (%rdi)
579 ; VLX-NEXT: retq
265580 %r = add <4 x i32> %a, %b
266581 store <4 x i32> %r, <4 x i32>* %dst, align 16, !nontemporal !1
267582 ret void
268583 }
269584
270585 define void @test_op_v2f64(<2 x double> %a, <2 x double> %b, <2 x double>* %dst) {
271 ; CHECK-LABEL: test_op_v2f64:
272 ; SSE: movntpd
273 ; AVX: vmovntpd
274 ; AVX2: vmovntpd
275 ; VLX: vmovntpd
586 ; SSE-LABEL: test_op_v2f64:
587 ; SSE: # BB#0:
588 ; SSE-NEXT: addpd %xmm1, %xmm0
589 ; SSE-NEXT: movntpd %xmm0, (%rdi)
590 ; SSE-NEXT: retq
591 ;
592 ; AVX-LABEL: test_op_v2f64:
593 ; AVX: # BB#0:
594 ; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
595 ; AVX-NEXT: vmovntpd %xmm0, (%rdi)
596 ; AVX-NEXT: retq
597 ;
598 ; VLX-LABEL: test_op_v2f64:
599 ; VLX: # BB#0:
600 ; VLX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
601 ; VLX-NEXT: vmovntpd %xmm0, (%rdi)
602 ; VLX-NEXT: retq
276603 %r = fadd <2 x double> %a, %b
277604 store <2 x double> %r, <2 x double>* %dst, align 16, !nontemporal !1
278605 ret void
279606 }
280607
281608 define void @test_op_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64>* %dst) {
282 ; CHECK-LABEL: test_op_v2i64:
283 ; SSE: movntdq
284 ; AVX: vmovntdq
285 ; AVX2: vmovntdq
286 ; VLX: vmovntdq
609 ; SSE-LABEL: test_op_v2i64:
610 ; SSE: # BB#0:
611 ; SSE-NEXT: paddq %xmm1, %xmm0
612 ; SSE-NEXT: movntdq %xmm0, (%rdi)
613 ; SSE-NEXT: retq
614 ;
615 ; AVX-LABEL: test_op_v2i64:
616 ; AVX: # BB#0:
617 ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
618 ; AVX-NEXT: vmovntdq %xmm0, (%rdi)
619 ; AVX-NEXT: retq
620 ;
621 ; VLX-LABEL: test_op_v2i64:
622 ; VLX: # BB#0:
623 ; VLX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
624 ; VLX-NEXT: vmovntdq %xmm0, (%rdi)
625 ; VLX-NEXT: retq
287626 %r = add <2 x i64> %a, %b
288627 store <2 x i64> %r, <2 x i64>* %dst, align 16, !nontemporal !1
289628 ret void
290629 }
291630
292631 define void @test_op_v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16>* %dst) {
293 ; CHECK-LABEL: test_op_v8i16:
294 ; SSE: movntdq
295 ; AVX: vmovntdq
296 ; AVX2: vmovntdq
297 ; VLX: vmovntdq
632 ; SSE-LABEL: test_op_v8i16:
633 ; SSE: # BB#0:
634 ; SSE-NEXT: paddw %xmm1, %xmm0
635 ; SSE-NEXT: movntdq %xmm0, (%rdi)
636 ; SSE-NEXT: retq
637 ;
638 ; AVX-LABEL: test_op_v8i16:
639 ; AVX: # BB#0:
640 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
641 ; AVX-NEXT: vmovntdq %xmm0, (%rdi)
642 ; AVX-NEXT: retq
643 ;
644 ; VLX-LABEL: test_op_v8i16:
645 ; VLX: # BB#0:
646 ; VLX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
647 ; VLX-NEXT: vmovntdq %xmm0, (%rdi)
648 ; VLX-NEXT: retq
298649 %r = add <8 x i16> %a, %b
299650 store <8 x i16> %r, <8 x i16>* %dst, align 16, !nontemporal !1
300651 ret void
301652 }
302653
303654 define void @test_op_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8>* %dst) {
304 ; CHECK-LABEL: test_op_v16i8:
305 ; SSE: movntdq
306 ; AVX: vmovntdq
307 ; AVX2: vmovntdq
308 ; VLX: vmovntdq
655 ; SSE-LABEL: test_op_v16i8:
656 ; SSE: # BB#0:
657 ; SSE-NEXT: paddb %xmm1, %xmm0
658 ; SSE-NEXT: movntdq %xmm0, (%rdi)
659 ; SSE-NEXT: retq
660 ;
661 ; AVX-LABEL: test_op_v16i8:
662 ; AVX: # BB#0:
663 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
664 ; AVX-NEXT: vmovntdq %xmm0, (%rdi)
665 ; AVX-NEXT: retq
666 ;
667 ; VLX-LABEL: test_op_v16i8:
668 ; VLX: # BB#0:
669 ; VLX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
670 ; VLX-NEXT: vmovntdq %xmm0, (%rdi)
671 ; VLX-NEXT: retq
309672 %r = add <16 x i8> %a, %b
310673 store <16 x i8> %r, <16 x i8>* %dst, align 16, !nontemporal !1
311674 ret void
314677 ; And now YMM versions.
315678
316679 define void @test_op_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
317 ; CHECK-LABEL: test_op_v8f32:
318 ; AVX: vmovntps %ymm
319 ; AVX2: vmovntps %ymm
320 ; VLX: vmovntps %ymm
680 ; SSE-LABEL: test_op_v8f32:
681 ; SSE: # BB#0:
682 ; SSE-NEXT: addps %xmm2, %xmm0
683 ; SSE-NEXT: addps %xmm3, %xmm1
684 ; SSE-NEXT: movntps %xmm1, 16(%rdi)
685 ; SSE-NEXT: movntps %xmm0, (%rdi)
686 ; SSE-NEXT: retq
687 ;
688 ; AVX-LABEL: test_op_v8f32:
689 ; AVX: # BB#0:
690 ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
691 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
692 ; AVX-NEXT: vzeroupper
693 ; AVX-NEXT: retq
694 ;
695 ; VLX-LABEL: test_op_v8f32:
696 ; VLX: # BB#0:
697 ; VLX-NEXT: vaddps %ymm1, %ymm0, %ymm0
698 ; VLX-NEXT: vmovntps %ymm0, (%rdi)
699 ; VLX-NEXT: retq
321700 %r = fadd <8 x float> %a, %b
322701 store <8 x float> %r, <8 x float>* %dst, align 32, !nontemporal !1
323702 ret void
324703 }
325704
326705 define void @test_op_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %dst) {
327 ; CHECK-LABEL: test_op_v8i32:
328 ; AVX: vmovntps %ymm
329 ; AVX2: vmovntdq %ymm
330 ; VLX: vmovntdq %ymm
706 ; SSE-LABEL: test_op_v8i32:
707 ; SSE: # BB#0:
708 ; SSE-NEXT: paddd %xmm2, %xmm0
709 ; SSE-NEXT: paddd %xmm3, %xmm1
710 ; SSE-NEXT: movntdq %xmm1, 16(%rdi)
711 ; SSE-NEXT: movntdq %xmm0, (%rdi)
712 ; SSE-NEXT: retq
713 ;
714 ; AVX1-LABEL: test_op_v8i32:
715 ; AVX1: # BB#0:
716 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
717 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
718 ; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
719 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
720 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
721 ; AVX1-NEXT: vmovntps %ymm0, (%rdi)
722 ; AVX1-NEXT: vzeroupper
723 ; AVX1-NEXT: retq
724 ;
725 ; AVX2-LABEL: test_op_v8i32:
726 ; AVX2: # BB#0:
727 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
728 ; AVX2-NEXT: vmovntdq %ymm0, (%rdi)
729 ; AVX2-NEXT: vzeroupper
730 ; AVX2-NEXT: retq
731 ;
732 ; VLX-LABEL: test_op_v8i32:
733 ; VLX: # BB#0:
734 ; VLX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
735 ; VLX-NEXT: vmovntdq %ymm0, (%rdi)
736 ; VLX-NEXT: retq
331737 %r = add <8 x i32> %a, %b
332738 store <8 x i32> %r, <8 x i32>* %dst, align 32, !nontemporal !1
333739 ret void
334740 }
335741
336742 define void @test_op_v4f64(<4 x double> %a, <4 x double> %b, <4 x double>* %dst) {
337 ; CHECK-LABEL: test_op_v4f64:
338 ; AVX: vmovntpd %ymm
339 ; AVX2: vmovntpd %ymm
340 ; VLX: vmovntpd %ymm
743 ; SSE-LABEL: test_op_v4f64:
744 ; SSE: # BB#0:
745 ; SSE-NEXT: addpd %xmm2, %xmm0
746 ; SSE-NEXT: addpd %xmm3, %xmm1
747 ; SSE-NEXT: movntpd %xmm1, 16(%rdi)
748 ; SSE-NEXT: movntpd %xmm0, (%rdi)
749 ; SSE-NEXT: retq
750 ;
751 ; AVX-LABEL: test_op_v4f64:
752 ; AVX: # BB#0:
753 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
754 ; AVX-NEXT: vmovntpd %ymm0, (%rdi)
755 ; AVX-NEXT: vzeroupper
756 ; AVX-NEXT: retq
757 ;
758 ; VLX-LABEL: test_op_v4f64:
759 ; VLX: # BB#0:
760 ; VLX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
761 ; VLX-NEXT: vmovntpd %ymm0, (%rdi)
762 ; VLX-NEXT: retq
341763 %r = fadd <4 x double> %a, %b
342764 store <4 x double> %r, <4 x double>* %dst, align 32, !nontemporal !1
343765 ret void
344766 }
345767
346768 define void @test_op_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %dst) {
347 ; CHECK-LABEL: test_op_v4i64:
348 ; AVX: vmovntps %ymm
349 ; AVX2: vmovntdq %ymm
350 ; VLX: vmovntdq %ymm
769 ; SSE-LABEL: test_op_v4i64:
770 ; SSE: # BB#0:
771 ; SSE-NEXT: paddq %xmm2, %xmm0
772 ; SSE-NEXT: paddq %xmm3, %xmm1
773 ; SSE-NEXT: movntdq %xmm1, 16(%rdi)
774 ; SSE-NEXT: movntdq %xmm0, (%rdi)
775 ; SSE-NEXT: retq
776 ;
777 ; AVX1-LABEL: test_op_v4i64:
778 ; AVX1: # BB#0:
779 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
780 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
781 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
782 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
783 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
784 ; AVX1-NEXT: vmovntps %ymm0, (%rdi)
785 ; AVX1-NEXT: vzeroupper
786 ; AVX1-NEXT: retq
787 ;
788 ; AVX2-LABEL: test_op_v4i64:
789 ; AVX2: # BB#0:
790 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
791 ; AVX2-NEXT: vmovntdq %ymm0, (%rdi)
792 ; AVX2-NEXT: vzeroupper
793 ; AVX2-NEXT: retq
794 ;
795 ; VLX-LABEL: test_op_v4i64:
796 ; VLX: # BB#0:
797 ; VLX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
798 ; VLX-NEXT: vmovntdq %ymm0, (%rdi)
799 ; VLX-NEXT: retq
351800 %r = add <4 x i64> %a, %b
352801 store <4 x i64> %r, <4 x i64>* %dst, align 32, !nontemporal !1
353802 ret void
354803 }
355804
356805 define void @test_op_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %dst) {
357 ; CHECK-LABEL: test_op_v16i16:
358 ; AVX: vmovntps %ymm
359 ; AVX2: vmovntdq %ymm
360 ; VLX: vmovntdq %ymm
806 ; SSE-LABEL: test_op_v16i16:
807 ; SSE: # BB#0:
808 ; SSE-NEXT: paddw %xmm2, %xmm0
809 ; SSE-NEXT: paddw %xmm3, %xmm1
810 ; SSE-NEXT: movntdq %xmm1, 16(%rdi)
811 ; SSE-NEXT: movntdq %xmm0, (%rdi)
812 ; SSE-NEXT: retq
813 ;
814 ; AVX1-LABEL: test_op_v16i16:
815 ; AVX1: # BB#0:
816 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
817 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
818 ; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2
819 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
820 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
821 ; AVX1-NEXT: vmovntps %ymm0, (%rdi)
822 ; AVX1-NEXT: vzeroupper
823 ; AVX1-NEXT: retq
824 ;
825 ; AVX2-LABEL: test_op_v16i16:
826 ; AVX2: # BB#0:
827 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
828 ; AVX2-NEXT: vmovntdq %ymm0, (%rdi)
829 ; AVX2-NEXT: vzeroupper
830 ; AVX2-NEXT: retq
831 ;
832 ; VLX-LABEL: test_op_v16i16:
833 ; VLX: # BB#0:
834 ; VLX-NEXT: vpaddw %ymm1, %ymm0, %ymm0
835 ; VLX-NEXT: vmovntdq %ymm0, (%rdi)
836 ; VLX-NEXT: retq
361837 %r = add <16 x i16> %a, %b
362838 store <16 x i16> %r, <16 x i16>* %dst, align 32, !nontemporal !1
363839 ret void
364840 }
365841
366842 define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) {
367 ; CHECK-LABEL: test_op_v32i8:
368 ; AVX: vmovntps %ymm
369 ; AVX2: vmovntdq %ymm
370 ; VLX: vmovntdq %ymm
843 ; SSE-LABEL: test_op_v32i8:
844 ; SSE: # BB#0:
845 ; SSE-NEXT: paddb %xmm2, %xmm0
846 ; SSE-NEXT: paddb %xmm3, %xmm1
847 ; SSE-NEXT: movntdq %xmm1, 16(%rdi)
848 ; SSE-NEXT: movntdq %xmm0, (%rdi)
849 ; SSE-NEXT: retq
850 ;
851 ; AVX1-LABEL: test_op_v32i8:
852 ; AVX1: # BB#0:
853 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
854 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
855 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
856 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
857 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
858 ; AVX1-NEXT: vmovntps %ymm0, (%rdi)
859 ; AVX1-NEXT: vzeroupper
860 ; AVX1-NEXT: retq
861 ;
862 ; AVX2-LABEL: test_op_v32i8:
863 ; AVX2: # BB#0:
864 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
865 ; AVX2-NEXT: vmovntdq %ymm0, (%rdi)
866 ; AVX2-NEXT: vzeroupper
867 ; AVX2-NEXT: retq
868 ;
869 ; VLX-LABEL: test_op_v32i8:
870 ; VLX: # BB#0:
871 ; VLX-NEXT: vpaddb %ymm1, %ymm0, %ymm0
872 ; VLX-NEXT: vmovntdq %ymm0, (%rdi)
873 ; VLX-NEXT: retq
371874 %r = add <32 x i8> %a, %b
372875 store <32 x i8> %r, <32 x i8>* %dst, align 32, !nontemporal !1
373876 ret void
378881 ; could even scalarize to movnti when we have 1-alignment: nontemporal is
379882 ; probably always worth even some 20 instruction scalarization.
380883 define void @test_unaligned_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
381 ; CHECK-LABEL: test_unaligned_v8f32:
382 ; SSE: movntps %xmm
383 ; SSE: movntps %xmm
384 ; AVX-NOT: movnt
385 ; AVX: vmovups %ymm
386 ; AVX2-NOT: movnt
387 ; AVX2: vmovups %ymm
388 ; VLX-NOT: movnt
389 ; VLX: vmovups %ymm
884 ; SSE-LABEL: test_unaligned_v8f32:
885 ; SSE: # BB#0:
886 ; SSE-NEXT: addps %xmm2, %xmm0
887 ; SSE-NEXT: addps %xmm3, %xmm1
888 ; SSE-NEXT: movntps %xmm1, 16(%rdi)
889 ; SSE-NEXT: movntps %xmm0, (%rdi)
890 ; SSE-NEXT: retq
891 ;
892 ; AVX-LABEL: test_unaligned_v8f32:
893 ; AVX: # BB#0:
894 ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
895 ; AVX-NEXT: vmovups %ymm0, (%rdi)
896 ; AVX-NEXT: vzeroupper
897 ; AVX-NEXT: retq
898 ;
899 ; VLX-LABEL: test_unaligned_v8f32:
900 ; VLX: # BB#0:
901 ; VLX-NEXT: vaddps %ymm1, %ymm0, %ymm0
902 ; VLX-NEXT: vmovups %ymm0, (%rdi)
903 ; VLX-NEXT: retq
390904 %r = fadd <8 x float> %a, %b
391905 store <8 x float> %r, <8 x float>* %dst, align 16, !nontemporal !1
392906 ret void