llvm.org GIT mirror llvm / f7528fd
[X86] Add narrow vector test cases to vector-reduce* tests. Add copies of the tests with -x86-experimental-vector-widening-legalization git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@348334 91177308-0d34-0410-b5e6-96231b3b80d8 Craig Topper 1 year, 2 months ago
18 changed file(s) with 20352 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
2 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
3 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
4 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
5 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
6 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
7
8 ;
9 ; vXi64
10 ;
11
12 define i64 @test_v2i64(<2 x i64> %a0) {
13 ; SSE-LABEL: test_v2i64:
14 ; SSE: # %bb.0:
15 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
16 ; SSE-NEXT: paddq %xmm0, %xmm1
17 ; SSE-NEXT: movq %xmm1, %rax
18 ; SSE-NEXT: retq
19 ;
20 ; AVX-LABEL: test_v2i64:
21 ; AVX: # %bb.0:
22 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
23 ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
24 ; AVX-NEXT: vmovq %xmm0, %rax
25 ; AVX-NEXT: retq
26 ;
27 ; AVX512-LABEL: test_v2i64:
28 ; AVX512: # %bb.0:
29 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
30 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
31 ; AVX512-NEXT: vmovq %xmm0, %rax
32 ; AVX512-NEXT: retq
33 %1 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %a0)
34 ret i64 %1
35 }
36
37 define i64 @test_v4i64(<4 x i64> %a0) {
38 ; SSE-LABEL: test_v4i64:
39 ; SSE: # %bb.0:
40 ; SSE-NEXT: paddq %xmm1, %xmm0
41 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
42 ; SSE-NEXT: paddq %xmm0, %xmm1
43 ; SSE-NEXT: movq %xmm1, %rax
44 ; SSE-NEXT: retq
45 ;
46 ; AVX1-LABEL: test_v4i64:
47 ; AVX1: # %bb.0:
48 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
49 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
50 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
51 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
52 ; AVX1-NEXT: vmovq %xmm0, %rax
53 ; AVX1-NEXT: vzeroupper
54 ; AVX1-NEXT: retq
55 ;
56 ; AVX2-LABEL: test_v4i64:
57 ; AVX2: # %bb.0:
58 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
59 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
60 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
61 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
62 ; AVX2-NEXT: vmovq %xmm0, %rax
63 ; AVX2-NEXT: vzeroupper
64 ; AVX2-NEXT: retq
65 ;
66 ; AVX512-LABEL: test_v4i64:
67 ; AVX512: # %bb.0:
68 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
69 ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
70 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
71 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
72 ; AVX512-NEXT: vmovq %xmm0, %rax
73 ; AVX512-NEXT: vzeroupper
74 ; AVX512-NEXT: retq
75 %1 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> %a0)
76 ret i64 %1
77 }
78
79 define i64 @test_v8i64(<8 x i64> %a0) {
80 ; SSE-LABEL: test_v8i64:
81 ; SSE: # %bb.0:
82 ; SSE-NEXT: paddq %xmm3, %xmm1
83 ; SSE-NEXT: paddq %xmm2, %xmm1
84 ; SSE-NEXT: paddq %xmm0, %xmm1
85 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
86 ; SSE-NEXT: paddq %xmm1, %xmm0
87 ; SSE-NEXT: movq %xmm0, %rax
88 ; SSE-NEXT: retq
89 ;
90 ; AVX1-LABEL: test_v8i64:
91 ; AVX1: # %bb.0:
92 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
93 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
94 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
95 ; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1
96 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
97 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
98 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
99 ; AVX1-NEXT: vmovq %xmm0, %rax
100 ; AVX1-NEXT: vzeroupper
101 ; AVX1-NEXT: retq
102 ;
103 ; AVX2-LABEL: test_v8i64:
104 ; AVX2: # %bb.0:
105 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
106 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
107 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
108 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
109 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
110 ; AVX2-NEXT: vmovq %xmm0, %rax
111 ; AVX2-NEXT: vzeroupper
112 ; AVX2-NEXT: retq
113 ;
114 ; AVX512-LABEL: test_v8i64:
115 ; AVX512: # %bb.0:
116 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
117 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
118 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
119 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
120 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
121 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
122 ; AVX512-NEXT: vmovq %xmm0, %rax
123 ; AVX512-NEXT: vzeroupper
124 ; AVX512-NEXT: retq
125 %1 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> %a0)
126 ret i64 %1
127 }
128
129 define i64 @test_v16i64(<16 x i64> %a0) {
130 ; SSE-LABEL: test_v16i64:
131 ; SSE: # %bb.0:
132 ; SSE-NEXT: paddq %xmm6, %xmm2
133 ; SSE-NEXT: paddq %xmm7, %xmm3
134 ; SSE-NEXT: paddq %xmm5, %xmm3
135 ; SSE-NEXT: paddq %xmm1, %xmm3
136 ; SSE-NEXT: paddq %xmm4, %xmm2
137 ; SSE-NEXT: paddq %xmm3, %xmm2
138 ; SSE-NEXT: paddq %xmm0, %xmm2
139 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
140 ; SSE-NEXT: paddq %xmm2, %xmm0
141 ; SSE-NEXT: movq %xmm0, %rax
142 ; SSE-NEXT: retq
143 ;
144 ; AVX1-LABEL: test_v16i64:
145 ; AVX1: # %bb.0:
146 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm4
147 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
148 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
149 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
150 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
151 ; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
152 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
153 ; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
154 ; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
155 ; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
156 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
157 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
158 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
159 ; AVX1-NEXT: vmovq %xmm0, %rax
160 ; AVX1-NEXT: vzeroupper
161 ; AVX1-NEXT: retq
162 ;
163 ; AVX2-LABEL: test_v16i64:
164 ; AVX2: # %bb.0:
165 ; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
166 ; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1
167 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
168 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
169 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
170 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
171 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
172 ; AVX2-NEXT: vmovq %xmm0, %rax
173 ; AVX2-NEXT: vzeroupper
174 ; AVX2-NEXT: retq
175 ;
176 ; AVX512-LABEL: test_v16i64:
177 ; AVX512: # %bb.0:
178 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
179 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
180 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
181 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
182 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
183 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
184 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
185 ; AVX512-NEXT: vmovq %xmm0, %rax
186 ; AVX512-NEXT: vzeroupper
187 ; AVX512-NEXT: retq
188 %1 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> %a0)
189 ret i64 %1
190 }
191
192 ;
193 ; vXi32
194 ;
195
196 define i32 @test_v2i32(<2 x i32> %a0) {
197 ; SSE-LABEL: test_v2i32:
198 ; SSE: # %bb.0:
199 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
200 ; SSE-NEXT: paddd %xmm0, %xmm1
201 ; SSE-NEXT: movd %xmm1, %eax
202 ; SSE-NEXT: retq
203 ;
204 ; AVX-LABEL: test_v2i32:
205 ; AVX: # %bb.0:
206 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
207 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
208 ; AVX-NEXT: vmovd %xmm0, %eax
209 ; AVX-NEXT: retq
210 ;
211 ; AVX512-LABEL: test_v2i32:
212 ; AVX512: # %bb.0:
213 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
214 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
215 ; AVX512-NEXT: vmovd %xmm0, %eax
216 ; AVX512-NEXT: retq
217 %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> %a0)
218 ret i32 %1
219 }
220
221 define i32 @test_v4i32(<4 x i32> %a0) {
222 ; SSE-LABEL: test_v4i32:
223 ; SSE: # %bb.0:
224 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
225 ; SSE-NEXT: paddd %xmm0, %xmm1
226 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
227 ; SSE-NEXT: paddd %xmm1, %xmm0
228 ; SSE-NEXT: movd %xmm0, %eax
229 ; SSE-NEXT: retq
230 ;
231 ; AVX-LABEL: test_v4i32:
232 ; AVX: # %bb.0:
233 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
234 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
235 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
236 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
237 ; AVX-NEXT: vmovd %xmm0, %eax
238 ; AVX-NEXT: retq
239 ;
240 ; AVX512-LABEL: test_v4i32:
241 ; AVX512: # %bb.0:
242 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
243 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
244 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
245 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
246 ; AVX512-NEXT: vmovd %xmm0, %eax
247 ; AVX512-NEXT: retq
248 %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %a0)
249 ret i32 %1
250 }
251
252 define i32 @test_v8i32(<8 x i32> %a0) {
253 ; SSE-LABEL: test_v8i32:
254 ; SSE: # %bb.0:
255 ; SSE-NEXT: paddd %xmm1, %xmm0
256 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
257 ; SSE-NEXT: paddd %xmm0, %xmm1
258 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
259 ; SSE-NEXT: paddd %xmm1, %xmm0
260 ; SSE-NEXT: movd %xmm0, %eax
261 ; SSE-NEXT: retq
262 ;
263 ; AVX1-LABEL: test_v8i32:
264 ; AVX1: # %bb.0:
265 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
266 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
267 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
268 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
269 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
270 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
271 ; AVX1-NEXT: vmovd %xmm0, %eax
272 ; AVX1-NEXT: vzeroupper
273 ; AVX1-NEXT: retq
274 ;
275 ; AVX2-LABEL: test_v8i32:
276 ; AVX2: # %bb.0:
277 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
278 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
279 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
280 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
281 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
282 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
283 ; AVX2-NEXT: vmovd %xmm0, %eax
284 ; AVX2-NEXT: vzeroupper
285 ; AVX2-NEXT: retq
286 ;
287 ; AVX512-LABEL: test_v8i32:
288 ; AVX512: # %bb.0:
289 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
290 ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
291 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
292 ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
293 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
294 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
295 ; AVX512-NEXT: vmovd %xmm0, %eax
296 ; AVX512-NEXT: vzeroupper
297 ; AVX512-NEXT: retq
298 %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %a0)
299 ret i32 %1
300 }
301
302 define i32 @test_v16i32(<16 x i32> %a0) {
303 ; SSE-LABEL: test_v16i32:
304 ; SSE: # %bb.0:
305 ; SSE-NEXT: paddd %xmm3, %xmm1
306 ; SSE-NEXT: paddd %xmm2, %xmm1
307 ; SSE-NEXT: paddd %xmm0, %xmm1
308 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
309 ; SSE-NEXT: paddd %xmm1, %xmm0
310 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
311 ; SSE-NEXT: paddd %xmm0, %xmm1
312 ; SSE-NEXT: movd %xmm1, %eax
313 ; SSE-NEXT: retq
314 ;
315 ; AVX1-LABEL: test_v16i32:
316 ; AVX1: # %bb.0:
317 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
318 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
319 ; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
320 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
321 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
322 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
323 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
324 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
325 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
326 ; AVX1-NEXT: vmovd %xmm0, %eax
327 ; AVX1-NEXT: vzeroupper
328 ; AVX1-NEXT: retq
329 ;
330 ; AVX2-LABEL: test_v16i32:
331 ; AVX2: # %bb.0:
332 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
333 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
334 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
335 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
336 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
337 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
338 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
339 ; AVX2-NEXT: vmovd %xmm0, %eax
340 ; AVX2-NEXT: vzeroupper
341 ; AVX2-NEXT: retq
342 ;
343 ; AVX512-LABEL: test_v16i32:
344 ; AVX512: # %bb.0:
345 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
346 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
347 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
348 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
349 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
350 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
351 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
352 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
353 ; AVX512-NEXT: vmovd %xmm0, %eax
354 ; AVX512-NEXT: vzeroupper
355 ; AVX512-NEXT: retq
356 %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> %a0)
357 ret i32 %1
358 }
359
360 define i32 @test_v32i32(<32 x i32> %a0) {
361 ; SSE-LABEL: test_v32i32:
362 ; SSE: # %bb.0:
363 ; SSE-NEXT: paddd %xmm6, %xmm2
364 ; SSE-NEXT: paddd %xmm7, %xmm3
365 ; SSE-NEXT: paddd %xmm5, %xmm3
366 ; SSE-NEXT: paddd %xmm1, %xmm3
367 ; SSE-NEXT: paddd %xmm4, %xmm2
368 ; SSE-NEXT: paddd %xmm3, %xmm2
369 ; SSE-NEXT: paddd %xmm0, %xmm2
370 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
371 ; SSE-NEXT: paddd %xmm2, %xmm0
372 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
373 ; SSE-NEXT: paddd %xmm0, %xmm1
374 ; SSE-NEXT: movd %xmm1, %eax
375 ; SSE-NEXT: retq
376 ;
377 ; AVX1-LABEL: test_v32i32:
378 ; AVX1: # %bb.0:
379 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm4
380 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
381 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
382 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
383 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
384 ; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
385 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
386 ; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
387 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
388 ; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
389 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
390 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
391 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
392 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
393 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
394 ; AVX1-NEXT: vmovd %xmm0, %eax
395 ; AVX1-NEXT: vzeroupper
396 ; AVX1-NEXT: retq
397 ;
398 ; AVX2-LABEL: test_v32i32:
399 ; AVX2: # %bb.0:
400 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
401 ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
402 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
403 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
404 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
405 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
406 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
407 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
408 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
409 ; AVX2-NEXT: vmovd %xmm0, %eax
410 ; AVX2-NEXT: vzeroupper
411 ; AVX2-NEXT: retq
412 ;
413 ; AVX512-LABEL: test_v32i32:
414 ; AVX512: # %bb.0:
415 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
416 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
417 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
418 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
419 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
420 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
421 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
422 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
423 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
424 ; AVX512-NEXT: vmovd %xmm0, %eax
425 ; AVX512-NEXT: vzeroupper
426 ; AVX512-NEXT: retq
427 %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> %a0)
428 ret i32 %1
429 }
430
431 ;
432 ; vXi16
433 ;
434
435 define i16 @test_v2i16(<2 x i16> %a0) {
436 ; SSE-LABEL: test_v2i16:
437 ; SSE: # %bb.0:
438 ; SSE-NEXT: movdqa %xmm0, %xmm1
439 ; SSE-NEXT: psrld $16, %xmm1
440 ; SSE-NEXT: paddw %xmm0, %xmm1
441 ; SSE-NEXT: movd %xmm1, %eax
442 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
443 ; SSE-NEXT: retq
444 ;
445 ; AVX-LABEL: test_v2i16:
446 ; AVX: # %bb.0:
447 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
448 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
449 ; AVX-NEXT: vmovd %xmm0, %eax
450 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax
451 ; AVX-NEXT: retq
452 ;
453 ; AVX512-LABEL: test_v2i16:
454 ; AVX512: # %bb.0:
455 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
456 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
457 ; AVX512-NEXT: vmovd %xmm0, %eax
458 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
459 ; AVX512-NEXT: retq
460 %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> %a0)
461 ret i16 %1
462 }
463
464 define i16 @test_v4i16(<4 x i16> %a0) {
465 ; SSE-LABEL: test_v4i16:
466 ; SSE: # %bb.0:
467 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
468 ; SSE-NEXT: paddw %xmm0, %xmm1
469 ; SSE-NEXT: movdqa %xmm1, %xmm0
470 ; SSE-NEXT: psrld $16, %xmm0
471 ; SSE-NEXT: paddw %xmm1, %xmm0
472 ; SSE-NEXT: movd %xmm0, %eax
473 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
474 ; SSE-NEXT: retq
475 ;
476 ; AVX-LABEL: test_v4i16:
477 ; AVX: # %bb.0:
478 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
479 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
480 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
481 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
482 ; AVX-NEXT: vmovd %xmm0, %eax
483 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax
484 ; AVX-NEXT: retq
485 ;
486 ; AVX512-LABEL: test_v4i16:
487 ; AVX512: # %bb.0:
488 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
489 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
490 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
491 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
492 ; AVX512-NEXT: vmovd %xmm0, %eax
493 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
494 ; AVX512-NEXT: retq
495 %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> %a0)
496 ret i16 %1
497 }
498
499 define i16 @test_v8i16(<8 x i16> %a0) {
500 ; SSE-LABEL: test_v8i16:
501 ; SSE: # %bb.0:
502 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
503 ; SSE-NEXT: paddw %xmm0, %xmm1
504 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
505 ; SSE-NEXT: paddw %xmm1, %xmm0
506 ; SSE-NEXT: movdqa %xmm0, %xmm1
507 ; SSE-NEXT: psrld $16, %xmm1
508 ; SSE-NEXT: paddw %xmm0, %xmm1
509 ; SSE-NEXT: movd %xmm1, %eax
510 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
511 ; SSE-NEXT: retq
512 ;
513 ; AVX-LABEL: test_v8i16:
514 ; AVX: # %bb.0:
515 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
516 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
517 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
518 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
519 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
520 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
521 ; AVX-NEXT: vmovd %xmm0, %eax
522 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax
523 ; AVX-NEXT: retq
524 ;
525 ; AVX512-LABEL: test_v8i16:
526 ; AVX512: # %bb.0:
527 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
528 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
529 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
530 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
531 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
532 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
533 ; AVX512-NEXT: vmovd %xmm0, %eax
534 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
535 ; AVX512-NEXT: retq
536 %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> %a0)
537 ret i16 %1
538 }
539
540 define i16 @test_v16i16(<16 x i16> %a0) {
541 ; SSE-LABEL: test_v16i16:
542 ; SSE: # %bb.0:
543 ; SSE-NEXT: paddw %xmm1, %xmm0
544 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
545 ; SSE-NEXT: paddw %xmm0, %xmm1
546 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
547 ; SSE-NEXT: paddw %xmm1, %xmm0
548 ; SSE-NEXT: movdqa %xmm0, %xmm1
549 ; SSE-NEXT: psrld $16, %xmm1
550 ; SSE-NEXT: paddw %xmm0, %xmm1
551 ; SSE-NEXT: movd %xmm1, %eax
552 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
553 ; SSE-NEXT: retq
554 ;
555 ; AVX1-LABEL: test_v16i16:
556 ; AVX1: # %bb.0:
557 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
558 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
559 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
560 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
561 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
562 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
563 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
564 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
565 ; AVX1-NEXT: vmovd %xmm0, %eax
566 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
567 ; AVX1-NEXT: vzeroupper
568 ; AVX1-NEXT: retq
569 ;
570 ; AVX2-LABEL: test_v16i16:
571 ; AVX2: # %bb.0:
572 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
573 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
574 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
575 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
576 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
577 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
578 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
579 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
580 ; AVX2-NEXT: vmovd %xmm0, %eax
581 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
582 ; AVX2-NEXT: vzeroupper
583 ; AVX2-NEXT: retq
584 ;
585 ; AVX512-LABEL: test_v16i16:
586 ; AVX512: # %bb.0:
587 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
588 ; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0
589 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
590 ; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0
591 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
592 ; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0
593 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
594 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
595 ; AVX512-NEXT: vmovd %xmm0, %eax
596 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
597 ; AVX512-NEXT: vzeroupper
598 ; AVX512-NEXT: retq
599 %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> %a0)
600 ret i16 %1
601 }
602
603 define i16 @test_v32i16(<32 x i16> %a0) {
604 ; SSE-LABEL: test_v32i16:
605 ; SSE: # %bb.0:
606 ; SSE-NEXT: paddw %xmm3, %xmm1
607 ; SSE-NEXT: paddw %xmm2, %xmm1
608 ; SSE-NEXT: paddw %xmm0, %xmm1
609 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
610 ; SSE-NEXT: paddw %xmm1, %xmm0
611 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
612 ; SSE-NEXT: paddw %xmm0, %xmm1
613 ; SSE-NEXT: movdqa %xmm1, %xmm0
614 ; SSE-NEXT: psrld $16, %xmm0
615 ; SSE-NEXT: paddw %xmm1, %xmm0
616 ; SSE-NEXT: movd %xmm0, %eax
617 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
618 ; SSE-NEXT: retq
619 ;
620 ; AVX1-LABEL: test_v32i16:
621 ; AVX1: # %bb.0:
622 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
623 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
624 ; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2
625 ; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1
626 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
627 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
628 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
629 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
630 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
631 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
632 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
633 ; AVX1-NEXT: vmovd %xmm0, %eax
634 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
635 ; AVX1-NEXT: vzeroupper
636 ; AVX1-NEXT: retq
637 ;
638 ; AVX2-LABEL: test_v32i16:
639 ; AVX2: # %bb.0:
640 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
641 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
642 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
643 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
644 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
645 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
646 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
647 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
648 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
649 ; AVX2-NEXT: vmovd %xmm0, %eax
650 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
651 ; AVX2-NEXT: vzeroupper
652 ; AVX2-NEXT: retq
653 ;
654 ; AVX512-LABEL: test_v32i16:
655 ; AVX512: # %bb.0:
656 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
657 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
658 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
659 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
660 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
661 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
662 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
663 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
664 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
665 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
666 ; AVX512-NEXT: vmovd %xmm0, %eax
667 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
668 ; AVX512-NEXT: vzeroupper
669 ; AVX512-NEXT: retq
670 %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> %a0)
671 ret i16 %1
672 }
673
674 define i16 @test_v64i16(<64 x i16> %a0) {
675 ; SSE-LABEL: test_v64i16:
676 ; SSE: # %bb.0:
677 ; SSE-NEXT: paddw %xmm6, %xmm2
678 ; SSE-NEXT: paddw %xmm7, %xmm3
679 ; SSE-NEXT: paddw %xmm5, %xmm3
680 ; SSE-NEXT: paddw %xmm1, %xmm3
681 ; SSE-NEXT: paddw %xmm4, %xmm2
682 ; SSE-NEXT: paddw %xmm3, %xmm2
683 ; SSE-NEXT: paddw %xmm0, %xmm2
684 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
685 ; SSE-NEXT: paddw %xmm2, %xmm0
686 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
687 ; SSE-NEXT: paddw %xmm0, %xmm1
688 ; SSE-NEXT: movdqa %xmm1, %xmm0
689 ; SSE-NEXT: psrld $16, %xmm0
690 ; SSE-NEXT: paddw %xmm1, %xmm0
691 ; SSE-NEXT: movd %xmm0, %eax
692 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
693 ; SSE-NEXT: retq
694 ;
695 ; AVX1-LABEL: test_v64i16:
696 ; AVX1: # %bb.0:
697 ; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm4
698 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
699 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
700 ; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1
701 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
702 ; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1
703 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
704 ; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1
705 ; AVX1-NEXT: vpaddw %xmm4, %xmm2, %xmm2
706 ; AVX1-NEXT: vpaddw %xmm1, %xmm2, %xmm1
707 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
708 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
709 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
710 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
711 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
712 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
713 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
714 ; AVX1-NEXT: vmovd %xmm0, %eax
715 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
716 ; AVX1-NEXT: vzeroupper
717 ; AVX1-NEXT: retq
718 ;
719 ; AVX2-LABEL: test_v64i16:
720 ; AVX2: # %bb.0:
721 ; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1
722 ; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1
723 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
724 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
725 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
726 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
727 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
728 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
729 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
730 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
731 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
732 ; AVX2-NEXT: vmovd %xmm0, %eax
733 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
734 ; AVX2-NEXT: vzeroupper
735 ; AVX2-NEXT: retq
736 ;
737 ; AVX512-LABEL: test_v64i16:
738 ; AVX512: # %bb.0:
739 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
740 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
741 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
742 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
743 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
744 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
745 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
746 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
747 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
748 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
749 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
750 ; AVX512-NEXT: vmovd %xmm0, %eax
751 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
752 ; AVX512-NEXT: vzeroupper
753 ; AVX512-NEXT: retq
754 %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> %a0)
755 ret i16 %1
756 }
757
758 ;
759 ; vXi8
760 ;
761
762 define i8 @test_v2i8(<2 x i8> %a0) {
763 ; SSE2-LABEL: test_v2i8:
764 ; SSE2: # %bb.0:
765 ; SSE2-NEXT: movdqa %xmm0, %xmm1
766 ; SSE2-NEXT: psrlw $8, %xmm1
767 ; SSE2-NEXT: paddb %xmm0, %xmm1
768 ; SSE2-NEXT: movd %xmm1, %eax
769 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
770 ; SSE2-NEXT: retq
771 ;
772 ; SSE41-LABEL: test_v2i8:
773 ; SSE41: # %bb.0:
774 ; SSE41-NEXT: movdqa %xmm0, %xmm1
775 ; SSE41-NEXT: psrlw $8, %xmm1
776 ; SSE41-NEXT: paddb %xmm0, %xmm1
777 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
778 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
779 ; SSE41-NEXT: retq
780 ;
781 ; AVX-LABEL: test_v2i8:
782 ; AVX: # %bb.0:
783 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
784 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
785 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
786 ; AVX-NEXT: # kill: def $al killed $al killed $eax
787 ; AVX-NEXT: retq
788 ;
789 ; AVX512-LABEL: test_v2i8:
790 ; AVX512: # %bb.0:
791 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
792 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
793 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
794 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
795 ; AVX512-NEXT: retq
796 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> %a0)
797 ret i8 %1
798 }
799
800 define i8 @test_v4i8(<4 x i8> %a0) {
801 ; SSE2-LABEL: test_v4i8:
802 ; SSE2: # %bb.0:
803 ; SSE2-NEXT: movdqa %xmm0, %xmm1
804 ; SSE2-NEXT: psrld $16, %xmm1
805 ; SSE2-NEXT: paddb %xmm0, %xmm1
806 ; SSE2-NEXT: movdqa %xmm1, %xmm0
807 ; SSE2-NEXT: psrlw $8, %xmm0
808 ; SSE2-NEXT: paddb %xmm1, %xmm0
809 ; SSE2-NEXT: movd %xmm0, %eax
810 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
811 ; SSE2-NEXT: retq
812 ;
813 ; SSE41-LABEL: test_v4i8:
814 ; SSE41: # %bb.0:
815 ; SSE41-NEXT: movdqa %xmm0, %xmm1
816 ; SSE41-NEXT: psrld $16, %xmm1
817 ; SSE41-NEXT: paddb %xmm0, %xmm1
818 ; SSE41-NEXT: movdqa %xmm1, %xmm0
819 ; SSE41-NEXT: psrlw $8, %xmm0
820 ; SSE41-NEXT: paddb %xmm1, %xmm0
821 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
822 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
823 ; SSE41-NEXT: retq
824 ;
825 ; AVX-LABEL: test_v4i8:
826 ; AVX: # %bb.0:
827 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
828 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
829 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
830 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
831 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
832 ; AVX-NEXT: # kill: def $al killed $al killed $eax
833 ; AVX-NEXT: retq
834 ;
835 ; AVX512-LABEL: test_v4i8:
836 ; AVX512: # %bb.0:
837 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
838 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
839 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
840 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
841 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
842 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
843 ; AVX512-NEXT: retq
844 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> %a0)
845 ret i8 %1
846 }
847
848 define i8 @test_v8i8(<8 x i8> %a0) {
849 ; SSE2-LABEL: test_v8i8:
850 ; SSE2: # %bb.0:
851 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
852 ; SSE2-NEXT: paddb %xmm0, %xmm1
853 ; SSE2-NEXT: movdqa %xmm1, %xmm0
854 ; SSE2-NEXT: psrld $16, %xmm0
855 ; SSE2-NEXT: paddb %xmm1, %xmm0
856 ; SSE2-NEXT: movdqa %xmm0, %xmm1
857 ; SSE2-NEXT: psrlw $8, %xmm1
858 ; SSE2-NEXT: paddb %xmm0, %xmm1
859 ; SSE2-NEXT: movd %xmm1, %eax
860 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
861 ; SSE2-NEXT: retq
862 ;
863 ; SSE41-LABEL: test_v8i8:
864 ; SSE41: # %bb.0:
865 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
866 ; SSE41-NEXT: paddb %xmm0, %xmm1
867 ; SSE41-NEXT: movdqa %xmm1, %xmm0
868 ; SSE41-NEXT: psrld $16, %xmm0
869 ; SSE41-NEXT: paddb %xmm1, %xmm0
870 ; SSE41-NEXT: movdqa %xmm0, %xmm1
871 ; SSE41-NEXT: psrlw $8, %xmm1
872 ; SSE41-NEXT: paddb %xmm0, %xmm1
873 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
874 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
875 ; SSE41-NEXT: retq
876 ;
877 ; AVX-LABEL: test_v8i8:
878 ; AVX: # %bb.0:
879 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
880 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
881 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
882 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
883 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
884 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
885 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
886 ; AVX-NEXT: # kill: def $al killed $al killed $eax
887 ; AVX-NEXT: retq
888 ;
889 ; AVX512-LABEL: test_v8i8:
890 ; AVX512: # %bb.0:
891 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
892 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
893 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
894 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
895 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
896 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
897 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
898 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
899 ; AVX512-NEXT: retq
900 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> %a0)
901 ret i8 %1
902 }
903
904 define i8 @test_v16i8(<16 x i8> %a0) {
905 ; SSE2-LABEL: test_v16i8:
906 ; SSE2: # %bb.0:
907 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
908 ; SSE2-NEXT: paddb %xmm0, %xmm1
909 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
910 ; SSE2-NEXT: paddb %xmm1, %xmm0
911 ; SSE2-NEXT: movdqa %xmm0, %xmm1
912 ; SSE2-NEXT: psrld $16, %xmm1
913 ; SSE2-NEXT: paddb %xmm0, %xmm1
914 ; SSE2-NEXT: movdqa %xmm1, %xmm0
915 ; SSE2-NEXT: psrlw $8, %xmm0
916 ; SSE2-NEXT: paddb %xmm1, %xmm0
917 ; SSE2-NEXT: movd %xmm0, %eax
918 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
919 ; SSE2-NEXT: retq
920 ;
921 ; SSE41-LABEL: test_v16i8:
922 ; SSE41: # %bb.0:
923 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
924 ; SSE41-NEXT: paddb %xmm0, %xmm1
925 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
926 ; SSE41-NEXT: paddb %xmm1, %xmm0
927 ; SSE41-NEXT: movdqa %xmm0, %xmm1
928 ; SSE41-NEXT: psrld $16, %xmm1
929 ; SSE41-NEXT: paddb %xmm0, %xmm1
930 ; SSE41-NEXT: movdqa %xmm1, %xmm0
931 ; SSE41-NEXT: psrlw $8, %xmm0
932 ; SSE41-NEXT: paddb %xmm1, %xmm0
933 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
934 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
935 ; SSE41-NEXT: retq
936 ;
937 ; AVX-LABEL: test_v16i8:
938 ; AVX: # %bb.0:
939 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
940 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
941 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
942 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
943 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
944 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
945 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
946 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
947 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
948 ; AVX-NEXT: # kill: def $al killed $al killed $eax
949 ; AVX-NEXT: retq
950 ;
951 ; AVX512-LABEL: test_v16i8:
952 ; AVX512: # %bb.0:
953 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
954 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
955 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
956 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
957 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
958 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
959 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
960 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
961 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
962 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
963 ; AVX512-NEXT: retq
964 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> %a0)
965 ret i8 %1
966 }
967
968 define i8 @test_v32i8(<32 x i8> %a0) {
969 ; SSE2-LABEL: test_v32i8:
970 ; SSE2: # %bb.0:
971 ; SSE2-NEXT: paddb %xmm1, %xmm0
972 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
973 ; SSE2-NEXT: paddb %xmm0, %xmm1
974 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
975 ; SSE2-NEXT: paddb %xmm1, %xmm0
976 ; SSE2-NEXT: movdqa %xmm0, %xmm1
977 ; SSE2-NEXT: psrld $16, %xmm1
978 ; SSE2-NEXT: paddb %xmm0, %xmm1
979 ; SSE2-NEXT: movdqa %xmm1, %xmm0
980 ; SSE2-NEXT: psrlw $8, %xmm0
981 ; SSE2-NEXT: paddb %xmm1, %xmm0
982 ; SSE2-NEXT: movd %xmm0, %eax
983 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
984 ; SSE2-NEXT: retq
985 ;
986 ; SSE41-LABEL: test_v32i8:
987 ; SSE41: # %bb.0:
988 ; SSE41-NEXT: paddb %xmm1, %xmm0
989 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
990 ; SSE41-NEXT: paddb %xmm0, %xmm1
991 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
992 ; SSE41-NEXT: paddb %xmm1, %xmm0
993 ; SSE41-NEXT: movdqa %xmm0, %xmm1
994 ; SSE41-NEXT: psrld $16, %xmm1
995 ; SSE41-NEXT: paddb %xmm0, %xmm1
996 ; SSE41-NEXT: movdqa %xmm1, %xmm0
997 ; SSE41-NEXT: psrlw $8, %xmm0
998 ; SSE41-NEXT: paddb %xmm1, %xmm0
999 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
1000 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1001 ; SSE41-NEXT: retq
1002 ;
1003 ; AVX1-LABEL: test_v32i8:
1004 ; AVX1: # %bb.0:
1005 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1006 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1007 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1008 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1009 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1010 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1011 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1012 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1013 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1014 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1015 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
1016 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
1017 ; AVX1-NEXT: vzeroupper
1018 ; AVX1-NEXT: retq
1019 ;
1020 ; AVX2-LABEL: test_v32i8:
1021 ; AVX2: # %bb.0:
1022 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1023 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1024 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1025 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1026 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1027 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1028 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1029 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1030 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1031 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1032 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1033 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
1034 ; AVX2-NEXT: vzeroupper
1035 ; AVX2-NEXT: retq
1036 ;
1037 ; AVX512-LABEL: test_v32i8:
1038 ; AVX512: # %bb.0:
1039 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1040 ; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1041 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1042 ; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1043 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1044 ; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1045 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1046 ; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1047 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1048 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1049 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1050 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1051 ; AVX512-NEXT: vzeroupper
1052 ; AVX512-NEXT: retq
1053 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> %a0)
1054 ret i8 %1
1055 }
1056
1057 define i8 @test_v64i8(<64 x i8> %a0) {
1058 ; SSE2-LABEL: test_v64i8:
1059 ; SSE2: # %bb.0:
1060 ; SSE2-NEXT: paddb %xmm3, %xmm1
1061 ; SSE2-NEXT: paddb %xmm2, %xmm1
1062 ; SSE2-NEXT: paddb %xmm0, %xmm1
1063 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1064 ; SSE2-NEXT: paddb %xmm1, %xmm0
1065 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1066 ; SSE2-NEXT: paddb %xmm0, %xmm1
1067 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1068 ; SSE2-NEXT: psrld $16, %xmm0
1069 ; SSE2-NEXT: paddb %xmm1, %xmm0
1070 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1071 ; SSE2-NEXT: psrlw $8, %xmm1
1072 ; SSE2-NEXT: paddb %xmm0, %xmm1
1073 ; SSE2-NEXT: movd %xmm1, %eax
1074 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1075 ; SSE2-NEXT: retq
1076 ;
1077 ; SSE41-LABEL: test_v64i8:
1078 ; SSE41: # %bb.0:
1079 ; SSE41-NEXT: paddb %xmm3, %xmm1
1080 ; SSE41-NEXT: paddb %xmm2, %xmm1
1081 ; SSE41-NEXT: paddb %xmm0, %xmm1
1082 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1083 ; SSE41-NEXT: paddb %xmm1, %xmm0
1084 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1085 ; SSE41-NEXT: paddb %xmm0, %xmm1
1086 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1087 ; SSE41-NEXT: psrld $16, %xmm0
1088 ; SSE41-NEXT: paddb %xmm1, %xmm0
1089 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1090 ; SSE41-NEXT: psrlw $8, %xmm1
1091 ; SSE41-NEXT: paddb %xmm0, %xmm1
1092 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
1093 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1094 ; SSE41-NEXT: retq
1095 ;
1096 ; AVX1-LABEL: test_v64i8:
1097 ; AVX1: # %bb.0:
1098 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1099 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1100 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
1101 ; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1
1102 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1103 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1104 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1105 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1106 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1107 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1108 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1109 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1110 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1111 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
1112 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
1113 ; AVX1-NEXT: vzeroupper
1114 ; AVX1-NEXT: retq
1115 ;
1116 ; AVX2-LABEL: test_v64i8:
1117 ; AVX2: # %bb.0:
1118 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1119 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1120 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1121 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1122 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1123 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1124 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1125 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1126 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1127 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1128 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1129 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1130 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
1131 ; AVX2-NEXT: vzeroupper
1132 ; AVX2-NEXT: retq
1133 ;
1134 ; AVX512-LABEL: test_v64i8:
1135 ; AVX512: # %bb.0:
1136 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1137 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1138 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1139 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1140 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1141 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1142 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1143 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1144 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1145 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1146 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1147 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1148 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1149 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1150 ; AVX512-NEXT: vzeroupper
1151 ; AVX512-NEXT: retq
1152 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> %a0)
1153 ret i8 %1
1154 }
1155
1156 define i8 @test_v128i8(<128 x i8> %a0) {
1157 ; SSE2-LABEL: test_v128i8:
1158 ; SSE2: # %bb.0:
1159 ; SSE2-NEXT: paddb %xmm6, %xmm2
1160 ; SSE2-NEXT: paddb %xmm7, %xmm3
1161 ; SSE2-NEXT: paddb %xmm5, %xmm3
1162 ; SSE2-NEXT: paddb %xmm1, %xmm3
1163 ; SSE2-NEXT: paddb %xmm4, %xmm2
1164 ; SSE2-NEXT: paddb %xmm3, %xmm2
1165 ; SSE2-NEXT: paddb %xmm0, %xmm2
1166 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
1167 ; SSE2-NEXT: paddb %xmm2, %xmm0
1168 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1169 ; SSE2-NEXT: paddb %xmm0, %xmm1
1170 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1171 ; SSE2-NEXT: psrld $16, %xmm0
1172 ; SSE2-NEXT: paddb %xmm1, %xmm0
1173 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1174 ; SSE2-NEXT: psrlw $8, %xmm1
1175 ; SSE2-NEXT: paddb %xmm0, %xmm1
1176 ; SSE2-NEXT: movd %xmm1, %eax
1177 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1178 ; SSE2-NEXT: retq
1179 ;
1180 ; SSE41-LABEL: test_v128i8:
1181 ; SSE41: # %bb.0:
1182 ; SSE41-NEXT: paddb %xmm6, %xmm2
1183 ; SSE41-NEXT: paddb %xmm7, %xmm3
1184 ; SSE41-NEXT: paddb %xmm5, %xmm3
1185 ; SSE41-NEXT: paddb %xmm1, %xmm3
1186 ; SSE41-NEXT: paddb %xmm4, %xmm2
1187 ; SSE41-NEXT: paddb %xmm3, %xmm2
1188 ; SSE41-NEXT: paddb %xmm0, %xmm2
1189 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
1190 ; SSE41-NEXT: paddb %xmm2, %xmm0
1191 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1192 ; SSE41-NEXT: paddb %xmm0, %xmm1
1193 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1194 ; SSE41-NEXT: psrld $16, %xmm0
1195 ; SSE41-NEXT: paddb %xmm1, %xmm0
1196 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1197 ; SSE41-NEXT: psrlw $8, %xmm1
1198 ; SSE41-NEXT: paddb %xmm0, %xmm1
1199 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
1200 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1201 ; SSE41-NEXT: retq
1202 ;
1203 ; AVX1-LABEL: test_v128i8:
1204 ; AVX1: # %bb.0:
1205 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm4
1206 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1207 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1208 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
1209 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
1210 ; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
1211 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1212 ; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
1213 ; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2
1214 ; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
1215 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1216 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1217 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1218 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1219 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1220 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1221 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1222 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1223 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1224 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
1225 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
1226 ; AVX1-NEXT: vzeroupper
1227 ; AVX1-NEXT: retq
1228 ;
1229 ; AVX2-LABEL: test_v128i8:
1230 ; AVX2: # %bb.0:
1231 ; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1
1232 ; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
1233 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1234 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1235 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1236 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1237 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1238 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1239 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1240 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1241 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1242 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1243 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1244 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1245 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
1246 ; AVX2-NEXT: vzeroupper
1247 ; AVX2-NEXT: retq
1248 ;
1249 ; AVX512-LABEL: test_v128i8:
1250 ; AVX512: # %bb.0:
1251 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1252 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1253 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1254 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1255 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1256 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1257 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1258 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1259 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1260 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1261 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1262 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1263 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1264 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1265 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1266 ; AVX512-NEXT: vzeroupper
1267 ; AVX512-NEXT: retq
1268 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> %a0)
1269 ret i8 %1
1270 }
1271
1272 declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64>)
1273 declare i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64>)
1274 declare i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64>)
1275 declare i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64>)
1276
1277 declare i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32>)
1278 declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32>)
1279 declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>)
1280 declare i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32>)
1281 declare i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32>)
1282
1283 declare i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16>)
1284 declare i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16>)
1285 declare i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>)
1286 declare i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>)
1287 declare i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16>)
1288 declare i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16>)
1289
1290 declare i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8>)
1291 declare i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8>)
1292 declare i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8>)
1293 declare i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>)
1294 declare i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8>)
1295 declare i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8>)
1296 declare i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8>)
192192 ;
193193 ; vXi32
194194 ;
195
196 define i32 @test_v2i32(<2 x i32> %a0) {
197 ; SSE-LABEL: test_v2i32:
198 ; SSE: # %bb.0:
199 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
200 ; SSE-NEXT: paddq %xmm0, %xmm1
201 ; SSE-NEXT: movd %xmm1, %eax
202 ; SSE-NEXT: retq
203 ;
204 ; AVX-LABEL: test_v2i32:
205 ; AVX: # %bb.0:
206 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
207 ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
208 ; AVX-NEXT: vmovd %xmm0, %eax
209 ; AVX-NEXT: retq
210 ;
211 ; AVX512-LABEL: test_v2i32:
212 ; AVX512: # %bb.0:
213 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
214 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
215 ; AVX512-NEXT: vmovd %xmm0, %eax
216 ; AVX512-NEXT: retq
217 %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> %a0)
218 ret i32 %1
219 }
195220
196221 define i32 @test_v4i32(<4 x i32> %a0) {
197222 ; SSE-LABEL: test_v4i32:
406431 ;
407432 ; vXi16
408433 ;
434
435 define i16 @test_v2i16(<2 x i16> %a0) {
436 ; SSE-LABEL: test_v2i16:
437 ; SSE: # %bb.0:
438 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
439 ; SSE-NEXT: paddq %xmm0, %xmm1
440 ; SSE-NEXT: movd %xmm1, %eax
441 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
442 ; SSE-NEXT: retq
443 ;
444 ; AVX-LABEL: test_v2i16:
445 ; AVX: # %bb.0:
446 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
447 ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
448 ; AVX-NEXT: vmovd %xmm0, %eax
449 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax
450 ; AVX-NEXT: retq
451 ;
452 ; AVX512-LABEL: test_v2i16:
453 ; AVX512: # %bb.0:
454 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
455 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
456 ; AVX512-NEXT: vmovd %xmm0, %eax
457 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
458 ; AVX512-NEXT: retq
459 %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> %a0)
460 ret i16 %1
461 }
462
463 define i16 @test_v4i16(<4 x i16> %a0) {
464 ; SSE-LABEL: test_v4i16:
465 ; SSE: # %bb.0:
466 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
467 ; SSE-NEXT: paddd %xmm0, %xmm1
468 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
469 ; SSE-NEXT: paddd %xmm1, %xmm0
470 ; SSE-NEXT: movd %xmm0, %eax
471 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
472 ; SSE-NEXT: retq
473 ;
474 ; AVX-LABEL: test_v4i16:
475 ; AVX: # %bb.0:
476 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
477 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
478 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
479 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
480 ; AVX-NEXT: vmovd %xmm0, %eax
481 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax
482 ; AVX-NEXT: retq
483 ;
484 ; AVX512-LABEL: test_v4i16:
485 ; AVX512: # %bb.0:
486 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
487 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
488 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
489 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
490 ; AVX512-NEXT: vmovd %xmm0, %eax
491 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
492 ; AVX512-NEXT: retq
493 %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> %a0)
494 ret i16 %1
495 }
409496
410497 define i16 @test_v8i16(<8 x i16> %a0) {
411498 ; SSE-LABEL: test_v8i16:
670757 ; vXi8
671758 ;
672759
760 define i8 @test_v2i8(<2 x i8> %a0) {
761 ; SSE2-LABEL: test_v2i8:
762 ; SSE2: # %bb.0:
763 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
764 ; SSE2-NEXT: paddq %xmm0, %xmm1
765 ; SSE2-NEXT: movd %xmm1, %eax
766 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
767 ; SSE2-NEXT: retq
768 ;
769 ; SSE41-LABEL: test_v2i8:
770 ; SSE41: # %bb.0:
771 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
772 ; SSE41-NEXT: paddq %xmm0, %xmm1
773 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
774 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
775 ; SSE41-NEXT: retq
776 ;
777 ; AVX-LABEL: test_v2i8:
778 ; AVX: # %bb.0:
779 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
780 ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
781 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
782 ; AVX-NEXT: # kill: def $al killed $al killed $eax
783 ; AVX-NEXT: retq
784 ;
785 ; AVX512-LABEL: test_v2i8:
786 ; AVX512: # %bb.0:
787 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
788 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
789 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
790 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
791 ; AVX512-NEXT: retq
792 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> %a0)
793 ret i8 %1
794 }
795
796 define i8 @test_v4i8(<4 x i8> %a0) {
797 ; SSE2-LABEL: test_v4i8:
798 ; SSE2: # %bb.0:
799 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
800 ; SSE2-NEXT: paddd %xmm0, %xmm1
801 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
802 ; SSE2-NEXT: paddd %xmm1, %xmm0
803 ; SSE2-NEXT: movd %xmm0, %eax
804 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
805 ; SSE2-NEXT: retq
806 ;
807 ; SSE41-LABEL: test_v4i8:
808 ; SSE41: # %bb.0:
809 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
810 ; SSE41-NEXT: paddd %xmm0, %xmm1
811 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
812 ; SSE41-NEXT: paddd %xmm1, %xmm0
813 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
814 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
815 ; SSE41-NEXT: retq
816 ;
817 ; AVX-LABEL: test_v4i8:
818 ; AVX: # %bb.0:
819 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
820 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
821 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
822 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
823 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
824 ; AVX-NEXT: # kill: def $al killed $al killed $eax
825 ; AVX-NEXT: retq
826 ;
827 ; AVX512-LABEL: test_v4i8:
828 ; AVX512: # %bb.0:
829 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
830 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
831 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
832 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
833 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
834 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
835 ; AVX512-NEXT: retq
836 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> %a0)
837 ret i8 %1
838 }
839
840 define i8 @test_v8i8(<8 x i8> %a0) {
841 ; SSE2-LABEL: test_v8i8:
842 ; SSE2: # %bb.0:
843 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
844 ; SSE2-NEXT: paddw %xmm0, %xmm1
845 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
846 ; SSE2-NEXT: paddw %xmm1, %xmm0
847 ; SSE2-NEXT: movdqa %xmm0, %xmm1
848 ; SSE2-NEXT: psrld $16, %xmm1
849 ; SSE2-NEXT: paddw %xmm0, %xmm1
850 ; SSE2-NEXT: movd %xmm1, %eax
851 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
852 ; SSE2-NEXT: retq
853 ;
854 ; SSE41-LABEL: test_v8i8:
855 ; SSE41: # %bb.0:
856 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
857 ; SSE41-NEXT: paddw %xmm0, %xmm1
858 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
859 ; SSE41-NEXT: paddw %xmm1, %xmm0
860 ; SSE41-NEXT: movdqa %xmm0, %xmm1
861 ; SSE41-NEXT: psrld $16, %xmm1
862 ; SSE41-NEXT: paddw %xmm0, %xmm1
863 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
864 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
865 ; SSE41-NEXT: retq
866 ;
867 ; AVX-LABEL: test_v8i8:
868 ; AVX: # %bb.0:
869 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
870 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
871 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
872 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
873 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
874 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
875 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
876 ; AVX-NEXT: # kill: def $al killed $al killed $eax
877 ; AVX-NEXT: retq
878 ;
879 ; AVX512-LABEL: test_v8i8:
880 ; AVX512: # %bb.0:
881 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
882 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
883 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
884 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
885 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
886 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
887 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
888 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
889 ; AVX512-NEXT: retq
890 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> %a0)
891 ret i8 %1
892 }
893
673894 define i8 @test_v16i8(<16 x i8> %a0) {
674895 ; SSE2-LABEL: test_v16i8:
675896 ; SSE2: # %bb.0:
10431264 declare i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64>)
10441265 declare i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64>)
10451266
1267 declare i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32>)
10461268 declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32>)
10471269 declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>)
10481270 declare i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32>)
10491271 declare i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32>)
10501272
1273 declare i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16>)
1274 declare i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16>)
10511275 declare i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>)
10521276 declare i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>)
10531277 declare i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16>)
10541278 declare i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16>)
10551279
1280 declare i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8>)
1281 declare i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8>)
1282 declare i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8>)
10561283 declare i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>)
10571284 declare i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8>)
10581285 declare i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8>)
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
2 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
3 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
4 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
5 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
6 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
7
8 ;
9 ; vXi64
10 ;
11
12 define i64 @test_v2i64(<2 x i64> %a0) {
13 ; SSE-LABEL: test_v2i64:
14 ; SSE: # %bb.0:
15 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
16 ; SSE-NEXT: pand %xmm0, %xmm1
17 ; SSE-NEXT: movq %xmm1, %rax
18 ; SSE-NEXT: retq
19 ;
20 ; AVX-LABEL: test_v2i64:
21 ; AVX: # %bb.0:
22 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
23 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
24 ; AVX-NEXT: vmovq %xmm0, %rax
25 ; AVX-NEXT: retq
26 ;
27 ; AVX512-LABEL: test_v2i64:
28 ; AVX512: # %bb.0:
29 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
30 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
31 ; AVX512-NEXT: vmovq %xmm0, %rax
32 ; AVX512-NEXT: retq
33 %1 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> %a0)
34 ret i64 %1
35 }
36
37 define i64 @test_v4i64(<4 x i64> %a0) {
38 ; SSE-LABEL: test_v4i64:
39 ; SSE: # %bb.0:
40 ; SSE-NEXT: pand %xmm1, %xmm0
41 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
42 ; SSE-NEXT: pand %xmm0, %xmm1
43 ; SSE-NEXT: movq %xmm1, %rax
44 ; SSE-NEXT: retq
45 ;
46 ; AVX1-LABEL: test_v4i64:
47 ; AVX1: # %bb.0:
48 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
49 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
50 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
51 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
52 ; AVX1-NEXT: vmovq %xmm0, %rax
53 ; AVX1-NEXT: vzeroupper
54 ; AVX1-NEXT: retq
55 ;
56 ; AVX2-LABEL: test_v4i64:
57 ; AVX2: # %bb.0:
58 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
59 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
60 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
61 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
62 ; AVX2-NEXT: vmovq %xmm0, %rax
63 ; AVX2-NEXT: vzeroupper
64 ; AVX2-NEXT: retq
65 ;
66 ; AVX512-LABEL: test_v4i64:
67 ; AVX512: # %bb.0:
68 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
69 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
70 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
71 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
72 ; AVX512-NEXT: vmovq %xmm0, %rax
73 ; AVX512-NEXT: vzeroupper
74 ; AVX512-NEXT: retq
75 %1 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> %a0)
76 ret i64 %1
77 }
78
79 define i64 @test_v8i64(<8 x i64> %a0) {
80 ; SSE-LABEL: test_v8i64:
81 ; SSE: # %bb.0:
82 ; SSE-NEXT: pand %xmm3, %xmm1
83 ; SSE-NEXT: pand %xmm2, %xmm1
84 ; SSE-NEXT: pand %xmm0, %xmm1
85 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
86 ; SSE-NEXT: pand %xmm1, %xmm0
87 ; SSE-NEXT: movq %xmm0, %rax
88 ; SSE-NEXT: retq
89 ;
90 ; AVX1-LABEL: test_v8i64:
91 ; AVX1: # %bb.0:
92 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
93 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
94 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
95 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
96 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
97 ; AVX1-NEXT: vmovq %xmm0, %rax
98 ; AVX1-NEXT: vzeroupper
99 ; AVX1-NEXT: retq
100 ;
101 ; AVX2-LABEL: test_v8i64:
102 ; AVX2: # %bb.0:
103 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
104 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
105 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
106 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
107 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
108 ; AVX2-NEXT: vmovq %xmm0, %rax
109 ; AVX2-NEXT: vzeroupper
110 ; AVX2-NEXT: retq
111 ;
112 ; AVX512-LABEL: test_v8i64:
113 ; AVX512: # %bb.0:
114 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
115 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
116 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
117 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
118 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
119 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
120 ; AVX512-NEXT: vmovq %xmm0, %rax
121 ; AVX512-NEXT: vzeroupper
122 ; AVX512-NEXT: retq
123 %1 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> %a0)
124 ret i64 %1
125 }
126
127 define i64 @test_v16i64(<16 x i64> %a0) {
128 ; SSE-LABEL: test_v16i64:
129 ; SSE: # %bb.0:
130 ; SSE-NEXT: pand %xmm6, %xmm2
131 ; SSE-NEXT: pand %xmm7, %xmm3
132 ; SSE-NEXT: pand %xmm5, %xmm3
133 ; SSE-NEXT: pand %xmm1, %xmm3
134 ; SSE-NEXT: pand %xmm4, %xmm2
135 ; SSE-NEXT: pand %xmm3, %xmm2
136 ; SSE-NEXT: pand %xmm0, %xmm2
137 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
138 ; SSE-NEXT: pand %xmm2, %xmm0
139 ; SSE-NEXT: movq %xmm0, %rax
140 ; SSE-NEXT: retq
141 ;
142 ; AVX1-LABEL: test_v16i64:
143 ; AVX1: # %bb.0:
144 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
145 ; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
146 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
147 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
148 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
149 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
150 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
151 ; AVX1-NEXT: vmovq %xmm0, %rax
152 ; AVX1-NEXT: vzeroupper
153 ; AVX1-NEXT: retq
154 ;
155 ; AVX2-LABEL: test_v16i64:
156 ; AVX2: # %bb.0:
157 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
158 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
159 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
160 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
161 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
162 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
163 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
164 ; AVX2-NEXT: vmovq %xmm0, %rax
165 ; AVX2-NEXT: vzeroupper
166 ; AVX2-NEXT: retq
167 ;
168 ; AVX512-LABEL: test_v16i64:
169 ; AVX512: # %bb.0:
170 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
171 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
172 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
173 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
174 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
175 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
176 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
177 ; AVX512-NEXT: vmovq %xmm0, %rax
178 ; AVX512-NEXT: vzeroupper
179 ; AVX512-NEXT: retq
180 %1 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> %a0)
181 ret i64 %1
182 }
183
184 ;
185 ; vXi32
186 ;
187
188 define i32 @test_v2i32(<2 x i32> %a0) {
189 ; SSE-LABEL: test_v2i32:
190 ; SSE: # %bb.0:
191 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
192 ; SSE-NEXT: pand %xmm0, %xmm1
193 ; SSE-NEXT: movd %xmm1, %eax
194 ; SSE-NEXT: retq
195 ;
196 ; AVX-LABEL: test_v2i32:
197 ; AVX: # %bb.0:
198 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
199 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
200 ; AVX-NEXT: vmovd %xmm0, %eax
201 ; AVX-NEXT: retq
202 ;
203 ; AVX512-LABEL: test_v2i32:
204 ; AVX512: # %bb.0:
205 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
206 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
207 ; AVX512-NEXT: vmovd %xmm0, %eax
208 ; AVX512-NEXT: retq
209 %1 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> %a0)
210 ret i32 %1
211 }
212
213 define i32 @test_v4i32(<4 x i32> %a0) {
214 ; SSE-LABEL: test_v4i32:
215 ; SSE: # %bb.0:
216 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
217 ; SSE-NEXT: pand %xmm0, %xmm1
218 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
219 ; SSE-NEXT: pand %xmm1, %xmm0
220 ; SSE-NEXT: movd %xmm0, %eax
221 ; SSE-NEXT: retq
222 ;
223 ; AVX-LABEL: test_v4i32:
224 ; AVX: # %bb.0:
225 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
226 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
227 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
228 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
229 ; AVX-NEXT: vmovd %xmm0, %eax
230 ; AVX-NEXT: retq
231 ;
232 ; AVX512-LABEL: test_v4i32:
233 ; AVX512: # %bb.0:
234 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
235 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
236 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
237 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
238 ; AVX512-NEXT: vmovd %xmm0, %eax
239 ; AVX512-NEXT: retq
240 %1 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> %a0)
241 ret i32 %1
242 }
243
244 define i32 @test_v8i32(<8 x i32> %a0) {
245 ; SSE-LABEL: test_v8i32:
246 ; SSE: # %bb.0:
247 ; SSE-NEXT: pand %xmm1, %xmm0
248 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
249 ; SSE-NEXT: pand %xmm0, %xmm1
250 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
251 ; SSE-NEXT: pand %xmm1, %xmm0
252 ; SSE-NEXT: movd %xmm0, %eax
253 ; SSE-NEXT: retq
254 ;
255 ; AVX1-LABEL: test_v8i32:
256 ; AVX1: # %bb.0:
257 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
258 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
259 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
260 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
261 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
262 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
263 ; AVX1-NEXT: vmovd %xmm0, %eax
264 ; AVX1-NEXT: vzeroupper
265 ; AVX1-NEXT: retq
266 ;
267 ; AVX2-LABEL: test_v8i32:
268 ; AVX2: # %bb.0:
269 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
270 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
271 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
272 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
273 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
274 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
275 ; AVX2-NEXT: vmovd %xmm0, %eax
276 ; AVX2-NEXT: vzeroupper
277 ; AVX2-NEXT: retq
278 ;
279 ; AVX512-LABEL: test_v8i32:
280 ; AVX512: # %bb.0:
281 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
282 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
283 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
284 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
285 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
286 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
287 ; AVX512-NEXT: vmovd %xmm0, %eax
288 ; AVX512-NEXT: vzeroupper
289 ; AVX512-NEXT: retq
290 %1 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> %a0)
291 ret i32 %1
292 }
293
294 define i32 @test_v16i32(<16 x i32> %a0) {
295 ; SSE-LABEL: test_v16i32:
296 ; SSE: # %bb.0:
297 ; SSE-NEXT: pand %xmm3, %xmm1
298 ; SSE-NEXT: pand %xmm2, %xmm1
299 ; SSE-NEXT: pand %xmm0, %xmm1
300 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
301 ; SSE-NEXT: pand %xmm1, %xmm0
302 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
303 ; SSE-NEXT: pand %xmm0, %xmm1
304 ; SSE-NEXT: movd %xmm1, %eax
305 ; SSE-NEXT: retq
306 ;
307 ; AVX1-LABEL: test_v16i32:
308 ; AVX1: # %bb.0:
309 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
310 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
311 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
312 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
313 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
314 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
315 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
316 ; AVX1-NEXT: vmovd %xmm0, %eax
317 ; AVX1-NEXT: vzeroupper
318 ; AVX1-NEXT: retq
319 ;
320 ; AVX2-LABEL: test_v16i32:
321 ; AVX2: # %bb.0:
322 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
323 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
324 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
325 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
326 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
327 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
328 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
329 ; AVX2-NEXT: vmovd %xmm0, %eax
330 ; AVX2-NEXT: vzeroupper
331 ; AVX2-NEXT: retq
332 ;
333 ; AVX512-LABEL: test_v16i32:
334 ; AVX512: # %bb.0:
335 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
336 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
337 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
338 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
339 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
340 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
341 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
342 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
343 ; AVX512-NEXT: vmovd %xmm0, %eax
344 ; AVX512-NEXT: vzeroupper
345 ; AVX512-NEXT: retq
346 %1 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> %a0)
347 ret i32 %1
348 }
349
350 define i32 @test_v32i32(<32 x i32> %a0) {
351 ; SSE-LABEL: test_v32i32:
352 ; SSE: # %bb.0:
353 ; SSE-NEXT: pand %xmm6, %xmm2
354 ; SSE-NEXT: pand %xmm7, %xmm3
355 ; SSE-NEXT: pand %xmm5, %xmm3
356 ; SSE-NEXT: pand %xmm1, %xmm3
357 ; SSE-NEXT: pand %xmm4, %xmm2
358 ; SSE-NEXT: pand %xmm3, %xmm2
359 ; SSE-NEXT: pand %xmm0, %xmm2
360 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
361 ; SSE-NEXT: pand %xmm2, %xmm0
362 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
363 ; SSE-NEXT: pand %xmm0, %xmm1
364 ; SSE-NEXT: movd %xmm1, %eax
365 ; SSE-NEXT: retq
366 ;
367 ; AVX1-LABEL: test_v32i32:
368 ; AVX1: # %bb.0:
369 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
370 ; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
371 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
372 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
373 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
374 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
375 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
376 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
377 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
378 ; AVX1-NEXT: vmovd %xmm0, %eax
379 ; AVX1-NEXT: vzeroupper
380 ; AVX1-NEXT: retq
381 ;
382 ; AVX2-LABEL: test_v32i32:
383 ; AVX2: # %bb.0:
384 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
385 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
386 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
387 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
388 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
389 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
390 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
391 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
392 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
393 ; AVX2-NEXT: vmovd %xmm0, %eax
394 ; AVX2-NEXT: vzeroupper
395 ; AVX2-NEXT: retq
396 ;
397 ; AVX512-LABEL: test_v32i32:
398 ; AVX512: # %bb.0:
399 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
400 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
401 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
402 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
403 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
404 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
405 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
406 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
407 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
408 ; AVX512-NEXT: vmovd %xmm0, %eax
409 ; AVX512-NEXT: vzeroupper
410 ; AVX512-NEXT: retq
411 %1 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> %a0)
412 ret i32 %1
413 }
414
415 ;
416 ; vXi16
417 ;
418
419 define i16 @test_v2i16(<2 x i16> %a0) {
420 ; SSE-LABEL: test_v2i16:
421 ; SSE: # %bb.0:
422 ; SSE-NEXT: movdqa %xmm0, %xmm1
423 ; SSE-NEXT: psrld $16, %xmm1
424 ; SSE-NEXT: pand %xmm0, %xmm1
425 ; SSE-NEXT: movd %xmm1, %eax
426 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
427 ; SSE-NEXT: retq
428 ;
429 ; AVX-LABEL: test_v2i16:
430 ; AVX: # %bb.0:
431 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
432 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
433 ; AVX-NEXT: vmovd %xmm0, %eax
434 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax
435 ; AVX-NEXT: retq
436 ;
437 ; AVX512-LABEL: test_v2i16:
438 ; AVX512: # %bb.0:
439 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
440 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
441 ; AVX512-NEXT: vmovd %xmm0, %eax
442 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
443 ; AVX512-NEXT: retq
444 %1 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> %a0)
445 ret i16 %1
446 }
447
448 define i16 @test_v4i16(<4 x i16> %a0) {
449 ; SSE-LABEL: test_v4i16:
450 ; SSE: # %bb.0:
451 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
452 ; SSE-NEXT: pand %xmm0, %xmm1
453 ; SSE-NEXT: movdqa %xmm1, %xmm0
454 ; SSE-NEXT: psrld $16, %xmm0
455 ; SSE-NEXT: pand %xmm1, %xmm0
456 ; SSE-NEXT: movd %xmm0, %eax
457 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
458 ; SSE-NEXT: retq
459 ;
460 ; AVX-LABEL: test_v4i16:
461 ; AVX: # %bb.0:
462 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
463 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
464 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
465 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
466 ; AVX-NEXT: vmovd %xmm0, %eax
467 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax
468 ; AVX-NEXT: retq
469 ;
470 ; AVX512-LABEL: test_v4i16:
471 ; AVX512: # %bb.0:
472 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
473 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
474 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
475 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
476 ; AVX512-NEXT: vmovd %xmm0, %eax
477 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
478 ; AVX512-NEXT: retq
479 %1 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> %a0)
480 ret i16 %1
481 }
482
483 define i16 @test_v8i16(<8 x i16> %a0) {
484 ; SSE-LABEL: test_v8i16:
485 ; SSE: # %bb.0:
486 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
487 ; SSE-NEXT: pand %xmm0, %xmm1
488 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
489 ; SSE-NEXT: pand %xmm1, %xmm0
490 ; SSE-NEXT: movdqa %xmm0, %xmm1
491 ; SSE-NEXT: psrld $16, %xmm1
492 ; SSE-NEXT: pand %xmm0, %xmm1
493 ; SSE-NEXT: movd %xmm1, %eax
494 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
495 ; SSE-NEXT: retq
496 ;
497 ; AVX-LABEL: test_v8i16:
498 ; AVX: # %bb.0:
499 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
500 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
501 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
502 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
503 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
504 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
505 ; AVX-NEXT: vmovd %xmm0, %eax
506 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax
507 ; AVX-NEXT: retq
508 ;
509 ; AVX512-LABEL: test_v8i16:
510 ; AVX512: # %bb.0:
511 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
512 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
513 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
514 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
515 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
516 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
517 ; AVX512-NEXT: vmovd %xmm0, %eax
518 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
519 ; AVX512-NEXT: retq
520 %1 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> %a0)
521 ret i16 %1
522 }
523
524 define i16 @test_v16i16(<16 x i16> %a0) {
525 ; SSE-LABEL: test_v16i16:
526 ; SSE: # %bb.0:
527 ; SSE-NEXT: pand %xmm1, %xmm0
528 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
529 ; SSE-NEXT: pand %xmm0, %xmm1
530 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
531 ; SSE-NEXT: pand %xmm1, %xmm0
532 ; SSE-NEXT: movdqa %xmm0, %xmm1
533 ; SSE-NEXT: psrld $16, %xmm1
534 ; SSE-NEXT: pand %xmm0, %xmm1
535 ; SSE-NEXT: movd %xmm1, %eax
536 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
537 ; SSE-NEXT: retq
538 ;
539 ; AVX1-LABEL: test_v16i16:
540 ; AVX1: # %bb.0:
541 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
542 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
543 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
544 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
545 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
546 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
547 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
548 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
549 ; AVX1-NEXT: vmovd %xmm0, %eax
550 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
551 ; AVX1-NEXT: vzeroupper
552 ; AVX1-NEXT: retq
553 ;
554 ; AVX2-LABEL: test_v16i16:
555 ; AVX2: # %bb.0:
556 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
557 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
558 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
559 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
560 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
561 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
562 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
563 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
564 ; AVX2-NEXT: vmovd %xmm0, %eax
565 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
566 ; AVX2-NEXT: vzeroupper
567 ; AVX2-NEXT: retq
568 ;
569 ; AVX512-LABEL: test_v16i16:
570 ; AVX512: # %bb.0:
571 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
572 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
573 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
574 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
575 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
576 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
577 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
578 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
579 ; AVX512-NEXT: vmovd %xmm0, %eax
580 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
581 ; AVX512-NEXT: vzeroupper
582 ; AVX512-NEXT: retq
583 %1 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> %a0)
584 ret i16 %1
585 }
586
587 define i16 @test_v32i16(<32 x i16> %a0) {
588 ; SSE-LABEL: test_v32i16:
589 ; SSE: # %bb.0:
590 ; SSE-NEXT: pand %xmm3, %xmm1
591 ; SSE-NEXT: pand %xmm2, %xmm1
592 ; SSE-NEXT: pand %xmm0, %xmm1
593 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
594 ; SSE-NEXT: pand %xmm1, %xmm0
595 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
596 ; SSE-NEXT: pand %xmm0, %xmm1
597 ; SSE-NEXT: movdqa %xmm1, %xmm0
598 ; SSE-NEXT: psrld $16, %xmm0
599 ; SSE-NEXT: pand %xmm1, %xmm0
600 ; SSE-NEXT: movd %xmm0, %eax
601 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
602 ; SSE-NEXT: retq
603 ;
604 ; AVX1-LABEL: test_v32i16:
605 ; AVX1: # %bb.0:
606 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
607 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
608 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
609 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
610 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
611 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
612 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
613 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
614 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
615 ; AVX1-NEXT: vmovd %xmm0, %eax
616 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
617 ; AVX1-NEXT: vzeroupper
618 ; AVX1-NEXT: retq
619 ;
620 ; AVX2-LABEL: test_v32i16:
621 ; AVX2: # %bb.0:
622 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
623 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
624 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
625 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
626 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
627 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
628 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
629 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
630 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
631 ; AVX2-NEXT: vmovd %xmm0, %eax
632 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
633 ; AVX2-NEXT: vzeroupper
634 ; AVX2-NEXT: retq
635 ;
636 ; AVX512-LABEL: test_v32i16:
637 ; AVX512: # %bb.0:
638 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
639 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
640 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
641 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
642 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
643 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
644 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
645 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
646 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
647 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
648 ; AVX512-NEXT: vmovd %xmm0, %eax
649 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
650 ; AVX512-NEXT: vzeroupper
651 ; AVX512-NEXT: retq
652 %1 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> %a0)
653 ret i16 %1
654 }
655
656 define i16 @test_v64i16(<64 x i16> %a0) {
657 ; SSE-LABEL: test_v64i16:
658 ; SSE: # %bb.0:
659 ; SSE-NEXT: pand %xmm6, %xmm2
660 ; SSE-NEXT: pand %xmm7, %xmm3
661 ; SSE-NEXT: pand %xmm5, %xmm3
662 ; SSE-NEXT: pand %xmm1, %xmm3
663 ; SSE-NEXT: pand %xmm4, %xmm2
664 ; SSE-NEXT: pand %xmm3, %xmm2
665 ; SSE-NEXT: pand %xmm0, %xmm2
666 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
667 ; SSE-NEXT: pand %xmm2, %xmm0
668 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
669 ; SSE-NEXT: pand %xmm0, %xmm1
670 ; SSE-NEXT: movdqa %xmm1, %xmm0
671 ; SSE-NEXT: psrld $16, %xmm0
672 ; SSE-NEXT: pand %xmm1, %xmm0
673 ; SSE-NEXT: movd %xmm0, %eax
674 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
675 ; SSE-NEXT: retq
676 ;
677 ; AVX1-LABEL: test_v64i16:
678 ; AVX1: # %bb.0:
679 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
680 ; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
681 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
682 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
683 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
684 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
685 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
686 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
687 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
688 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
689 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
690 ; AVX1-NEXT: vmovd %xmm0, %eax
691 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
692 ; AVX1-NEXT: vzeroupper
693 ; AVX1-NEXT: retq
694 ;
695 ; AVX2-LABEL: test_v64i16:
696 ; AVX2: # %bb.0:
697 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
698 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
699 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
700 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
701 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
702 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
703 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
704 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
705 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
706 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
707 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
708 ; AVX2-NEXT: vmovd %xmm0, %eax
709 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
710 ; AVX2-NEXT: vzeroupper
711 ; AVX2-NEXT: retq
712 ;
713 ; AVX512-LABEL: test_v64i16:
714 ; AVX512: # %bb.0:
715 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
716 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
717 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
718 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
719 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
720 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
721 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
722 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
723 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
724 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
725 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
726 ; AVX512-NEXT: vmovd %xmm0, %eax
727 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
728 ; AVX512-NEXT: vzeroupper
729 ; AVX512-NEXT: retq
730 %1 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> %a0)
731 ret i16 %1
732 }
733
734 ;
735 ; vXi8
736 ;
737
738 define i8 @test_v2i8(<2 x i8> %a0) {
739 ; SSE2-LABEL: test_v2i8:
740 ; SSE2: # %bb.0:
741 ; SSE2-NEXT: movdqa %xmm0, %xmm1
742 ; SSE2-NEXT: psrlw $8, %xmm1
743 ; SSE2-NEXT: pand %xmm0, %xmm1
744 ; SSE2-NEXT: movd %xmm1, %eax
745 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
746 ; SSE2-NEXT: retq
747 ;
748 ; SSE41-LABEL: test_v2i8:
749 ; SSE41: # %bb.0:
750 ; SSE41-NEXT: movdqa %xmm0, %xmm1
751 ; SSE41-NEXT: psrlw $8, %xmm1
752 ; SSE41-NEXT: pand %xmm0, %xmm1
753 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
754 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
755 ; SSE41-NEXT: retq
756 ;
757 ; AVX-LABEL: test_v2i8:
758 ; AVX: # %bb.0:
759 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
760 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
761 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
762 ; AVX-NEXT: # kill: def $al killed $al killed $eax
763 ; AVX-NEXT: retq
764 ;
765 ; AVX512-LABEL: test_v2i8:
766 ; AVX512: # %bb.0:
767 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
768 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
769 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
770 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
771 ; AVX512-NEXT: retq
772 %1 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> %a0)
773 ret i8 %1
774 }
775
776 define i8 @test_v4i8(<4 x i8> %a0) {
777 ; SSE2-LABEL: test_v4i8:
778 ; SSE2: # %bb.0:
779 ; SSE2-NEXT: movdqa %xmm0, %xmm1
780 ; SSE2-NEXT: psrld $16, %xmm1
781 ; SSE2-NEXT: pand %xmm0, %xmm1
782 ; SSE2-NEXT: movdqa %xmm1, %xmm0
783 ; SSE2-NEXT: psrlw $8, %xmm0
784 ; SSE2-NEXT: pand %xmm1, %xmm0
785 ; SSE2-NEXT: movd %xmm0, %eax
786 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
787 ; SSE2-NEXT: retq
788 ;
789 ; SSE41-LABEL: test_v4i8:
790 ; SSE41: # %bb.0:
791 ; SSE41-NEXT: movdqa %xmm0, %xmm1
792 ; SSE41-NEXT: psrld $16, %xmm1
793 ; SSE41-NEXT: pand %xmm0, %xmm1
794 ; SSE41-NEXT: movdqa %xmm1, %xmm0
795 ; SSE41-NEXT: psrlw $8, %xmm0
796 ; SSE41-NEXT: pand %xmm1, %xmm0
797 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
798 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
799 ; SSE41-NEXT: retq
800 ;
801 ; AVX-LABEL: test_v4i8:
802 ; AVX: # %bb.0:
803 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
804 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
805 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
806 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
807 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
808 ; AVX-NEXT: # kill: def $al killed $al killed $eax
809 ; AVX-NEXT: retq
810 ;
811 ; AVX512-LABEL: test_v4i8:
812 ; AVX512: # %bb.0:
813 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
814 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
815 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
816 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
817 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
818 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
819 ; AVX512-NEXT: retq
820 %1 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> %a0)
821 ret i8 %1
822 }
823
824 define i8 @test_v8i8(<8 x i8> %a0) {
825 ; SSE2-LABEL: test_v8i8:
826 ; SSE2: # %bb.0:
827 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
828 ; SSE2-NEXT: pand %xmm0, %xmm1
829 ; SSE2-NEXT: movdqa %xmm1, %xmm0
830 ; SSE2-NEXT: psrld $16, %xmm0
831 ; SSE2-NEXT: pand %xmm1, %xmm0
832 ; SSE2-NEXT: movdqa %xmm0, %xmm1
833 ; SSE2-NEXT: psrlw $8, %xmm1
834 ; SSE2-NEXT: pand %xmm0, %xmm1
835 ; SSE2-NEXT: movd %xmm1, %eax
836 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
837 ; SSE2-NEXT: retq
838 ;
839 ; SSE41-LABEL: test_v8i8:
840 ; SSE41: # %bb.0:
841 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
842 ; SSE41-NEXT: pand %xmm0, %xmm1
843 ; SSE41-NEXT: movdqa %xmm1, %xmm0
844 ; SSE41-NEXT: psrld $16, %xmm0
845 ; SSE41-NEXT: pand %xmm1, %xmm0
846 ; SSE41-NEXT: movdqa %xmm0, %xmm1
847 ; SSE41-NEXT: psrlw $8, %xmm1
848 ; SSE41-NEXT: pand %xmm0, %xmm1
849 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
850 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
851 ; SSE41-NEXT: retq
852 ;
853 ; AVX-LABEL: test_v8i8:
854 ; AVX: # %bb.0:
855 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
856 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
857 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
858 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
859 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
860 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
861 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
862 ; AVX-NEXT: # kill: def $al killed $al killed $eax
863 ; AVX-NEXT: retq
864 ;
865 ; AVX512-LABEL: test_v8i8:
866 ; AVX512: # %bb.0:
867 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
868 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
869 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
870 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
871 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
872 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
873 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
874 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
875 ; AVX512-NEXT: retq
876 %1 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> %a0)
877 ret i8 %1
878 }
879
880 define i8 @test_v16i8(<16 x i8> %a0) {
881 ; SSE2-LABEL: test_v16i8:
882 ; SSE2: # %bb.0:
883 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
884 ; SSE2-NEXT: pand %xmm0, %xmm1
885 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
886 ; SSE2-NEXT: pand %xmm1, %xmm0
887 ; SSE2-NEXT: movdqa %xmm0, %xmm1
888 ; SSE2-NEXT: psrld $16, %xmm1
889 ; SSE2-NEXT: pand %xmm0, %xmm1
890 ; SSE2-NEXT: movdqa %xmm1, %xmm0
891 ; SSE2-NEXT: psrlw $8, %xmm0
892 ; SSE2-NEXT: pand %xmm1, %xmm0
893 ; SSE2-NEXT: movd %xmm0, %eax
894 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
895 ; SSE2-NEXT: retq
896 ;
897 ; SSE41-LABEL: test_v16i8:
898 ; SSE41: # %bb.0:
899 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
900 ; SSE41-NEXT: pand %xmm0, %xmm1
901 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
902 ; SSE41-NEXT: pand %xmm1, %xmm0
903 ; SSE41-NEXT: movdqa %xmm0, %xmm1
904 ; SSE41-NEXT: psrld $16, %xmm1
905 ; SSE41-NEXT: pand %xmm0, %xmm1
906 ; SSE41-NEXT: movdqa %xmm1, %xmm0
907 ; SSE41-NEXT: psrlw $8, %xmm0
908 ; SSE41-NEXT: pand %xmm1, %xmm0
909 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
910 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
911 ; SSE41-NEXT: retq
912 ;
913 ; AVX-LABEL: test_v16i8:
914 ; AVX: # %bb.0:
915 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
916 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
917 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
918 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
919 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
920 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
921 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
922 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
923 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
924 ; AVX-NEXT: # kill: def $al killed $al killed $eax
925 ; AVX-NEXT: retq
926 ;
927 ; AVX512-LABEL: test_v16i8:
928 ; AVX512: # %bb.0:
929 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
930 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
931 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
932 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
933 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
934 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
935 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
936 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
937 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
938 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
939 ; AVX512-NEXT: retq
940 %1 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> %a0)
941 ret i8 %1
942 }
943
944 define i8 @test_v32i8(<32 x i8> %a0) {
945 ; SSE2-LABEL: test_v32i8:
946 ; SSE2: # %bb.0:
947 ; SSE2-NEXT: pand %xmm1, %xmm0
948 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
949 ; SSE2-NEXT: pand %xmm0, %xmm1
950 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
951 ; SSE2-NEXT: pand %xmm1, %xmm0
952 ; SSE2-NEXT: movdqa %xmm0, %xmm1
953 ; SSE2-NEXT: psrld $16, %xmm1
954 ; SSE2-NEXT: pand %xmm0, %xmm1
955 ; SSE2-NEXT: movdqa %xmm1, %xmm0
956 ; SSE2-NEXT: psrlw $8, %xmm0
957 ; SSE2-NEXT: pand %xmm1, %xmm0
958 ; SSE2-NEXT: movd %xmm0, %eax
959 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
960 ; SSE2-NEXT: retq
961 ;
962 ; SSE41-LABEL: test_v32i8:
963 ; SSE41: # %bb.0:
964 ; SSE41-NEXT: pand %xmm1, %xmm0
965 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
966 ; SSE41-NEXT: pand %xmm0, %xmm1
967 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
968 ; SSE41-NEXT: pand %xmm1, %xmm0
969 ; SSE41-NEXT: movdqa %xmm0, %xmm1
970 ; SSE41-NEXT: psrld $16, %xmm1
971 ; SSE41-NEXT: pand %xmm0, %xmm1
972 ; SSE41-NEXT: movdqa %xmm1, %xmm0
973 ; SSE41-NEXT: psrlw $8, %xmm0
974 ; SSE41-NEXT: pand %xmm1, %xmm0
975 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
976 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
977 ; SSE41-NEXT: retq
978 ;
979 ; AVX1-LABEL: test_v32i8:
980 ; AVX1: # %bb.0:
981 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
982 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
983 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
984 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
985 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
986 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
987 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
988 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
989 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
990 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
991 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
992 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
993 ; AVX1-NEXT: vzeroupper
994 ; AVX1-NEXT: retq
995 ;
996 ; AVX2-LABEL: test_v32i8:
997 ; AVX2: # %bb.0:
998 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
999 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1000 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1001 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1002 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1003 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1004 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1005 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1006 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1007 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1008 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1009 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
1010 ; AVX2-NEXT: vzeroupper
1011 ; AVX2-NEXT: retq
1012 ;
1013 ; AVX512-LABEL: test_v32i8:
1014 ; AVX512: # %bb.0:
1015 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1016 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
1017 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1018 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
1019 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1020 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
1021 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1022 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
1023 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1024 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1025 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1026 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1027 ; AVX512-NEXT: vzeroupper
1028 ; AVX512-NEXT: retq
1029 %1 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> %a0)
1030 ret i8 %1
1031 }
1032
1033 define i8 @test_v64i8(<64 x i8> %a0) {
1034 ; SSE2-LABEL: test_v64i8:
1035 ; SSE2: # %bb.0:
1036 ; SSE2-NEXT: pand %xmm3, %xmm1
1037 ; SSE2-NEXT: pand %xmm2, %xmm1
1038 ; SSE2-NEXT: pand %xmm0, %xmm1
1039 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1040 ; SSE2-NEXT: pand %xmm1, %xmm0
1041 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1042 ; SSE2-NEXT: pand %xmm0, %xmm1
1043 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1044 ; SSE2-NEXT: psrld $16, %xmm0
1045 ; SSE2-NEXT: pand %xmm1, %xmm0
1046 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1047 ; SSE2-NEXT: psrlw $8, %xmm1
1048 ; SSE2-NEXT: pand %xmm0, %xmm1
1049 ; SSE2-NEXT: movd %xmm1, %eax
1050 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1051 ; SSE2-NEXT: retq
1052 ;
1053 ; SSE41-LABEL: test_v64i8:
1054 ; SSE41: # %bb.0:
1055 ; SSE41-NEXT: pand %xmm3, %xmm1
1056 ; SSE41-NEXT: pand %xmm2, %xmm1
1057 ; SSE41-NEXT: pand %xmm0, %xmm1
1058 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1059 ; SSE41-NEXT: pand %xmm1, %xmm0
1060 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1061 ; SSE41-NEXT: pand %xmm0, %xmm1
1062 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1063 ; SSE41-NEXT: psrld $16, %xmm0
1064 ; SSE41-NEXT: pand %xmm1, %xmm0
1065 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1066 ; SSE41-NEXT: psrlw $8, %xmm1
1067 ; SSE41-NEXT: pand %xmm0, %xmm1
1068 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
1069 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1070 ; SSE41-NEXT: retq
1071 ;
1072 ; AVX1-LABEL: test_v64i8:
1073 ; AVX1: # %bb.0:
1074 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
1075 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1076 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
1077 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
1078 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
1079 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
1080 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
1081 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1082 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
1083 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1084 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
1085 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
1086 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
1087 ; AVX1-NEXT: vzeroupper
1088 ; AVX1-NEXT: retq
1089 ;
1090 ; AVX2-LABEL: test_v64i8:
1091 ; AVX2: # %bb.0:
1092 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1093 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1094 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1095 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1096 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1097 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1098 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1099 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1100 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1101 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1102 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1103 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1104 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
1105 ; AVX2-NEXT: vzeroupper
1106 ; AVX2-NEXT: retq
1107 ;
1108 ; AVX512-LABEL: test_v64i8:
1109 ; AVX512: # %bb.0:
1110 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1111 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1112 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1113 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1114 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1115 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1116 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1117 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1118 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1119 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1120 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1121 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1122 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1123 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1124 ; AVX512-NEXT: vzeroupper
1125 ; AVX512-NEXT: retq
1126 %1 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> %a0)
1127 ret i8 %1
1128 }
1129
1130 define i8 @test_v128i8(<128 x i8> %a0) {
1131 ; SSE2-LABEL: test_v128i8:
1132 ; SSE2: # %bb.0:
1133 ; SSE2-NEXT: pand %xmm6, %xmm2
1134 ; SSE2-NEXT: pand %xmm7, %xmm3
1135 ; SSE2-NEXT: pand %xmm5, %xmm3
1136 ; SSE2-NEXT: pand %xmm1, %xmm3
1137 ; SSE2-NEXT: pand %xmm4, %xmm2
1138 ; SSE2-NEXT: pand %xmm3, %xmm2
1139 ; SSE2-NEXT: pand %xmm0, %xmm2
1140 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
1141 ; SSE2-NEXT: pand %xmm2, %xmm0
1142 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1143 ; SSE2-NEXT: pand %xmm0, %xmm1
1144 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1145 ; SSE2-NEXT: psrld $16, %xmm0
1146 ; SSE2-NEXT: pand %xmm1, %xmm0
1147 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1148 ; SSE2-NEXT: psrlw $8, %xmm1
1149 ; SSE2-NEXT: pand %xmm0, %xmm1
1150 ; SSE2-NEXT: movd %xmm1, %eax
1151 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1152 ; SSE2-NEXT: retq
1153 ;
1154 ; SSE41-LABEL: test_v128i8:
1155 ; SSE41: # %bb.0:
1156 ; SSE41-NEXT: pand %xmm6, %xmm2
1157 ; SSE41-NEXT: pand %xmm7, %xmm3
1158 ; SSE41-NEXT: pand %xmm5, %xmm3
1159 ; SSE41-NEXT: pand %xmm1, %xmm3
1160 ; SSE41-NEXT: pand %xmm4, %xmm2
1161 ; SSE41-NEXT: pand %xmm3, %xmm2
1162 ; SSE41-NEXT: pand %xmm0, %xmm2
1163 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
1164 ; SSE41-NEXT: pand %xmm2, %xmm0
1165 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1166 ; SSE41-NEXT: pand %xmm0, %xmm1
1167 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1168 ; SSE41-NEXT: psrld $16, %xmm0
1169 ; SSE41-NEXT: pand %xmm1, %xmm0
1170 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1171 ; SSE41-NEXT: psrlw $8, %xmm1
1172 ; SSE41-NEXT: pand %xmm0, %xmm1
1173 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
1174 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1175 ; SSE41-NEXT: retq
1176 ;
1177 ; AVX1-LABEL: test_v128i8:
1178 ; AVX1: # %bb.0:
1179 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
1180 ; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
1181 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
1182 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1183 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
1184 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
1185 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
1186 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
1187 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
1188 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1189 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
1190 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1191 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
1192 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
1193 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
1194 ; AVX1-NEXT: vzeroupper
1195 ; AVX1-NEXT: retq
1196 ;
1197 ; AVX2-LABEL: test_v128i8:
1198 ; AVX2: # %bb.0:
1199 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
1200 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
1201 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1202 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1203 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1204 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1205 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1206 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1207 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1208 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1209 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1210 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1211 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1212 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1213 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
1214 ; AVX2-NEXT: vzeroupper
1215 ; AVX2-NEXT: retq
1216 ;
1217 ; AVX512-LABEL: test_v128i8:
1218 ; AVX512: # %bb.0:
1219 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1220 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1221 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1222 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1223 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1224 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1225 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1226 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1227 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1228 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1229 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1230 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1231 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1232 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1233 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1234 ; AVX512-NEXT: vzeroupper
1235 ; AVX512-NEXT: retq
1236 %1 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> %a0)
1237 ret i8 %1
1238 }
1239
1240 declare i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64>)
1241 declare i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64>)
1242 declare i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64>)
1243 declare i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64>)
1244
1245 declare i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32>)
1246 declare i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32>)
1247 declare i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32>)
1248 declare i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32>)
1249 declare i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32>)
1250
1251 declare i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16>)
1252 declare i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16>)
1253 declare i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16>)
1254 declare i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16>)
1255 declare i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16>)
1256 declare i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16>)
1257
1258 declare i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8>)
1259 declare i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8>)
1260 declare i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8>)
1261 declare i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8>)
1262 declare i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8>)
1263 declare i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8>)
1264 declare i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8>)
185185 ; vXi32
186186 ;
187187
188 define i32 @test_v2i32(<2 x i32> %a0) {
189 ; SSE-LABEL: test_v2i32:
190 ; SSE: # %bb.0:
191 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
192 ; SSE-NEXT: pand %xmm0, %xmm1
193 ; SSE-NEXT: movd %xmm1, %eax
194 ; SSE-NEXT: retq
195 ;
196 ; AVX-LABEL: test_v2i32:
197 ; AVX: # %bb.0:
198 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
199 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
200 ; AVX-NEXT: vmovd %xmm0, %eax
201 ; AVX-NEXT: retq
202 ;
203 ; AVX512-LABEL: test_v2i32:
204 ; AVX512: # %bb.0:
205 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
206 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
207 ; AVX512-NEXT: vmovd %xmm0, %eax
208 ; AVX512-NEXT: retq
209 %1 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> %a0)
210 ret i32 %1
211 }
212
188213 define i32 @test_v4i32(<4 x i32> %a0) {
189214 ; SSE-LABEL: test_v4i32:
190215 ; SSE: # %bb.0:
391416 ; vXi16
392417 ;
393418
419 define i16 @test_v2i16(<2 x i16> %a0) {
420 ; SSE-LABEL: test_v2i16:
421 ; SSE: # %bb.0:
422 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
423 ; SSE-NEXT: pand %xmm0, %xmm1
424 ; SSE-NEXT: movd %xmm1, %eax
425 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
426 ; SSE-NEXT: retq
427 ;
428 ; AVX-LABEL: test_v2i16:
429 ; AVX: # %bb.0:
430 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
431 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
432 ; AVX-NEXT: vmovd %xmm0, %eax
433 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax
434 ; AVX-NEXT: retq
435 ;
436 ; AVX512-LABEL: test_v2i16:
437 ; AVX512: # %bb.0:
438 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
439 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
440 ; AVX512-NEXT: vmovd %xmm0, %eax
441 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
442 ; AVX512-NEXT: retq
443 %1 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> %a0)
444 ret i16 %1
445 }
446
447 define i16 @test_v4i16(<4 x i16> %a0) {
448 ; SSE-LABEL: test_v4i16:
449 ; SSE: # %bb.0:
450 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
451 ; SSE-NEXT: pand %xmm0, %xmm1
452 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
453 ; SSE-NEXT: pand %xmm1, %xmm0
454 ; SSE-NEXT: movd %xmm0, %eax
455 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
456 ; SSE-NEXT: retq
457 ;
458 ; AVX-LABEL: test_v4i16:
459 ; AVX: # %bb.0:
460 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
461 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
462 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
463 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
464 ; AVX-NEXT: vmovd %xmm0, %eax
465 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax
466 ; AVX-NEXT: retq
467 ;
468 ; AVX512-LABEL: test_v4i16:
469 ; AVX512: # %bb.0:
470 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
471 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
472 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
473 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
474 ; AVX512-NEXT: vmovd %xmm0, %eax
475 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
476 ; AVX512-NEXT: retq
477 %1 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> %a0)
478 ret i16 %1
479 }
480
394481 define i16 @test_v8i16(<8 x i16> %a0) {
395482 ; SSE-LABEL: test_v8i16:
396483 ; SSE: # %bb.0:
645732 ;
646733 ; vXi8
647734 ;
735
736 define i8 @test_v2i8(<2 x i8> %a0) {
737 ; SSE2-LABEL: test_v2i8:
738 ; SSE2: # %bb.0:
739 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
740 ; SSE2-NEXT: pand %xmm0, %xmm1
741 ; SSE2-NEXT: movd %xmm1, %eax
742 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
743 ; SSE2-NEXT: retq
744 ;
745 ; SSE41-LABEL: test_v2i8:
746 ; SSE41: # %bb.0:
747 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
748 ; SSE41-NEXT: pand %xmm0, %xmm1
749 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
750 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
751 ; SSE41-NEXT: retq
752 ;
753 ; AVX-LABEL: test_v2i8:
754 ; AVX: # %bb.0:
755 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
756 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
757 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
758 ; AVX-NEXT: # kill: def $al killed $al killed $eax
759 ; AVX-NEXT: retq
760 ;
761 ; AVX512-LABEL: test_v2i8:
762 ; AVX512: # %bb.0:
763 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
764 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
765 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
766 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
767 ; AVX512-NEXT: retq
768 %1 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> %a0)
769 ret i8 %1
770 }
771
772 define i8 @test_v4i8(<4 x i8> %a0) {
773 ; SSE2-LABEL: test_v4i8:
774 ; SSE2: # %bb.0:
775 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
776 ; SSE2-NEXT: pand %xmm0, %xmm1
777 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
778 ; SSE2-NEXT: pand %xmm1, %xmm0
779 ; SSE2-NEXT: movd %xmm0, %eax
780 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
781 ; SSE2-NEXT: retq
782 ;
783 ; SSE41-LABEL: test_v4i8:
784 ; SSE41: # %bb.0:
785 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
786 ; SSE41-NEXT: pand %xmm0, %xmm1
787 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
788 ; SSE41-NEXT: pand %xmm1, %xmm0
789 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
790 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
791 ; SSE41-NEXT: retq
792 ;
793 ; AVX-LABEL: test_v4i8:
794 ; AVX: # %bb.0:
795 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
796 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
797 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
798 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
799 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
800 ; AVX-NEXT: # kill: def $al killed $al killed $eax
801 ; AVX-NEXT: retq
802 ;
803 ; AVX512-LABEL: test_v4i8:
804 ; AVX512: # %bb.0:
805 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
806 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
807 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
808 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
809 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
810 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
811 ; AVX512-NEXT: retq
812 %1 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> %a0)
813 ret i8 %1
814 }
815
816 define i8 @test_v8i8(<8 x i8> %a0) {
817 ; SSE2-LABEL: test_v8i8:
818 ; SSE2: # %bb.0:
819 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
820 ; SSE2-NEXT: pand %xmm0, %xmm1
821 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
822 ; SSE2-NEXT: pand %xmm1, %xmm0
823 ; SSE2-NEXT: movdqa %xmm0, %xmm1
824 ; SSE2-NEXT: psrld $16, %xmm1
825 ; SSE2-NEXT: pand %xmm0, %xmm1
826 ; SSE2-NEXT: movd %xmm1, %eax
827 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
828 ; SSE2-NEXT: retq
829 ;
830 ; SSE41-LABEL: test_v8i8:
831 ; SSE41: # %bb.0:
832 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
833 ; SSE41-NEXT: pand %xmm0, %xmm1
834 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
835 ; SSE41-NEXT: pand %xmm1, %xmm0
836 ; SSE41-NEXT: movdqa %xmm0, %xmm1
837 ; SSE41-NEXT: psrld $16, %xmm1
838 ; SSE41-NEXT: pand %xmm0, %xmm1
839 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
840 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
841 ; SSE41-NEXT: retq
842 ;
843 ; AVX-LABEL: test_v8i8:
844 ; AVX: # %bb.0:
845 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
846 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
847 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
848 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
849 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
850 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
851 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
852 ; AVX-NEXT: # kill: def $al killed $al killed $eax
853 ; AVX-NEXT: retq
854 ;
855 ; AVX512-LABEL: test_v8i8:
856 ; AVX512: # %bb.0:
857 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
858 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
859 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
860 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
861 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
862 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
863 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
864 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
865 ; AVX512-NEXT: retq
866 %1 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> %a0)
867 ret i8 %1
868 }
648869
649870 define i8 @test_v16i8(<16 x i8> %a0) {
650871 ; SSE2-LABEL: test_v16i8:
10111232 declare i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64>)
10121233 declare i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64>)
10131234
1235 declare i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32>)
10141236 declare i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32>)
10151237 declare i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32>)
10161238 declare i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32>)
10171239 declare i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32>)
10181240
1241 declare i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16>)
1242 declare i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16>)
10191243 declare i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16>)
10201244 declare i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16>)
10211245 declare i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16>)
10221246 declare i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16>)
10231247
1248 declare i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8>)
1249 declare i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8>)
1250 declare i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8>)
10241251 declare i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8>)
10251252 declare i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8>)
10261253 declare i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8>)
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
2 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
3 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
4 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
5 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
6 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL --check-prefix=AVX512BWVL
7 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
8 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL --check-prefix=AVX512DQVL
9
10 ;
11 ; vXi64
12 ;
13
14 define i64 @test_v2i64(<2 x i64> %a0) {
15 ; SSE-LABEL: test_v2i64:
16 ; SSE: # %bb.0:
17 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
18 ; SSE-NEXT: movdqa %xmm0, %xmm2
19 ; SSE-NEXT: psrlq $32, %xmm2
20 ; SSE-NEXT: pmuludq %xmm1, %xmm2
21 ; SSE-NEXT: movdqa %xmm0, %xmm3
22 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
23 ; SSE-NEXT: pmuludq %xmm0, %xmm3
24 ; SSE-NEXT: paddq %xmm2, %xmm3
25 ; SSE-NEXT: psllq $32, %xmm3
26 ; SSE-NEXT: pmuludq %xmm1, %xmm0
27 ; SSE-NEXT: paddq %xmm3, %xmm0
28 ; SSE-NEXT: movq %xmm0, %rax
29 ; SSE-NEXT: retq
30 ;
31 ; AVX-LABEL: test_v2i64:
32 ; AVX: # %bb.0:
33 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
34 ; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2
35 ; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
36 ; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
37 ; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
38 ; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm2
39 ; AVX-NEXT: vpsllq $32, %xmm2, %xmm2
40 ; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
41 ; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0
42 ; AVX-NEXT: vmovq %xmm0, %rax
43 ; AVX-NEXT: retq
44 ;
45 ; AVX512BW-LABEL: test_v2i64:
46 ; AVX512BW: # %bb.0:
47 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
48 ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
49 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
50 ; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
51 ; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
52 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
53 ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
54 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
55 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
56 ; AVX512BW-NEXT: vmovq %xmm0, %rax
57 ; AVX512BW-NEXT: retq
58 ;
59 ; AVX512BWVL-LABEL: test_v2i64:
60 ; AVX512BWVL: # %bb.0:
61 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
62 ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
63 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
64 ; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3
65 ; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
66 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
67 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
68 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
69 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
70 ; AVX512BWVL-NEXT: vmovq %xmm0, %rax
71 ; AVX512BWVL-NEXT: retq
72 ;
73 ; AVX512DQ-LABEL: test_v2i64:
74 ; AVX512DQ: # %bb.0:
75 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
76 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
77 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
78 ; AVX512DQ-NEXT: vmovq %xmm0, %rax
79 ; AVX512DQ-NEXT: vzeroupper
80 ; AVX512DQ-NEXT: retq
81 ;
82 ; AVX512DQVL-LABEL: test_v2i64:
83 ; AVX512DQVL: # %bb.0:
84 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
85 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
86 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax
87 ; AVX512DQVL-NEXT: retq
88 %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> %a0)
89 ret i64 %1
90 }
91
92 define i64 @test_v4i64(<4 x i64> %a0) {
93 ; SSE-LABEL: test_v4i64:
94 ; SSE: # %bb.0:
95 ; SSE-NEXT: movdqa %xmm0, %xmm2
96 ; SSE-NEXT: psrlq $32, %xmm2
97 ; SSE-NEXT: pmuludq %xmm1, %xmm2
98 ; SSE-NEXT: movdqa %xmm1, %xmm3
99 ; SSE-NEXT: psrlq $32, %xmm3
100 ; SSE-NEXT: pmuludq %xmm0, %xmm3
101 ; SSE-NEXT: paddq %xmm2, %xmm3
102 ; SSE-NEXT: psllq $32, %xmm3
103 ; SSE-NEXT: pmuludq %xmm1, %xmm0
104 ; SSE-NEXT: paddq %xmm3, %xmm0
105 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
106 ; SSE-NEXT: movdqa %xmm0, %xmm2
107 ; SSE-NEXT: psrlq $32, %xmm2
108 ; SSE-NEXT: pmuludq %xmm1, %xmm2
109 ; SSE-NEXT: movdqa %xmm0, %xmm3
110 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
111 ; SSE-NEXT: pmuludq %xmm0, %xmm3
112 ; SSE-NEXT: paddq %xmm2, %xmm3
113 ; SSE-NEXT: psllq $32, %xmm3
114 ; SSE-NEXT: pmuludq %xmm1, %xmm0
115 ; SSE-NEXT: paddq %xmm3, %xmm0
116 ; SSE-NEXT: movq %xmm0, %rax
117 ; SSE-NEXT: retq
118 ;
119 ; AVX1-LABEL: test_v4i64:
120 ; AVX1: # %bb.0:
121 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
122 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
123 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
124 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
125 ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
126 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
127 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
128 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
129 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
130 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
131 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
132 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
133 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
134 ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
135 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
136 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
137 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
138 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
139 ; AVX1-NEXT: vmovq %xmm0, %rax
140 ; AVX1-NEXT: vzeroupper
141 ; AVX1-NEXT: retq
142 ;
143 ; AVX2-LABEL: test_v4i64:
144 ; AVX2: # %bb.0:
145 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
146 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2
147 ; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
148 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
149 ; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
150 ; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2
151 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
152 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
153 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
154 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
155 ; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
156 ; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
157 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
158 ; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
159 ; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
160 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
161 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
162 ; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
163 ; AVX2-NEXT: vmovq %xmm0, %rax
164 ; AVX2-NEXT: vzeroupper
165 ; AVX2-NEXT: retq
166 ;
167 ; AVX512BW-LABEL: test_v4i64:
168 ; AVX512BW: # %bb.0:
169 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
170 ; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2
171 ; AVX512BW-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
172 ; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm3
173 ; AVX512BW-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
174 ; AVX512BW-NEXT: vpaddq %ymm2, %ymm3, %ymm2
175 ; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2
176 ; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
177 ; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0
178 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
179 ; AVX512BW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
180 ; AVX512BW-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
181 ; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm3
182 ; AVX512BW-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
183 ; AVX512BW-NEXT: vpaddq %ymm3, %ymm2, %ymm2
184 ; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2
185 ; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
186 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
187 ; AVX512BW-NEXT: vmovq %xmm0, %rax
188 ; AVX512BW-NEXT: vzeroupper
189 ; AVX512BW-NEXT: retq
190 ;
191 ; AVX512BWVL-LABEL: test_v4i64:
192 ; AVX512BWVL: # %bb.0:
193 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
194 ; AVX512BWVL-NEXT: vpsrlq $32, %ymm0, %ymm2
195 ; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
196 ; AVX512BWVL-NEXT: vpsrlq $32, %ymm1, %ymm3
197 ; AVX512BWVL-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
198 ; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm3, %ymm2
199 ; AVX512BWVL-NEXT: vpsllq $32, %ymm2, %ymm2
200 ; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
201 ; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm0, %ymm0
202 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
203 ; AVX512BWVL-NEXT: vpsrlq $32, %ymm0, %ymm2
204 ; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
205 ; AVX512BWVL-NEXT: vpsrlq $32, %ymm1, %ymm3
206 ; AVX512BWVL-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
207 ; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm3, %ymm2
208 ; AVX512BWVL-NEXT: vpsllq $32, %ymm2, %ymm2
209 ; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
210 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
211 ; AVX512BWVL-NEXT: vmovq %xmm0, %rax
212 ; AVX512BWVL-NEXT: vzeroupper
213 ; AVX512BWVL-NEXT: retq
214 ;
215 ; AVX512DQ-LABEL: test_v4i64:
216 ; AVX512DQ: # %bb.0:
217 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
218 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
219 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
220 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
221 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
222 ; AVX512DQ-NEXT: vmovq %xmm0, %rax
223 ; AVX512DQ-NEXT: vzeroupper
224 ; AVX512DQ-NEXT: retq
225 ;
226 ; AVX512DQVL-LABEL: test_v4i64:
227 ; AVX512DQVL: # %bb.0:
228 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
229 ; AVX512DQVL-NEXT: vpmullq %ymm1, %ymm0, %ymm0
230 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
231 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
232 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax
233 ; AVX512DQVL-NEXT: vzeroupper
234 ; AVX512DQVL-NEXT: retq
235 %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> %a0)
236 ret i64 %1
237 }
238
239 define i64 @test_v8i64(<8 x i64> %a0) {
240 ; SSE-LABEL: test_v8i64:
241 ; SSE: # %bb.0:
242 ; SSE-NEXT: movdqa %xmm1, %xmm4
243 ; SSE-NEXT: psrlq $32, %xmm4
244 ; SSE-NEXT: pmuludq %xmm3, %xmm4
245 ; SSE-NEXT: movdqa %xmm3, %xmm5
246 ; SSE-NEXT: psrlq $32, %xmm5
247 ; SSE-NEXT: pmuludq %xmm1, %xmm5
248 ; SSE-NEXT: paddq %xmm4, %xmm5
249 ; SSE-NEXT: psllq $32, %xmm5
250 ; SSE-NEXT: pmuludq %xmm3, %xmm1
251 ; SSE-NEXT: paddq %xmm5, %xmm1
252 ; SSE-NEXT: movdqa %xmm0, %xmm3
253 ; SSE-NEXT: psrlq $32, %xmm3
254 ; SSE-NEXT: pmuludq %xmm2, %xmm3
255 ; SSE-NEXT: movdqa %xmm2, %xmm4
256 ; SSE-NEXT: psrlq $32, %xmm4
257 ; SSE-NEXT: pmuludq %xmm0, %xmm4
258 ; SSE-NEXT: paddq %xmm3, %xmm4
259 ; SSE-NEXT: psllq $32, %xmm4
260 ; SSE-NEXT: pmuludq %xmm2, %xmm0
261 ; SSE-NEXT: paddq %xmm4, %xmm0
262 ; SSE-NEXT: movdqa %xmm0, %xmm2
263 ; SSE-NEXT: psrlq $32, %xmm2
264 ; SSE-NEXT: pmuludq %xmm1, %xmm2
265 ; SSE-NEXT: movdqa %xmm1, %xmm3
266 ; SSE-NEXT: psrlq $32, %xmm3
267 ; SSE-NEXT: pmuludq %xmm0, %xmm3
268 ; SSE-NEXT: paddq %xmm2, %xmm3
269 ; SSE-NEXT: psllq $32, %xmm3
270 ; SSE-NEXT: pmuludq %xmm1, %xmm0
271 ; SSE-NEXT: paddq %xmm3, %xmm0
272 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
273 ; SSE-NEXT: movdqa %xmm0, %xmm2
274 ; SSE-NEXT: psrlq $32, %xmm2
275 ; SSE-NEXT: pmuludq %xmm1, %xmm2
276 ; SSE-NEXT: movdqa %xmm0, %xmm3
277 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
278 ; SSE-NEXT: pmuludq %xmm0, %xmm3
279 ; SSE-NEXT: paddq %xmm2, %xmm3
280 ; SSE-NEXT: psllq $32, %xmm3
281 ; SSE-NEXT: pmuludq %xmm1, %xmm0
282 ; SSE-NEXT: paddq %xmm3, %xmm0
283 ; SSE-NEXT: movq %xmm0, %rax
284 ; SSE-NEXT: retq
285 ;
286 ; AVX1-LABEL: test_v8i64:
287 ; AVX1: # %bb.0:
288 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
289 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
290 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4
291 ; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm4
292 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5
293 ; AVX1-NEXT: vpmuludq %xmm5, %xmm3, %xmm5
294 ; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
295 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
296 ; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
297 ; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
298 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3
299 ; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3
300 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
301 ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4
302 ; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3
303 ; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
304 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
305 ; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0
306 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm1
307 ; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
308 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm3
309 ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
310 ; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
311 ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
312 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
313 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
314 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
315 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
316 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
317 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
318 ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
319 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
320 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
321 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
322 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
323 ; AVX1-NEXT: vmovq %xmm0, %rax
324 ; AVX1-NEXT: vzeroupper
325 ; AVX1-NEXT: retq
326 ;
327 ; AVX2-LABEL: test_v8i64:
328 ; AVX2: # %bb.0:
329 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2
330 ; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
331 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
332 ; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
333 ; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2
334 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
335 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
336 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
337 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
338 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2
339 ; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
340 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
341 ; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
342 ; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2
343 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
344 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
345 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
346 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
347 ; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
348 ; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
349 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
350 ; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
351 ; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
352 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
353 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
354 ; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
355 ; AVX2-NEXT: vmovq %xmm0, %rax
356 ; AVX2-NEXT: vzeroupper
357 ; AVX2-NEXT: retq
358 ;
359 ; AVX512BW-LABEL: test_v8i64:
360 ; AVX512BW: # %bb.0:
361 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
362 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
363 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
364 ; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
365 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
366 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
367 ; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
368 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
369 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
370 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
371 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
372 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
373 ; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
374 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
375 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
376 ; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
377 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
378 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
379 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
380 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
381 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
382 ; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
383 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
384 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
385 ; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
386 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
387 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
388 ; AVX512BW-NEXT: vmovq %xmm0, %rax
389 ; AVX512BW-NEXT: vzeroupper
390 ; AVX512BW-NEXT: retq
391 ;
392 ; AVX512BWVL-LABEL: test_v8i64:
393 ; AVX512BWVL: # %bb.0:
394 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
395 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2
396 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
397 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3
398 ; AVX512BWVL-NE