llvm.org GIT mirror llvm / 5fe8b23
[NFC][CodeGen][X86][AArch64] Add tests for C++ std::midpoint() pattern (PR40965) Tests only for integers, not floating point or pointers. The scalar 8-bit case uses branch instead of CMOV, because there is no no 8-bit CMOV. Vector tests are for consistency, since it can be vectorized. https://bugs.llvm.org/show_bug.cgi?id=40965 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@355436 91177308-0d34-0410-b5e6-96231b3b80d8 Roman Lebedev 7 months ago
5 changed file(s) with 10752 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
2
3 ; These test cases are inspired by C++2a std::midpoint().
4 ; See https://bugs.llvm.org/show_bug.cgi?id=40965
5
6 ; ---------------------------------------------------------------------------- ;
7 ; 32-bit width
8 ; ---------------------------------------------------------------------------- ;
9
10 ; Values come from regs
11
12 define i32 @scalar_i32_signed_reg_reg(i32 %a1, i32 %a2) nounwind {
13 ; CHECK-LABEL: scalar_i32_signed_reg_reg:
14 ; CHECK: // %bb.0:
15 ; CHECK-NEXT: cmp w0, w1
16 ; CHECK-NEXT: csel w9, w1, w0, gt
17 ; CHECK-NEXT: csel w10, w0, w1, gt
18 ; CHECK-NEXT: mov w8, #-1
19 ; CHECK-NEXT: sub w9, w10, w9
20 ; CHECK-NEXT: cneg w8, w8, le
21 ; CHECK-NEXT: lsr w9, w9, #1
22 ; CHECK-NEXT: madd w0, w9, w8, w0
23 ; CHECK-NEXT: ret
24 %t3 = icmp sgt i32 %a1, %a2 ; signed
25 %t4 = select i1 %t3, i32 -1, i32 1
26 %t5 = select i1 %t3, i32 %a2, i32 %a1
27 %t6 = select i1 %t3, i32 %a1, i32 %a2
28 %t7 = sub i32 %t6, %t5
29 %t8 = lshr i32 %t7, 1
30 %t9 = mul nsw i32 %t8, %t4 ; signed
31 %a10 = add nsw i32 %t9, %a1 ; signed
32 ret i32 %a10
33 }
34
35 define i32 @scalar_i32_unsigned_reg_reg(i32 %a1, i32 %a2) nounwind {
36 ; CHECK-LABEL: scalar_i32_unsigned_reg_reg:
37 ; CHECK: // %bb.0:
38 ; CHECK-NEXT: cmp w0, w1
39 ; CHECK-NEXT: csel w9, w1, w0, hi
40 ; CHECK-NEXT: csel w10, w0, w1, hi
41 ; CHECK-NEXT: mov w8, #-1
42 ; CHECK-NEXT: sub w9, w10, w9
43 ; CHECK-NEXT: cneg w8, w8, ls
44 ; CHECK-NEXT: lsr w9, w9, #1
45 ; CHECK-NEXT: madd w0, w9, w8, w0
46 ; CHECK-NEXT: ret
47 %t3 = icmp ugt i32 %a1, %a2
48 %t4 = select i1 %t3, i32 -1, i32 1
49 %t5 = select i1 %t3, i32 %a2, i32 %a1
50 %t6 = select i1 %t3, i32 %a1, i32 %a2
51 %t7 = sub i32 %t6, %t5
52 %t8 = lshr i32 %t7, 1
53 %t9 = mul i32 %t8, %t4
54 %a10 = add i32 %t9, %a1
55 ret i32 %a10
56 }
57
58 ; Values are loaded. Only check signed case.
59
60 define i32 @scalar_i32_signed_mem_reg(i32* %a1_addr, i32 %a2) nounwind {
61 ; CHECK-LABEL: scalar_i32_signed_mem_reg:
62 ; CHECK: // %bb.0:
63 ; CHECK-NEXT: ldr w8, [x0]
64 ; CHECK-NEXT: mov w9, #-1
65 ; CHECK-NEXT: cmp w8, w1
66 ; CHECK-NEXT: csel w10, w1, w8, gt
67 ; CHECK-NEXT: csel w11, w8, w1, gt
68 ; CHECK-NEXT: sub w10, w11, w10
69 ; CHECK-NEXT: cneg w9, w9, le
70 ; CHECK-NEXT: lsr w10, w10, #1
71 ; CHECK-NEXT: madd w0, w10, w9, w8
72 ; CHECK-NEXT: ret
73 %a1 = load i32, i32* %a1_addr
74 %t3 = icmp sgt i32 %a1, %a2 ; signed
75 %t4 = select i1 %t3, i32 -1, i32 1
76 %t5 = select i1 %t3, i32 %a2, i32 %a1
77 %t6 = select i1 %t3, i32 %a1, i32 %a2
78 %t7 = sub i32 %t6, %t5
79 %t8 = lshr i32 %t7, 1
80 %t9 = mul nsw i32 %t8, %t4 ; signed
81 %a10 = add nsw i32 %t9, %a1 ; signed
82 ret i32 %a10
83 }
84
85 define i32 @scalar_i32_signed_reg_mem(i32 %a1, i32* %a2_addr) nounwind {
86 ; CHECK-LABEL: scalar_i32_signed_reg_mem:
87 ; CHECK: // %bb.0:
88 ; CHECK-NEXT: ldr w8, [x1]
89 ; CHECK-NEXT: mov w9, #-1
90 ; CHECK-NEXT: cmp w0, w8
91 ; CHECK-NEXT: csel w10, w8, w0, gt
92 ; CHECK-NEXT: csel w8, w0, w8, gt
93 ; CHECK-NEXT: sub w8, w8, w10
94 ; CHECK-NEXT: cneg w9, w9, le
95 ; CHECK-NEXT: lsr w8, w8, #1
96 ; CHECK-NEXT: madd w0, w8, w9, w0
97 ; CHECK-NEXT: ret
98 %a2 = load i32, i32* %a2_addr
99 %t3 = icmp sgt i32 %a1, %a2 ; signed
100 %t4 = select i1 %t3, i32 -1, i32 1
101 %t5 = select i1 %t3, i32 %a2, i32 %a1
102 %t6 = select i1 %t3, i32 %a1, i32 %a2
103 %t7 = sub i32 %t6, %t5
104 %t8 = lshr i32 %t7, 1
105 %t9 = mul nsw i32 %t8, %t4 ; signed
106 %a10 = add nsw i32 %t9, %a1 ; signed
107 ret i32 %a10
108 }
109
110 define i32 @scalar_i32_signed_mem_mem(i32* %a1_addr, i32* %a2_addr) nounwind {
111 ; CHECK-LABEL: scalar_i32_signed_mem_mem:
112 ; CHECK: // %bb.0:
113 ; CHECK-NEXT: ldr w8, [x0]
114 ; CHECK-NEXT: ldr w9, [x1]
115 ; CHECK-NEXT: mov w10, #-1
116 ; CHECK-NEXT: cmp w8, w9
117 ; CHECK-NEXT: csel w11, w9, w8, gt
118 ; CHECK-NEXT: csel w9, w8, w9, gt
119 ; CHECK-NEXT: sub w9, w9, w11
120 ; CHECK-NEXT: cneg w10, w10, le
121 ; CHECK-NEXT: lsr w9, w9, #1
122 ; CHECK-NEXT: madd w0, w9, w10, w8
123 ; CHECK-NEXT: ret
124 %a1 = load i32, i32* %a1_addr
125 %a2 = load i32, i32* %a2_addr
126 %t3 = icmp sgt i32 %a1, %a2 ; signed
127 %t4 = select i1 %t3, i32 -1, i32 1
128 %t5 = select i1 %t3, i32 %a2, i32 %a1
129 %t6 = select i1 %t3, i32 %a1, i32 %a2
130 %t7 = sub i32 %t6, %t5
131 %t8 = lshr i32 %t7, 1
132 %t9 = mul nsw i32 %t8, %t4 ; signed
133 %a10 = add nsw i32 %t9, %a1 ; signed
134 ret i32 %a10
135 }
136
137 ; ---------------------------------------------------------------------------- ;
138 ; 64-bit width
139 ; ---------------------------------------------------------------------------- ;
140
141 ; Values come from regs
142
143 define i64 @scalar_i64_signed_reg_reg(i64 %a1, i64 %a2) nounwind {
144 ; CHECK-LABEL: scalar_i64_signed_reg_reg:
145 ; CHECK: // %bb.0:
146 ; CHECK-NEXT: cmp x0, x1
147 ; CHECK-NEXT: csel x9, x1, x0, gt
148 ; CHECK-NEXT: csel x10, x0, x1, gt
149 ; CHECK-NEXT: mov x8, #-1
150 ; CHECK-NEXT: sub x9, x10, x9
151 ; CHECK-NEXT: cneg x8, x8, le
152 ; CHECK-NEXT: lsr x9, x9, #1
153 ; CHECK-NEXT: madd x0, x9, x8, x0
154 ; CHECK-NEXT: ret
155 %t3 = icmp sgt i64 %a1, %a2 ; signed
156 %t4 = select i1 %t3, i64 -1, i64 1
157 %t5 = select i1 %t3, i64 %a2, i64 %a1
158 %t6 = select i1 %t3, i64 %a1, i64 %a2
159 %t7 = sub i64 %t6, %t5
160 %t8 = lshr i64 %t7, 1
161 %t9 = mul nsw i64 %t8, %t4 ; signed
162 %a10 = add nsw i64 %t9, %a1 ; signed
163 ret i64 %a10
164 }
165
166 define i64 @scalar_i64_unsigned_reg_reg(i64 %a1, i64 %a2) nounwind {
167 ; CHECK-LABEL: scalar_i64_unsigned_reg_reg:
168 ; CHECK: // %bb.0:
169 ; CHECK-NEXT: cmp x0, x1
170 ; CHECK-NEXT: csel x9, x1, x0, hi
171 ; CHECK-NEXT: csel x10, x0, x1, hi
172 ; CHECK-NEXT: mov x8, #-1
173 ; CHECK-NEXT: sub x9, x10, x9
174 ; CHECK-NEXT: cneg x8, x8, ls
175 ; CHECK-NEXT: lsr x9, x9, #1
176 ; CHECK-NEXT: madd x0, x9, x8, x0
177 ; CHECK-NEXT: ret
178 %t3 = icmp ugt i64 %a1, %a2
179 %t4 = select i1 %t3, i64 -1, i64 1
180 %t5 = select i1 %t3, i64 %a2, i64 %a1
181 %t6 = select i1 %t3, i64 %a1, i64 %a2
182 %t7 = sub i64 %t6, %t5
183 %t8 = lshr i64 %t7, 1
184 %t9 = mul i64 %t8, %t4
185 %a10 = add i64 %t9, %a1
186 ret i64 %a10
187 }
188
189 ; Values are loaded. Only check signed case.
190
191 define i64 @scalar_i64_signed_mem_reg(i64* %a1_addr, i64 %a2) nounwind {
192 ; CHECK-LABEL: scalar_i64_signed_mem_reg:
193 ; CHECK: // %bb.0:
194 ; CHECK-NEXT: ldr x8, [x0]
195 ; CHECK-NEXT: mov x9, #-1
196 ; CHECK-NEXT: cmp x8, x1
197 ; CHECK-NEXT: csel x10, x1, x8, gt
198 ; CHECK-NEXT: csel x11, x8, x1, gt
199 ; CHECK-NEXT: sub x10, x11, x10
200 ; CHECK-NEXT: cneg x9, x9, le
201 ; CHECK-NEXT: lsr x10, x10, #1
202 ; CHECK-NEXT: madd x0, x10, x9, x8
203 ; CHECK-NEXT: ret
204 %a1 = load i64, i64* %a1_addr
205 %t3 = icmp sgt i64 %a1, %a2 ; signed
206 %t4 = select i1 %t3, i64 -1, i64 1
207 %t5 = select i1 %t3, i64 %a2, i64 %a1
208 %t6 = select i1 %t3, i64 %a1, i64 %a2
209 %t7 = sub i64 %t6, %t5
210 %t8 = lshr i64 %t7, 1
211 %t9 = mul nsw i64 %t8, %t4 ; signed
212 %a10 = add nsw i64 %t9, %a1 ; signed
213 ret i64 %a10
214 }
215
216 define i64 @scalar_i64_signed_reg_mem(i64 %a1, i64* %a2_addr) nounwind {
217 ; CHECK-LABEL: scalar_i64_signed_reg_mem:
218 ; CHECK: // %bb.0:
219 ; CHECK-NEXT: ldr x8, [x1]
220 ; CHECK-NEXT: mov x9, #-1
221 ; CHECK-NEXT: cmp x0, x8
222 ; CHECK-NEXT: csel x10, x8, x0, gt
223 ; CHECK-NEXT: csel x8, x0, x8, gt
224 ; CHECK-NEXT: sub x8, x8, x10
225 ; CHECK-NEXT: cneg x9, x9, le
226 ; CHECK-NEXT: lsr x8, x8, #1
227 ; CHECK-NEXT: madd x0, x8, x9, x0
228 ; CHECK-NEXT: ret
229 %a2 = load i64, i64* %a2_addr
230 %t3 = icmp sgt i64 %a1, %a2 ; signed
231 %t4 = select i1 %t3, i64 -1, i64 1
232 %t5 = select i1 %t3, i64 %a2, i64 %a1
233 %t6 = select i1 %t3, i64 %a1, i64 %a2
234 %t7 = sub i64 %t6, %t5
235 %t8 = lshr i64 %t7, 1
236 %t9 = mul nsw i64 %t8, %t4 ; signed
237 %a10 = add nsw i64 %t9, %a1 ; signed
238 ret i64 %a10
239 }
240
241 define i64 @scalar_i64_signed_mem_mem(i64* %a1_addr, i64* %a2_addr) nounwind {
242 ; CHECK-LABEL: scalar_i64_signed_mem_mem:
243 ; CHECK: // %bb.0:
244 ; CHECK-NEXT: ldr x8, [x0]
245 ; CHECK-NEXT: ldr x9, [x1]
246 ; CHECK-NEXT: mov x10, #-1
247 ; CHECK-NEXT: cmp x8, x9
248 ; CHECK-NEXT: csel x11, x9, x8, gt
249 ; CHECK-NEXT: csel x9, x8, x9, gt
250 ; CHECK-NEXT: sub x9, x9, x11
251 ; CHECK-NEXT: cneg x10, x10, le
252 ; CHECK-NEXT: lsr x9, x9, #1
253 ; CHECK-NEXT: madd x0, x9, x10, x8
254 ; CHECK-NEXT: ret
255 %a1 = load i64, i64* %a1_addr
256 %a2 = load i64, i64* %a2_addr
257 %t3 = icmp sgt i64 %a1, %a2 ; signed
258 %t4 = select i1 %t3, i64 -1, i64 1
259 %t5 = select i1 %t3, i64 %a2, i64 %a1
260 %t6 = select i1 %t3, i64 %a1, i64 %a2
261 %t7 = sub i64 %t6, %t5
262 %t8 = lshr i64 %t7, 1
263 %t9 = mul nsw i64 %t8, %t4 ; signed
264 %a10 = add nsw i64 %t9, %a1 ; signed
265 ret i64 %a10
266 }
267
268 ; ---------------------------------------------------------------------------- ;
269 ; 16-bit width
270 ; ---------------------------------------------------------------------------- ;
271
272 ; Values come from regs
273
274 define i16 @scalar_i16_signed_reg_reg(i16 %a1, i16 %a2) nounwind {
275 ; CHECK-LABEL: scalar_i16_signed_reg_reg:
276 ; CHECK: // %bb.0:
277 ; CHECK-NEXT: sxth w8, w0
278 ; CHECK-NEXT: mov w9, #-1
279 ; CHECK-NEXT: cmp w8, w1, sxth
280 ; CHECK-NEXT: cneg w8, w9, le
281 ; CHECK-NEXT: csel w9, w1, w0, gt
282 ; CHECK-NEXT: csel w10, w0, w1, gt
283 ; CHECK-NEXT: sub w9, w10, w9
284 ; CHECK-NEXT: ubfx w9, w9, #1, #15
285 ; CHECK-NEXT: madd w0, w9, w8, w0
286 ; CHECK-NEXT: ret
287 %t3 = icmp sgt i16 %a1, %a2 ; signed
288 %t4 = select i1 %t3, i16 -1, i16 1
289 %t5 = select i1 %t3, i16 %a2, i16 %a1
290 %t6 = select i1 %t3, i16 %a1, i16 %a2
291 %t7 = sub i16 %t6, %t5
292 %t8 = lshr i16 %t7, 1
293 %t9 = mul nsw i16 %t8, %t4 ; signed
294 %a10 = add nsw i16 %t9, %a1 ; signed
295 ret i16 %a10
296 }
297
298 define i16 @scalar_i16_unsigned_reg_reg(i16 %a1, i16 %a2) nounwind {
299 ; CHECK-LABEL: scalar_i16_unsigned_reg_reg:
300 ; CHECK: // %bb.0:
301 ; CHECK-NEXT: and w8, w0, #0xffff
302 ; CHECK-NEXT: mov w9, #-1
303 ; CHECK-NEXT: cmp w8, w1, uxth
304 ; CHECK-NEXT: cneg w8, w9, ls
305 ; CHECK-NEXT: csel w9, w1, w0, hi
306 ; CHECK-NEXT: csel w10, w0, w1, hi
307 ; CHECK-NEXT: sub w9, w10, w9
308 ; CHECK-NEXT: ubfx w9, w9, #1, #15
309 ; CHECK-NEXT: madd w0, w9, w8, w0
310 ; CHECK-NEXT: ret
311 %t3 = icmp ugt i16 %a1, %a2
312 %t4 = select i1 %t3, i16 -1, i16 1
313 %t5 = select i1 %t3, i16 %a2, i16 %a1
314 %t6 = select i1 %t3, i16 %a1, i16 %a2
315 %t7 = sub i16 %t6, %t5
316 %t8 = lshr i16 %t7, 1
317 %t9 = mul i16 %t8, %t4
318 %a10 = add i16 %t9, %a1
319 ret i16 %a10
320 }
321
322 ; Values are loaded. Only check signed case.
323
324 define i16 @scalar_i16_signed_mem_reg(i16* %a1_addr, i16 %a2) nounwind {
325 ; CHECK-LABEL: scalar_i16_signed_mem_reg:
326 ; CHECK: // %bb.0:
327 ; CHECK-NEXT: ldrsh w8, [x0]
328 ; CHECK-NEXT: mov w9, #-1
329 ; CHECK-NEXT: cmp w8, w1, sxth
330 ; CHECK-NEXT: csel w10, w1, w8, gt
331 ; CHECK-NEXT: csel w11, w8, w1, gt
332 ; CHECK-NEXT: sub w10, w11, w10
333 ; CHECK-NEXT: cneg w9, w9, le
334 ; CHECK-NEXT: ubfx w10, w10, #1, #15
335 ; CHECK-NEXT: madd w0, w10, w9, w8
336 ; CHECK-NEXT: ret
337 %a1 = load i16, i16* %a1_addr
338 %t3 = icmp sgt i16 %a1, %a2 ; signed
339 %t4 = select i1 %t3, i16 -1, i16 1
340 %t5 = select i1 %t3, i16 %a2, i16 %a1
341 %t6 = select i1 %t3, i16 %a1, i16 %a2
342 %t7 = sub i16 %t6, %t5
343 %t8 = lshr i16 %t7, 1
344 %t9 = mul nsw i16 %t8, %t4 ; signed
345 %a10 = add nsw i16 %t9, %a1 ; signed
346 ret i16 %a10
347 }
348
349 define i16 @scalar_i16_signed_reg_mem(i16 %a1, i16* %a2_addr) nounwind {
350 ; CHECK-LABEL: scalar_i16_signed_reg_mem:
351 ; CHECK: // %bb.0:
352 ; CHECK-NEXT: ldrsh w8, [x1]
353 ; CHECK-NEXT: sxth w9, w0
354 ; CHECK-NEXT: mov w10, #-1
355 ; CHECK-NEXT: cmp w9, w8
356 ; CHECK-NEXT: cneg w9, w10, le
357 ; CHECK-NEXT: csel w10, w8, w0, gt
358 ; CHECK-NEXT: csel w8, w0, w8, gt
359 ; CHECK-NEXT: sub w8, w8, w10
360 ; CHECK-NEXT: ubfx w8, w8, #1, #15
361 ; CHECK-NEXT: madd w0, w8, w9, w0
362 ; CHECK-NEXT: ret
363 %a2 = load i16, i16* %a2_addr
364 %t3 = icmp sgt i16 %a1, %a2 ; signed
365 %t4 = select i1 %t3, i16 -1, i16 1
366 %t5 = select i1 %t3, i16 %a2, i16 %a1
367 %t6 = select i1 %t3, i16 %a1, i16 %a2
368 %t7 = sub i16 %t6, %t5
369 %t8 = lshr i16 %t7, 1
370 %t9 = mul nsw i16 %t8, %t4 ; signed
371 %a10 = add nsw i16 %t9, %a1 ; signed
372 ret i16 %a10
373 }
374
375 define i16 @scalar_i16_signed_mem_mem(i16* %a1_addr, i16* %a2_addr) nounwind {
376 ; CHECK-LABEL: scalar_i16_signed_mem_mem:
377 ; CHECK: // %bb.0:
378 ; CHECK-NEXT: ldrsh w8, [x0]
379 ; CHECK-NEXT: ldrsh w9, [x1]
380 ; CHECK-NEXT: mov w10, #-1
381 ; CHECK-NEXT: cmp w8, w9
382 ; CHECK-NEXT: csel w11, w9, w8, gt
383 ; CHECK-NEXT: csel w9, w8, w9, gt
384 ; CHECK-NEXT: sub w9, w9, w11
385 ; CHECK-NEXT: cneg w10, w10, le
386 ; CHECK-NEXT: ubfx w9, w9, #1, #15
387 ; CHECK-NEXT: madd w0, w9, w10, w8
388 ; CHECK-NEXT: ret
389 %a1 = load i16, i16* %a1_addr
390 %a2 = load i16, i16* %a2_addr
391 %t3 = icmp sgt i16 %a1, %a2 ; signed
392 %t4 = select i1 %t3, i16 -1, i16 1
393 %t5 = select i1 %t3, i16 %a2, i16 %a1
394 %t6 = select i1 %t3, i16 %a1, i16 %a2
395 %t7 = sub i16 %t6, %t5
396 %t8 = lshr i16 %t7, 1
397 %t9 = mul nsw i16 %t8, %t4 ; signed
398 %a10 = add nsw i16 %t9, %a1 ; signed
399 ret i16 %a10
400 }
401
402 ; ---------------------------------------------------------------------------- ;
403 ; 8-bit width
404 ; ---------------------------------------------------------------------------- ;
405
406 ; Values come from regs
407
408 define i8 @scalar_i8_signed_reg_reg(i8 %a1, i8 %a2) nounwind {
409 ; CHECK-LABEL: scalar_i8_signed_reg_reg:
410 ; CHECK: // %bb.0:
411 ; CHECK-NEXT: sxtb w8, w0
412 ; CHECK-NEXT: mov w9, #-1
413 ; CHECK-NEXT: cmp w8, w1, sxtb
414 ; CHECK-NEXT: cneg w8, w9, le
415 ; CHECK-NEXT: csel w9, w1, w0, gt
416 ; CHECK-NEXT: csel w10, w0, w1, gt
417 ; CHECK-NEXT: sub w9, w10, w9
418 ; CHECK-NEXT: ubfx w9, w9, #1, #7
419 ; CHECK-NEXT: madd w0, w9, w8, w0
420 ; CHECK-NEXT: ret
421 %t3 = icmp sgt i8 %a1, %a2 ; signed
422 %t4 = select i1 %t3, i8 -1, i8 1
423 %t5 = select i1 %t3, i8 %a2, i8 %a1
424 %t6 = select i1 %t3, i8 %a1, i8 %a2
425 %t7 = sub i8 %t6, %t5
426 %t8 = lshr i8 %t7, 1
427 %t9 = mul nsw i8 %t8, %t4 ; signed
428 %a10 = add nsw i8 %t9, %a1 ; signed
429 ret i8 %a10
430 }
431
432 define i8 @scalar_i8_unsigned_reg_reg(i8 %a1, i8 %a2) nounwind {
433 ; CHECK-LABEL: scalar_i8_unsigned_reg_reg:
434 ; CHECK: // %bb.0:
435 ; CHECK-NEXT: and w8, w0, #0xff
436 ; CHECK-NEXT: mov w9, #-1
437 ; CHECK-NEXT: cmp w8, w1, uxtb
438 ; CHECK-NEXT: cneg w8, w9, ls
439 ; CHECK-NEXT: csel w9, w1, w0, hi
440 ; CHECK-NEXT: csel w10, w0, w1, hi
441 ; CHECK-NEXT: sub w9, w10, w9
442 ; CHECK-NEXT: ubfx w9, w9, #1, #7
443 ; CHECK-NEXT: madd w0, w9, w8, w0
444 ; CHECK-NEXT: ret
445 %t3 = icmp ugt i8 %a1, %a2
446 %t4 = select i1 %t3, i8 -1, i8 1
447 %t5 = select i1 %t3, i8 %a2, i8 %a1
448 %t6 = select i1 %t3, i8 %a1, i8 %a2
449 %t7 = sub i8 %t6, %t5
450 %t8 = lshr i8 %t7, 1
451 %t9 = mul i8 %t8, %t4
452 %a10 = add i8 %t9, %a1
453 ret i8 %a10
454 }
455
456 ; Values are loaded. Only check signed case.
457
458 define i8 @scalar_i8_signed_mem_reg(i8* %a1_addr, i8 %a2) nounwind {
459 ; CHECK-LABEL: scalar_i8_signed_mem_reg:
460 ; CHECK: // %bb.0:
461 ; CHECK-NEXT: ldrsb w8, [x0]
462 ; CHECK-NEXT: mov w9, #-1
463 ; CHECK-NEXT: cmp w8, w1, sxtb
464 ; CHECK-NEXT: csel w10, w1, w8, gt
465 ; CHECK-NEXT: csel w11, w8, w1, gt
466 ; CHECK-NEXT: sub w10, w11, w10
467 ; CHECK-NEXT: cneg w9, w9, le
468 ; CHECK-NEXT: ubfx w10, w10, #1, #7
469 ; CHECK-NEXT: madd w0, w10, w9, w8
470 ; CHECK-NEXT: ret
471 %a1 = load i8, i8* %a1_addr
472 %t3 = icmp sgt i8 %a1, %a2 ; signed
473 %t4 = select i1 %t3, i8 -1, i8 1
474 %t5 = select i1 %t3, i8 %a2, i8 %a1
475 %t6 = select i1 %t3, i8 %a1, i8 %a2
476 %t7 = sub i8 %t6, %t5
477 %t8 = lshr i8 %t7, 1
478 %t9 = mul nsw i8 %t8, %t4 ; signed
479 %a10 = add nsw i8 %t9, %a1 ; signed
480 ret i8 %a10
481 }
482
483 define i8 @scalar_i8_signed_reg_mem(i8 %a1, i8* %a2_addr) nounwind {
484 ; CHECK-LABEL: scalar_i8_signed_reg_mem:
485 ; CHECK: // %bb.0:
486 ; CHECK-NEXT: ldrsb w8, [x1]
487 ; CHECK-NEXT: sxtb w9, w0
488 ; CHECK-NEXT: mov w10, #-1
489 ; CHECK-NEXT: cmp w9, w8
490 ; CHECK-NEXT: cneg w9, w10, le
491 ; CHECK-NEXT: csel w10, w8, w0, gt
492 ; CHECK-NEXT: csel w8, w0, w8, gt
493 ; CHECK-NEXT: sub w8, w8, w10
494 ; CHECK-NEXT: ubfx w8, w8, #1, #7
495 ; CHECK-NEXT: madd w0, w8, w9, w0
496 ; CHECK-NEXT: ret
497 %a2 = load i8, i8* %a2_addr
498 %t3 = icmp sgt i8 %a1, %a2 ; signed
499 %t4 = select i1 %t3, i8 -1, i8 1
500 %t5 = select i1 %t3, i8 %a2, i8 %a1
501 %t6 = select i1 %t3, i8 %a1, i8 %a2
502 %t7 = sub i8 %t6, %t5
503 %t8 = lshr i8 %t7, 1
504 %t9 = mul nsw i8 %t8, %t4 ; signed
505 %a10 = add nsw i8 %t9, %a1 ; signed
506 ret i8 %a10
507 }
508
509 define i8 @scalar_i8_signed_mem_mem(i8* %a1_addr, i8* %a2_addr) nounwind {
510 ; CHECK-LABEL: scalar_i8_signed_mem_mem:
511 ; CHECK: // %bb.0:
512 ; CHECK-NEXT: ldrsb w8, [x0]
513 ; CHECK-NEXT: ldrsb w9, [x1]
514 ; CHECK-NEXT: mov w10, #-1
515 ; CHECK-NEXT: cmp w8, w9
516 ; CHECK-NEXT: csel w11, w9, w8, gt
517 ; CHECK-NEXT: csel w9, w8, w9, gt
518 ; CHECK-NEXT: sub w9, w9, w11
519 ; CHECK-NEXT: cneg w10, w10, le
520 ; CHECK-NEXT: ubfx w9, w9, #1, #7
521 ; CHECK-NEXT: madd w0, w9, w10, w8
522 ; CHECK-NEXT: ret
523 %a1 = load i8, i8* %a1_addr
524 %a2 = load i8, i8* %a2_addr
525 %t3 = icmp sgt i8 %a1, %a2 ; signed
526 %t4 = select i1 %t3, i8 -1, i8 1
527 %t5 = select i1 %t3, i8 %a2, i8 %a1
528 %t6 = select i1 %t3, i8 %a1, i8 %a2
529 %t7 = sub i8 %t6, %t5
530 %t8 = lshr i8 %t7, 1
531 %t9 = mul nsw i8 %t8, %t4 ; signed
532 %a10 = add nsw i8 %t9, %a1 ; signed
533 ret i8 %a10
534 }
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,AVX1,AVX1-FALLBACK
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FALLBACK
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=ALL,XOP,XOP-FALLBACK
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=ALL,XOP,AVX,AVX1,XOPAVX,XOPAVX1,XOPAVX1-FALLBACK
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=ALL,XOP,AVX,AVX2,XOPAVX,XOPAVX2,XOPAVX2-FALLBACK
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX512,AVX512F
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512VL,AVX512VL-FALLBACK
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512,AVX512BW,AVX512BW-FALLBACK
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512,AVX512VL,AVX512BW,AVX512VLBW
12
13 ; These test cases are inspired by C++2a std::midpoint().
14 ; See https://bugs.llvm.org/show_bug.cgi?id=40965
15
16 ; Using 128-bit vector regs.
17
18 ; ---------------------------------------------------------------------------- ;
19 ; 32-bit width. 128 / 32 = 4 elts.
20 ; ---------------------------------------------------------------------------- ;
21
22 ; Values come from regs
23
24 define <4 x i32> @vec128_i32_signed_reg_reg(<4 x i32> %a1, <4 x i32> %a2) nounwind {
25 ; SSE2-LABEL: vec128_i32_signed_reg_reg:
26 ; SSE2: # %bb.0:
27 ; SSE2-NEXT: movdqa %xmm0, %xmm2
28 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
29 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1]
30 ; SSE2-NEXT: por %xmm2, %xmm3
31 ; SSE2-NEXT: movdqa %xmm1, %xmm4
32 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
33 ; SSE2-NEXT: movdqa %xmm0, %xmm5
34 ; SSE2-NEXT: pand %xmm4, %xmm5
35 ; SSE2-NEXT: pandn %xmm1, %xmm4
36 ; SSE2-NEXT: por %xmm5, %xmm4
37 ; SSE2-NEXT: movdqa %xmm0, %xmm5
38 ; SSE2-NEXT: pand %xmm2, %xmm5
39 ; SSE2-NEXT: pandn %xmm1, %xmm2
40 ; SSE2-NEXT: por %xmm5, %xmm2
41 ; SSE2-NEXT: psubd %xmm4, %xmm2
42 ; SSE2-NEXT: psrld $1, %xmm2
43 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
44 ; SSE2-NEXT: pmuludq %xmm3, %xmm2
45 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
46 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
47 ; SSE2-NEXT: pmuludq %xmm1, %xmm3
48 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
49 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
50 ; SSE2-NEXT: paddd %xmm2, %xmm0
51 ; SSE2-NEXT: retq
52 ;
53 ; SSE41-LABEL: vec128_i32_signed_reg_reg:
54 ; SSE41: # %bb.0:
55 ; SSE41-NEXT: movdqa %xmm0, %xmm2
56 ; SSE41-NEXT: pcmpgtd %xmm1, %xmm2
57 ; SSE41-NEXT: por {{.*}}(%rip), %xmm2
58 ; SSE41-NEXT: movdqa %xmm0, %xmm3
59 ; SSE41-NEXT: pminsd %xmm1, %xmm3
60 ; SSE41-NEXT: pmaxsd %xmm0, %xmm1
61 ; SSE41-NEXT: psubd %xmm3, %xmm1
62 ; SSE41-NEXT: psrld $1, %xmm1
63 ; SSE41-NEXT: pmulld %xmm1, %xmm2
64 ; SSE41-NEXT: paddd %xmm0, %xmm2
65 ; SSE41-NEXT: movdqa %xmm2, %xmm0
66 ; SSE41-NEXT: retq
67 ;
68 ; AVX1-FALLBACK-LABEL: vec128_i32_signed_reg_reg:
69 ; AVX1-FALLBACK: # %bb.0:
70 ; AVX1-FALLBACK-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm2
71 ; AVX1-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
72 ; AVX1-FALLBACK-NEXT: vpminsd %xmm1, %xmm0, %xmm3
73 ; AVX1-FALLBACK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1
74 ; AVX1-FALLBACK-NEXT: vpsubd %xmm3, %xmm1, %xmm1
75 ; AVX1-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1
76 ; AVX1-FALLBACK-NEXT: vpmulld %xmm2, %xmm1, %xmm1
77 ; AVX1-FALLBACK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
78 ; AVX1-FALLBACK-NEXT: retq
79 ;
80 ; AVX2-FALLBACK-LABEL: vec128_i32_signed_reg_reg:
81 ; AVX2-FALLBACK: # %bb.0:
82 ; AVX2-FALLBACK-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm2
83 ; AVX2-FALLBACK-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
84 ; AVX2-FALLBACK-NEXT: vpor %xmm3, %xmm2, %xmm2
85 ; AVX2-FALLBACK-NEXT: vpminsd %xmm1, %xmm0, %xmm3
86 ; AVX2-FALLBACK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1
87 ; AVX2-FALLBACK-NEXT: vpsubd %xmm3, %xmm1, %xmm1
88 ; AVX2-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1
89 ; AVX2-FALLBACK-NEXT: vpmulld %xmm2, %xmm1, %xmm1
90 ; AVX2-FALLBACK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
91 ; AVX2-FALLBACK-NEXT: retq
92 ;
93 ; XOP-FALLBACK-LABEL: vec128_i32_signed_reg_reg:
94 ; XOP-FALLBACK: # %bb.0:
95 ; XOP-FALLBACK-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2
96 ; XOP-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
97 ; XOP-FALLBACK-NEXT: vpminsd %xmm1, %xmm0, %xmm3
98 ; XOP-FALLBACK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1
99 ; XOP-FALLBACK-NEXT: vpsubd %xmm3, %xmm1, %xmm1
100 ; XOP-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1
101 ; XOP-FALLBACK-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0
102 ; XOP-FALLBACK-NEXT: retq
103 ;
104 ; XOPAVX1-LABEL: vec128_i32_signed_reg_reg:
105 ; XOPAVX1: # %bb.0:
106 ; XOPAVX1-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2
107 ; XOPAVX1-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
108 ; XOPAVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm3
109 ; XOPAVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1
110 ; XOPAVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
111 ; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1
112 ; XOPAVX1-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0
113 ; XOPAVX1-NEXT: retq
114 ;
115 ; XOPAVX2-LABEL: vec128_i32_signed_reg_reg:
116 ; XOPAVX2: # %bb.0:
117 ; XOPAVX2-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2
118 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
119 ; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
120 ; XOPAVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm3
121 ; XOPAVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1
122 ; XOPAVX2-NEXT: vpsubd %xmm3, %xmm1, %xmm1
123 ; XOPAVX2-NEXT: vpsrld $1, %xmm1, %xmm1
124 ; XOPAVX2-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0
125 ; XOPAVX2-NEXT: retq
126 ;
127 ; AVX512F-LABEL: vec128_i32_signed_reg_reg:
128 ; AVX512F: # %bb.0:
129 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
130 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
131 ; AVX512F-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
132 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
133 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
134 ; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1}
135 ; AVX512F-NEXT: vpminsd %xmm1, %xmm0, %xmm2
136 ; AVX512F-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1
137 ; AVX512F-NEXT: vpsubd %xmm2, %xmm1, %xmm1
138 ; AVX512F-NEXT: vpsrld $1, %xmm1, %xmm1
139 ; AVX512F-NEXT: vpmulld %xmm3, %xmm1, %xmm1
140 ; AVX512F-NEXT: vpaddd %xmm0, %xmm1, %xmm0
141 ; AVX512F-NEXT: vzeroupper
142 ; AVX512F-NEXT: retq
143 ;
144 ; AVX512VL-LABEL: vec128_i32_signed_reg_reg:
145 ; AVX512VL: # %bb.0:
146 ; AVX512VL-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
147 ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
148 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
149 ; AVX512VL-NEXT: vmovdqa32 %xmm2, %xmm3 {%k1}
150 ; AVX512VL-NEXT: vpminsd %xmm1, %xmm0, %xmm2
151 ; AVX512VL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1
152 ; AVX512VL-NEXT: vpsubd %xmm2, %xmm1, %xmm1
153 ; AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1
154 ; AVX512VL-NEXT: vpmulld %xmm3, %xmm1, %xmm1
155 ; AVX512VL-NEXT: vpaddd %xmm0, %xmm1, %xmm0
156 ; AVX512VL-NEXT: retq
157 ;
158 ; AVX512BW-FALLBACK-LABEL: vec128_i32_signed_reg_reg:
159 ; AVX512BW-FALLBACK: # %bb.0:
160 ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
161 ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
162 ; AVX512BW-FALLBACK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
163 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
164 ; AVX512BW-FALLBACK-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
165 ; AVX512BW-FALLBACK-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1}
166 ; AVX512BW-FALLBACK-NEXT: vpminsd %xmm1, %xmm0, %xmm2
167 ; AVX512BW-FALLBACK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1
168 ; AVX512BW-FALLBACK-NEXT: vpsubd %xmm2, %xmm1, %xmm1
169 ; AVX512BW-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1
170 ; AVX512BW-FALLBACK-NEXT: vpmulld %xmm3, %xmm1, %xmm1
171 ; AVX512BW-FALLBACK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
172 ; AVX512BW-FALLBACK-NEXT: vzeroupper
173 ; AVX512BW-FALLBACK-NEXT: retq
174 %t3 = icmp sgt <4 x i32> %a1, %a2 ; signed
175 %t4 = select <4 x i1> %t3, <4 x i32> , <4 x i32>
176 %t5 = select <4 x i1> %t3, <4 x i32> %a2, <4 x i32> %a1
177 %t6 = select <4 x i1> %t3, <4 x i32> %a1, <4 x i32> %a2
178 %t7 = sub <4 x i32> %t6, %t5
179 %t8 = lshr <4 x i32> %t7,
180 %t9 = mul nsw <4 x i32> %t8, %t4 ; signed
181 %a10 = add nsw <4 x i32> %t9, %a1 ; signed
182 ret <4 x i32> %a10
183 }
184
185 define <4 x i32> @vec128_i32_unsigned_reg_reg(<4 x i32> %a1, <4 x i32> %a2) nounwind {
186 ; SSE2-LABEL: vec128_i32_unsigned_reg_reg:
187 ; SSE2: # %bb.0:
188 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
189 ; SSE2-NEXT: movdqa %xmm1, %xmm3
190 ; SSE2-NEXT: pxor %xmm2, %xmm3
191 ; SSE2-NEXT: pxor %xmm0, %xmm2
192 ; SSE2-NEXT: movdqa %xmm2, %xmm4
193 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
194 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,1,1,1]
195 ; SSE2-NEXT: por %xmm4, %xmm5
196 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
197 ; SSE2-NEXT: movdqa %xmm0, %xmm2
198 ; SSE2-NEXT: pand %xmm3, %xmm2
199 ; SSE2-NEXT: pandn %xmm1, %xmm3
200 ; SSE2-NEXT: por %xmm2, %xmm3
201 ; SSE2-NEXT: movdqa %xmm0, %xmm2
202 ; SSE2-NEXT: pand %xmm4, %xmm2
203 ; SSE2-NEXT: pandn %xmm1, %xmm4
204 ; SSE2-NEXT: por %xmm2, %xmm4
205 ; SSE2-NEXT: psubd %xmm3, %xmm4
206 ; SSE2-NEXT: psrld $1, %xmm4
207 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
208 ; SSE2-NEXT: pmuludq %xmm5, %xmm4
209 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
210 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
211 ; SSE2-NEXT: pmuludq %xmm1, %xmm3
212 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
213 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
214 ; SSE2-NEXT: paddd %xmm2, %xmm0
215 ; SSE2-NEXT: retq
216 ;
217 ; SSE41-LABEL: vec128_i32_unsigned_reg_reg:
218 ; SSE41: # %bb.0:
219 ; SSE41-NEXT: movdqa %xmm0, %xmm2
220 ; SSE41-NEXT: pminud %xmm1, %xmm2
221 ; SSE41-NEXT: movdqa %xmm0, %xmm3
222 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm3
223 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
224 ; SSE41-NEXT: pxor %xmm3, %xmm4
225 ; SSE41-NEXT: por {{.*}}(%rip), %xmm4
226 ; SSE41-NEXT: pmaxud %xmm0, %xmm1
227 ; SSE41-NEXT: psubd %xmm2, %xmm1
228 ; SSE41-NEXT: psrld $1, %xmm1
229 ; SSE41-NEXT: pmulld %xmm1, %xmm4
230 ; SSE41-NEXT: paddd %xmm4, %xmm0
231 ; SSE41-NEXT: retq
232 ;
233 ; AVX1-FALLBACK-LABEL: vec128_i32_unsigned_reg_reg:
234 ; AVX1-FALLBACK: # %bb.0:
235 ; AVX1-FALLBACK-NEXT: vpminud %xmm1, %xmm0, %xmm2
236 ; AVX1-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm3
237 ; AVX1-FALLBACK-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
238 ; AVX1-FALLBACK-NEXT: vpxor %xmm4, %xmm3, %xmm3
239 ; AVX1-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm3, %xmm3
240 ; AVX1-FALLBACK-NEXT: vpmaxud %xmm1, %xmm0, %xmm1
241 ; AVX1-FALLBACK-NEXT: vpsubd %xmm2, %xmm1, %xmm1
242 ; AVX1-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1
243 ; AVX1-FALLBACK-NEXT: vpmulld %xmm3, %xmm1, %xmm1
244 ; AVX1-FALLBACK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
245 ; AVX1-FALLBACK-NEXT: retq
246 ;
247 ; AVX2-FALLBACK-LABEL: vec128_i32_unsigned_reg_reg:
248 ; AVX2-FALLBACK: # %bb.0:
249 ; AVX2-FALLBACK-NEXT: vpminud %xmm1, %xmm0, %xmm2
250 ; AVX2-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm3
251 ; AVX2-FALLBACK-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
252 ; AVX2-FALLBACK-NEXT: vpxor %xmm4, %xmm3, %xmm3
253 ; AVX2-FALLBACK-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1,1,1,1]
254 ; AVX2-FALLBACK-NEXT: vpor %xmm4, %xmm3, %xmm3
255 ; AVX2-FALLBACK-NEXT: vpmaxud %xmm1, %xmm0, %xmm1
256 ; AVX2-FALLBACK-NEXT: vpsubd %xmm2, %xmm1, %xmm1
257 ; AVX2-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1
258 ; AVX2-FALLBACK-NEXT: vpmulld %xmm3, %xmm1, %xmm1
259 ; AVX2-FALLBACK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
260 ; AVX2-FALLBACK-NEXT: retq
261 ;
262 ; XOP-FALLBACK-LABEL: vec128_i32_unsigned_reg_reg:
263 ; XOP-FALLBACK: # %bb.0:
264 ; XOP-FALLBACK-NEXT: vpcomgtud %xmm1, %xmm0, %xmm2
265 ; XOP-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
266 ; XOP-FALLBACK-NEXT: vpminud %xmm1, %xmm0, %xmm3
267 ; XOP-FALLBACK-NEXT: vpmaxud %xmm1, %xmm0, %xmm1
268 ; XOP-FALLBACK-NEXT: vpsubd %xmm3, %xmm1, %xmm1
269 ; XOP-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1
270 ; XOP-FALLBACK-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0
271 ; XOP-FALLBACK-NEXT: retq
272 ;
273 ; XOPAVX1-LABEL: vec128_i32_unsigned_reg_reg:
274 ; XOPAVX1: # %bb.0:
275 ; XOPAVX1-NEXT: vpcomgtud %xmm1, %xmm0, %xmm2
276 ; XOPAVX1-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
277 ; XOPAVX1-NEXT: vpminud %xmm1, %xmm0, %xmm3
278 ; XOPAVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm1
279 ; XOPAVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
280 ; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1
281 ; XOPAVX1-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0
282 ; XOPAVX1-NEXT: retq
283 ;
284 ; XOPAVX2-LABEL: vec128_i32_unsigned_reg_reg:
285 ; XOPAVX2: # %bb.0:
286 ; XOPAVX2-NEXT: vpcomgtud %xmm1, %xmm0, %xmm2
287 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
288 ; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
289 ; XOPAVX2-NEXT: vpminud %xmm1, %xmm0, %xmm3
290 ; XOPAVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm1
291 ; XOPAVX2-NEXT: vpsubd %xmm3, %xmm1, %xmm1
292 ; XOPAVX2-NEXT: vpsrld $1, %xmm1, %xmm1
293 ; XOPAVX2-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0
294 ; XOPAVX2-NEXT: retq
295 ;
296 ; AVX512F-LABEL: vec128_i32_unsigned_reg_reg:
297 ; AVX512F: # %bb.0:
298 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
299 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
300 ; AVX512F-NEXT: vpcmpnleud %zmm1, %zmm0, %k1
301 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
302 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
303 ; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1}
304 ; AVX512F-NEXT: vpminud %xmm1, %xmm0, %xmm2
305 ; AVX512F-NEXT: vpmaxud %xmm1, %xmm0, %xmm1
306 ; AVX512F-NEXT: vpsubd %xmm2, %xmm1, %xmm1
307 ; AVX512F-NEXT: vpsrld $1, %xmm1, %xmm1
308 ; AVX512F-NEXT: vpmulld %xmm3, %xmm1, %xmm1
309 ; AVX512F-NEXT: vpaddd %xmm0, %xmm1, %xmm0
310 ; AVX512F-NEXT: vzeroupper
311 ; AVX512F-NEXT: retq
312 ;
313 ; AVX512VL-LABEL: vec128_i32_unsigned_reg_reg:
314 ; AVX512VL: # %bb.0:
315 ; AVX512VL-NEXT: vpcmpnleud %xmm1, %xmm0, %k1
316 ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
317 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
318 ; AVX512VL-NEXT: vmovdqa32 %xmm2, %xmm3 {%k1}
319 ; AVX512VL-NEXT: vpminud %xmm1, %xmm0, %xmm2
320 ; AVX512VL-NEXT: vpmaxud %xmm1, %xmm0, %xmm1
321 ; AVX512VL-NEXT: vpsubd %xmm2, %xmm1, %xmm1
322 ; AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1
323 ; AVX512VL-NEXT: vpmulld %xmm3, %xmm1, %xmm1
324 ; AVX512VL-NEXT: vpaddd %xmm0, %xmm1, %xmm0
325 ; AVX512VL-NEXT: retq
326 ;
327 ; AVX512BW-FALLBACK-LABEL: vec128_i32_unsigned_reg_reg:
328 ; AVX512BW-FALLBACK: # %bb.0:
329 ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
330 ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
331 ; AVX512BW-FALLBACK-NEXT: vpcmpnleud %zmm1, %zmm0, %k1
332 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
333 ; AVX512BW-FALLBACK-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
334 ; AVX512BW-FALLBACK-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1}
335 ; AVX512BW-FALLBACK-NEXT: vpminud %xmm1, %xmm0, %xmm2
336 ; AVX512BW-FALLBACK-NEXT: vpmaxud %xmm1, %xmm0, %xmm1
337 ; AVX512BW-FALLBACK-NEXT: vpsubd %xmm2, %xmm1, %xmm1
338 ; AVX512BW-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1
339 ; AVX512BW-FALLBACK-NEXT: vpmulld %xmm3, %xmm1, %xmm1
340 ; AVX512BW-FALLBACK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
341 ; AVX512BW-FALLBACK-NEXT: vzeroupper
342 ; AVX512BW-FALLBACK-NEXT: retq
343 %t3 = icmp ugt <4 x i32> %a1, %a2
344 %t4 = select <4 x i1> %t3, <4 x i32> , <4 x i32>
345 %t5 = select <4 x i1> %t3, <4 x i32> %a2, <4 x i32> %a1
346 %t6 = select <4 x i1> %t3, <4 x i32> %a1, <4 x i32> %a2
347 %t7 = sub <4 x i32> %t6, %t5
348 %t8 = lshr <4 x i32> %t7,
349 %t9 = mul <4 x i32> %t8, %t4
350 %a10 = add <4 x i32> %t9, %a1
351 ret <4 x i32> %a10
352 }
353
354 ; Values are loaded. Only check signed case.
355
356 define <4 x i32> @vec128_i32_signed_mem_reg(<4 x i32>* %a1_addr, <4 x i32> %a2) nounwind {
357 ; SSE2-LABEL: vec128_i32_signed_mem_reg:
358 ; SSE2: # %bb.0:
359 ; SSE2-NEXT: movdqa (%rdi), %xmm1
360 ; SSE2-NEXT: movdqa %xmm1, %xmm2
361 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
362 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1]
363 ; SSE2-NEXT: por %xmm2, %xmm3
364 ; SSE2-NEXT: movdqa %xmm0, %xmm4
365 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
366 ; SSE2-NEXT: movdqa %xmm1, %xmm5
367 ; SSE2-NEXT: pand %xmm4, %xmm5
368 ; SSE2-NEXT: pandn %xmm0, %xmm4
369 ; SSE2-NEXT: por %xmm5, %xmm4
370 ; SSE2-NEXT: movdqa %xmm1, %xmm5
371 ; SSE2-NEXT: pand %xmm2, %xmm5
372 ; SSE2-NEXT: pandn %xmm0, %xmm2
373 ; SSE2-NEXT: por %xmm5, %xmm2
374 ; SSE2-NEXT: psubd %xmm4, %xmm2
375 ; SSE2-NEXT: psrld $1, %xmm2
376 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
377 ; SSE2-NEXT: pmuludq %xmm3, %xmm2
378 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
379 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
380 ; SSE2-NEXT: pmuludq %xmm4, %xmm2
381 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
382 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
383 ; SSE2-NEXT: paddd %xmm1, %xmm0
384 ; SSE2-NEXT: retq
385 ;
386 ; SSE41-LABEL: vec128_i32_signed_mem_reg:
387 ; SSE41: # %bb.0:
388 ; SSE41-NEXT: movdqa (%rdi), %xmm1
389 ; SSE41-NEXT: movdqa %xmm1, %xmm2
390 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
391 ; SSE41-NEXT: por {{.*}}(%rip), %xmm2
392 ; SSE41-NEXT: movdqa %xmm1, %xmm3
393 ; SSE41-NEXT: pminsd %xmm0, %xmm3
394 ; SSE41-NEXT: pmaxsd %xmm1, %xmm0
395 ; SSE41-NEXT: psubd %xmm3, %xmm0
396 ; SSE41-NEXT: psrld $1, %xmm0
397 ; SSE41-NEXT: pmulld %xmm2, %xmm0
398 ; SSE41-NEXT: paddd %xmm1, %xmm0
399 ; SSE41-NEXT: retq
400 ;
401 ; AVX1-FALLBACK-LABEL: vec128_i32_signed_mem_reg:
402 ; AVX1-FALLBACK: # %bb.0:
403 ; AVX1-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
404 ; AVX1-FALLBACK-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm2
405 ; AVX1-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
406 ; AVX1-FALLBACK-NEXT: vpminsd %xmm0, %xmm1, %xmm3
407 ; AVX1-FALLBACK-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0
408 ; AVX1-FALLBACK-NEXT: vpsubd %xmm3, %xmm0, %xmm0
409 ; AVX1-FALLBACK-NEXT: vpsrld $1, %xmm0, %xmm0
410 ; AVX1-FALLBACK-NEXT: vpmulld %xmm2, %xmm0, %xmm0
411 ; AVX1-FALLBACK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
412 ; AVX1-FALLBACK-NEXT: retq
413 ;
414 ; AVX2-FALLBACK-LABEL: vec128_i32_signed_mem_reg:
415 ; AVX2-FALLBACK: # %bb.0:
416 ; AVX2-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
417 ; AVX2-FALLBACK-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm2
418 ; AVX2-FALLBACK-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
419 ; AVX2-FALLBACK-NEXT: vpor %xmm3, %xmm2, %xmm2
420 ; AVX2-FALLBACK-NEXT: vpminsd %xmm0, %xmm1, %xmm3
421 ; AVX2-FALLBACK-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0
422 ; AVX2-FALLBACK-NEXT: vpsubd %xmm3, %xmm0, %xmm0
423 ; AVX2-FALLBACK-NEXT: vpsrld $1, %xmm0, %xmm0
424 ; AVX2-FALLBACK-NEXT: vpmulld %xmm2, %xmm0, %xmm0
425 ; AVX2-FALLBACK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
426 ; AVX2-FALLBACK-NEXT: retq
427 ;
428 ; XOP-FALLBACK-LABEL: vec128_i32_signed_mem_reg:
429 ; XOP-FALLBACK: # %bb.0:
430 ; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
431 ; XOP-FALLBACK-NEXT: vpcomgtd %xmm0, %xmm1, %xmm2
432 ; XOP-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
433 ; XOP-FALLBACK-NEXT: vpminsd %xmm0, %xmm1, %xmm3
434 ; XOP-FALLBACK-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0
435 ; XOP-FALLBACK-NEXT: vpsubd %xmm3, %xmm0, %xmm0
436 ; XOP-FALLBACK-NEXT: vpsrld $1, %xmm0, %xmm0
437 ; XOP-FALLBACK-NEXT: vpmacsdd %xmm1, %xmm2, %xmm0, %xmm0
438 ; XOP-FALLBACK-NEXT: retq
439 ;
440 ; XOPAVX1-LABEL: vec128_i32_signed_mem_reg:
441 ; XOPAVX1: # %bb.0:
442 ; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm1
443 ; XOPAVX1-NEXT: vpcomgtd %xmm0, %xmm1, %xmm2
444 ; XOPAVX1-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
445 ; XOPAVX1-NEXT: vpminsd %xmm0, %xmm1, %xmm3
446 ; XOPAVX1-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0
447 ; XOPAVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0
448 ; XOPAVX1-NEXT: vpsrld $1, %xmm0, %xmm0
449 ; XOPAVX1-NEXT: vpmacsdd %xmm1, %xmm2, %xmm0, %xmm0
450 ; XOPAVX1-NEXT: retq
451 ;
452 ; XOPAVX2-LABEL: vec128_i32_signed_mem_reg:
453 ; XOPAVX2: # %bb.0:
454 ; XOPAVX2-NEXT: vmovdqa (%rdi), %xmm1
455 ; XOPAVX2-NEXT: vpcomgtd %xmm0, %xmm1, %xmm2
456 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
457 ; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
458 ; XOPAVX2-NEXT: vpminsd %xmm0, %xmm1, %xmm3
459 ; XOPAVX2-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0
460 ; XOPAVX2-NEXT: vpsubd %xmm3, %xmm0, %xmm0
461 ; XOPAVX2-NEXT: vpsrld $1, %xmm0, %xmm0
462 ; XOPAVX2-NEXT: vpmacsdd %xmm1, %xmm2, %xmm0, %xmm0
463 ; XOPAVX2-NEXT: retq
464 ;
465 ; AVX512F-LABEL: vec128_i32_signed_mem_reg:
466 ; AVX512F: # %bb.0:
467 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
468 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
469 ; AVX512F-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
470 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
471 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
472 ; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1}
473 ; AVX512F-NEXT: vpminsd %xmm0, %xmm1, %xmm2
474 ; AVX512F-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0
475 ; AVX512F-NEXT: vpsubd %xmm2, %xmm0, %xmm0
476 ; AVX512F-NEXT: vpsrld $1, %xmm0, %xmm0
477 ; AVX512F-NEXT: vpmulld %xmm3, %xmm0, %xmm0
478 ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
479 ; AVX512F-NEXT: vzeroupper
480 ; AVX512F-NEXT: retq
481 ;
482 ; AVX512VL-LABEL: vec128_i32_signed_mem_reg:
483 ; AVX512VL: # %bb.0:
484 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm1
485 ; AVX512VL-NEXT: vpcmpgtd %xmm0, %xmm1, %k1
486 ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
487 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
488 ; AVX512VL-NEXT: vmovdqa32 %xmm2, %xmm3 {%k1}
489 ; AVX512VL-NEXT: vpminsd %xmm0, %xmm1, %xmm2
490 ; AVX512VL-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0
491 ; AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0
492 ; AVX512VL-NEXT: vpsrld $1, %xmm0, %xmm0
493 ; AVX512VL-NEXT: vpmulld %xmm3, %xmm0, %xmm0
494 ; AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
495 ; AVX512VL-NEXT: retq
496 ;
497 ; AVX512BW-FALLBACK-LABEL: vec128_i32_signed_mem_reg:
498 ; AVX512BW-FALLBACK: # %bb.0:
499 ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
500 ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
501 ; AVX512BW-FALLBACK-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
502 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
503 ; AVX512BW-FALLBACK-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
504 ; AVX512BW-FALLBACK-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1}
505 ; AVX512BW-FALLBACK-NEXT: vpminsd %xmm0, %xmm1, %xmm2
506 ; AVX512BW-FALLBACK-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0
507 ; AVX512BW-FALLBACK-NEXT: vpsubd %xmm2, %xmm0, %xmm0
508 ; AVX512BW-FALLBACK-NEXT: vpsrld $1, %xmm0, %xmm0
509 ; AVX512BW-FALLBACK-NEXT: vpmulld %xmm3, %xmm0, %xmm0
510 ; AVX512BW-FALLBACK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
511 ; AVX512BW-FALLBACK-NEXT: vzeroupper
512 ; AVX512BW-FALLBACK-NEXT: retq
513 %a1 = load <4 x i32>, <4 x i32>* %a1_addr
514 %t3 = icmp sgt <4 x i32> %a1, %a2 ; signed
515 %t4 = select <4 x i1> %t3, <4 x i32> , <4 x i32>
516 %t5 = select <4 x i1> %t3, <4 x i32> %a2, <4 x i32> %a1
517 %t6 = select <4 x i1> %t3, <4 x i32> %a1, <4 x i32> %a2
518 %t7 = sub <4 x i32> %t6, %t5
519 %t8 = lshr <4 x i32> %t7,
520 %t9 = mul nsw <4 x i32> %t8, %t4 ; signed
521 %a10 = add nsw <4 x i32> %t9, %a1 ; signed
522 ret <4 x i32> %a10
523 }
524
525 define <4 x i32> @vec128_i32_signed_reg_mem(<4 x i32> %a1, <4 x i32>* %a2_addr) nounwind {
526 ; SSE2-LABEL: vec128_i32_signed_reg_mem:
527 ; SSE2: # %bb.0:
528 ; SSE2-NEXT: movdqa (%rdi), %xmm1
529 ; SSE2-NEXT: movdqa %xmm0, %xmm2
530 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
531 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1]
532 ; SSE2-NEXT: por %xmm2, %xmm3
533 ; SSE2-NEXT: movdqa %xmm1, %xmm4
534 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
535 ; SSE2-NEXT: movdqa %xmm0, %xmm5
536 ; SSE2-NEXT: pand %xmm4, %xmm5
537 ; SSE2-NEXT: pandn %xmm1, %xmm4
538 ; SSE2-NEXT: por %xmm5, %xmm4
539 ; SSE2-NEXT: movdqa %xmm0, %xmm5
540 ; SSE2-NEXT: pand %xmm2, %xmm5
541 ; SSE2-NEXT: pandn %xmm1, %xmm2
542 ; SSE2-NEXT: por %xmm5, %xmm2
543 ; SSE2-NEXT: psubd %xmm4, %xmm2
544 ; SSE2-NEXT: psrld $1, %xmm2
545 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
546 ; SSE2-NEXT: pmuludq %xmm3, %xmm2
547 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
548 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
549 ; SSE2-NEXT: pmuludq %xmm1, %xmm3
550 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
551 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
552 ; SSE2-NEXT: paddd %xmm2, %xmm0
553 ; SSE2-NEXT: retq
554 ;
555 ; SSE41-LABEL: vec128_i32_signed_reg_mem:
556 ; SSE41: # %bb.0:
557 ; SSE41-NEXT: movdqa (%rdi), %xmm2
558 ; SSE41-NEXT: movdqa %xmm0, %xmm1
559 ; SSE41-NEXT: pcmpgtd %xmm2, %xmm1
560 ; SSE41-NEXT: por {{.*}}(%rip), %xmm1
561 ; SSE41-NEXT: movdqa %xmm0, %xmm3
562 ; SSE41-NEXT: pminsd %xmm2, %xmm3
563 ; SSE41-NEXT: pmaxsd %xmm0, %xmm2
564 ; SSE41-NEXT: psubd %xmm3, %xmm2
565 ; SSE41-NEXT: psrld $1, %xmm2
566 ; SSE41-NEXT: pmulld %xmm2, %xmm1
567 ; SSE41-NEXT: paddd %xmm0, %xmm1
568 ; SSE41-NEXT: movdqa %xmm1, %xmm0
569 ; SSE41-NEXT: retq
570 ;
571 ; AVX1-FALLBACK-LABEL: vec128_i32_signed_reg_mem:
572 ; AVX1-FALLBACK: # %bb.0:
573 ; AVX1-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
574 ; AVX1-FALLBACK-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm2
575 ; AVX1-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
576 ; AVX1-FALLBACK-NEXT: vpminsd %xmm1, %xmm0, %xmm3
577 ; AVX1-FALLBACK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1
578 ; AVX1-FALLBACK-NEXT: vpsubd %xmm3, %xmm1, %xmm1
579 ; AVX1-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1
580 ; AVX1-FALLBACK-NEXT: vpmulld %xmm2, %xmm1, %xmm1
581 ; AVX1-FALLBACK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
582 ; AVX1-FALLBACK-NEXT: retq
583 ;
584 ; AVX2-FALLBACK-LABEL: vec128_i32_signed_reg_mem:
585 ; AVX2-FALLBACK: # %bb.0:
586 ; AVX2-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
587 ; AVX2-FALLBACK-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm2
588 ; AVX2-FALLBACK-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
589 ; AVX2-FALLBACK-NEXT: vpor %xmm3, %xmm2, %xmm2
590 ; AVX2-FALLBACK-NEXT: vpminsd %xmm1, %xmm0, %xmm3
591 ; AVX2-FALLBACK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1
592 ; AVX2-FALLBACK-NEXT: vpsubd %xmm3, %xmm1, %xmm1
593 ; AVX2-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1
594 ; AVX2-FALLBACK-NEXT: vpmulld %xmm2, %xmm1, %xmm1
595 ; AVX2-FALLBACK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
596 ; AVX2-FALLBACK-NEXT: retq
597 ;
598 ; XOP-FALLBACK-LABEL: vec128_i32_signed_reg_mem:
599 ; XOP-FALLBACK: # %bb.0:
600 ; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
601 ; XOP-FALLBACK-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2
602 ; XOP-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
603 ; XOP-FALLBACK-NEXT: vpminsd %xmm1, %xmm0, %xmm3
604 ; XOP-FALLBACK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1
605 ; XOP-FALLBACK-NEXT: vpsubd %xmm3, %xmm1, %xmm1
606 ; XOP-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1
607 ; XOP-FALLBACK-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0
608 ; XOP-FALLBACK-NEXT: retq
609 ;
610 ; XOPAVX1-LABEL: vec128_i32_signed_reg_mem:
611 ; XOPAVX1: # %bb.0:
612 ; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm1
613 ; XOPAVX1-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2
614 ; XOPAVX1-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
615 ; XOPAVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm3
616 ; XOPAVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1
617 ; XOPAVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
618 ; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1
619 ; XOPAVX1-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0
620 ; XOPAVX1-NEXT: retq
621 ;
622 ; XOPAVX2-LABEL: vec128_i32_signed_reg_mem:
623 ; XOPAVX2: # %bb.0:
624 ; XOPAVX2-NEXT: vmovdqa (%rdi), %xmm1
625 ; XOPAVX2-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2
626 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
627 ; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
628 ; XOPAVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm3
629 ; XOPAVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1
630 ; XOPAVX2-NEXT: vpsubd %xmm3, %xmm1, %xmm1
631 ; XOPAVX2-NEXT: vpsrld $1, %xmm1, %xmm1
632 ; XOPAVX2-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0
633 ; XOPAVX2-NEXT: retq
634 ;
635 ; AVX512F-LABEL: vec128_i32_signed_reg_mem:
636 ; AVX512F: # %bb.0:
637 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
638 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
639 ; AVX512F-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
640 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
641 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
642 ; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1}
643 ; AVX512F-NEXT: vpminsd %xmm1, %xmm0, %xmm2
644 ; AVX512F-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1
645 ; AVX512F-NEXT: vpsubd %xmm2, %xmm1, %xmm1
646 ; AVX512F-NEXT: vpsrld $1, %xmm1, %xmm1
647 ; AVX512F-NEXT: vpmulld %xmm3, %xmm1, %xmm1
648 ; AVX512F-NEXT: vpaddd %xmm0, %xmm1, %xmm0
649 ; AVX512F-NEXT: vzeroupper
650 ; AVX512F-NEXT: retq
651 ;
652 ; AVX512VL-LABEL: vec128_i32_signed_reg_mem:
653 ; AVX512VL: # %bb.0:
654 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm1
655 ; AVX512VL-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
656 ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
657 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
658 ; AVX512VL-NEXT: vmovdqa32 %xmm2, %xmm3 {%k1}
659 ; AVX512VL-NEXT: vpminsd %xmm1, %xmm0, %xmm2
660 ; AVX512VL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1
661 ; AVX512VL-NEXT: vpsubd %xmm2, %xmm1, %xmm1
662 ; AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1
663 ; AVX512VL-NEXT: vpmulld %xmm3, %xmm1, %xmm1
664 ; AVX512VL-NEXT: vpaddd %xmm0, %xmm1, %xmm0
665 ; AVX512VL-NEXT: retq
666 ;
667 ; AVX512BW-FALLBACK-LABEL: vec128_i32_signed_reg_mem:
668 ; AVX512BW-FALLBACK: # %bb.0:
669 ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
670 ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
671 ; AVX512BW-FALLBACK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
672 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
673 ; AVX512BW-FALLBACK-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
674 ; AVX512BW-FALLBACK-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1}
675 ; AVX512BW-FALLBACK-NEXT: vpminsd %xmm1, %xmm0, %xmm2
676 ; AVX512BW-FALLBACK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1
677 ; AVX512BW-FALLBACK-NEXT: vpsubd %xmm2, %xmm1, %xmm1
678 ; AVX512BW-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1
679 ; AVX512BW-FALLBACK-NEXT: vpmulld %xmm3, %xmm1, %xmm1
680 ; AVX512BW-FALLBACK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
681 ; AVX512BW-FALLBACK-NEXT: vzeroupper
682 ; AVX512BW-FALLBACK-NEXT: retq
683 %a2 = load <4 x i32>, <4 x i32>* %a2_addr
684 %t3 = icmp sgt <4 x i32> %a1, %a2 ; signed
685 %t4 = select <4 x i1> %t3, <4 x i32> , <4 x i32>
686 %t5 = select <4 x i1> %t3, <4 x i32> %a2, <4 x i32> %a1
687 %t6 = select <4 x i1> %t3, <4 x i32> %a1, <4 x i32> %a2
688 %t7 = sub <4 x i32> %t6, %t5
689 %t8 = lshr <4 x i32> %t7,
690 %t9 = mul nsw <4 x i32> %t8, %t4 ; signed
691 %a10 = add nsw <4 x i32> %t9, %a1 ; signed
692 ret <4 x i32> %a10
693 }
694
695 define <4 x i32> @vec128_i32_signed_mem_mem(<4 x i32>* %a1_addr, <4 x i32>* %a2_addr) nounwind {
696 ; SSE2-LABEL: vec128_i32_signed_mem_mem:
697 ; SSE2: # %bb.0:
698 ; SSE2-NEXT: movdqa (%rdi), %xmm1
699 ; SSE2-NEXT: movdqa (%rsi), %xmm0
700 ; SSE2-NEXT: movdqa %xmm1, %xmm2
701 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
702 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1]
703 ; SSE2-NEXT: por %xmm2, %xmm3
704 ; SSE2-NEXT: movdqa %xmm0, %xmm4
705 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
706 ; SSE2-NEXT: movdqa %xmm1, %xmm5
707 ; SSE2-NEXT: pand %xmm4, %xmm5
708 ; SSE2-NEXT: pandn %xmm0, %xmm4
709 ; SSE2-NEXT: por %xmm5, %xmm4
710 ; SSE2-NEXT: movdqa %xmm1, %xmm5
711 ; SSE2-NEXT: pand %xmm2, %xmm5
712 ; SSE2-NEXT: pandn %xmm0, %xmm2
713 ; SSE2-NEXT: por %xmm5, %xmm2
714 ; SSE2-NEXT: psubd %xmm4, %xmm2
715 ; SSE2-NEXT: psrld $1, %xmm2
716 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
717 ; SSE2-NEXT: pmuludq %xmm3, %xmm2
718 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
719 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
720 ; SSE2-NEXT: pmuludq %xmm4, %xmm2
721 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
722 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
723 ; SSE2-NEXT: paddd %xmm1, %xmm0
724 ; SSE2-NEXT: retq
725 ;
726 ; SSE41-LABEL: vec128_i32_signed_mem_mem:
727 ; SSE41: # %bb.0:
728 ; SSE41-NEXT: movdqa (%rdi), %xmm1
729 ; SSE41-NEXT: movdqa (%rsi), %xmm0
730 ; SSE41-NEXT: movdqa %xmm1, %xmm2
731 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
732 ; SSE41-NEXT: por {{.*}}(%rip), %xmm2
733 ; SSE41-NEXT: movdqa %xmm1, %xmm3
734 ; SSE41-NEXT: pminsd %xmm0, %xmm3
735 ; SSE41-NEXT: pmaxsd %xmm1, %xmm0
736 ; SSE41-NEXT: psubd %xmm3, %xmm0
737 ; SSE41-NEXT: psrld $1, %xmm0
738 ; SSE41-NEXT: pmulld %xmm2, %xmm0
739 ; SSE41-NEXT: paddd %xmm1, %xmm0
740 ; SSE41-NEXT: retq
741 ;
742 ; AVX1-FALLBACK-LABEL: vec128_i32_signed_mem_mem:
743 ; AVX1-FALLBACK: # %bb.0:
744 ; AVX1-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0
745 ; AVX1-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1
746 ; AVX1-FALLBACK-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm2
747 ; AVX1-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
748 ; AVX1-FALLBACK-NEXT: vpminsd %xmm1, %xmm0, %xmm3
749 ; AVX1-FALLBACK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1
750 ; AVX1-FALLBACK-NEXT: vpsubd %xmm3, %xmm1, %xmm1
751 ; AVX1-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1
752 ; AVX1-FALLBACK-NEXT: vpmulld %xmm2, %xmm1, %xmm1
753 ; AVX1-FALLBACK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
754 ; AVX1-FALLBACK-NEXT: retq
755 ;
756 ; AVX2-FALLBACK-LABEL: vec128_i32_signed_mem_mem:
757 ; AVX2-FALLBACK: # %bb.0:
758 ; AVX2-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0
759 ; AVX2-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1
760 ; AVX2-FALLBACK-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm2
761 ; AVX2-FALLBACK-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
762 ; AVX2-FALLBACK-NEXT: vpor %xmm3, %xmm2, %xmm2
763 ; AVX2-FALLBACK-NEXT: vpminsd %xmm1, %xmm0, %xmm3
764 ; AVX2-FALLBACK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1
765 ; AVX2-FALLBACK-NEXT: vpsubd %xmm3, %xmm1, %xmm1
766 ; AVX2-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1
767 ; AVX2-FALLBACK-NEXT: vpmulld %xmm2, %xmm1, %xmm1
768 ; AVX2-FALLBACK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
769 ; AVX2-FALLBACK-NEXT: retq
770 ;
771 ; XOP-FALLBACK-LABEL: vec128_i32_signed_mem_mem:
772 ; XOP-FALLBACK: # %bb.0:
773 ; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0
774 ; XOP-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1
775 ; XOP-FALLBACK-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2
776 ; XOP-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
777 ; XOP-FALLBACK-NEXT: vpminsd %xmm1, %xmm0, %xmm3
778 ; XOP-FALLBACK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1
779 ; XOP-FALLBACK-NEXT: vpsubd %xmm3, %xmm1, %xmm1
780 ; XOP-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1
781 ; XOP-FALLBACK-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0
782 ; XOP-FALLBACK-NEXT: retq
783 ;
784 ; XOPAVX1-LABEL: vec128_i32_signed_mem_mem:
785 ; XOPAVX1: # %bb.0:
786 ; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm0
787 ; XOPAVX1-NEXT: vmovdqa (%rsi), %xmm1
788 ; XOPAVX1-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2
789 ; XOPAVX1-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
790 ; XOPAVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm3
791 ; XOPAVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1
792 ; XOPAVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
793 ; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1
794 ; XOPAVX1-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0
795 ; XOPAVX1-NEXT: retq
796 ;
797 ; XOPAVX2-LABEL: vec128_i32_signed_mem_mem:
798 ; XOPAVX2: # %bb.0:
799 ; XOPAVX2-NEXT: vmovdqa (%rdi), %xmm0
800 ; XOPAVX2-NEXT: vmovdqa (%rsi), %xmm1
801 ; XOPAVX2-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2
802 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
803 ; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
804 ; XOPAVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm3
805 ; XOPAVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1
806 ; XOPAVX2-NEXT: vpsubd %xmm3, %xmm1, %xmm1
807 ; XOPAVX2-NEXT: vpsrld $1, %xmm1, %xmm1
808 ; XOPAVX2-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0
809 ; XOPAVX2-NEXT: retq
810 ;
811 ; AVX512F-LABEL: vec128_i32_signed_mem_mem:
812 ; AVX512F: # %bb.0:
813 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
814 ; AVX512F-NEXT: vmovdqa (%rsi), %xmm1
815 ; AVX512F-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
816 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
817 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
818 ; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1}
819 ; AVX512F-NEXT: vpminsd %xmm1, %xmm0, %xmm2
820 ; AVX512F-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1
821 ; AVX512F-NEXT: vpsubd %xmm2, %xmm1, %xmm1
822 ; AVX512F-NEXT: vpsrld $1, %xmm1, %xmm1
823 ; AVX512F-NEXT: vpmulld %xmm3, %xmm1, %xmm1
824 ; AVX512F-NEXT: vpaddd %xmm0, %xmm1, %xmm0
825 ; AVX512F-NEXT: vzeroupper
826 ; AVX512F-NEXT: retq
827 ;
828 ; AVX512VL-LABEL: vec128_i32_signed_mem_mem:
829 ; AVX512VL: # %bb.0:
830 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
831 ; AVX512VL-NEXT: vmovdqa (%rsi), %xmm1
832 ; AVX512VL-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
833 ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
834 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
835 ; AVX512VL-NEXT: vmovdqa32 %xmm2, %xmm3 {%k1}
836 ; AVX512VL-NEXT: vpminsd %xmm1, %xmm0, %xmm2
837 ; AVX512VL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1
838 ; AVX512VL-NEXT: vpsubd %xmm2, %xmm1, %xmm1
839 ; AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1
840 ; AVX512VL-NEXT: vpmulld %xmm3, %xmm1, %xmm1
841 ; AVX512VL-NEXT: vpaddd %xmm0, %xmm1, %xmm0
842 ; AVX512VL-NEXT: retq
843 ;
844 ; AVX512BW-FALLBACK-LABEL: vec128_i32_signed_mem_mem:
845 ; AVX512BW-FALLBACK: # %bb.0:
846 ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0
847 ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1
848 ; AVX512BW-FALLBACK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
849 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
850 ; AVX512BW-FALLBACK-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
851 ; AVX512BW-FALLBACK-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1}
852 ; AVX512BW-FALLBACK-NEXT: vpminsd %xmm1, %xmm0, %xmm2
853 ; AVX512BW-FALLBACK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1
854 ; AVX512BW-FALLBACK-NEXT: vpsubd %xmm2, %xmm1, %xmm1
855 ; AVX512BW-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1
856 ; AVX512BW-FALLBACK-NEXT: vpmulld %xmm3, %xmm1, %xmm1
857 ; AVX512BW-FALLBACK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
858 ; AVX512BW-FALLBACK-NEXT: vzeroupper
859 ; AVX512BW-FALLBACK-NEXT: retq
860 %a1 = load <4 x i32>, <4 x i32>* %a1_addr
861 %a2 = load <4 x i32>, <4 x i32>* %a2_addr
862 %t3 = icmp sgt <4 x i32> %a1, %a2 ; signed
863 %t4 = select <4 x i1> %t3, <4 x i32> , <4 x i32>
864 %t5 = select <4 x i1> %t3, <4 x i32> %a2, <4 x i32> %a1
865 %t6 = select <4 x i1> %t3, <4 x i32> %a1, <4 x i32> %a2
866 %t7 = sub <4 x i32> %t6, %t5
867 %t8 = lshr <4 x i32> %t7,
868 %t9 = mul nsw <4 x i32> %t8, %t4 ; signed
869 %a10 = add nsw <4 x i32> %t9, %a1 ; signed
870 ret <4 x i32> %a10
871 }
872
873 ; ---------------------------------------------------------------------------- ;
874 ; 64-bit width. 128 / 64 = 2 elts.
875 ; ---------------------------------------------------------------------------- ;
876
877 ; Values come from regs
878
879 define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwind {
880 ; SSE2-LABEL: vec128_i64_signed_reg_reg:
881 ; SSE2: # %bb.0:
882 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
883 ; SSE2-NEXT: movdqa %xmm1, %xmm5
884 ; SSE2-NEXT: pxor %xmm4, %xmm5
885 ; SSE2-NEXT: pxor %xmm0, %xmm4
886 ; SSE2-NEXT: movdqa %xmm4, %xmm2
887 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm2
888 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
889 ; SSE2-NEXT: movdqa %xmm4, %xmm6
890 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
891 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
892 ; SSE2-NEXT: pand %xmm6, %xmm3
893 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
894 ; SSE2-NEXT: por %xmm3, %xmm2
895 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1]
896 ; SSE2-NEXT: por %xmm2, %xmm3
897 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
898 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2]
899 ; SSE2-NEXT: pand %xmm6, %xmm4
900 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
901 ; SSE2-NEXT: por %xmm4, %xmm5
902 ; SSE2-NEXT: movdqa %xmm0, %xmm4
903 ; SSE2-NEXT: pand %xmm5, %xmm4
904 ; SSE2-NEXT: pandn %xmm1, %xmm5
905 ; SSE2-NEXT: por %xmm4, %xmm5
906 ; SSE2-NEXT: movdqa %xmm0, %xmm4
907 ; SSE2-NEXT: pand %xmm2, %xmm4
908 ; SSE2-NEXT: pandn %xmm1, %xmm2
909 ; SSE2-NEXT: por %xmm4, %xmm2
910 ; SSE2-NEXT: psubq %xmm5, %xmm2
911 ; SSE2-NEXT: psrlq $1, %xmm2
912 ; SSE2-NEXT: movdqa %xmm3, %xmm4
913 ; SSE2-NEXT: psrlq $32, %xmm4
914 ; SSE2-NEXT: pmuludq %xmm2, %xmm4
915 ; SSE2-NEXT: movdqa %xmm2, %xmm1
916 ; SSE2-NEXT: psrlq $32, %xmm1
917 ; SSE2-NEXT: pmuludq %xmm3, %xmm1
918 ; SSE2-NEXT: paddq %xmm4, %xmm1
919 ; SSE2-NEXT: psllq $32, %xmm1
920 ; SSE2-NEXT: pmuludq %xmm3, %xmm2
921 ; SSE2-NEXT: paddq %xmm0, %xmm1
922 ; SSE2-NEXT: paddq %xmm2, %xmm1
923 ; SSE2-NEXT: movdqa %xmm1, %xmm0
924 ; SSE2-NEXT: retq
925 ;
926 ; SSE41-LABEL: vec128_i64_signed_reg_reg:
927 ; SSE41: # %bb.0:
928 ; SSE41-NEXT: movdqa %xmm0, %xmm2
929 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
930 ; SSE41-NEXT: movdqa %xmm1, %xmm5
931 ; SSE41-NEXT: pxor %xmm0, %xmm5
932 ; SSE41-NEXT: pxor %xmm2, %xmm0
933 ; SSE41-NEXT: movdqa %xmm0, %xmm3
934 ; SSE41-NEXT: pcmpgtd %xmm5, %xmm3
935 ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
936 ; SSE41-NEXT: movdqa %xmm0, %xmm4
937 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm4
938 ; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
939 ; SSE41-NEXT: pand %xmm7, %xmm6
940 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
941 ; SSE41-NEXT: por %xmm6, %xmm4
942 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1,1]
943 ; SSE41-NEXT: por %xmm4, %xmm3
944 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
945 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
946 ; SSE41-NEXT: pand %xmm7, %xmm0
947 ; SSE41-NEXT: por %xmm5, %xmm0
948 ; SSE41-NEXT: movdqa %xmm1, %xmm5
949 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
950 ; SSE41-NEXT: movdqa %xmm4, %xmm0
951 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
952 ; SSE41-NEXT: psubq %xmm5, %xmm1
953 ; SSE41-NEXT: psrlq $1, %xmm1
954 ; SSE41-NEXT: movdqa %xmm3, %xmm0
955 ; SSE41-NEXT: psrlq $32, %xmm0
956 ; SSE41-NEXT: pmuludq %xmm1, %xmm0
957 ; SSE41-NEXT: movdqa %xmm1, %xmm4
958 ; SSE41-NEXT: psrlq $32, %xmm4
959 ; SSE41-NEXT: pmuludq %xmm3, %xmm4
960 ; SSE41-NEXT: paddq %xmm0, %xmm4
961 ; SSE41-NEXT: psllq $32, %xmm4
962 ; SSE41-NEXT: pmuludq %xmm1, %xmm3
963 ; SSE41-NEXT: paddq %xmm2, %xmm4
964 ; SSE41-NEXT: paddq %xmm4, %xmm3
965 ; SSE41-NEXT: movdqa %xmm3, %xmm0
966 ; SSE41-NEXT: retq
967 ;
968 ; AVX1-FALLBACK-LABEL: vec128_i64_signed_reg_reg:
969 ; AVX1-FALLBACK: # %bb.0:
970 ; AVX1-FALLBACK-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
971 ; AVX1-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm3
972 ; AVX1-FALLBACK-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm4
973 ; AVX1-FALLBACK-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4
974 ; AVX1-FALLBACK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
975 ; AVX1-FALLBACK-NEXT: vpsubq %xmm4, %xmm1, %xmm1
976 ; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm1
977 ; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm2
978 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm2, %xmm1, %xmm2
979 ; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm1, %xmm4
980 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm3, %xmm4, %xmm4
981 ; AVX1-FALLBACK-NEXT: vpaddq %xmm4, %xmm2, %xmm2
982 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2
983 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
984 ; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
985 ; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
986 ; AVX1-FALLBACK-NEXT: retq
987 ;
988 ; AVX2-FALLBACK-LABEL: vec128_i64_signed_reg_reg:
989 ; AVX2-FALLBACK: # %bb.0:
990 ; AVX2-FALLBACK-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
991 ; AVX2-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm3
992 ; AVX2-FALLBACK-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm4
993 ; AVX2-FALLBACK-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4
994 ; AVX2-FALLBACK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
995 ; AVX2-FALLBACK-NEXT: vpsubq %xmm4, %xmm1, %xmm1
996 ; AVX2-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm1
997 ; AVX2-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm2
998 ; AVX2-FALLBACK-NEXT: vpmuludq %xmm2, %xmm1, %xmm2
999 ; AVX2-FALLBACK-NEXT: vpsrlq $32, %xmm1, %xmm4
1000 ; AVX2-FALLBACK-NEXT: vpmuludq %xmm3, %xmm4, %xmm4
1001 ; AVX2-FALLBACK-NEXT: vpaddq %xmm4, %xmm2, %xmm2
1002 ; AVX2-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2
1003 ; AVX2-FALLBACK-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
1004 ; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1005 ; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1006 ; AVX2-FALLBACK-NEXT: retq
1007 ;
1008 ; XOP-LABEL: vec128_i64_signed_reg_reg:
1009 ; XOP: # %bb.0:
1010 ; XOP-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2
1011 ; XOP-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm3
1012 ; XOP-NEXT: vpcomltq %xmm1, %xmm0, %xmm4
1013 ; XOP-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4
1014 ; XOP-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
1015 ; XOP-NEXT: vpsubq %xmm4, %xmm1, %xmm1
1016 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm1
1017 ; XOP-NEXT: vpsrlq $32, %xmm3, %xmm2
1018 ; XOP-NEXT: vpmuludq %xmm2, %xmm1, %xmm2
1019 ; XOP-NEXT: vpsrlq $32, %xmm1, %xmm4
1020 ; XOP-NEXT: vpmuludq %xmm3, %xmm4, %xmm4
1021 ; XOP-NEXT: vpaddq %xmm4, %xmm2, %xmm2
1022 ; XOP-NEXT: vpsllq $32, %xmm2, %xmm2
1023 ; XOP-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
1024 ; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1025 ; XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1026 ; XOP-NEXT: retq
1027 ;
1028 ; AVX512F-LABEL: vec128_i64_signed_reg_reg:
1029 ; AVX512F: # %bb.0:
1030 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1031 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1032 ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
1033 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1034 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
1035 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
1036 ; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2
1037 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
1038 ; AVX512F-NEXT: vpsubq %xmm2, %xmm1, %xmm1
1039 ; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm1
1040 ; AVX512F-NEXT: vpsrlq $32, %xmm1, %xmm2
1041 ; AVX512F-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
1042 ; AVX512F-NEXT: vpsrlq $32, %xmm3, %xmm4
1043 ; AVX512F-NEXT: vpmuludq %xmm4, %xmm1, %xmm4
1044 ; AVX512F-NEXT: vpaddq %xmm2, %xmm4, %xmm2
1045 ; AVX512F-NEXT: vpsllq $32, %xmm2, %xmm2
1046 ; AVX512F-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
1047 ; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1048 ; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1049 ; AVX512F-NEXT: vzeroupper
1050 ; AVX512F-NEXT: retq
1051 ;
1052 ; AVX512VL-LABEL: vec128_i64_signed_reg_reg:
1053 ; AVX512VL: # %bb.0:
1054 ; AVX512VL-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
1055 ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1056 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
1057 ; AVX512VL-NEXT: vmovdqa64 %xmm2, %xmm3 {%k1}
1058 ; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm2
1059 ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm1
1060 ; AVX512VL-NEXT: vpsubq %xmm2, %xmm1, %xmm1
1061 ; AVX512VL-NEXT: vpsrlq $1, %xmm1, %xmm1
1062 ; AVX512VL-NEXT: vpsrlq $32, %xmm3, %xmm2
1063 ; AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm2
1064 ; AVX512VL-NEXT: vpsrlq $32, %xmm1, %xmm4
1065 ; AVX512VL-NEXT: vpmuludq %xmm3, %xmm4, %xmm4
1066 ; AVX512VL-NEXT: vpaddq %xmm4, %xmm2, %xmm2
1067 ; AVX512VL-NEXT: vpsllq $32, %xmm2, %xmm2
1068 ; AVX512VL-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
1069 ; AVX512VL-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1070 ; AVX512VL-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1071 ; AVX512VL-NEXT: retq
1072 ;
1073 ; AVX512BW-FALLBACK-LABEL: vec128_i64_signed_reg_reg:
1074 ; AVX512BW-FALLBACK: # %bb.0:
1075 ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1076 ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1077 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
1078 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1079 ; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
1080 ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
1081 ; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2
1082 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
1083 ; AVX512BW-FALLBACK-NEXT: vpsubq %xmm2, %xmm1, %xmm1
1084 ; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm1
1085 ; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %xmm1, %xmm2
1086 ; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
1087 ; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm4
1088 ; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm4, %xmm1, %xmm4
1089 ; AVX512BW-FALLBACK-NEXT: vpaddq %xmm2, %xmm4, %xmm2
1090 ; AVX512BW-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2
1091 ; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
1092 ; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1093 ; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1094 ; AVX512BW-FALLBACK-NEXT: vzeroupper
1095 ; AVX512BW-FALLBACK-NEXT: retq
1096 %t3 = icmp sgt <2 x i64> %a1, %a2 ; signed
1097 %t4 = select <2 x i1> %t3, <2 x i64> , <2 x i64>
1098 %t5 = select <2 x i1> %t3, <2 x i64> %a2, <2 x i64> %a1
1099 %t6 = select <2 x i1> %t3, <2 x i64> %a1, <2 x i64> %a2
1100 %t7 = sub <2 x i64> %t6, %t5
1101 %t8 = lshr <2 x i64> %t7,
1102 %t9 = mul nsw <2 x i64> %t8, %t4 ; signed
1103 %a10 = add nsw <2 x i64> %t9, %a1 ; signed
1104 ret <2 x i64> %a10
1105 }
1106
1107 define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwind {
1108 ; SSE2-LABEL: vec128_i64_unsigned_reg_reg:
1109 ; SSE2: # %bb.0:
1110 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
1111 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1112 ; SSE2-NEXT: pxor %xmm4, %xmm5
1113 ; SSE2-NEXT: pxor %xmm0, %xmm4
1114 ; SSE2-NEXT: movdqa %xmm4, %xmm2
1115 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm2
1116 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
1117 ; SSE2-NEXT: movdqa %xmm4, %xmm6
1118 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
1119 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
1120 ; SSE2-NEXT: pand %xmm6, %xmm3
1121 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1122 ; SSE2-NEXT: por %xmm3, %xmm2
1123 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1]
1124 ; SSE2-NEXT: por %xmm2, %xmm3
1125 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
1126 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2]
1127 ; SSE2-NEXT: pand %xmm6, %xmm4
1128 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1129 ; SSE2-NEXT: por %xmm4, %xmm5
1130 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1131 ; SSE2-NEXT: pand %xmm5, %xmm4
1132 ; SSE2-NEXT: pandn %xmm1, %xmm5
1133 ; SSE2-NEXT: por %xmm4, %xmm5
1134 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1135 ; SSE2-NEXT: pand %xmm2, %xmm4
1136 ; SSE2-NEXT: pandn %xmm1, %xmm2
1137 ; SSE2-NEXT: por %xmm4, %xmm2
1138 ; SSE2-NEXT: psubq %xmm5, %xmm2
1139 ; SSE2-NEXT: psrlq $1, %xmm2
1140 ; SSE2-NEXT: movdqa %xmm3, %xmm4
1141 ; SSE2-NEXT: psrlq $32, %xmm4
1142 ; SSE2-NEXT: pmuludq %xmm2, %xmm4
1143 ; SSE2-NEXT: movdqa %xmm2, %xmm1
1144 ; SSE2-NEXT: psrlq $32, %xmm1
1145 ; SSE2-NEXT: pmuludq %xmm3, %xmm1
1146 ; SSE2-NEXT: paddq %xmm4, %xmm1
1147 ; SSE2-NEXT: psllq $32, %xmm1
1148 ; SSE2-NEXT: pmuludq %xmm3, %xmm2
1149 ; SSE2-NEXT: paddq %xmm0, %xmm1
1150 ; SSE2-NEXT: paddq %xmm2, %xmm1
1151 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1152 ; SSE2-NEXT: retq
1153 ;
1154 ; SSE41-LABEL: vec128_i64_unsigned_reg_reg:
1155 ; SSE41: # %bb.0:
1156 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1157 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
1158 ; SSE41-NEXT: movdqa %xmm1, %xmm5
1159 ; SSE41-NEXT: pxor %xmm0, %xmm5
1160 ; SSE41-NEXT: pxor %xmm2, %xmm0
1161 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1162 ; SSE41-NEXT: pcmpgtd %xmm5, %xmm3
1163 ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
1164 ; SSE41-NEXT: movdqa %xmm0, %xmm4
1165 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm4
1166 ; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
1167 ; SSE41-NEXT: pand %xmm7, %xmm6
1168 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
1169 ; SSE41-NEXT: por %xmm6, %xmm4
1170 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1,1]
1171 ; SSE41-NEXT: por %xmm4, %xmm3
1172 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
1173 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
1174 ; SSE41-NEXT: pand %xmm7, %xmm0
1175 ; SSE41-NEXT: por %xmm5, %xmm0
1176 ; SSE41-NEXT: movdqa %xmm1, %xmm5
1177 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
1178 ; SSE41-NEXT: movdqa %xmm4, %xmm0
1179 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
1180 ; SSE41-NEXT: psubq %xmm5, %xmm1
1181 ; SSE41-NEXT: psrlq $1, %xmm1
1182 ; SSE41-NEXT: movdqa %xmm3, %xmm0
1183 ; SSE41-NEXT: psrlq $32, %xmm0
1184 ; SSE41-NEXT: pmuludq %xmm1, %xmm0
1185 ; SSE41-NEXT: movdqa %xmm1, %xmm4
1186 ; SSE41-NEXT: psrlq $32, %xmm4
1187 ; SSE41-NEXT: pmuludq %xmm3, %xmm4
1188 ; SSE41-NEXT: paddq %xmm0, %xmm4
1189 ; SSE41-NEXT: psllq $32, %xmm4
1190 ; SSE41-NEXT: pmuludq %xmm1, %xmm3
1191 ; SSE41-NEXT: paddq %xmm2, %xmm4
1192 ; SSE41-NEXT: paddq %xmm4, %xmm3
1193 ; SSE41-NEXT: movdqa %xmm3, %xmm0
1194 ; SSE41-NEXT: retq
1195 ;
1196 ; AVX1-FALLBACK-LABEL: vec128_i64_unsigned_reg_reg:
1197 ; AVX1-FALLBACK: # %bb.0:
1198 ; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
1199 ; AVX1-FALLBACK-NEXT: vpxor %xmm2, %xmm1, %xmm3
1200 ; AVX1-FALLBACK-NEXT: vpxor %xmm2, %xmm0, %xmm2
1201 ; AVX1-FALLBACK-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4
1202 ; AVX1-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm4, %xmm5
1203 ; AVX1-FALLBACK-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
1204 ; AVX1-FALLBACK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2
1205 ; AVX1-FALLBACK-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm1
1206 ; AVX1-FALLBACK-NEXT: vpsubq %xmm2, %xmm1, %xmm1
1207 ; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm1
1208 ; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm2
1209 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm2, %xmm1, %xmm2
1210 ; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm1, %xmm3
1211 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm3, %xmm3
1212 ; AVX1-FALLBACK-NEXT: vpaddq %xmm3, %xmm2, %xmm2
1213 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2
1214 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm1, %xmm1
1215 ; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1216 ; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1217 ; AVX1-FALLBACK-NEXT: retq
1218 ;
1219 ; AVX2-FALLBACK-LABEL: vec128_i64_unsigned_reg_reg:
1220 ; AVX2-FALLBACK: # %bb.0:
1221 ; AVX2-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
1222 ; AVX2-FALLBACK-NEXT: vpxor %xmm2, %xmm1, %xmm3
1223 ; AVX2-FALLBACK-NEXT: vpxor %xmm2, %xmm0, %xmm2
1224 ; AVX2-FALLBACK-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4
1225 ; AVX2-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm4, %xmm5
1226 ; AVX2-FALLBACK-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
1227 ; AVX2-FALLBACK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2
1228 ; AVX2-FALLBACK-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm1
1229 ; AVX2-FALLBACK-NEXT: vpsubq %xmm2, %xmm1, %xmm1
1230 ; AVX2-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm1
1231 ; AVX2-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm2
1232 ; AVX2-FALLBACK-NEXT: vpmuludq %xmm2, %xmm1, %xmm2
1233 ; AVX2-FALLBACK-NEXT: vpsrlq $32, %xmm1, %xmm3
1234 ; AVX2-FALLBACK-NEXT: vpmuludq %xmm5, %xmm3, %xmm3
1235 ; AVX2-FALLBACK-NEXT: vpaddq %xmm3, %xmm2, %xmm2
1236 ; AVX2-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2
1237 ; AVX2-FALLBACK-NEXT: vpmuludq %xmm5, %xmm1, %xmm1
1238 ; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1239 ; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1240 ; AVX2-FALLBACK-NEXT: retq
1241 ;
1242 ; XOP-LABEL: vec128_i64_unsigned_reg_reg:
1243 ; XOP: # %bb.0:
1244 ; XOP-NEXT: vpcomgtuq %xmm1, %xmm0, %xmm2
1245 ; XOP-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm3
1246 ; XOP-NEXT: vpcomltuq %xmm1, %xmm0, %xmm4
1247 ; XOP-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4
1248 ; XOP-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
1249 ; XOP-NEXT: vpsubq %xmm4, %xmm1, %xmm1
1250 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm1
1251 ; XOP-NEXT: vpsrlq $32, %xmm3, %xmm2
1252 ; XOP-NEXT: vpmuludq %xmm2, %xmm1, %xmm2
1253 ; XOP-NEXT: vpsrlq $32, %xmm1, %xmm4
1254 ; XOP-NEXT: vpmuludq %xmm3, %xmm4, %xmm4
1255 ; XOP-NEXT: vpaddq %xmm4, %xmm2, %xmm2
1256 ; XOP-NEXT: vpsllq $32, %xmm2, %xmm2
1257 ; XOP-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
1258 ; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1259 ; XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1260 ; XOP-NEXT: retq
1261 ;
1262 ; AVX512F-LABEL: vec128_i64_unsigned_reg_reg:
1263 ; AVX512F: # %bb.0:
1264 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1265 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1266 ; AVX512F-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1
1267 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1268 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
1269 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
1270 ; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm2
1271 ; AVX512F-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1
1272 ; AVX512F-NEXT: vpsubq %xmm2, %xmm1, %xmm1
1273 ; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm1
1274 ; AVX512F-NEXT: vpsrlq $32, %xmm1, %xmm2
1275 ; AVX512F-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
1276 ; AVX512F-NEXT: vpsrlq $32, %xmm3, %xmm4
1277 ; AVX512F-NEXT: vpmuludq %xmm4, %xmm1, %xmm4
1278 ; AVX512F-NEXT: vpaddq %xmm2, %xmm4, %xmm2
1279 ; AVX512F-NEXT: vpsllq $32, %xmm2, %xmm2
1280 ; AVX512F-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
1281 ; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1282 ; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1283 ; AVX512F-NEXT: vzeroupper
1284 ; AVX512F-NEXT: retq
1285 ;
1286 ; AVX512VL-LABEL: vec128_i64_unsigned_reg_reg:
1287 ; AVX512VL: # %bb.0:
1288 ; AVX512VL-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1
1289 ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1290 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
1291 ; AVX512VL-NEXT: vmovdqa64 %xmm2, %xmm3 {%k1}
1292 ; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm2
1293 ; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm1
1294 ; AVX512VL-NEXT: vpsubq %xmm2, %xmm1, %xmm1
1295 ; AVX512VL-NEXT: vpsrlq $1, %xmm1, %xmm1
1296 ; AVX512VL-NEXT: vpsrlq $32, %xmm3, %xmm2
1297 ; AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm2
1298 ; AVX512VL-NEXT: vpsrlq $32, %xmm1, %xmm4
1299 ; AVX512VL-NEXT: vpmuludq %xmm3, %xmm4, %xmm4
1300 ; AVX512VL-NEXT: vpaddq %xmm4, %xmm2, %xmm2
1301 ; AVX512VL-NEXT: vpsllq $32, %xmm2, %xmm2
1302 ; AVX512VL-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
1303 ; AVX512VL-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1304 ; AVX512VL-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1305 ; AVX512VL-NEXT: retq
1306 ;
1307 ; AVX512BW-FALLBACK-LABEL: vec128_i64_unsigned_reg_reg:
1308 ; AVX512BW-FALLBACK: # %bb.0:
1309 ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1310 ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1311 ; AVX512BW-FALLBACK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1
1312 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1313 ; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
1314 ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
1315 ; AVX512BW-FALLBACK-NEXT: vpminuq %zmm1, %zmm0, %zmm2
1316 ; AVX512BW-FALLBACK-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1
1317 ; AVX512BW-FALLBACK-NEXT: vpsubq %xmm2, %xmm1, %xmm1
1318 ; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm1
1319 ; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %xmm1, %xmm2
1320 ; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
1321 ; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm4
1322 ; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm4, %xmm1, %xmm4
1323 ; AVX512BW-FALLBACK-NEXT: vpaddq %xmm2, %xmm4, %xmm2
1324 ; AVX512BW-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2
1325 ; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
1326 ; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1327 ; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1328 ; AVX512BW-FALLBACK-NEXT: vzeroupper
1329 ; AVX512BW-FALLBACK-NEXT: retq
1330 %t3 = icmp ugt <2 x i64> %a1, %a2
1331 %t4 = select <2 x i1> %t3, <2 x i64> , <2 x i64>
1332 %t5 = select <2 x i1> %t3, <2 x i64> %a2, <2 x i64> %a1
1333 %t6 = select <2 x i1> %t3, <2 x i64> %a1, <2 x i64> %a2
1334 %t7 = sub <2 x i64> %t6, %t5
1335 %t8 = lshr <2 x i64> %t7,
1336 %t9 = mul <2 x i64> %t8, %t4
1337 %a10 = add <2 x i64> %t9, %a1
1338 ret <2 x i64> %a10
1339 }
1340
1341 ; Values are loaded. Only check signed case.
1342
1343 define <2 x i64> @vec128_i64_signed_mem_reg(<2 x i64>* %a1_addr, <2 x i64> %a2) nounwind {
1344 ; SSE2-LABEL: vec128_i64_signed_mem_reg:
1345 ; SSE2: # %bb.0:
1346 ; SSE2-NEXT: movdqa (%rdi), %xmm1
1347 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
1348 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1349 ; SSE2-NEXT: pxor %xmm4, %xmm5
1350 ; SSE2-NEXT: pxor %xmm1, %xmm4
1351 ; SSE2-NEXT: movdqa %xmm4, %xmm2
1352 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm2
1353 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
1354 ; SSE2-NEXT: movdqa %xmm4, %xmm6
1355 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
1356 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
1357 ; SSE2-NEXT: pand %xmm6, %xmm3
1358 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1359 ; SSE2-NEXT: por %xmm3, %xmm2
1360 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1]
1361 ; SSE2-NEXT: por %xmm2, %xmm3
1362 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
1363 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2]
1364 ; SSE2-NEXT: pand %xmm6, %xmm4
1365 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1366 ; SSE2-NEXT: por %xmm4, %xmm5
1367 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1368 ; SSE2-NEXT: pand %xmm5, %xmm4
1369 ; SSE2-NEXT: pandn %xmm0, %xmm5
1370 ; SSE2-NEXT: por %xmm4, %xmm5
1371 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1372 ; SSE2-NEXT: pand %xmm2, %xmm4
1373 ; SSE2-NEXT: pandn %xmm0, %xmm2
1374 ; SSE2-NEXT: por %xmm4, %xmm2
1375 ; SSE2-NEXT: psubq %xmm5, %xmm2
1376 ; SSE2-NEXT: psrlq $1, %xmm2
1377 ; SSE2-NEXT: movdqa %xmm3, %xmm4
1378 ; SSE2-NEXT: psrlq $32, %xmm4
1379 ; SSE2-NEXT: pmuludq %xmm2, %xmm4
1380 ; SSE2-NEXT: movdqa %xmm2, %xmm0
1381 ; SSE2-NEXT: psrlq $32, %xmm0
1382 ; SSE2-NEXT: pmuludq %xmm3, %xmm0
1383 ; SSE2-NEXT: paddq %xmm4, %xmm0
1384 ; SSE2-NEXT: psllq $32, %xmm0
1385 ; SSE2-NEXT: pmuludq %xmm3, %xmm2
1386 ; SSE2-NEXT: paddq %xmm1, %xmm0
1387 ; SSE2-NEXT: paddq %xmm2, %xmm0
1388 ; SSE2-NEXT: retq
1389 ;
1390 ; SSE41-LABEL: vec128_i64_signed_mem_reg:
1391 ; SSE41: # %bb.0:
1392 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1393 ; SSE41-NEXT: movdqa (%rdi), %xmm3
1394 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
1395 ; SSE41-NEXT: pxor %xmm5, %xmm0
1396 ; SSE41-NEXT: pxor %xmm3, %xmm5
1397 ; SSE41-NEXT: movdqa %xmm5, %xmm2
1398 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
1399 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
1400 ; SSE41-NEXT: movdqa %xmm5, %xmm6
1401 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
1402 ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
1403 ; SSE41-NEXT: pand %xmm6, %xmm4
1404 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1405 ; SSE41-NEXT: por %xmm4, %xmm2
1406 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1,1]
1407 ; SSE41-NEXT: por %xmm2, %xmm4
1408 ; SSE41-NEXT: pcmpgtd %xmm5, %xmm0
1409 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
1410 ; SSE41-NEXT: pand %xmm6, %xmm5
1411 ; SSE41-NEXT: por %xmm5, %xmm0
1412 ; SSE41-NEXT: movdqa %xmm1, %xmm5
1413 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
1414 ; SSE41-NEXT: movdqa %xmm2, %xmm0
1415 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
1416 ; SSE41-NEXT: psubq %xmm5, %xmm1
1417 ; SSE41-NEXT: psrlq $1, %xmm1
1418 ; SSE41-NEXT: movdqa %xmm4, %xmm2
1419 ; SSE41-NEXT: psrlq $32, %xmm2
1420 ; SSE41-NEXT: pmuludq %xmm1, %xmm2
1421 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1422 ; SSE41-NEXT: psrlq $32, %xmm0
1423 ; SSE41-NEXT: pmuludq %xmm4, %xmm0
1424 ; SSE41-NEXT: paddq %xmm2, %xmm0
1425 ; SSE41-NEXT: psllq $32, %xmm0
1426 ; SSE41-NEXT: pmuludq %xmm4, %xmm1
1427 ; SSE41-NEXT: paddq %xmm3, %xmm0
1428 ; SSE41-NEXT: paddq %xmm1, %xmm0
1429 ; SSE41-NEXT: retq
1430 ;
1431 ; AVX1-FALLBACK-LABEL: vec128_i64_signed_mem_reg:
1432 ; AVX1-FALLBACK: # %bb.0:
1433 ; AVX1-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
1434 ; AVX1-FALLBACK-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
1435 ; AVX1-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm3
1436 ; AVX1-FALLBACK-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm4
1437 ; AVX1-FALLBACK-NEXT: vblendvpd %xmm4, %xmm1, %xmm0, %xmm4
1438 ; AVX1-FALLBACK-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1439 ; AVX1-FALLBACK-NEXT: vpsubq %xmm4, %xmm0, %xmm0
1440 ; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm0, %xmm0
1441 ; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm2
1442 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
1443 ; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm0, %xmm4
1444 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm3, %xmm4, %xmm4
1445 ; AVX1-FALLBACK-NEXT: vpaddq %xmm4, %xmm2, %xmm2
1446 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2
1447 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm3, %xmm0, %xmm0
1448 ; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm2, %xmm1
1449 ; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1450 ; AVX1-FALLBACK-NEXT: retq
1451 ;
1452 ; AVX2-FALLBACK-LABEL: vec128_i64_signed_mem_reg:
1453 ; AVX2-FALLBACK: # %bb.0:
1454 ; AVX2-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
1455 ; AVX2-FALLBACK-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
1456 ; AVX2-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm3
1457 ; AVX2-FALLBACK-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm4
1458 ; AVX2-FALLBACK-NEXT: vblendvpd %xmm4, %xmm1, %xmm0, %xmm4
1459 ; AVX2-FALLBACK-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1460 ; AVX2-FALLBACK-NEXT: vpsubq %xmm4, %xmm0, %xmm0
1461 ; AVX2-FALLBACK-NEXT: vpsrlq $1, %xmm0, %xmm0
1462 ; AVX2-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm2
1463 ; AVX2-FALLBACK-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
1464 ; AVX2-FALLBACK-NEXT: vpsrlq $32, %xmm0, %xmm4
1465 ; AVX2-FALLBACK-NEXT: vpmuludq %xmm3, %xmm4, %xmm4
1466 ; AVX2-FALLBACK-NEXT: vpaddq %xmm4, %xmm2, %xmm2
1467 ; AVX2-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2
1468 ; AVX2-FALLBACK-NEXT: vpmuludq %xmm3, %xmm0, %xmm0
1469 ; AVX2-FALLBACK-NEXT: vpaddq %xmm1, %xmm2, %xmm1
1470 ; AVX2-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1471 ; AVX2-FALLBACK-NEXT: retq
1472 ;
1473 ; XOP-LABEL: vec128_i64_signed_mem_reg:
1474 ; XOP: # %bb.0:
1475 ; XOP-NEXT: vmovdqa (%rdi), %xmm1
1476 ; XOP-NEXT: vpcomgtq %xmm0, %xmm1, %xmm2
1477 ; XOP-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm3
1478 ; XOP-NEXT: vpcomltq %xmm0, %xmm1, %xmm4
1479 ; XOP-NEXT: vblendvpd %xmm4, %xmm1, %xmm0, %xmm4
1480 ; XOP-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1481 ; XOP-NEXT: vpsubq %xmm4, %xmm0, %xmm0
1482 ; XOP-NEXT: vpsrlq $1, %xmm0, %xmm0
1483 ; XOP-NEXT: vpsrlq $32, %xmm3, %xmm2
1484 ; XOP-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
1485 ; XOP-NEXT: vpsrlq $32, %xmm0, %xmm4
1486 ; XOP-NEXT: vpmuludq %xmm3, %xmm4, %xmm4
1487 ; XOP-NEXT: vpaddq %xmm4, %xmm2, %xmm2
1488 ; XOP-NEXT: vpsllq $32, %xmm2, %xmm2
1489 ; XOP-NEXT: vpmuludq %xmm3, %xmm0, %xmm0
1490 ; XOP-NEXT: vpaddq %xmm1, %xmm2, %xmm1
1491 ; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1492 ; XOP-NEXT: retq
1493 ;
1494 ; AVX512F-LABEL: vec128_i64_signed_mem_reg:
1495 ; AVX512F: # %bb.0:
1496 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1497 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
1498 ; AVX512F-NEXT: vpcmpgtq %zmm0, %zmm1, %k1
1499 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1500 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
1501 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
1502 ; AVX512F-NEXT: vpminsq %zmm0, %zmm1, %zmm2
1503 ; AVX512F-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
1504 ; AVX512F-NEXT: vpsubq %xmm2, %xmm0, %xmm0
1505 ; AVX512F-NEXT: vpsrlq $1, %xmm0, %xmm0
1506 ; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm2
1507 ; AVX512F-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
1508 ; AVX512F-NEXT: vpsrlq $32, %xmm3, %xmm4
1509 ; AVX512F-NEXT: vpmuludq %xmm4, %xmm0, %xmm4
1510 ; AVX512F-NEXT: vpaddq %xmm2, %xmm4, %xmm2
1511 ; AVX512F-NEXT: vpsllq $32, %xmm2, %xmm2
1512 ; AVX512F-NEXT: vpmuludq %xmm3, %xmm0, %xmm0
1513 ; AVX512F-NEXT: vpaddq %xmm1, %xmm2, %xmm1
1514 ; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1515 ; AVX512F-NEXT: vzeroupper
1516 ; AVX512F-NEXT: retq
1517 ;
1518 ; AVX512VL-LABEL: vec128_i64_signed_mem_reg:
1519 ; AVX512VL: # %bb.0:
1520 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm1
1521 ; AVX512VL-NEXT: vpcmpgtq %xmm0, %xmm1, %k1
1522 ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1523 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
1524 ; AVX512VL-NEXT: vmovdqa64 %xmm2, %xmm3 {%k1}
1525 ; AVX512VL-NEXT: vpminsq %xmm0, %xmm1, %xmm2
1526 ; AVX512VL-NEXT: vpmaxsq %xmm0, %xmm1, %xmm0
1527 ; AVX512VL-NEXT: vpsubq %xmm2, %xmm0, %xmm0
1528 ; AVX512VL-NEXT: vpsrlq $1, %xmm0, %xmm0
1529 ; AVX512VL-NEXT: vpsrlq $32, %xmm3, %xmm2
1530 ; AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
1531 ; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm4
1532 ; AVX512VL-NEXT: vpmuludq %xmm3, %xmm4, %xmm4
1533 ; AVX512VL-NEXT: vpaddq %xmm4, %xmm2, %xmm2
1534 ; AVX512VL-NEXT: vpsllq $32, %xmm2, %xmm2
1535 ; AVX512VL-NEXT: vpmuludq %xmm3, %xmm0, %xmm0
1536 ; AVX512VL-NEXT: vpaddq %xmm1, %xmm2, %xmm1
1537 ; AVX512VL-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1538 ; AVX512VL-NEXT: retq
1539 ;
1540 ; AVX512BW-FALLBACK-LABEL: vec128_i64_signed_mem_reg:
1541 ; AVX512BW-FALLBACK: # %bb.0:
1542 ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1543 ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
1544 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm0, %zmm1, %k1
1545 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1546 ; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
1547 ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
1548 ; AVX512BW-FALLBACK-NEXT: vpminsq %zmm0, %zmm1, %zmm2
1549 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
1550 ; AVX512BW-FALLBACK-NEXT: vpsubq %xmm2, %xmm0, %xmm0
1551 ; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %xmm0, %xmm0
1552 ; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %xmm0, %xmm2
1553 ; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
1554 ; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm4
1555 ; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm4, %xmm0, %xmm4
1556 ; AVX512BW-FALLBACK-NEXT: vpaddq %xmm2, %xmm4, %xmm2
1557 ; AVX512BW-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2
1558 ; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm0, %xmm0
1559 ; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm2, %xmm1
1560 ; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1561 ; AVX512BW-FALLBACK-NEXT: vzeroupper
1562 ; AVX512BW-FALLBACK-NEXT: retq
1563 %a1 = load <2 x i64>, <2 x i64>* %a1_addr
1564 %t3 = icmp sgt <2 x i64> %a1, %a2 ; signed
1565 %t4 = select <2 x i1> %t3, <2 x i64> , <2 x i64>
1566 %t5 = select <2 x i1> %t3, <2 x i64> %a2, <2 x i64> %a1
1567 %t6 = select <2 x i1> %t3, <2 x i64> %a1, <2 x i64> %a2
1568 %t7 = sub <2 x i64> %t6, %t5
1569 %t8 = lshr <2 x i64> %t7,
1570 %t9 = mul nsw <2 x i64> %t8, %t4 ; signed
1571 %a10 = add nsw <2 x i64> %t9, %a1 ; signed
1572 ret <2 x i64> %a10
1573 }
1574
1575 define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, <2 x i64>* %a2_addr) nounwind {
1576 ; SSE2-LABEL: vec128_i64_signed_reg_mem:
1577 ; SSE2: # %bb.0:
1578 ; SSE2-NEXT: movdqa (%rdi), %xmm1
1579 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
1580 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1581 ; SSE2-NEXT: pxor %xmm4, %xmm5
1582 ; SSE2-NEXT: pxor %xmm1, %xmm4
1583 ; SSE2-NEXT: movdqa %xmm5, %xmm2
1584 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
1585 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
1586 ; SSE2-NEXT: movdqa %xmm5, %xmm6
1587 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm6
1588 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
1589 ; SSE2-NEXT: pand %xmm6, %xmm3
1590 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1591 ; SSE2-NEXT: por %xmm3, %xmm2
1592 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1]
1593 ; SSE2-NEXT: por %xmm2, %xmm3
1594 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
1595 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
1596 ; SSE2-NEXT: pand %xmm6, %xmm5
1597 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1598 ; SSE2-NEXT: por %xmm5, %xmm4
1599 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1600 ; SSE2-NEXT: pand %xmm4, %xmm5
1601 ; SSE2-NEXT: pandn %xmm1, %xmm4
1602 ; SSE2-NEXT: por %xmm5, %xmm4
1603 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1604 ; SSE2-NEXT: pand %xmm2, %xmm5
1605 ; SSE2-NEXT: pandn %xmm1, %xmm2
1606 ; SSE2-NEXT: por %xmm5, %xmm2
1607 ; SSE2-NEXT: psubq %xmm4, %xmm2
1608 ; SSE2-NEXT: psrlq $1, %xmm2
1609 ; SSE2-NEXT: movdqa %xmm3, %xmm4
1610 ; SSE2-NEXT: psrlq $32, %xmm4
1611 ; SSE2-NEXT: pmuludq %xmm2, %xmm4
1612 ; SSE2-NEXT: movdqa %xmm2, %xmm1
1613 ; SSE2-NEXT: psrlq $32, %xmm1
1614 ; SSE2-NEXT: pmuludq %xmm3, %xmm1
1615 ; SSE2-NEXT: paddq %xmm4, %xmm1
1616 ; SSE2-NEXT: psllq $32, %xmm1
1617 ; SSE2-NEXT: pmuludq %xmm3, %xmm2
1618 ; SSE2-NEXT: paddq %xmm0, %xmm1
1619 ; SSE2-NEXT: paddq %xmm2, %xmm1
1620 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1621 ; SSE2-NEXT: retq
1622 ;
1623 ; SSE41-LABEL: vec128_i64_signed_reg_mem:
1624 ; SSE41: # %bb.0:
1625 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1626 ; SSE41-NEXT: movdqa (%rdi), %xmm3
1627 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
1628 ; SSE41-NEXT: pxor %xmm5, %xmm0
1629 ; SSE41-NEXT: pxor %xmm3, %xmm5
1630 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1631 ; SSE41-NEXT: pcmpgtd %xmm5, %xmm2
1632 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
1633 ; SSE41-NEXT: movdqa %xmm0, %xmm6
1634 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm6
1635 ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
1636 ; SSE41-NEXT: pand %xmm6, %xmm4
1637 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1638 ; SSE41-NEXT: por %xmm4, %xmm2
1639 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1,1]
1640 ; SSE41-NEXT: por %xmm2, %xmm4
1641 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
1642 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
1643 ; SSE41-NEXT: pand %xmm6, %xmm0
1644 ; SSE41-NEXT: por %xmm5, %xmm0
1645 ; SSE41-NEXT: movdqa %xmm3, %xmm5
1646 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
1647 ; SSE41-NEXT: movdqa %xmm2, %xmm0
1648 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
1649 ; SSE41-NEXT: psubq %xmm5, %xmm3
1650 ; SSE41-NEXT: psrlq $1, %xmm3
1651 ; SSE41-NEXT: movdqa %xmm4, %xmm2
1652 ; SSE41-NEXT: psrlq $32, %xmm2
1653 ; SSE41-NEXT: pmuludq %xmm3, %xmm2
1654 ; SSE41-NEXT: movdqa %xmm3, %xmm0
1655 ; SSE41-NEXT: psrlq $32, %xmm0
1656 ; SSE41-NEXT: pmuludq %xmm4, %xmm0
1657 ; SSE41-NEXT: paddq %xmm2, %xmm0
1658 ; SSE41-NEXT: psllq $32, %xmm0
1659 ; SSE41-NEXT: pmuludq %xmm4, %xmm3
1660 ; SSE41-NEXT: paddq %xmm1, %xmm0
1661 ; SSE41-NEXT: paddq %xmm3, %xmm0
1662 ; SSE41-NEXT: retq
1663 ;
1664 ; AVX1-FALLBACK-LABEL: vec128_i64_signed_reg_mem:
1665 ; AVX1-FALLBACK: # %bb.0:
1666 ; AVX1-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
1667 ; AVX1-FALLBACK-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
1668 ; AVX1-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm3
1669 ; AVX1-FALLBACK-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm4
1670 ; AVX1-FALLBACK-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4
1671 ; AVX1-FALLBACK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
1672 ; AVX1-FALLBACK-NEXT: vpsubq %xmm4, %xmm1, %xmm1
1673 ; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm1
1674 ; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm2
1675 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm2, %xmm1, %xmm2
1676 ; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm1, %xmm4
1677 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm3, %xmm4, %xmm4
1678 ; AVX1-FALLBACK-NEXT: vpaddq %xmm4, %xmm2, %xmm2
1679 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2
1680 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
1681 ; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1682 ; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1683 ; AVX1-FALLBACK-NEXT: retq
1684 ;
1685 ; AVX2-FALLBACK-LABEL: vec128_i64_signed_reg_mem:
1686 ; AVX2-FALLBACK: # %bb.0:
1687 ; AVX2-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
1688 ; AVX2-FALLBACK-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
1689 ; AVX2-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm3
1690 ; AVX2-FALLBACK-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm4
1691 ; AVX2-FALLBACK-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4
1692 ; AVX2-FALLBACK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
1693 ; AVX2-FALLBACK-NEXT: vpsubq %xmm4, %xmm1, %xmm1
1694 ; AVX2-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm1
1695 ; AVX2-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm2
1696 ; AVX2-FALLBACK-NEXT: vpmuludq %xmm2, %xmm1, %xmm2
1697 ; AVX2-FALLBACK-NEXT: vpsrlq $32, %xmm1, %xmm4
1698 ; AVX2-FALLBACK-NEXT: vpmuludq %xmm3, %xmm4, %xmm4
1699 ; AVX2-FALLBACK-NEXT: vpaddq %xmm4, %xmm2, %xmm2
1700 ; AVX2-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2
1701 ; AVX2-FALLBACK-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
1702 ; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1703 ; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1704 ; AVX2-FALLBACK-NEXT: retq
1705 ;
1706 ; XOP-LABEL: vec128_i64_signed_reg_mem:
1707 ; XOP: # %bb.0:
1708 ; XOP-NEXT: vmovdqa (%rdi), %xmm1
1709 ; XOP-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2
1710 ; XOP-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm3
1711 ; XOP-NEXT: vpcomltq %xmm1, %xmm0, %xmm4
1712 ; XOP-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4
1713 ; XOP-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
1714 ; XOP-NEXT: vpsubq %xmm4, %xmm1, %xmm1
1715 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm1
1716 ; XOP-NEXT: vpsrlq $32, %xmm3, %xmm2
1717 ; XOP-NEXT: vpmuludq %xmm2, %xmm1, %xmm2
1718 ; XOP-NEXT: vpsrlq $32, %xmm1, %xmm4
1719 ; XOP-NEXT: vpmuludq %xmm3, %xmm4, %xmm4
1720 ; XOP-NEXT: vpaddq %xmm4, %xmm2, %xmm2
1721 ; XOP-NEXT: vpsllq $32, %xmm2, %xmm2
1722 ; XOP-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
1723 ; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1724 ; XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1725 ; XOP-NEXT: retq
1726 ;
1727 ; AVX512F-LABEL: vec128_i64_signed_reg_mem:
1728 ; AVX512F: # %bb.0:
1729 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1730 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
1731 ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
1732 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1733 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
1734 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
1735 ; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2
1736 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
1737 ; AVX512F-NEXT: vpsubq %xmm2, %xmm1, %xmm1
1738 ; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm1
1739 ; AVX512F-NEXT: vpsrlq $32, %xmm1, %xmm2
1740 ; AVX512F-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
1741 ; AVX512F-NEXT: vpsrlq $32, %xmm3, %xmm4
1742 ; AVX512F-NEXT: vpmuludq %xmm4, %xmm1, %xmm4
1743 ; AVX512F-NEXT: vpaddq %xmm2, %xmm4, %xmm2
1744 ; AVX512F-NEXT: vpsllq $32, %xmm2, %xmm2
1745 ; AVX512F-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
1746 ; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1747 ; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1748 ; AVX512F-NEXT: vzeroupper
1749 ; AVX512F-NEXT: retq
1750 ;
1751 ; AVX512VL-LABEL: vec128_i64_signed_reg_mem:
1752 ; AVX512VL: # %bb.0:
1753 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm1
1754 ; AVX512VL-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
1755 ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1756 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
1757 ; AVX512VL-NEXT: vmovdqa64 %xmm2, %xmm3 {%k1}
1758 ; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm2
1759 ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm1
1760 ; AVX512VL-NEXT: vpsubq %xmm2, %xmm1, %xmm1
1761 ; AVX512VL-NEXT: vpsrlq $1, %xmm1, %xmm1
1762 ; AVX512VL-NEXT: vpsrlq $32, %xmm3, %xmm2
1763 ; AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm2
1764 ; AVX512VL-NEXT: vpsrlq $32, %xmm1, %xmm4
1765 ; AVX512VL-NEXT: vpmuludq %xmm3, %xmm4, %xmm4
1766 ; AVX512VL-NEXT: vpaddq %xmm4, %xmm2, %xmm2
1767 ; AVX512VL-NEXT: vpsllq $32, %xmm2, %xmm2
1768 ; AVX512VL-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
1769 ; AVX512VL-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1770 ; AVX512VL-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1771 ; AVX512VL-NEXT: retq
1772 ;
1773 ; AVX512BW-FALLBACK-LABEL: vec128_i64_signed_reg_mem:
1774 ; AVX512BW-FALLBACK: # %bb.0:
1775 ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1776 ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
1777 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
1778 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1779 ; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
1780 ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
1781 ; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2
1782 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
1783 ; AVX512BW-FALLBACK-NEXT: vpsubq %xmm2, %xmm1, %xmm1
1784 ; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm1
1785 ; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %xmm1, %xmm2
1786 ; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
1787 ; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm4
1788 ; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm4, %xmm1, %xmm4
1789 ; AVX512BW-FALLBACK-NEXT: vpaddq %xmm2, %xmm4, %xmm2
1790 ; AVX512BW-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2
1791 ; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
1792 ; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1793 ; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1794 ; AVX512BW-FALLBACK-NEXT: vzeroupper
1795 ; AVX512BW-FALLBACK-NEXT: retq
1796 %a2 = load <2 x i64>, <2 x i64>* %a2_addr
1797 %t3 = icmp sgt <2 x i64> %a1, %a2 ; signed
1798 %t4 = select <2 x i1> %t3, <2 x i64> , <2 x i64>
1799 %t5 = select <2 x i1> %t3, <2 x i64> %a2, <2 x i64> %a1
1800 %t6 = select <2 x i1> %t3, <2 x i64> %a1, <2 x i64> %a2
1801 %t7 = sub <2 x i64> %t6, %t5
1802 %t8 = lshr <2 x i64> %t7,
1803 %t9 = mul nsw <2 x i64> %t8, %t4 ; signed
1804 %a10 = add nsw <2 x i64> %t9, %a1 ; signed
1805 ret <2 x i64> %a10
1806 }
1807
1808 define <2 x i64> @vec128_i64_signed_mem_mem(<2 x i64>* %a1_addr, <2 x i64>* %a2_addr) nounwind {
1809 ; SSE2-LABEL: vec128_i64_signed_mem_mem:
1810 ; SSE2: # %bb.0:
1811 ; SSE2-NEXT: movdqa (%rdi), %xmm1
1812 ; SSE2-NEXT: movdqa (%rsi), %xmm0
1813 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
1814 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1815 ; SSE2-NEXT: pxor %xmm4, %xmm5
1816 ; SSE2-NEXT: pxor %xmm1, %xmm4
1817 ; SSE2-NEXT: movdqa %xmm4, %xmm2
1818 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm2
1819 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
1820 ; SSE2-NEXT: movdqa %xmm4, %xmm6
1821 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
1822 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
1823 ; SSE2-NEXT: pand %xmm6, %xmm3
1824 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1825 ; SSE2-NEXT: por %xmm3, %xmm2
1826 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1]
1827 ; SSE2-NEXT: por %xmm2, %xmm3
1828 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
1829 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2]
1830 ; SSE2-NEXT: pand %xmm6, %xmm4
1831 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1832 ; SSE2-NEXT: por %xmm4, %xmm5
1833 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1834 ; SSE2-NEXT: pand %xmm5, %xmm4
1835 ; SSE2-NEXT: pandn %xmm0, %xmm5
1836 ; SSE2-NEXT: por %xmm4, %xmm5
1837 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1838 ; SSE2-NEXT: pand %xmm2, %xmm4
1839 ; SSE2-NEXT: pandn %xmm0, %xmm2
1840 ; SSE2-NEXT: por %xmm4, %xmm2
1841 ; SSE2-NEXT: psubq %xmm5, %xmm2
1842 ; SSE2-NEXT: psrlq $1, %xmm2
1843 ; SSE2-NEXT: movdqa %xmm3, %xmm4
1844 ; SSE2-NEXT: psrlq $32, %xmm4
1845 ; SSE2-NEXT: pmuludq %xmm2, %xmm4
1846 ; SSE2-NEXT: movdqa %xmm2, %xmm0
1847 ; SSE2-NEXT: psrlq $32, %xmm0
1848 ; SSE2-NEXT: pmuludq %xmm3, %xmm0
1849 ; SSE2-NEXT: paddq %xmm4, %xmm0
1850 ; SSE2-NEXT: psllq $32, %xmm0
1851 ; SSE2-NEXT: pmuludq %xmm3, %xmm2
1852 ; SSE2-NEXT: paddq %xmm1, %xmm0
1853 ; SSE2-NEXT: paddq %xmm2, %xmm0
1854 ; SSE2-NEXT: retq
1855 ;
1856 ; SSE41-LABEL: vec128_i64_signed_mem_mem:
1857 ; SSE41: # %bb.0:
1858 ; SSE41-NEXT: movdqa (%rdi), %xmm3
1859 ; SSE41-NEXT: movdqa (%rsi), %xmm2
1860 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
1861 ; SSE41-NEXT: movdqa %xmm2, %xmm5
1862 ; SSE41-NEXT: pxor %xmm0, %xmm5
1863 ; SSE41-NEXT: pxor %xmm3, %xmm0
1864 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1865 ; SSE41-NEXT: pcmpgtd %xmm5, %xmm1
1866 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
1867 ; SSE41-NEXT: movdqa %xmm0, %xmm6
1868 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm6
1869 ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
1870 ; SSE41-NEXT: pand %xmm6, %xmm4
1871 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1872 ; SSE41-NEXT: por %xmm4, %xmm1
1873 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1,1]
1874 ; SSE41-NEXT: por %xmm1, %xmm4
1875 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
1876 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
1877 ; SSE41-NEXT: pand %xmm6, %xmm0
1878 ; SSE41-NEXT: por %xmm5, %xmm0
1879 ; SSE41-NEXT: movdqa %xmm2, %xmm5
1880 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
1881 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1882 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
1883 ; SSE41-NEXT: psubq %xmm5, %xmm2
1884 ; SSE41-NEXT: psrlq $1, %xmm2
1885 ; SSE41-NEXT: movdqa %xmm4, %xmm1
1886 ; SSE41-NEXT: psrlq $32, %xmm1
1887 ; SSE41-NEXT: pmuludq %xmm2, %xmm1
1888 ; SSE41-NEXT: movdqa %xmm2, %xmm0
1889 ; SSE41-NEXT: psrlq $32, %xmm0
1890 ; SSE41-NEXT: pmuludq %xmm4, %xmm0
1891 ; SSE41-NEXT: paddq %xmm1, %xmm0
1892 ; SSE41-NEXT: psllq $32, %xmm0
1893 ; SSE41-NEXT: pmuludq %xmm4, %xmm2
1894 ; SSE41-NEXT: paddq %xmm3, %xmm0
1895 ; SSE41-NEXT: paddq %xmm2, %xmm0
1896 ; SSE41-NEXT: retq
1897 ;
1898 ; AVX1-FALLBACK-LABEL: vec128_i64_signed_mem_mem:
1899 ; AVX1-FALLBACK: # %bb.0:
1900 ; AVX1-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0
1901 ; AVX1-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1
1902 ; AVX1-FALLBACK-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
1903 ; AVX1-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm3
1904 ; AVX1-FALLBACK-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm4
1905 ; AVX1-FALLBACK-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4
1906 ; AVX1-FALLBACK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
1907 ; AVX1-FALLBACK-NEXT: vpsubq %xmm4, %xmm1, %xmm1
1908 ; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm1
1909 ; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm2
1910 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm2, %xmm1, %xmm2
1911 ; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm1, %xmm4
1912 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm3, %xmm4, %xmm4
1913 ; AVX1-FALLBACK-NEXT: vpaddq %xmm4, %xmm2, %xmm2
1914 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2
1915 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
1916 ; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1917 ; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1918 ; AVX1-FALLBACK-NEXT: retq
1919 ;
1920 ; AVX2-FALLBACK-LABEL: vec128_i64_signed_mem_mem:
1921 ; AVX2-FALLBACK: # %bb.0:
1922 ; AVX2-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0
1923 ; AVX2-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1
1924 ; AVX2-FALLBACK-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
1925 ; AVX2-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm3
1926 ; AVX2-FALLBACK-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm4
1927 ; AVX2-FALLBACK-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4
1928 ; AVX2-FALLBACK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
1929 ; AVX2-FALLBACK-NEXT: vpsubq %xmm4, %xmm1, %xmm1
1930 ; AVX2-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm1
1931 ; AVX2-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm2
1932 ; AVX2-FALLBACK-NEXT: vpmuludq %xmm2, %xmm1, %xmm2
1933 ; AVX2-FALLBACK-NEXT: vpsrlq $32, %xmm1, %xmm4
1934 ; AVX2-FALLBACK-NEXT: vpmuludq %xmm3, %xmm4, %xmm4
1935 ; AVX2-FALLBACK-NEXT: vpaddq %xmm4, %xmm2, %xmm2
1936 ; AVX2-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2
1937 ; AVX2-FALLBACK-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
1938 ; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1939 ; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1940 ; AVX2-FALLBACK-NEXT: retq
1941 ;
1942 ; XOP-LABEL: vec128_i64_signed_mem_mem:
1943 ; XOP: # %bb.0:
1944 ; XOP-NEXT: vmovdqa (%rdi), %xmm0
1945 ; XOP-NEXT: vmovdqa (%rsi), %xmm1
1946 ; XOP-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2
1947 ; XOP-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm3
1948 ; XOP-NEXT: vpcomltq %xmm1, %xmm0, %xmm4
1949 ; XOP-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4
1950 ; XOP-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
1951 ; XOP-NEXT: vpsubq %xmm4, %xmm1, %xmm1
1952 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm1
1953 ; XOP-NEXT: vpsrlq $32, %xmm3, %xmm2
1954 ; XOP-NEXT: vpmuludq %xmm2, %xmm1, %xmm2
1955 ; XOP-NEXT: vpsrlq $32, %xmm1, %xmm4
1956 ; XOP-NEXT: vpmuludq %xmm3, %xmm4, %xmm4
1957 ; XOP-NEXT: vpaddq %xmm4, %xmm2, %xmm2
1958 ; XOP-NEXT: vpsllq $32, %xmm2, %xmm2
1959 ; XOP-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
1960 ; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1961 ; XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1962 ; XOP-NEXT: retq
1963 ;
1964 ; AVX512F-LABEL: vec128_i64_signed_mem_mem:
1965 ; AVX512F: # %bb.0:
1966 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
1967 ; AVX512F-NEXT: vmovdqa (%rsi), %xmm1
1968 ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
1969 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1970 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
1971 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
1972 ; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2
1973 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
1974 ; AVX512F-NEXT: vpsubq %xmm2, %xmm1, %xmm1
1975 ; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm1
1976 ; AVX512F-NEXT: vpsrlq $32, %xmm1, %xmm2
1977 ; AVX512F-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
1978 ; AVX512F-NEXT: vpsrlq $32, %xmm3, %xmm4
1979 ; AVX512F-NEXT: vpmuludq %xmm4, %xmm1, %xmm4
1980 ; AVX512F-NEXT: vpaddq %xmm2, %xmm4, %xmm2
1981 ; AVX512F-NEXT: vpsllq $32, %xmm2, %xmm2
1982 ; AVX512F-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
1983 ; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1984 ; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1985 ; AVX512F-NEXT: vzeroupper
1986 ; AVX512F-NEXT: retq
1987 ;
1988 ; AVX512VL-LABEL: vec128_i64_signed_mem_mem:
1989 ; AVX512VL: # %bb.0:
1990 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
1991 ; AVX512VL-NEXT: vmovdqa (%rsi), %xmm1
1992 ; AVX512VL-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
1993 ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1994 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
1995 ; AVX512VL-NEXT: vmovdqa64 %xmm2, %xmm3 {%k1}
1996 ; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm2
1997 ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm1
1998 ; AVX512VL-NEXT: vpsubq %xmm2, %xmm1, %xmm1
1999 ; AVX512VL-NEXT: vpsrlq $1, %xmm1, %xmm1
2000 ; AVX512VL-NEXT: vpsrlq $32, %xmm3, %xmm2
2001 ; AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm2
2002 ; AVX512VL-NEXT: vpsrlq $32, %xmm1, %xmm4
2003 ; AVX512VL-NEXT: vpmuludq %xmm3, %xmm4, %xmm4
2004 ; AVX512VL-NEXT: vpaddq %xmm4, %xmm2, %xmm2
2005 ; AVX512VL-NEXT: vpsllq $32, %xmm2, %xmm2
2006 ; AVX512VL-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
2007 ; AVX512VL-NEXT: vpaddq %xmm0, %xmm2, %xmm0
2008 ; AVX512VL-NEXT: vpaddq %xmm0, %xmm1, %xmm0
2009 ; AVX512VL-NEXT: retq
2010 ;
2011 ; AVX512BW-FALLBACK-LABEL: vec128_i64_signed_mem_mem:
2012 ; AVX512BW-FALLBACK: # %bb.0:
2013 ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0
2014 ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1
2015 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
2016 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
2017 ; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
2018 ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
2019 ; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2
2020 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
2021 ; AVX512BW-FALLBACK-NEXT: vpsubq %xmm2, %xmm1, %xmm1
2022 ; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm1
2023 ; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %xmm1, %xmm2
2024 ; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
2025 ; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm4
2026 ; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm4, %xmm1, %xmm4
2027 ; AVX512BW-FALLBACK-NEXT: vpaddq %xmm2, %xmm4, %xmm2
2028 ; AVX512BW-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2
2029 ; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
2030 ; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
2031 ; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
2032 ; AVX512BW-FALLBACK-NEXT: vzeroupper
2033 ; AVX512BW-FALLBACK-NEXT: retq
2034 %a1 = load <2 x i64>, <2 x i64>* %a1_addr
2035 %a2 = load <2 x i64>, <2 x i64>* %a2_addr
2036 %t3 = icmp sgt <2 x i64> %a1, %a2 ; signed
2037 %t4 = select <2 x i1> %t3, <2 x i64> , <2 x i64>
2038 %t5 = select <2 x i1> %t3, <2 x i64> %a2, <2 x i64> %a1
2039 %t6 = select <2 x i1> %t3, <2 x i64> %a1, <2 x i64> %a2
2040 %t7 = sub <2 x i64> %t6, %t5
2041 %t8 = lshr <2 x i64> %t7,
2042 %t9 = mul nsw <2 x i64> %t8, %t4 ; signed
2043 %a10 = add nsw <2 x i64> %t9, %a1 ; signed
2044 ret <2 x i64> %a10
2045 }
2046
2047 ; ---------------------------------------------------------------------------- ;
2048 ; 16-bit width. 128 / 16 = 8 elts.
2049 ; ---------------------------------------------------------------------------- ;
2050
2051 ; Values come from regs
2052
2053 define <8 x i16> @vec128_i16_signed_reg_reg(<8 x i16> %a1, <8 x i16> %a2) nounwind {
2054 ; SSE-LABEL: vec128_i16_signed_reg_reg:
2055 ; SSE: # %bb.0:
2056 ; SSE-NEXT: movdqa %xmm0, %xmm2
2057 ; SSE-NEXT: pcmpgtw %xmm1, %xmm2
2058 ; SSE-NEXT: por {{.*}}(%rip), %xmm2
2059 ; SSE-NEXT: movdqa %xmm0, %xmm3
2060 ; SSE-NEXT: pminsw %xmm1, %xmm3
2061 ; SSE-NEXT: pmaxsw %xmm0, %xmm1
2062 ; SSE-NEXT: psubw %xmm3, %xmm1
2063 ; SSE-NEXT: psrlw $1, %xmm1
2064 ; SSE-NEXT: pmullw %xmm1, %xmm2
2065 ; SSE-NEXT: paddw %xmm0, %xmm2
2066 ; SSE-NEXT: movdqa %xmm2, %xmm0
2067 ; SSE-NEXT: retq
2068 ;
2069 ; AVX1-FALLBACK-LABEL: vec128_i16_signed_reg_reg:
2070 ; AVX1-FALLBACK: # %bb.0:
2071 ; AVX1-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2
2072 ; AVX1-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2073 ; AVX1-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm3
2074 ; AVX1-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
2075 ; AVX1-FALLBACK-NEXT: vpsubw %xmm3, %xmm1, %xmm1
2076 ; AVX1-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
2077 ; AVX1-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1
2078 ; AVX1-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2079 ; AVX1-FALLBACK-NEXT: retq
2080 ;
2081 ; AVX2-FALLBACK-LABEL: vec128_i16_signed_reg_reg:
2082 ; AVX2-FALLBACK: # %bb.0:
2083 ; AVX2-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2
2084 ; AVX2-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2085 ; AVX2-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm3
2086 ; AVX2-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
2087 ; AVX2-FALLBACK-NEXT: vpsubw %xmm3, %xmm1, %xmm1
2088 ; AVX2-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
2089 ; AVX2-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1
2090 ; AVX2-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2091 ; AVX2-FALLBACK-NEXT: retq
2092 ;
2093 ; XOP-LABEL: vec128_i16_signed_reg_reg:
2094 ; XOP: # %bb.0:
2095 ; XOP-NEXT: vpcomgtw %xmm1, %xmm0, %xmm2
2096 ; XOP-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2097 ; XOP-NEXT: vpminsw %xmm1, %xmm0, %xmm3
2098 ; XOP-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
2099 ; XOP-NEXT: vpsubw %xmm3, %xmm1, %xmm1
2100 ; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1
2101 ; XOP-NEXT: vpmacsww %xmm0, %xmm2, %xmm1, %xmm0
2102 ; XOP-NEXT: retq
2103 ;
2104 ; AVX512F-LABEL: vec128_i16_signed_reg_reg:
2105 ; AVX512F: # %bb.0:
2106 ; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2
2107 ; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2108 ; AVX512F-NEXT: vpminsw %xmm1, %xmm0, %xmm3
2109 ; AVX512F-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
2110 ; AVX512F-NEXT: vpsubw %xmm3, %xmm1, %xmm1
2111 ; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1
2112 ; AVX512F-NEXT: vpmullw %xmm2, %xmm1, %xmm1
2113 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2114 ; AVX512F-NEXT: retq
2115 ;
2116 ; AVX512VL-FALLBACK-LABEL: vec128_i16_signed_reg_reg:
2117 ; AVX512VL-FALLBACK: # %bb.0:
2118 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2
2119 ; AVX512VL-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2120 ; AVX512VL-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm3
2121 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
2122 ; AVX512VL-FALLBACK-NEXT: vpsubw %xmm3, %xmm1, %xmm1
2123 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
2124 ; AVX512VL-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1
2125 ; AVX512VL-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2126 ; AVX512VL-FALLBACK-NEXT: retq
2127 ;
2128 ; AVX512BW-FALLBACK-LABEL: vec128_i16_signed_reg_reg:
2129 ; AVX512BW-FALLBACK: # %bb.0:
2130 ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2131 ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2132 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1
2133 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
2134 ; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
2135 ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1}
2136 ; AVX512BW-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2
2137 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
2138 ; AVX512BW-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1
2139 ; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
2140 ; AVX512BW-FALLBACK-NEXT: vpmullw %xmm3, %xmm1, %xmm1
2141 ; AVX512BW-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2142 ; AVX512BW-FALLBACK-NEXT: vzeroupper
2143 ; AVX512BW-FALLBACK-NEXT: retq
2144 ;
2145 ; AVX512VLBW-LABEL: vec128_i16_signed_reg_reg:
2146 ; AVX512VLBW: # %bb.0:
2147 ; AVX512VLBW-NEXT: vpcmpgtw %xmm1, %xmm0, %k1
2148 ; AVX512VLBW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
2149 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
2150 ; AVX512VLBW-NEXT: vmovdqu16 %xmm2, %xmm3 {%k1}
2151 ; AVX512VLBW-NEXT: vpminsw %xmm1, %xmm0, %xmm2
2152 ; AVX512VLBW-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
2153 ; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm1, %xmm1
2154 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1
2155 ; AVX512VLBW-NEXT: vpmullw %xmm3, %xmm1, %xmm1
2156 ; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2157 ; AVX512VLBW-NEXT: retq
2158 %t3 = icmp sgt <8 x i16> %a1, %a2 ; signed
2159 %t4 = select <8 x i1> %t3, <8 x i16> , <8 x i16>
2160 %t5 = select <8 x i1> %t3, <8 x i16> %a2, <8 x i16> %a1
2161 %t6 = select <8 x i1> %t3, <8 x i16> %a1, <8 x i16> %a2
2162 %t7 = sub <8 x i16> %t6, %t5
2163 %t8 = lshr <8 x i16> %t7,
2164 %t9 = mul nsw <8 x i16> %t8, %t4 ; signed
2165 %a10 = add nsw <8 x i16> %t9, %a1 ; signed
2166 ret <8 x i16> %a10
2167 }
2168
2169 define <8 x i16> @vec128_i16_unsigned_reg_reg(<8 x i16> %a1, <8 x i16> %a2) nounwind {
2170 ; SSE2-LABEL: vec128_i16_unsigned_reg_reg:
2171 ; SSE2: # %bb.0:
2172 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
2173 ; SSE2-NEXT: pxor %xmm3, %xmm1
2174 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2175 ; SSE2-NEXT: pxor %xmm3, %xmm2
2176 ; SSE2-NEXT: movdqa %xmm2, %xmm4
2177 ; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
2178 ; SSE2-NEXT: por {{.*}}(%rip), %xmm4
2179 ; SSE2-NEXT: movdqa %xmm2, %xmm5
2180 ; SSE2-NEXT: pminsw %xmm1, %xmm5
2181 ; SSE2-NEXT: pxor %xmm3, %xmm5
2182 ; SSE2-NEXT: pmaxsw %xmm1, %xmm2
2183 ; SSE2-NEXT: pxor %xmm3, %xmm2
2184 ; SSE2-NEXT: psubw %xmm5, %xmm2
2185 ; SSE2-NEXT: psrlw $1, %xmm2
2186 ; SSE2-NEXT: pmullw %xmm4, %xmm2
2187 ; SSE2-NEXT: paddw %xmm0, %xmm2
2188 ; SSE2-NEXT: movdqa %xmm2, %xmm0
2189 ; SSE2-NEXT: retq
2190 ;
2191 ; SSE41-LABEL: vec128_i16_unsigned_reg_reg:
2192 ; SSE41: # %bb.0:
2193 ; SSE41-NEXT: movdqa %xmm0, %xmm2
2194 ; SSE41-NEXT: pminuw %xmm1, %xmm2
2195 ; SSE41-NEXT: movdqa %xmm0, %xmm3
2196 ; SSE41-NEXT: pcmpeqw %xmm2, %xmm3
2197 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
2198 ; SSE41-NEXT: pxor %xmm3, %xmm4
2199 ; SSE41-NEXT: por {{.*}}(%rip), %xmm4
2200 ; SSE41-NEXT: pmaxuw %xmm0, %xmm1
2201 ; SSE41-NEXT: psubw %xmm2, %xmm1
2202 ; SSE41-NEXT: psrlw $1, %xmm1
2203 ; SSE41-NEXT: pmullw %xmm1, %xmm4
2204 ; SSE41-NEXT: paddw %xmm4, %xmm0
2205 ; SSE41-NEXT: retq
2206 ;
2207 ; AVX1-FALLBACK-LABEL: vec128_i16_unsigned_reg_reg:
2208 ; AVX1-FALLBACK: # %bb.0:
2209 ; AVX1-FALLBACK-NEXT: vpminuw %xmm1, %xmm0, %xmm2
2210 ; AVX1-FALLBACK-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3
2211 ; AVX1-FALLBACK-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
2212 ; AVX1-FALLBACK-NEXT: vpxor %xmm4, %xmm3, %xmm3
2213 ; AVX1-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm3, %xmm3
2214 ; AVX1-FALLBACK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
2215 ; AVX1-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1
2216 ; AVX1-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
2217 ; AVX1-FALLBACK-NEXT: vpmullw %xmm3, %xmm1, %xmm1
2218 ; AVX1-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2219 ; AVX1-FALLBACK-NEXT: retq
2220 ;
2221 ; AVX2-FALLBACK-LABEL: vec128_i16_unsigned_reg_reg:
2222 ; AVX2-FALLBACK: # %bb.0:
2223 ; AVX2-FALLBACK-NEXT: vpminuw %xmm1, %xmm0, %xmm2
2224 ; AVX2-FALLBACK-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3
2225 ; AVX2-FALLBACK-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
2226 ; AVX2-FALLBACK-NEXT: vpxor %xmm4, %xmm3, %xmm3
2227 ; AVX2-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm3, %xmm3
2228 ; AVX2-FALLBACK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
2229 ; AVX2-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1
2230 ; AVX2-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
2231 ; AVX2-FALLBACK-NEXT: vpmullw %xmm3, %xmm1, %xmm1
2232 ; AVX2-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2233 ; AVX2-FALLBACK-NEXT: retq
2234 ;
2235 ; XOP-LABEL: vec128_i16_unsigned_reg_reg:
2236 ; XOP: # %bb.0:
2237 ; XOP-NEXT: vpcomgtuw %xmm1, %xmm0, %xmm2
2238 ; XOP-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2239 ; XOP-NEXT: vpminuw %xmm1, %xmm0, %xmm3
2240 ; XOP-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
2241 ; XOP-NEXT: vpsubw %xmm3, %xmm1, %xmm1
2242 ; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1
2243 ; XOP-NEXT: vpmacsww %xmm0, %xmm2, %xmm1, %xmm0
2244 ; XOP-NEXT: retq
2245 ;
2246 ; AVX512F-LABEL: vec128_i16_unsigned_reg_reg:
2247 ; AVX512F: # %bb.0:
2248 ; AVX512F-NEXT: vpminuw %xmm1, %xmm0, %xmm2
2249 ; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3
2250 ; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3
2251 ; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm3, %xmm3
2252 ; AVX512F-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
2253 ; AVX512F-NEXT: vpsubw %xmm2, %xmm1, %xmm1
2254 ; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1
2255 ; AVX512F-NEXT: vpmullw %xmm3, %xmm1, %xmm1
2256 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2257 ; AVX512F-NEXT: vzeroupper
2258 ; AVX512F-NEXT: retq
2259 ;
2260 ; AVX512VL-FALLBACK-LABEL: vec128_i16_unsigned_reg_reg:
2261 ; AVX512VL-FALLBACK: # %bb.0:
2262 ; AVX512VL-FALLBACK-NEXT: vpminuw %xmm1, %xmm0, %xmm2
2263 ; AVX512VL-FALLBACK-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3
2264 ; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm3, %xmm3, %xmm3
2265 ; AVX512VL-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm3, %xmm3
2266 ; AVX512VL-FALLBACK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
2267 ; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1
2268 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
2269 ; AVX512VL-FALLBACK-NEXT: vpmullw %xmm3, %xmm1, %xmm1
2270 ; AVX512VL-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2271 ; AVX512VL-FALLBACK-NEXT: retq
2272 ;
2273 ; AVX512BW-FALLBACK-LABEL: vec128_i16_unsigned_reg_reg:
2274 ; AVX512BW-FALLBACK: # %bb.0:
2275 ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2276 ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2277 ; AVX512BW-FALLBACK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k1
2278 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
2279 ; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
2280 ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1}
2281 ; AVX512BW-FALLBACK-NEXT: vpminuw %xmm1, %xmm0, %xmm2
2282 ; AVX512BW-FALLBACK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
2283 ; AVX512BW-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1
2284 ; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
2285 ; AVX512BW-FALLBACK-NEXT: vpmullw %xmm3, %xmm1, %xmm1
2286 ; AVX512BW-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2287 ; AVX512BW-FALLBACK-NEXT: vzeroupper
2288 ; AVX512BW-FALLBACK-NEXT: retq
2289 ;
2290 ; AVX512VLBW-LABEL: vec128_i16_unsigned_reg_reg:
2291 ; AVX512VLBW: # %bb.0:
2292 ; AVX512VLBW-NEXT: vpcmpnleuw %xmm1, %xmm0, %k1
2293 ; AVX512VLBW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
2294 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
2295 ; AVX512VLBW-NEXT: vmovdqu16 %xmm2, %xmm3 {%k1}
2296 ; AVX512VLBW-NEXT: vpminuw %xmm1, %xmm0, %xmm2
2297 ; AVX512VLBW-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
2298 ; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm1, %xmm1
2299 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1
2300 ; AVX512VLBW-NEXT: vpmullw %xmm3, %xmm1, %xmm1
2301 ; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2302 ; AVX512VLBW-NEXT: retq
2303 %t3 = icmp ugt <8 x i16> %a1, %a2
2304 %t4 = select <8 x i1> %t3, <8 x i16> , <8 x i16>
2305 %t5 = select <8 x i1> %t3, <8 x i16> %a2, <8 x i16> %a1
2306 %t6 = select <8 x i1> %t3, <8 x i16> %a1, <8 x i16> %a2
2307 %t7 = sub <8 x i16> %t6, %t5
2308 %t8 = lshr <8 x i16> %t7,
2309 %t9 = mul <8 x i16> %t8, %t4
2310 %a10 = add <8 x i16> %t9, %a1
2311 ret <8 x i16> %a10
2312 }
2313
2314 ; Values are loaded. Only check signed case.
2315
2316 define <8 x i16> @vec128_i16_signed_mem_reg(<8 x i16>* %a1_addr, <8 x i16> %a2) nounwind {
2317 ; SSE-LABEL: vec128_i16_signed_mem_reg:
2318 ; SSE: # %bb.0:
2319 ; SSE-NEXT: movdqa (%rdi), %xmm1
2320 ; SSE-NEXT: movdqa %xmm1, %xmm2
2321 ; SSE-NEXT: pcmpgtw %xmm0, %xmm2
2322 ; SSE-NEXT: por {{.*}}(%rip), %xmm2
2323 ; SSE-NEXT: movdqa %xmm1, %xmm3
2324 ; SSE-NEXT: pminsw %xmm0, %xmm3
2325 ; SSE-NEXT: pmaxsw %xmm1, %xmm0
2326 ; SSE-NEXT: psubw %xmm3, %xmm0
2327 ; SSE-NEXT: psrlw $1, %xmm0
2328 ; SSE-NEXT: pmullw %xmm2, %xmm0
2329 ; SSE-NEXT: paddw %xmm1, %xmm0
2330 ; SSE-NEXT: retq
2331 ;
2332 ; AVX1-FALLBACK-LABEL: vec128_i16_signed_mem_reg:
2333 ; AVX1-FALLBACK: # %bb.0:
2334 ; AVX1-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
2335 ; AVX1-FALLBACK-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm2
2336 ; AVX1-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2337 ; AVX1-FALLBACK-NEXT: vpminsw %xmm0, %xmm1, %xmm3
2338 ; AVX1-FALLBACK-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0
2339 ; AVX1-FALLBACK-NEXT: vpsubw %xmm3, %xmm0, %xmm0
2340 ; AVX1-FALLBACK-NEXT: vpsrlw $1, %xmm0, %xmm0
2341 ; AVX1-FALLBACK-NEXT: vpmullw %xmm2, %xmm0, %xmm0
2342 ; AVX1-FALLBACK-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2343 ; AVX1-FALLBACK-NEXT: retq
2344 ;
2345 ; AVX2-FALLBACK-LABEL: vec128_i16_signed_mem_reg:
2346 ; AVX2-FALLBACK: # %bb.0:
2347 ; AVX2-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
2348 ; AVX2-FALLBACK-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm2
2349 ; AVX2-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2350 ; AVX2-FALLBACK-NEXT: vpminsw %xmm0, %xmm1, %xmm3
2351 ; AVX2-FALLBACK-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0
2352 ; AVX2-FALLBACK-NEXT: vpsubw %xmm3, %xmm0, %xmm0
2353 ; AVX2-FALLBACK-NEXT: vpsrlw $1, %xmm0, %xmm0
2354 ; AVX2-FALLBACK-NEXT: vpmullw %xmm2, %xmm0, %xmm0
2355 ; AVX2-FALLBACK-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2356 ; AVX2-FALLBACK-NEXT: retq
2357 ;
2358 ; XOP-LABEL: vec128_i16_signed_mem_reg:
2359 ; XOP: # %bb.0:
2360 ; XOP-NEXT: vmovdqa (%rdi), %xmm1
2361 ; XOP-NEXT: vpcomgtw %xmm0, %xmm1, %xmm2
2362 ; XOP-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2363 ; XOP-NEXT: vpminsw %xmm0, %xmm1, %xmm3
2364 ; XOP-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0
2365 ; XOP-NEXT: vpsubw %xmm3, %xmm0, %xmm0
2366 ; XOP-NEXT: vpsrlw $1, %xmm0, %xmm0
2367 ; XOP-NEXT: vpmacsww %xmm1, %xmm2, %xmm0, %xmm0
2368 ; XOP-NEXT: retq
2369 ;
2370 ; AVX512F-LABEL: vec128_i16_signed_mem_reg:
2371 ; AVX512F: # %bb.0:
2372 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
2373 ; AVX512F-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm2
2374 ; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2375 ; AVX512F-NEXT: vpminsw %xmm0, %xmm1, %xmm3
2376 ; AVX512F-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0
2377 ; AVX512F-NEXT: vpsubw %xmm3, %xmm0, %xmm0
2378 ; AVX512F-NEXT: vpsrlw $1, %xmm0, %xmm0
2379 ; AVX512F-NEXT: vpmullw %xmm2, %xmm0, %xmm0
2380 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2381 ; AVX512F-NEXT: retq
2382 ;
2383 ; AVX512VL-FALLBACK-LABEL: vec128_i16_signed_mem_reg:
2384 ; AVX512VL-FALLBACK: # %bb.0:
2385 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
2386 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm2
2387 ; AVX512VL-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2388 ; AVX512VL-FALLBACK-NEXT: vpminsw %xmm0, %xmm1, %xmm3
2389 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0
2390 ; AVX512VL-FALLBACK-NEXT: vpsubw %xmm3, %xmm0, %xmm0
2391 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm0, %xmm0
2392 ; AVX512VL-FALLBACK-NEXT: vpmullw %xmm2, %xmm0, %xmm0
2393 ; AVX512VL-FALLBACK-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2394 ; AVX512VL-FALLBACK-NEXT: retq
2395 ;
2396 ; AVX512BW-FALLBACK-LABEL: vec128_i16_signed_mem_reg:
2397 ; AVX512BW-FALLBACK: # %bb.0:
2398 ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2399 ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
2400 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm0, %zmm1, %k1
2401 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
2402 ; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
2403 ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1}
2404 ; AVX512BW-FALLBACK-NEXT: vpminsw %xmm0, %xmm1, %xmm2
2405 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0
2406 ; AVX512BW-FALLBACK-NEXT: vpsubw %xmm2, %xmm0, %xmm0
2407 ; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %xmm0, %xmm0
2408 ; AVX512BW-FALLBACK-NEXT: vpmullw %xmm3, %xmm0, %xmm0
2409 ; AVX512BW-FALLBACK-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2410 ; AVX512BW-FALLBACK-NEXT: vzeroupper
2411 ; AVX512BW-FALLBACK-NEXT: retq
2412 ;
2413 ; AVX512VLBW-LABEL: vec128_i16_signed_mem_reg:
2414 ; AVX512VLBW: # %bb.0:
2415 ; AVX512VLBW-NEXT: vmovdqa (%rdi), %xmm1
2416 ; AVX512VLBW-NEXT: vpcmpgtw %xmm0, %xmm1, %k1
2417 ; AVX512VLBW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
2418 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
2419 ; AVX512VLBW-NEXT: vmovdqu16 %xmm2, %xmm3 {%k1}
2420 ; AVX512VLBW-NEXT: vpminsw %xmm0, %xmm1, %xmm2
2421 ; AVX512VLBW-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0
2422 ; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm0, %xmm0
2423 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm0, %xmm0
2424 ; AVX512VLBW-NEXT: vpmullw %xmm3, %xmm0, %xmm0
2425 ; AVX512VLBW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2426 ; AVX512VLBW-NEXT: retq
2427 %a1 = load <8 x i16>, <8 x i16>* %a1_addr
2428 %t3 = icmp sgt <8 x i16> %a1, %a2 ; signed
2429 %t4 = select <8 x i1> %t3, <8 x i16> , <8 x i16>
2430 %t5 = select <8 x i1> %t3, <8 x i16> %a2, <8 x i16> %a1
2431 %t6 = select <8 x i1> %t3, <8 x i16> %a1, <8 x i16> %a2
2432 %t7 = sub <8 x i16> %t6, %t5
2433 %t8 = lshr <8 x i16> %t7,
2434 %t9 = mul nsw <8 x i16> %t8, %t4 ; signed
2435 %a10 = add nsw <8 x i16> %t9, %a1 ; signed
2436 ret <8 x i16> %a10
2437 }
2438
2439 define <8 x i16> @vec128_i16_signed_reg_mem(<8 x i16> %a1, <8 x i16>* %a2_addr) nounwind {
2440 ; SSE-LABEL: vec128_i16_signed_reg_mem:
2441 ; SSE: # %bb.0:
2442 ; SSE-NEXT: movdqa (%rdi), %xmm2
2443 ; SSE-NEXT: movdqa %xmm0, %xmm1
2444 ; SSE-NEXT: pcmpgtw %xmm2, %xmm1
2445 ; SSE-NEXT: por {{.*}}(%rip), %xmm1
2446 ; SSE-NEXT: movdqa %xmm0, %xmm3
2447 ; SSE-NEXT: pminsw %xmm2, %xmm3
2448 ; SSE-NEXT: pmaxsw %xmm0, %xmm2
2449 ; SSE-NEXT: psubw %xmm3, %xmm2
2450 ; SSE-NEXT: psrlw $1, %xmm2
2451 ; SSE-NEXT: pmullw %xmm2, %xmm1
2452 ; SSE-NEXT: paddw %xmm0, %xmm1
2453 ; SSE-NEXT: movdqa %xmm1, %xmm0
2454 ; SSE-NEXT: retq
2455 ;
2456 ; AVX1-FALLBACK-LABEL: vec128_i16_signed_reg_mem:
2457 ; AVX1-FALLBACK: # %bb.0:
2458 ; AVX1-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
2459 ; AVX1-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2
2460 ; AVX1-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2461 ; AVX1-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm3
2462 ; AVX1-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
2463 ; AVX1-FALLBACK-NEXT: vpsubw %xmm3, %xmm1, %xmm1
2464 ; AVX1-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
2465 ; AVX1-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1
2466 ; AVX1-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2467 ; AVX1-FALLBACK-NEXT: retq
2468 ;
2469 ; AVX2-FALLBACK-LABEL: vec128_i16_signed_reg_mem:
2470 ; AVX2-FALLBACK: # %bb.0:
2471 ; AVX2-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
2472 ; AVX2-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2
2473 ; AVX2-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2474 ; AVX2-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm3
2475 ; AVX2-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
2476 ; AVX2-FALLBACK-NEXT: vpsubw %xmm3, %xmm1, %xmm1
2477 ; AVX2-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
2478 ; AVX2-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1
2479 ; AVX2-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2480 ; AVX2-FALLBACK-NEXT: retq
2481 ;
2482 ; XOP-LABEL: vec128_i16_signed_reg_mem:
2483 ; XOP: # %bb.0:
2484 ; XOP-NEXT: vmovdqa (%rdi), %xmm1
2485 ; XOP-NEXT: vpcomgtw %xmm1, %xmm0, %xmm2
2486 ; XOP-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2487 ; XOP-NEXT: vpminsw %xmm1, %xmm0, %xmm3
2488 ; XOP-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
2489 ; XOP-NEXT: vpsubw %xmm3, %xmm1, %xmm1
2490 ; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1
2491 ; XOP-NEXT: vpmacsww %xmm0, %xmm2, %xmm1, %xmm0
2492 ; XOP-NEXT: retq
2493 ;
2494 ; AVX512F-LABEL: vec128_i16_signed_reg_mem:
2495 ; AVX512F: # %bb.0:
2496 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
2497 ; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2
2498 ; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2499 ; AVX512F-NEXT: vpminsw %xmm1, %xmm0, %xmm3
2500 ; AVX512F-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
2501 ; AVX512F-NEXT: vpsubw %xmm3, %xmm1, %xmm1
2502 ; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1
2503 ; AVX512F-NEXT: vpmullw %xmm2, %xmm1, %xmm1
2504 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2505 ; AVX512F-NEXT: retq
2506 ;
2507 ; AVX512VL-FALLBACK-LABEL: vec128_i16_signed_reg_mem:
2508 ; AVX512VL-FALLBACK: # %bb.0:
2509 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
2510 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2
2511 ; AVX512VL-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2512 ; AVX512VL-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm3
2513 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
2514 ; AVX512VL-FALLBACK-NEXT: vpsubw %xmm3, %xmm1, %xmm1
2515 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
2516 ; AVX512VL-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1
2517 ; AVX512VL-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2518 ; AVX512VL-FALLBACK-NEXT: retq
2519 ;
2520 ; AVX512BW-FALLBACK-LABEL: vec128_i16_signed_reg_mem:
2521 ; AVX512BW-FALLBACK: # %bb.0:
2522 ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2523 ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
2524 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1
2525 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
2526 ; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
2527 ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1}
2528 ; AVX512BW-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2
2529 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
2530 ; AVX512BW-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1
2531 ; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
2532 ; AVX512BW-FALLBACK-NEXT: vpmullw %xmm3, %xmm1, %xmm1
2533 ; AVX512BW-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2534 ; AVX512BW-FALLBACK-NEXT: vzeroupper
2535 ; AVX512BW-FALLBACK-NEXT: retq
2536 ;
2537 ; AVX512VLBW-LABEL: vec128_i16_signed_reg_mem:
2538 ; AVX512VLBW: # %bb.0:
2539 ; AVX512VLBW-NEXT: vmovdqa (%rdi), %xmm1
2540 ; AVX512VLBW-NEXT: vpcmpgtw %xmm1, %xmm0, %k1
2541 ; AVX512VLBW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
2542 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
2543 ; AVX512VLBW-NEXT: vmovdqu16 %xmm2, %xmm3 {%k1}
2544 ; AVX512VLBW-NEXT: vpminsw %xmm1, %xmm0, %xmm2
2545 ; AVX512VLBW-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
2546 ; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm1, %xmm1
2547 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1
2548 ; AVX512VLBW-NEXT: vpmullw %xmm3, %xmm1, %xmm1
2549 ; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2550 ; AVX512VLBW-NEXT: retq
2551 %a2 = load <8 x i16>, <8 x i16>* %a2_addr
2552 %t3 = icmp sgt <8 x i16> %a1, %a2 ; signed
2553 %t4 = select <8 x i1> %t3, <8 x i16> , <8 x i16>
2554 %t5 = select <8 x i1> %t3, <8 x i16> %a2, <8 x i16> %a1
2555 %t6 = select <8 x i1> %t3, <8 x i16> %a1, <8 x i16> %a2
2556 %t7 = sub <8 x i16> %t6, %t5
2557 %t8 = lshr <8 x i16> %t7,
2558 %t9 = mul nsw <8 x i16> %t8, %t4 ; signed
2559 %a10 = add nsw <8 x i16> %t9, %a1 ; signed
2560 ret <8 x i16> %a10
2561 }
2562
2563 define <8 x i16> @vec128_i16_signed_mem_mem(<8 x i16>* %a1_addr, <8 x i16>* %a2_addr) nounwind {
2564 ; SSE-LABEL: vec128_i16_signed_mem_mem:
2565 ; SSE: # %bb.0:
2566 ; SSE-NEXT: movdqa (%rdi), %xmm1
2567 ; SSE-NEXT: movdqa (%rsi), %xmm0
2568 ; SSE-NEXT: movdqa %xmm1, %xmm2
2569 ; SSE-NEXT: pcmpgtw %xmm0, %xmm2
2570 ; SSE-NEXT: por {{.*}}(%rip), %xmm2
2571 ; SSE-NEXT: movdqa %xmm1, %xmm3
2572 ; SSE-NEXT: pminsw %xmm0, %xmm3
2573 ; SSE-NEXT: pmaxsw %xmm1, %xmm0
2574 ; SSE-NEXT: psubw %xmm3, %xmm0
2575 ; SSE-NEXT: psrlw $1, %xmm0
2576 ; SSE-NEXT: pmullw %xmm2, %xmm0
2577 ; SSE-NEXT: paddw %xmm1, %xmm0
2578 ; SSE-NEXT: retq
2579 ;
2580 ; AVX1-FALLBACK-LABEL: vec128_i16_signed_mem_mem:
2581 ; AVX1-FALLBACK: # %bb.0:
2582 ; AVX1-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0
2583 ; AVX1-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1
2584 ; AVX1-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2
2585 ; AVX1-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2586 ; AVX1-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm3
2587 ; AVX1-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
2588 ; AVX1-FALLBACK-NEXT: vpsubw %xmm3, %xmm1, %xmm1
2589 ; AVX1-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
2590 ; AVX1-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1
2591 ; AVX1-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2592 ; AVX1-FALLBACK-NEXT: retq
2593 ;
2594 ; AVX2-FALLBACK-LABEL: vec128_i16_signed_mem_mem:
2595 ; AVX2-FALLBACK: # %bb.0:
2596 ; AVX2-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0
2597 ; AVX2-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1
2598 ; AVX2-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2
2599 ; AVX2-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2600 ; AVX2-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm3
2601 ; AVX2-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
2602 ; AVX2-FALLBACK-NEXT: vpsubw %xmm3, %xmm1, %xmm1
2603 ; AVX2-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
2604 ; AVX2-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1
2605 ; AVX2-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2606 ; AVX2-FALLBACK-NEXT: retq
2607 ;
2608 ; XOP-LABEL: vec128_i16_signed_mem_mem:
2609 ; XOP: # %bb.0:
2610 ; XOP-NEXT: vmovdqa (%rdi), %xmm0
2611 ; XOP-NEXT: vmovdqa (%rsi), %xmm1
2612 ; XOP-NEXT: vpcomgtw %xmm1, %xmm0, %xmm2
2613 ; XOP-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2614 ; XOP-NEXT: vpminsw %xmm1, %xmm0, %xmm3
2615 ; XOP-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
2616 ; XOP-NEXT: vpsubw %xmm3, %xmm1, %xmm1
2617 ; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1
2618 ; XOP-NEXT: vpmacsww %xmm0, %xmm2, %xmm1, %xmm0
2619 ; XOP-NEXT: retq
2620 ;
2621 ; AVX512F-LABEL: vec128_i16_signed_mem_mem:
2622 ; AVX512F: # %bb.0:
2623 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
2624 ; AVX512F-NEXT: vmovdqa (%rsi), %xmm1
2625 ; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2
2626 ; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2627 ; AVX512F-NEXT: vpminsw %xmm1, %xmm0, %xmm3
2628 ; AVX512F-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
2629 ; AVX512F-NEXT: vpsubw %xmm3, %xmm1, %xmm1
2630 ; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1
2631 ; AVX512F-NEXT: vpmullw %xmm2, %xmm1, %xmm1
2632 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2633 ; AVX512F-NEXT: retq
2634 ;
2635 ; AVX512VL-FALLBACK-LABEL: vec128_i16_signed_mem_mem:
2636 ; AVX512VL-FALLBACK: # %bb.0:
2637 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0
2638 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1
2639 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2
2640 ; AVX512VL-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2641 ; AVX512VL-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm3
2642 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
2643 ; AVX512VL-FALLBACK-NEXT: vpsubw %xmm3, %xmm1, %xmm1
2644 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
2645 ; AVX512VL-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1
2646 ; AVX512VL-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2647 ; AVX512VL-FALLBACK-NEXT: retq
2648 ;
2649 ; AVX512BW-FALLBACK-LABEL: vec128_i16_signed_mem_mem:
2650 ; AVX512BW-FALLBACK: # %bb.0:
2651 ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0
2652 ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1
2653 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1
2654 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
2655 ; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
2656 ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1}
2657 ; AVX512BW-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2
2658 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
2659 ; AVX512BW-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1
2660 ; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
2661 ; AVX512BW-FALLBACK-NEXT: vpmullw %xmm3, %xmm1, %xmm1
2662 ; AVX512BW-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2663 ; AVX512BW-FALLBACK-NEXT: vzeroupper
2664 ; AVX512BW-FALLBACK-NEXT: retq
2665 ;
2666 ; AVX512VLBW-LABEL: vec128_i16_signed_mem_mem:
2667 ; AVX512VLBW: # %bb.0:
2668 ; AVX512VLBW-NEXT: vmovdqa (%rdi), %xmm0
2669 ; AVX512VLBW-NEXT: vmovdqa (%rsi), %xmm1
2670 ; AVX512VLBW-NEXT: vpcmpgtw %xmm1, %xmm0, %k1
2671 ; AVX512VLBW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
2672 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
2673 ; AVX512VLBW-NEXT: vmovdqu16 %xmm2, %xmm3 {%k1}
2674 ; AVX512VLBW-NEXT: vpminsw %xmm1, %xmm0, %xmm2
2675 ; AVX512VLBW-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
2676 ; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm1, %xmm1
2677 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1
2678 ; AVX512VLBW-NEXT: vpmullw %xmm3, %xmm1, %xmm1
2679 ; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2680 ; AVX512VLBW-NEXT: retq
2681 %a1 = load <8 x i16>, <8 x i16>* %a1_addr
2682 %a2 = load <8 x i16>, <8 x i16>* %a2_addr
2683 %t3 = icmp sgt <8 x i16> %a1, %a2 ; signed
2684 %t4 = select <8 x i1> %t3, <8 x i16> , <8 x i16>
2685 %t5 = select <8 x i1> %t3, <8 x i16> %a2, <8 x i16> %a1
2686 %t6 = select <8 x i1> %t3, <8 x i16> %a1, <8 x i16> %a2
2687 %t7 = sub <8 x i16> %t6, %t5
2688 %t8 = lshr <8 x i16> %t7,
2689 %t9 = mul nsw <8 x i16> %t8, %t4 ; signed
2690 %a10 = add nsw <8 x i16> %t9, %a1 ; signed
2691 ret <8 x i16> %a10
2692 }
2693
2694 ; ---------------------------------------------------------------------------- ;
2695 ; 8-bit width. 128 / 8 = 16 elts.
2696 ; ---------------------------------------------------------------------------- ;
2697
2698 ; Values come from regs
2699
2700 define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwind {
2701 ; SSE2-LABEL: vec128_i8_signed_reg_reg:
2702 ; SSE2: # %bb.0:
2703 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2704 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
2705 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2706 ; SSE2-NEXT: por %xmm2, %xmm3
2707 ; SSE2-NEXT: movdqa %xmm1, %xmm4
2708 ; SSE2-NEXT: pcmpgtb %xmm0, %xmm4
2709 ; SSE2-NEXT: movdqa %xmm0, %xmm5
2710 ; SSE2-NEXT: pand %xmm4, %xmm5
2711 ; SSE2-NEXT: pandn %xmm1, %xmm4
2712 ; SSE2-NEXT: por %xmm5, %xmm4
2713 ; SSE2-NEXT: movdqa %xmm0, %xmm5
2714 ; SSE2-NEXT: pand %xmm2, %xmm5
2715 ; SSE2-NEXT: pandn %xmm1, %xmm2
2716 ; SSE2-NEXT: por %xmm5, %xmm2
2717 ; SSE2-NEXT: psubb %xmm4, %xmm2
2718 ; SSE2-NEXT: psrlw $1, %xmm2
2719 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
2720 ; SSE2-NEXT: movdqa %xmm2, %xmm1
2721 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2722 ; SSE2-NEXT: movdqa %xmm3, %xmm4
2723 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
2724 ; SSE2-NEXT: pmullw %xmm1, %xmm4
2725 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
2726 ; SSE2-NEXT: pand %xmm1, %xmm4
2727 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2728 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2729 ; SSE2-NEXT: pmullw %xmm3, %xmm2
2730 ; SSE2-NEXT: pand %xmm1, %xmm2
2731 ; SSE2-NEXT: packuswb %xmm4, %xmm2
2732 ; SSE2-NEXT: paddb %xmm0, %xmm2
2733 ; SSE2-NEXT: movdqa %xmm2, %xmm0
2734 ; SSE2-NEXT: retq
2735 ;
2736 ; SSE41-LABEL: vec128_i8_signed_reg_reg:
2737 ; SSE41: # %bb.0:
2738 ; SSE41-NEXT: movdqa %xmm0, %xmm2
2739 ; SSE41-NEXT: pcmpgtb %xmm1, %xmm2
2740 ; SSE41-NEXT: por {{.*}}(%rip), %xmm2
2741 ; SSE41-NEXT: movdqa %xmm0, %xmm3
2742 ; SSE41-NEXT: pminsb %xmm1, %xmm3
2743 ; SSE41-NEXT: pmaxsb %xmm0, %xmm1
2744 ; SSE41-NEXT: psubb %xmm3, %xmm1
2745 ; SSE41-NEXT: psrlw $1, %xmm1
2746 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
2747 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2748 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2749 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2750 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2751 ; SSE41-NEXT: pmullw %xmm1, %xmm2
2752 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
2753 ; SSE41-NEXT: pand %xmm1, %xmm2
2754 ; SSE41-NEXT: pmullw %xmm4, %xmm3
2755 ; SSE41-NEXT: pand %xmm1, %xmm3
2756 ; SSE41-NEXT: packuswb %xmm2, %xmm3
2757 ; SSE41-NEXT: paddb %xmm3, %xmm0
2758 ; SSE41-NEXT: retq
2759 ;
2760 ; AVX1-FALLBACK-LABEL: vec128_i8_signed_reg_reg:
2761 ; AVX1-FALLBACK: # %bb.0:
2762 ; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2
2763 ; AVX1-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2764 ; AVX1-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm3
2765 ; AVX1-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1
2766 ; AVX1-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1
2767 ; AVX1-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
2768 ; AVX1-FALLBACK-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
2769 ; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2770 ; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2771 ; AVX1-FALLBACK-NEXT: vpmullw %xmm4, %xmm3, %xmm3
2772 ; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
2773 ; AVX1-FALLBACK-NEXT: vpand %xmm4, %xmm3, %xmm3
2774 ; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2775 ; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2776 ; AVX1-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1
2777 ; AVX1-FALLBACK-NEXT: vpand %xmm4, %xmm1, %xmm1
2778 ; AVX1-FALLBACK-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
2779 ; AVX1-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
2780 ; AVX1-FALLBACK-NEXT: retq
2781 ;
2782 ; AVX2-FALLBACK-LABEL: vec128_i8_signed_reg_reg:
2783 ; AVX2-FALLBACK: # %bb.0:
2784 ; AVX2-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2
2785 ; AVX2-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2786 ; AVX2-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm3
2787 ; AVX2-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1
2788 ; AVX2-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1
2789 ; AVX2-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
2790 ; AVX2-FALLBACK-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
2791 ; AVX2-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2792 ; AVX2-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2793 ; AVX2-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1
2794 ; AVX2-FALLBACK-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
2795 ; AVX2-FALLBACK-NEXT: vextracti128 $1, %ymm1, %xmm2
2796 ; AVX2-FALLBACK-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
2797 ; AVX2-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
2798 ; AVX2-FALLBACK-NEXT: vzeroupper
2799 ; AVX2-FALLBACK-NEXT: retq
2800 ;
2801 ; XOP-FALLBACK-LABEL: vec128_i8_signed_reg_reg:
2802 ; XOP-FALLBACK: # %bb.0:
2803 ; XOP-FALLBACK-NEXT: vpcomgtb %xmm1, %xmm0, %xmm2
2804 ; XOP-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2805 ; XOP-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm3
2806 ; XOP-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1
2807 ; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1
2808 ; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
2809 ; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1
2810 ; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2811 ; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2812 ; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm3, %xmm3
2813 ; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2814 ; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2815 ; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1
2816 ; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14]
2817 ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
2818 ; XOP-FALLBACK-NEXT: retq
2819 ;
2820 ; XOPAVX1-LABEL: vec128_i8_signed_reg_reg:
2821 ; XOPAVX1: # %bb.0:
2822 ; XOPAVX1-NEXT: vpcomgtb %xmm1, %xmm0, %xmm2
2823 ; XOPAVX1-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2824 ; XOPAVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm3
2825 ; XOPAVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1
2826 ; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
2827 ; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
2828 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1
2829 ; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2830 ; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2831 ; XOPAVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
2832 ; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2833 ; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2834 ; XOPAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
2835 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14]
2836 ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
2837 ; XOPAVX1-NEXT: retq
2838 ;
2839 ; XOPAVX2-LABEL: vec128_i8_signed_reg_reg:
2840 ; XOPAVX2: # %bb.0:
2841 ; XOPAVX2-NEXT: vpcomgtb %xmm1, %xmm0, %xmm2
2842 ; XOPAVX2-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2843 ; XOPAVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm3
2844 ; XOPAVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1
2845 ; XOPAVX2-NEXT: vpsubb %xmm3, %xmm1, %xmm1
2846 ; XOPAVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
2847 ; XOPAVX2-NEXT: vpshlb %xmm3, %xmm1, %xmm1
2848 ; XOPAVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2849 ; XOPAVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2850 ; XOPAVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
2851 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
2852 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2853 ; XOPAVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
2854 ; XOPAVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
2855 ; XOPAVX2-NEXT: vzeroupper
2856 ; XOPAVX2-NEXT: retq
2857 ;
2858 ; AVX512F-LABEL: vec128_i8_signed_reg_reg:
2859 ; AVX512F: # %bb.0:
2860 ; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2
2861 ; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm2
2862 ; AVX512F-NEXT: vpminsb %xmm1, %xmm0, %xmm3
2863 ; AVX512F-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1
2864 ; AVX512F-NEXT: vpsubb %xmm3, %xmm1, %xmm1
2865 ; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1
2866 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
2867 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2868 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2869 ; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1
2870 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
2871 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
2872 ; AVX512F-NEXT: vpaddb %xmm0, %xmm1, %xmm0
2873 ; AVX512F-NEXT: vzeroupper
2874 ; AVX512F-NEXT: retq