llvm.org GIT mirror llvm / 349f5f8
[X86] Add SADDO/UADDO and SSUBO/USUBO combine tests Include scalar and vector test variants covering the folds in DAGCombiner (vector isn't currently supported - PR40442) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@355404 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 7 months ago
2 changed file(s) with 464 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
3
4 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
5 declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
6
7 declare {<4 x i32>, <4 x i1>} @llvm.sadd.with.overflow.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
8 declare {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
9
10 ; fold (sadd x, 0) -> x
11 define i32 @combine_sadd_zero(i32 %a0, i32 %a1) {
12 ; SSE-LABEL: combine_sadd_zero:
13 ; SSE: # %bb.0:
14 ; SSE-NEXT: movl %edi, %eax
15 ; SSE-NEXT: addl $0, %eax
16 ; SSE-NEXT: cmovol %esi, %eax
17 ; SSE-NEXT: retq
18 ;
19 ; AVX-LABEL: combine_sadd_zero:
20 ; AVX: # %bb.0:
21 ; AVX-NEXT: movl %edi, %eax
22 ; AVX-NEXT: addl $0, %eax
23 ; AVX-NEXT: cmovol %esi, %eax
24 ; AVX-NEXT: retq
25 %1 = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a0, i32 zeroinitializer)
26 %2 = extractvalue {i32, i1} %1, 0
27 %3 = extractvalue {i32, i1} %1, 1
28 %4 = select i1 %3, i32 %a1, i32 %2
29 ret i32 %4
30 }
31
32 define <4 x i32> @combine_vec_sadd_zero(<4 x i32> %a0, <4 x i32> %a1) {
33 ; SSE-LABEL: combine_vec_sadd_zero:
34 ; SSE: # %bb.0:
35 ; SSE-NEXT: movdqa %xmm0, %xmm2
36 ; SSE-NEXT: pxor %xmm0, %xmm0
37 ; SSE-NEXT: pcmpgtd %xmm2, %xmm0
38 ; SSE-NEXT: pcmpeqd %xmm3, %xmm3
39 ; SSE-NEXT: pxor %xmm3, %xmm0
40 ; SSE-NEXT: pcmpeqd %xmm0, %xmm3
41 ; SSE-NEXT: pcmpeqd %xmm0, %xmm0
42 ; SSE-NEXT: pandn %xmm3, %xmm0
43 ; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm2
44 ; SSE-NEXT: movaps %xmm2, %xmm0
45 ; SSE-NEXT: retq
46 ;
47 ; AVX-LABEL: combine_vec_sadd_zero:
48 ; AVX: # %bb.0:
49 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
50 ; AVX-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm2
51 ; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
52 ; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2
53 ; AVX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm3
54 ; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
55 ; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2
56 ; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
57 ; AVX-NEXT: retq
58 %1 = call {<4 x i32>, <4 x i1>} @llvm.sadd.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> zeroinitializer)
59 %2 = extractvalue {<4 x i32>, <4 x i1>} %1, 0
60 %3 = extractvalue {<4 x i32>, <4 x i1>} %1, 1
61 %4 = select <4 x i1> %3, <4 x i32> %a1, <4 x i32> %2
62 ret <4 x i32> %4
63 }
64
65 ; fold (uadd x, 0) -> x
66 define i32 @combine_uadd_zero(i32 %a0, i32 %a1) {
67 ; SSE-LABEL: combine_uadd_zero:
68 ; SSE: # %bb.0:
69 ; SSE-NEXT: movl %edi, %eax
70 ; SSE-NEXT: retq
71 ;
72 ; AVX-LABEL: combine_uadd_zero:
73 ; AVX: # %bb.0:
74 ; AVX-NEXT: movl %edi, %eax
75 ; AVX-NEXT: retq
76 %1 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a0, i32 zeroinitializer)
77 %2 = extractvalue {i32, i1} %1, 0
78 %3 = extractvalue {i32, i1} %1, 1
79 %4 = select i1 %3, i32 %a1, i32 %2
80 ret i32 %4
81 }
82
83 define <4 x i32> @combine_vec_uadd_zero(<4 x i32> %a0, <4 x i32> %a1) {
84 ; SSE-LABEL: combine_vec_uadd_zero:
85 ; SSE: # %bb.0:
86 ; SSE-NEXT: movdqa %xmm0, %xmm2
87 ; SSE-NEXT: pmaxud %xmm0, %xmm0
88 ; SSE-NEXT: pcmpeqd %xmm2, %xmm0
89 ; SSE-NEXT: pcmpeqd %xmm3, %xmm3
90 ; SSE-NEXT: pxor %xmm3, %xmm0
91 ; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm2
92 ; SSE-NEXT: movaps %xmm2, %xmm0
93 ; SSE-NEXT: retq
94 ;
95 ; AVX-LABEL: combine_vec_uadd_zero:
96 ; AVX: # %bb.0:
97 ; AVX-NEXT: vpmaxud %xmm0, %xmm0, %xmm2
98 ; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm2
99 ; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
100 ; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2
101 ; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
102 ; AVX-NEXT: retq
103 %1 = call {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> zeroinitializer)
104 %2 = extractvalue {<4 x i32>, <4 x i1>} %1, 0
105 %3 = extractvalue {<4 x i32>, <4 x i1>} %1, 1
106 %4 = select <4 x i1> %3, <4 x i32> %a1, <4 x i32> %2
107 ret <4 x i32> %4
108 }
109
110 ; fold (uadd (xor a, -1), 1) -> (usub 0, a) and flip carry
111 define i32 @combine_uadd_not(i32 %a0, i32 %a1) {
112 ; SSE-LABEL: combine_uadd_not:
113 ; SSE: # %bb.0:
114 ; SSE-NEXT: movl %edi, %eax
115 ; SSE-NEXT: negl %eax
116 ; SSE-NEXT: cmovael %esi, %eax
117 ; SSE-NEXT: retq
118 ;
119 ; AVX-LABEL: combine_uadd_not:
120 ; AVX: # %bb.0:
121 ; AVX-NEXT: movl %edi, %eax
122 ; AVX-NEXT: negl %eax
123 ; AVX-NEXT: cmovael %esi, %eax
124 ; AVX-NEXT: retq
125 %1 = xor i32 %a0, -1
126 %2 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %1, i32 1)
127 %3 = extractvalue {i32, i1} %2, 0
128 %4 = extractvalue {i32, i1} %2, 1
129 %5 = select i1 %4, i32 %a1, i32 %3
130 ret i32 %5
131 }
132
133 define <4 x i32> @combine_vec_uadd_not(<4 x i32> %a0, <4 x i32> %a1) {
134 ; SSE-LABEL: combine_vec_uadd_not:
135 ; SSE: # %bb.0:
136 ; SSE-NEXT: pcmpeqd %xmm3, %xmm3
137 ; SSE-NEXT: pxor %xmm3, %xmm0
138 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,4294967295]
139 ; SSE-NEXT: paddd %xmm0, %xmm2
140 ; SSE-NEXT: pmaxud %xmm2, %xmm0
141 ; SSE-NEXT: pcmpeqd %xmm2, %xmm0
142 ; SSE-NEXT: pxor %xmm3, %xmm0
143 ; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm2
144 ; SSE-NEXT: movaps %xmm2, %xmm0
145 ; SSE-NEXT: retq
146 ;
147 ; AVX-LABEL: combine_vec_uadd_not:
148 ; AVX: # %bb.0:
149 ; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
150 ; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
151 ; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm3
152 ; AVX-NEXT: vpmaxud %xmm0, %xmm3, %xmm0
153 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm3, %xmm0
154 ; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
155 ; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm3, %xmm0
156 ; AVX-NEXT: retq
157 %1 = xor <4 x i32> %a0,
158 %2 = call {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32> %1, <4 x i32> )
159 %3 = extractvalue {<4 x i32>, <4 x i1>} %2, 0
160 %4 = extractvalue {<4 x i32>, <4 x i1>} %2, 1
161 %5 = select <4 x i1> %4, <4 x i32> %a1, <4 x i32> %3
162 ret <4 x i32> %5
163 }
164
165 ; if uaddo never overflows, replace with add
166 define i32 @combine_uadd_no_overflow(i32 %a0, i32 %a1, i32 %a2) {
167 ; SSE-LABEL: combine_uadd_no_overflow:
168 ; SSE: # %bb.0:
169 ; SSE-NEXT: # kill: def $edx killed $edx def $rdx
170 ; SSE-NEXT: # kill: def $esi killed $esi def $rsi
171 ; SSE-NEXT: shrl $16, %esi
172 ; SSE-NEXT: shrl $16, %edx
173 ; SSE-NEXT: leal (%rdx,%rsi), %eax
174 ; SSE-NEXT: retq
175 ;
176 ; AVX-LABEL: combine_uadd_no_overflow:
177 ; AVX: # %bb.0:
178 ; AVX-NEXT: # kill: def $edx killed $edx def $rdx
179 ; AVX-NEXT: # kill: def $esi killed $esi def $rsi
180 ; AVX-NEXT: shrl $16, %esi
181 ; AVX-NEXT: shrl $16, %edx
182 ; AVX-NEXT: leal (%rdx,%rsi), %eax
183 ; AVX-NEXT: retq
184 %1 = lshr i32 %a1, 16
185 %2 = lshr i32 %a2, 16
186 %3 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %1, i32 %2)
187 %4 = extractvalue {i32, i1} %3, 0
188 %5 = extractvalue {i32, i1} %3, 1
189 %6 = select i1 %5, i32 %a2, i32 %4
190 ret i32 %4
191 }
192
193 define <4 x i32> @combine_vec_uadd_no_overflow(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
194 ; SSE-LABEL: combine_vec_uadd_no_overflow:
195 ; SSE: # %bb.0:
196 ; SSE-NEXT: movdqa %xmm2, %xmm0
197 ; SSE-NEXT: psrld $16, %xmm1
198 ; SSE-NEXT: psrld $16, %xmm0
199 ; SSE-NEXT: paddd %xmm1, %xmm0
200 ; SSE-NEXT: retq
201 ;
202 ; AVX-LABEL: combine_vec_uadd_no_overflow:
203 ; AVX: # %bb.0:
204 ; AVX-NEXT: vpsrld $16, %xmm1, %xmm0
205 ; AVX-NEXT: vpsrld $16, %xmm2, %xmm1
206 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
207 ; AVX-NEXT: retq
208 %1 = lshr <4 x i32> %a1,
209 %2 = lshr <4 x i32> %a2,
210 %3 = call {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32> %1, <4 x i32> %2)
211 %4 = extractvalue {<4 x i32>, <4 x i1>} %3, 0
212 %5 = extractvalue {<4 x i32>, <4 x i1>} %3, 1
213 %6 = select <4 x i1> %5, <4 x i32> %a2, <4 x i32> %4
214 ret <4 x i32> %4
215 }
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
3
4 declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
5 declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
6
7 declare {<4 x i32>, <4 x i1>} @llvm.ssub.with.overflow.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
8 declare {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
9
10 ; fold (ssub x, 0) -> x
11 define i32 @combine_ssub_zero(i32 %a0, i32 %a1) {
12 ; SSE-LABEL: combine_ssub_zero:
13 ; SSE: # %bb.0:
14 ; SSE-NEXT: movl %edi, %eax
15 ; SSE-NEXT: subl $0, %eax
16 ; SSE-NEXT: cmovol %esi, %eax
17 ; SSE-NEXT: retq
18 ;
19 ; AVX-LABEL: combine_ssub_zero:
20 ; AVX: # %bb.0:
21 ; AVX-NEXT: movl %edi, %eax
22 ; AVX-NEXT: subl $0, %eax
23 ; AVX-NEXT: cmovol %esi, %eax
24 ; AVX-NEXT: retq
25 %1 = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a0, i32 zeroinitializer)
26 %2 = extractvalue {i32, i1} %1, 0
27 %3 = extractvalue {i32, i1} %1, 1
28 %4 = select i1 %3, i32 %a1, i32 %2
29 ret i32 %4
30 }
31
32 define <4 x i32> @combine_vec_ssub_zero(<4 x i32> %a0, <4 x i32> %a1) {
33 ; SSE-LABEL: combine_vec_ssub_zero:
34 ; SSE: # %bb.0:
35 ; SSE-NEXT: movdqa %xmm0, %xmm2
36 ; SSE-NEXT: pxor %xmm3, %xmm3
37 ; SSE-NEXT: pcmpgtd %xmm0, %xmm3
38 ; SSE-NEXT: pcmpeqd %xmm4, %xmm4
39 ; SSE-NEXT: pxor %xmm4, %xmm3
40 ; SSE-NEXT: movdqa %xmm3, %xmm0
41 ; SSE-NEXT: pcmpeqd %xmm4, %xmm0
42 ; SSE-NEXT: pcmpeqd %xmm3, %xmm3
43 ; SSE-NEXT: pxor %xmm4, %xmm3
44 ; SSE-NEXT: pandn %xmm3, %xmm0
45 ; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm2
46 ; SSE-NEXT: movaps %xmm2, %xmm0
47 ; SSE-NEXT: retq
48 ;
49 ; AVX-LABEL: combine_vec_ssub_zero:
50 ; AVX: # %bb.0:
51 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
52 ; AVX-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm2
53 ; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
54 ; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2
55 ; AVX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm4
56 ; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
57 ; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2
58 ; AVX-NEXT: vpandn %xmm2, %xmm4, %xmm2
59 ; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
60 ; AVX-NEXT: retq
61 %1 = call {<4 x i32>, <4 x i1>} @llvm.ssub.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> zeroinitializer)
62 %2 = extractvalue {<4 x i32>, <4 x i1>} %1, 0
63 %3 = extractvalue {<4 x i32>, <4 x i1>} %1, 1
64 %4 = select <4 x i1> %3, <4 x i32> %a1, <4 x i32> %2
65 ret <4 x i32> %4
66 }
67
68 ; fold (usub x, 0) -> x
69 define i32 @combine_usub_zero(i32 %a0, i32 %a1) {
70 ; SSE-LABEL: combine_usub_zero:
71 ; SSE: # %bb.0:
72 ; SSE-NEXT: movl %edi, %eax
73 ; SSE-NEXT: retq
74 ;
75 ; AVX-LABEL: combine_usub_zero:
76 ; AVX: # %bb.0:
77 ; AVX-NEXT: movl %edi, %eax
78 ; AVX-NEXT: retq
79 %1 = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a0, i32 zeroinitializer)
80 %2 = extractvalue {i32, i1} %1, 0
81 %3 = extractvalue {i32, i1} %1, 1
82 %4 = select i1 %3, i32 %a1, i32 %2
83 ret i32 %4
84 }
85
86 define <4 x i32> @combine_vec_usub_zero(<4 x i32> %a0, <4 x i32> %a1) {
87 ; SSE-LABEL: combine_vec_usub_zero:
88 ; SSE: # %bb.0:
89 ; SSE-NEXT: movdqa %xmm0, %xmm2
90 ; SSE-NEXT: pminud %xmm0, %xmm0
91 ; SSE-NEXT: pcmpeqd %xmm2, %xmm0
92 ; SSE-NEXT: pcmpeqd %xmm3, %xmm3
93 ; SSE-NEXT: pxor %xmm3, %xmm0
94 ; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm2
95 ; SSE-NEXT: movaps %xmm2, %xmm0
96 ; SSE-NEXT: retq
97 ;
98 ; AVX-LABEL: combine_vec_usub_zero:
99 ; AVX: # %bb.0:
100 ; AVX-NEXT: vpminud %xmm0, %xmm0, %xmm2
101 ; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm2
102 ; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
103 ; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2
104 ; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
105 ; AVX-NEXT: retq
106 %1 = call {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> zeroinitializer)
107 %2 = extractvalue {<4 x i32>, <4 x i1>} %1, 0
108 %3 = extractvalue {<4 x i32>, <4 x i1>} %1, 1
109 %4 = select <4 x i1> %3, <4 x i32> %a1, <4 x i32> %2
110 ret <4 x i32> %4
111 }
112
113 ; fold (ssub x, x) -> 0
114 define i32 @combine_ssub_self(i32 %a0, i32 %a1) {
115 ; SSE-LABEL: combine_ssub_self:
116 ; SSE: # %bb.0:
117 ; SSE-NEXT: movl %edi, %eax
118 ; SSE-NEXT: subl %edi, %eax
119 ; SSE-NEXT: cmovol %esi, %eax
120 ; SSE-NEXT: retq
121 ;
122 ; AVX-LABEL: combine_ssub_self:
123 ; AVX: # %bb.0:
124 ; AVX-NEXT: movl %edi, %eax
125 ; AVX-NEXT: subl %edi, %eax
126 ; AVX-NEXT: cmovol %esi, %eax
127 ; AVX-NEXT: retq
128 %1 = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a0, i32 %a0)
129 %2 = extractvalue {i32, i1} %1, 0
130 %3 = extractvalue {i32, i1} %1, 1
131 %4 = select i1 %3, i32 %a1, i32 %2
132 ret i32 %4
133 }
134
135 define <4 x i32> @combine_vec_ssub_self(<4 x i32> %a0, <4 x i32> %a1) {
136 ; SSE-LABEL: combine_vec_ssub_self:
137 ; SSE: # %bb.0:
138 ; SSE-NEXT: psubd %xmm0, %xmm0
139 ; SSE-NEXT: retq
140 ;
141 ; AVX-LABEL: combine_vec_ssub_self:
142 ; AVX: # %bb.0:
143 ; AVX-NEXT: vpsubd %xmm0, %xmm0, %xmm0
144 ; AVX-NEXT: retq
145 %1 = call {<4 x i32>, <4 x i1>} @llvm.ssub.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a0)
146 %2 = extractvalue {<4 x i32>, <4 x i1>} %1, 0
147 %3 = extractvalue {<4 x i32>, <4 x i1>} %1, 1
148 %4 = select <4 x i1> %3, <4 x i32> %a1, <4 x i32> %2
149 ret <4 x i32> %4
150 }
151
152 ; fold (usub x, x) -> x
153 define i32 @combine_usub_self(i32 %a0, i32 %a1) {
154 ; SSE-LABEL: combine_usub_self:
155 ; SSE: # %bb.0:
156 ; SSE-NEXT: xorl %eax, %eax
157 ; SSE-NEXT: retq
158 ;
159 ; AVX-LABEL: combine_usub_self:
160 ; AVX: # %bb.0:
161 ; AVX-NEXT: xorl %eax, %eax
162 ; AVX-NEXT: retq
163 %1 = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a0, i32 %a0)
164 %2 = extractvalue {i32, i1} %1, 0
165 %3 = extractvalue {i32, i1} %1, 1
166 %4 = select i1 %3, i32 %a1, i32 %2
167 ret i32 %4
168 }
169
170 define <4 x i32> @combine_vec_usub_self(<4 x i32> %a0, <4 x i32> %a1) {
171 ; SSE-LABEL: combine_vec_usub_self:
172 ; SSE: # %bb.0:
173 ; SSE-NEXT: movdqa %xmm0, %xmm2
174 ; SSE-NEXT: psubd %xmm0, %xmm2
175 ; SSE-NEXT: pminud %xmm2, %xmm0
176 ; SSE-NEXT: pcmpeqd %xmm2, %xmm0
177 ; SSE-NEXT: pcmpeqd %xmm3, %xmm3
178 ; SSE-NEXT: pxor %xmm3, %xmm0
179 ; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm2
180 ; SSE-NEXT: movaps %xmm2, %xmm0
181 ; SSE-NEXT: retq
182 ;
183 ; AVX-LABEL: combine_vec_usub_self:
184 ; AVX: # %bb.0:
185 ; AVX-NEXT: vpsubd %xmm0, %xmm0, %xmm2
186 ; AVX-NEXT: vpminud %xmm0, %xmm2, %xmm0
187 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
188 ; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
189 ; AVX-NEXT: vpxor %xmm3, %xmm0, %xmm0
190 ; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
191 ; AVX-NEXT: retq
192 %1 = call {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a0)
193 %2 = extractvalue {<4 x i32>, <4 x i1>} %1, 0
194 %3 = extractvalue {<4 x i32>, <4 x i1>} %1, 1
195 %4 = select <4 x i1> %3, <4 x i32> %a1, <4 x i32> %2
196 ret <4 x i32> %4
197 }
198
199 ; fold (usub -1, x) -> (xor x, -1) + no borrow
200 define i32 @combine_usub_negone(i32 %a0, i32 %a1) {
201 ; SSE-LABEL: combine_usub_negone:
202 ; SSE: # %bb.0:
203 ; SSE-NEXT: movl %edi, %eax
204 ; SSE-NEXT: notl %eax
205 ; SSE-NEXT: retq
206 ;
207 ; AVX-LABEL: combine_usub_negone:
208 ; AVX: # %bb.0:
209 ; AVX-NEXT: movl %edi, %eax
210 ; AVX-NEXT: notl %eax
211 ; AVX-NEXT: retq
212 %1 = call {i32, i1} @llvm.usub.with.overflow.i32(i32 -1, i32 %a0)
213 %2 = extractvalue {i32, i1} %1, 0
214 %3 = extractvalue {i32, i1} %1, 1
215 %4 = select i1 %3, i32 %a1, i32 %2
216 ret i32 %4
217 }
218
219 define <4 x i32> @combine_vec_usub_negone(<4 x i32> %a0, <4 x i32> %a1) {
220 ; SSE-LABEL: combine_vec_usub_negone:
221 ; SSE: # %bb.0:
222 ; SSE-NEXT: movdqa %xmm0, %xmm2
223 ; SSE-NEXT: pcmpeqd %xmm3, %xmm3
224 ; SSE-NEXT: pxor %xmm3, %xmm2
225 ; SSE-NEXT: movdqa %xmm2, %xmm0
226 ; SSE-NEXT: pminud %xmm3, %xmm0
227 ; SSE-NEXT: pcmpeqd %xmm2, %xmm0
228 ; SSE-NEXT: pxor %xmm3, %xmm0
229 ; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm2
230 ; SSE-NEXT: movaps %xmm2, %xmm0
231 ; SSE-NEXT: retq
232 ;
233 ; AVX-LABEL: combine_vec_usub_negone:
234 ; AVX: # %bb.0:
235 ; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
236 ; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
237 ; AVX-NEXT: vpminud %xmm2, %xmm0, %xmm3
238 ; AVX-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3
239 ; AVX-NEXT: vpxor %xmm2, %xmm3, %xmm2
240 ; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
241 ; AVX-NEXT: retq
242 %1 = call {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32> , <4 x i32> %a0)
243 %2 = extractvalue {<4 x i32>, <4 x i1>} %1, 0
244 %3 = extractvalue {<4 x i32>, <4 x i1>} %1, 1
245 %4 = select <4 x i1> %3, <4 x i32> %a1, <4 x i32> %2
246 ret <4 x i32> %4
247 }