llvm.org GIT mirror llvm / 7bb5d42
[X86] Add SMULO/UMULO combine tests Include scalar and vector test variants covering the folds in DAGCombiner (vector isn't currently supported - PR40442) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@355407 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 7 months ago
1 changed file(s) with 136 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
3
4 declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
5 declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
6
7 declare {<4 x i32>, <4 x i1>} @llvm.smul.with.overflow.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
8 declare {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
9
10 ; fold (smulo x, 2) -> (saddo x, x)
11 define i32 @combine_smul_two(i32 %a0, i32 %a1) {
12 ; SSE-LABEL: combine_smul_two:
13 ; SSE: # %bb.0:
14 ; SSE-NEXT: movl %edi, %eax
15 ; SSE-NEXT: addl %edi, %eax
16 ; SSE-NEXT: cmovol %esi, %eax
17 ; SSE-NEXT: retq
18 ;
19 ; AVX-LABEL: combine_smul_two:
20 ; AVX: # %bb.0:
21 ; AVX-NEXT: movl %edi, %eax
22 ; AVX-NEXT: addl %edi, %eax
23 ; AVX-NEXT: cmovol %esi, %eax
24 ; AVX-NEXT: retq
25 %1 = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a0, i32 2)
26 %2 = extractvalue {i32, i1} %1, 0
27 %3 = extractvalue {i32, i1} %1, 1
28 %4 = select i1 %3, i32 %a1, i32 %2
29 ret i32 %4
30 }
31
32 define <4 x i32> @combine_vec_smul_two(<4 x i32> %a0, <4 x i32> %a1) {
33 ; SSE-LABEL: combine_vec_smul_two:
34 ; SSE: # %bb.0:
35 ; SSE-NEXT: movdqa %xmm0, %xmm2
36 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
37 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2,2,2,2]
38 ; SSE-NEXT: pmuldq %xmm3, %xmm0
39 ; SSE-NEXT: pmuldq %xmm2, %xmm3
40 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
41 ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5],xmm0[6,7]
42 ; SSE-NEXT: paddd %xmm2, %xmm2
43 ; SSE-NEXT: movdqa %xmm2, %xmm0
44 ; SSE-NEXT: psrad $31, %xmm0
45 ; SSE-NEXT: pcmpeqd %xmm3, %xmm0
46 ; SSE-NEXT: pcmpeqd %xmm3, %xmm3
47 ; SSE-NEXT: pxor %xmm3, %xmm0
48 ; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm2
49 ; SSE-NEXT: movaps %xmm2, %xmm0
50 ; SSE-NEXT: retq
51 ;
52 ; AVX-LABEL: combine_vec_smul_two:
53 ; AVX: # %bb.0:
54 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
55 ; AVX-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,2,2,2]
56 ; AVX-NEXT: vpmuldq %xmm3, %xmm2, %xmm2
57 ; AVX-NEXT: vpmuldq %xmm3, %xmm0, %xmm3
58 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
59 ; AVX-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
60 ; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0
61 ; AVX-NEXT: vpsrad $31, %xmm0, %xmm3
62 ; AVX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
63 ; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
64 ; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2
65 ; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
66 ; AVX-NEXT: retq
67 %1 = call {<4 x i32>, <4 x i1>} @llvm.smul.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> )
68 %2 = extractvalue {<4 x i32>, <4 x i1>} %1, 0
69 %3 = extractvalue {<4 x i32>, <4 x i1>} %1, 1
70 %4 = select <4 x i1> %3, <4 x i32> %a1, <4 x i32> %2
71 ret <4 x i32> %4
72 }
73
74 ; fold (umulo x, 2) -> (uaddo x, x)
75 define i32 @combine_umul_two(i32 %a0, i32 %a1) {
76 ; SSE-LABEL: combine_umul_two:
77 ; SSE: # %bb.0:
78 ; SSE-NEXT: movl %edi, %eax
79 ; SSE-NEXT: addl %edi, %eax
80 ; SSE-NEXT: cmovbl %esi, %eax
81 ; SSE-NEXT: retq
82 ;
83 ; AVX-LABEL: combine_umul_two:
84 ; AVX: # %bb.0:
85 ; AVX-NEXT: movl %edi, %eax
86 ; AVX-NEXT: addl %edi, %eax
87 ; AVX-NEXT: cmovbl %esi, %eax
88 ; AVX-NEXT: retq
89 %1 = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a0, i32 2)
90 %2 = extractvalue {i32, i1} %1, 0
91 %3 = extractvalue {i32, i1} %1, 1
92 %4 = select i1 %3, i32 %a1, i32 %2
93 ret i32 %4
94 }
95
96 define <4 x i32> @combine_vec_umul_two(<4 x i32> %a0, <4 x i32> %a1) {
97 ; SSE-LABEL: combine_vec_umul_two:
98 ; SSE: # %bb.0:
99 ; SSE-NEXT: movdqa %xmm0, %xmm2
100 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
101 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2,2,2,2]
102 ; SSE-NEXT: pmuludq %xmm3, %xmm0
103 ; SSE-NEXT: pmuludq %xmm2, %xmm3
104 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
105 ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5],xmm0[6,7]
106 ; SSE-NEXT: pxor %xmm4, %xmm4
107 ; SSE-NEXT: pcmpeqd %xmm3, %xmm4
108 ; SSE-NEXT: pcmpeqd %xmm0, %xmm0
109 ; SSE-NEXT: pxor %xmm4, %xmm0
110 ; SSE-NEXT: paddd %xmm2, %xmm2
111 ; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm2
112 ; SSE-NEXT: movaps %xmm2, %xmm0
113 ; SSE-NEXT: retq
114 ;
115 ; AVX-LABEL: combine_vec_umul_two:
116 ; AVX: # %bb.0:
117 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
118 ; AVX-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,2,2,2]
119 ; AVX-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
120 ; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
121 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
122 ; AVX-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
123 ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
124 ; AVX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
125 ; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
126 ; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2
127 ; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0
128 ; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
129 ; AVX-NEXT: retq
130 %1 = call {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> )
131 %2 = extractvalue {<4 x i32>, <4 x i1>} %1, 0
132 %3 = extractvalue {<4 x i32>, <4 x i1>} %1, 1
133 %4 = select <4 x i1> %3, <4 x i32> %a1, <4 x i32> %2
134 ret <4 x i32> %4
135 }