llvm.org GIT mirror llvm / e9c0b5a
[DAGCombiner] Teach how to fold sext/aext/zext of constant build vectors. This patch teaches the DAGCombiner how to fold a sext/aext/zext dag node when the operand in input is a build vector of constants (or UNDEFs). The inability to fold a sext/zext of a constant build_vector was the root cause of some pcg bugs affecting vselect expansion on x86-64 with AVX support. Before this change, the DAGCombiner only knew how to fold a sext/zext/aext of a ConstantSDNode. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@200234 91177308-0d34-0410-b5e6-96231b3b80d8 Andrea Di Biagio 6 years ago
4 changed file(s) with 372 addition(s) and 21 deletion(s). Raw diff Collapse all Expand all
45764576 SDLoc(N));
45774577 }
45784578
4579 // tryToFoldExtendOfConstant - Try to fold a sext/zext/aext
4580 // dag node into a ConstantSDNode or a build_vector of constants.
4581 // This function is called by the DAGCombiner when visiting sext/zext/aext
4582 // dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND).
4583 // Vector extends are not folded if operations are legal; this is to
4584 // avoid introducing illegal build_vector dag nodes.
4585 static SDNode *tryToFoldExtendOfConstant(SDNode *N, SelectionDAG &DAG,
4586 bool LegalOperations) {
4587 unsigned Opcode = N->getOpcode();
4588 SDValue N0 = N->getOperand(0);
4589 EVT VT = N->getValueType(0);
4590
4591 assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND ||
4592 Opcode == ISD::ANY_EXTEND) && "Expected EXTEND dag node in input!");
4593
4594 // fold (sext c1) -> c1
4595 // fold (zext c1) -> c1
4596 // fold (aext c1) -> c1
4597 if (isa(N0))
4598 return DAG.getNode(Opcode, SDLoc(N), VT, N0).getNode();
4599
4600 // fold (sext (build_vector AllConstants) -> (build_vector AllConstants)
4601 // fold (zext (build_vector AllConstants) -> (build_vector AllConstants)
4602 // fold (aext (build_vector AllConstants) -> (build_vector AllConstants)
4603 if (!(VT.isVector() && !LegalOperations &&
4604 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())))
4605 return 0;
4606
4607 // We can fold this node into a build_vector.
4608 unsigned VTBits = VT.getScalarType().getSizeInBits();
4609 unsigned EVTBits = N0->getValueType(0).getScalarType().getSizeInBits();
4610 unsigned ShAmt = VTBits - EVTBits;
4611 SmallVector Elts;
4612 unsigned NumElts = N0->getNumOperands();
4613 SDLoc DL(N);
4614
4615 for (unsigned i=0; i != NumElts; ++i) {
4616 SDValue Op = N0->getOperand(i);
4617 if (Op->getOpcode() == ISD::UNDEF) {
4618 Elts.push_back(DAG.getUNDEF(VT.getScalarType()));
4619 continue;
4620 }
4621
4622 ConstantSDNode *CurrentND = cast(Op);
4623 const APInt &C = APInt(VTBits, CurrentND->getAPIntValue().getZExtValue());
4624 if (Opcode == ISD::SIGN_EXTEND)
4625 Elts.push_back(DAG.getConstant(C.shl(ShAmt).ashr(ShAmt).getZExtValue(),
4626 VT.getScalarType()));
4627 else
4628 Elts.push_back(DAG.getConstant(C.shl(ShAmt).lshr(ShAmt).getZExtValue(),
4629 VT.getScalarType()));
4630 }
4631
4632 return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, &Elts[0], NumElts).getNode();
4633 }
4634
45794635 // ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this:
45804636 // "fold ({s|z|a}ext (load x)) -> ({s|z|a}ext (truncate ({s|z|a}extload x)))"
45814637 // transformation. Returns true if extension are possible and the above
46664722 SDValue N0 = N->getOperand(0);
46674723 EVT VT = N->getValueType(0);
46684724
4669 // fold (sext c1) -> c1
4670 if (isa(N0))
4671 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N0);
4725 if (SDNode *Res = tryToFoldExtendOfConstant(N, DAG, LegalOperations))
4726 return SDValue(Res, 0);
46724727
46734728 // fold (sext (sext x)) -> (sext x)
46744729 // fold (sext (aext x)) -> (sext x)
49164971 SDValue N0 = N->getOperand(0);
49174972 EVT VT = N->getValueType(0);
49184973
4919 // fold (zext c1) -> c1
4920 if (isa(N0))
4921 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0);
4974 if (SDNode *Res = tryToFoldExtendOfConstant(N, DAG, LegalOperations))
4975 return SDValue(Res, 0);
4976
49224977 // fold (zext (zext x)) -> (zext x)
49234978 // fold (zext (aext x)) -> (zext x)
49244979 if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND)
51855240 SDValue N0 = N->getOperand(0);
51865241 EVT VT = N->getValueType(0);
51875242
5188 // fold (aext c1) -> c1
5189 if (isa(N0))
5190 return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, N0);
5243 if (SDNode *Res = tryToFoldExtendOfConstant(N, DAG, LegalOperations))
5244 return SDValue(Res, 0);
5245
51915246 // fold (aext (aext x)) -> (aext x)
51925247 // fold (aext (zext x)) -> (zext x)
51935248 // fold (aext (sext x)) -> (sext x)
3131 store <2 x i64> %4, <2 x i64>* %c
3232 ret void
3333
34 ; FIXME: This code is correct, but poor. Ideally it would be similar to
35 ; the code in @false_v4f32
34 ; (setcc $a, $b, SETFALSE) is always folded
3635 ; CHECK-DAG: ldi.b [[R1:\$w[0-9]+]], 0
37 ; CHECK-DAG: slli.d [[R3:\$w[0-9]+]], [[R1]], 63
38 ; CHECK-DAG: srai.d [[R4:\$w[0-9]+]], [[R3]], 63
39 ; CHECK-DAG: st.d [[R4]], 0($4)
36 ; CHECK-DAG: st.w [[R1]], 0($4)
4037 ; CHECK: .size false_v2f64
4138 }
4239
508505 store <2 x i64> %4, <2 x i64>* %c
509506 ret void
510507
511 ; FIXME: This code is correct, but poor. Ideally it would be similar to
512 ; the code in @true_v4f32
513 ; CHECK-DAG: ldi.d [[R1:\$w[0-9]+]], 1
514 ; CHECK-DAG: slli.d [[R3:\$w[0-9]+]], [[R1]], 63
515 ; CHECK-DAG: srai.d [[R4:\$w[0-9]+]], [[R3]], 63
516 ; CHECK-DAG: st.d [[R4]], 0($4)
508 ; (setcc $a, $b, SETTRUE) is always folded.
509 ; CHECK-DAG: ldi.b [[R1:\$w[0-9]+]], -1
510 ; CHECK-DAG: st.w [[R1]], 0($4)
517511 ; CHECK: .size true_v2f64
518512 }
519513
5050
5151
5252 ;CHECK-LABEL: vsel_float8:
53 ;CHECK-NOT: vinsertf128
5354 ;CHECK: vblendvps
5455 ;CHECK: ret
5556 define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
5859 }
5960
6061 ;CHECK-LABEL: vsel_i328:
62 ;CHECK-NOT: vinsertf128
6163 ;CHECK: vblendvps
62 ;CHECK: ret
64 ;CHECK-NEXT: ret
6365 define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
6466 %vsel = select <8 x i1> , <8 x i32> %v1, <8 x i32> %v2
6567 ret <8 x i32> %vsel
7981 define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
8082 %vsel = select <8 x i1> , <8 x i64> %v1, <8 x i64> %v2
8183 ret <8 x i64> %vsel
84 }
85
86 ;CHECK-LABEL: vsel_double4:
87 ;CHECK-NOT: vinsertf128
88 ;CHECK: vblendvpd
89 ;CHECK-NEXT: ret
90 define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) {
91 %vsel = select <4 x i1> , <4 x double> %v1, <4 x double> %v2
92 ret <4 x double> %vsel
8293 }
8394
8495 ;; TEST blend + compares
0 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -mattr=+avx | FileCheck %s
1
2 ; Verify that the backend correctly folds a sign/zero extend of a vector where
3 ; elements are all constant values or UNDEFs.
4 ; The backend should be able to optimize all the test functions below into
5 ; simple loads from constant pool of the result. That is because the resulting
6 ; vector should be known at static time.
7
8
9 define <4 x i16> @test1() {
10 %1 = insertelement <4 x i8> undef, i8 0, i32 0
11 %2 = insertelement <4 x i8> %1, i8 -1, i32 1
12 %3 = insertelement <4 x i8> %2, i8 2, i32 2
13 %4 = insertelement <4 x i8> %3, i8 -3, i32 3
14 %5 = sext <4 x i8> %4 to <4 x i16>
15 ret <4 x i16> %5
16 }
17 ; CHECK-LABEL: test1
18 ; CHECK: vmovaps
19 ; CHECK-NEXT: ret
20
21 define <4 x i16> @test2() {
22 %1 = insertelement <4 x i8> undef, i8 undef, i32 0
23 %2 = insertelement <4 x i8> %1, i8 -1, i32 1
24 %3 = insertelement <4 x i8> %2, i8 undef, i32 2
25 %4 = insertelement <4 x i8> %3, i8 -3, i32 3
26 %5 = sext <4 x i8> %4 to <4 x i16>
27 ret <4 x i16> %5
28 }
29 ; CHECK-LABEL: test2
30 ; CHECK: vmovaps
31 ; CHECK-NEXT: ret
32
33 define <4 x i32> @test3() {
34 %1 = insertelement <4 x i8> undef, i8 0, i32 0
35 %2 = insertelement <4 x i8> %1, i8 -1, i32 1
36 %3 = insertelement <4 x i8> %2, i8 2, i32 2
37 %4 = insertelement <4 x i8> %3, i8 -3, i32 3
38 %5 = sext <4 x i8> %4 to <4 x i32>
39 ret <4 x i32> %5
40 }
41 ; CHECK-LABEL: test3
42 ; CHECK: vmovaps
43 ; CHECK-NEXT: ret
44
45 define <4 x i32> @test4() {
46 %1 = insertelement <4 x i8> undef, i8 undef, i32 0
47 %2 = insertelement <4 x i8> %1, i8 -1, i32 1
48 %3 = insertelement <4 x i8> %2, i8 undef, i32 2
49 %4 = insertelement <4 x i8> %3, i8 -3, i32 3
50 %5 = sext <4 x i8> %4 to <4 x i32>
51 ret <4 x i32> %5
52 }
53 ; CHECK-LABEL: test4
54 ; CHECK: vmovaps
55 ; CHECK-NEXT: ret
56
57
58 define <4 x i64> @test5() {
59 %1 = insertelement <4 x i8> undef, i8 0, i32 0
60 %2 = insertelement <4 x i8> %1, i8 -1, i32 1
61 %3 = insertelement <4 x i8> %2, i8 2, i32 2
62 %4 = insertelement <4 x i8> %3, i8 -3, i32 3
63 %5 = sext <4 x i8> %4 to <4 x i64>
64 ret <4 x i64> %5
65 }
66 ; CHECK-LABEL: test5
67 ; CHECK-NOT: vinsertf128
68 ; CHECK: vmovaps
69 ; CHECK-NEXT: ret
70
71 define <4 x i64> @test6() {
72 %1 = insertelement <4 x i8> undef, i8 undef, i32 0
73 %2 = insertelement <4 x i8> %1, i8 -1, i32 1
74 %3 = insertelement <4 x i8> %2, i8 undef, i32 2
75 %4 = insertelement <4 x i8> %3, i8 -3, i32 3
76 %5 = sext <4 x i8> %4 to <4 x i64>
77 ret <4 x i64> %5
78 }
79 ; CHECK-LABEL: test6
80 ; CHECK-NOT: vinsertf128
81 ; CHECK: vmovaps
82 ; CHECK-NEXT: ret
83
84 define <8 x i16> @test7() {
85 %1 = insertelement <8 x i8> undef, i8 0, i32 0
86 %2 = insertelement <8 x i8> %1, i8 -1, i32 1
87 %3 = insertelement <8 x i8> %2, i8 2, i32 2
88 %4 = insertelement <8 x i8> %3, i8 -3, i32 3
89 %5 = insertelement <8 x i8> %4, i8 4, i32 4
90 %6 = insertelement <8 x i8> %5, i8 -5, i32 5
91 %7 = insertelement <8 x i8> %6, i8 6, i32 6
92 %8 = insertelement <8 x i8> %7, i8 -7, i32 7
93 %9 = sext <8 x i8> %4 to <8 x i16>
94 ret <8 x i16> %9
95 }
96 ; CHECK-LABEL: test7
97 ; CHECK: vmovaps
98 ; CHECK-NEXT: ret
99
100 define <8 x i32> @test8() {
101 %1 = insertelement <8 x i8> undef, i8 0, i32 0
102 %2 = insertelement <8 x i8> %1, i8 -1, i32 1
103 %3 = insertelement <8 x i8> %2, i8 2, i32 2
104 %4 = insertelement <8 x i8> %3, i8 -3, i32 3
105 %5 = insertelement <8 x i8> %4, i8 4, i32 4
106 %6 = insertelement <8 x i8> %5, i8 -5, i32 5
107 %7 = insertelement <8 x i8> %6, i8 6, i32 6
108 %8 = insertelement <8 x i8> %7, i8 -7, i32 7
109 %9 = sext <8 x i8> %4 to <8 x i32>
110 ret <8 x i32> %9
111 }
112 ; CHECK-LABEL: test8
113 ; CHECK-NOT: vinsertf128
114 ; CHECK: vmovaps
115 ; CHECK-NEXT: ret
116
117 define <8 x i16> @test9() {
118 %1 = insertelement <8 x i8> undef, i8 undef, i32 0
119 %2 = insertelement <8 x i8> %1, i8 -1, i32 1
120 %3 = insertelement <8 x i8> %2, i8 undef, i32 2
121 %4 = insertelement <8 x i8> %3, i8 -3, i32 3
122 %5 = insertelement <8 x i8> %4, i8 undef, i32 4
123 %6 = insertelement <8 x i8> %5, i8 -5, i32 5
124 %7 = insertelement <8 x i8> %6, i8 undef, i32 6
125 %8 = insertelement <8 x i8> %7, i8 -7, i32 7
126 %9 = sext <8 x i8> %4 to <8 x i16>
127 ret <8 x i16> %9
128 }
129 ; CHECK-LABEL: test9
130 ; CHECK: vmovaps
131 ; CHECK-NEXT: ret
132
133 define <8 x i32> @test10() {
134 %1 = insertelement <8 x i8> undef, i8 0, i32 0
135 %2 = insertelement <8 x i8> %1, i8 undef, i32 1
136 %3 = insertelement <8 x i8> %2, i8 2, i32 2
137 %4 = insertelement <8 x i8> %3, i8 undef, i32 3
138 %5 = insertelement <8 x i8> %4, i8 4, i32 4
139 %6 = insertelement <8 x i8> %5, i8 undef, i32 5
140 %7 = insertelement <8 x i8> %6, i8 6, i32 6
141 %8 = insertelement <8 x i8> %7, i8 undef, i32 7
142 %9 = sext <8 x i8> %4 to <8 x i32>
143 ret <8 x i32> %9
144 }
145 ; CHECK-LABEL: test10
146 ; CHECK-NOT: vinsertf128
147 ; CHECK: vmovaps
148 ; CHECK-NEXT: ret
149
150
151 define <4 x i16> @test11() {
152 %1 = insertelement <4 x i8> undef, i8 0, i32 0
153 %2 = insertelement <4 x i8> %1, i8 -1, i32 1
154 %3 = insertelement <4 x i8> %2, i8 2, i32 2
155 %4 = insertelement <4 x i8> %3, i8 -3, i32 3
156 %5 = zext <4 x i8> %4 to <4 x i16>
157 ret <4 x i16> %5
158 }
159 ; CHECK-LABEL: test11
160 ; CHECK: vmovaps
161 ; CHECK-NEXT: ret
162
163 define <4 x i32> @test12() {
164 %1 = insertelement <4 x i8> undef, i8 0, i32 0
165 %2 = insertelement <4 x i8> %1, i8 -1, i32 1
166 %3 = insertelement <4 x i8> %2, i8 2, i32 2
167 %4 = insertelement <4 x i8> %3, i8 -3, i32 3
168 %5 = zext <4 x i8> %4 to <4 x i32>
169 ret <4 x i32> %5
170 }
171 ; CHECK-LABEL: test12
172 ; CHECK: vmovaps
173 ; CHECK-NEXT: ret
174
175 define <4 x i64> @test13() {
176 %1 = insertelement <4 x i8> undef, i8 0, i32 0
177 %2 = insertelement <4 x i8> %1, i8 -1, i32 1
178 %3 = insertelement <4 x i8> %2, i8 2, i32 2
179 %4 = insertelement <4 x i8> %3, i8 -3, i32 3
180 %5 = zext <4 x i8> %4 to <4 x i64>
181 ret <4 x i64> %5
182 }
183 ; CHECK-LABEL: test13
184 ; CHECK-NOT: vinsertf128
185 ; CHECK: vmovaps
186 ; CHECK-NEXT: ret
187
188 define <4 x i16> @test14() {
189 %1 = insertelement <4 x i8> undef, i8 undef, i32 0
190 %2 = insertelement <4 x i8> %1, i8 -1, i32 1
191 %3 = insertelement <4 x i8> %2, i8 undef, i32 2
192 %4 = insertelement <4 x i8> %3, i8 -3, i32 3
193 %5 = zext <4 x i8> %4 to <4 x i16>
194 ret <4 x i16> %5
195 }
196 ; CHECK-LABEL: test14
197 ; CHECK: vmovaps
198 ; CHECK-NEXT: ret
199
200 define <4 x i32> @test15() {
201 %1 = insertelement <4 x i8> undef, i8 0, i32 0
202 %2 = insertelement <4 x i8> %1, i8 undef, i32 1
203 %3 = insertelement <4 x i8> %2, i8 2, i32 2
204 %4 = insertelement <4 x i8> %3, i8 undef, i32 3
205 %5 = zext <4 x i8> %4 to <4 x i32>
206 ret <4 x i32> %5
207 }
208 ; CHECK-LABEL: test15
209 ; CHECK: vmovaps
210 ; CHECK-NEXT: ret
211
212 define <4 x i64> @test16() {
213 %1 = insertelement <4 x i8> undef, i8 undef, i32 0
214 %2 = insertelement <4 x i8> %1, i8 -1, i32 1
215 %3 = insertelement <4 x i8> %2, i8 2, i32 2
216 %4 = insertelement <4 x i8> %3, i8 undef, i32 3
217 %5 = zext <4 x i8> %4 to <4 x i64>
218 ret <4 x i64> %5
219 }
220 ; CHECK-LABEL: test16
221 ; CHECK-NOT: vinsertf128
222 ; CHECK: vmovaps
223 ; CHECK-NEXT: ret
224
225 define <8 x i16> @test17() {
226 %1 = insertelement <8 x i8> undef, i8 0, i32 0
227 %2 = insertelement <8 x i8> %1, i8 -1, i32 1
228 %3 = insertelement <8 x i8> %2, i8 2, i32 2
229 %4 = insertelement <8 x i8> %3, i8 -3, i32 3
230 %5 = insertelement <8 x i8> %4, i8 4, i32 4
231 %6 = insertelement <8 x i8> %5, i8 -5, i32 5
232 %7 = insertelement <8 x i8> %6, i8 6, i32 6
233 %8 = insertelement <8 x i8> %7, i8 -7, i32 7
234 %9 = zext <8 x i8> %8 to <8 x i16>
235 ret <8 x i16> %9
236 }
237 ; CHECK-LABEL: test17
238 ; CHECK: vmovaps
239 ; CHECK-NEXT: ret
240
241 define <8 x i32> @test18() {
242 %1 = insertelement <8 x i8> undef, i8 0, i32 0
243 %2 = insertelement <8 x i8> %1, i8 -1, i32 1
244 %3 = insertelement <8 x i8> %2, i8 2, i32 2
245 %4 = insertelement <8 x i8> %3, i8 -3, i32 3
246 %5 = insertelement <8 x i8> %4, i8 4, i32 4
247 %6 = insertelement <8 x i8> %5, i8 -5, i32 5
248 %7 = insertelement <8 x i8> %6, i8 6, i32 6
249 %8 = insertelement <8 x i8> %7, i8 -7, i32 7
250 %9 = zext <8 x i8> %8 to <8 x i32>
251 ret <8 x i32> %9
252 }
253 ; CHECK-LABEL: test18
254 ; CHECK-NOT: vinsertf128
255 ; CHECK: vmovaps
256 ; CHECK-NEXT: ret
257
258 define <8 x i16> @test19() {
259 %1 = insertelement <8 x i8> undef, i8 undef, i32 0
260 %2 = insertelement <8 x i8> %1, i8 -1, i32 1
261 %3 = insertelement <8 x i8> %2, i8 undef, i32 2
262 %4 = insertelement <8 x i8> %3, i8 -3, i32 3
263 %5 = insertelement <8 x i8> %4, i8 undef, i32 4
264 %6 = insertelement <8 x i8> %5, i8 -5, i32 5
265 %7 = insertelement <8 x i8> %6, i8 undef, i32 6
266 %8 = insertelement <8 x i8> %7, i8 -7, i32 7
267 %9 = zext <8 x i8> %8 to <8 x i16>
268 ret <8 x i16> %9
269 }
270 ; CHECK-LABEL: test19
271 ; CHECK: vmovaps
272 ; CHECK-NEXT: ret
273
274 define <8 x i32> @test20() {
275 %1 = insertelement <8 x i8> undef, i8 0, i32 0
276 %2 = insertelement <8 x i8> %1, i8 undef, i32 1
277 %3 = insertelement <8 x i8> %2, i8 2, i32 2
278 %4 = insertelement <8 x i8> %3, i8 -3, i32 3
279 %5 = insertelement <8 x i8> %4, i8 4, i32 4
280 %6 = insertelement <8 x i8> %5, i8 undef, i32 5
281 %7 = insertelement <8 x i8> %6, i8 6, i32 6
282 %8 = insertelement <8 x i8> %7, i8 undef, i32 7
283 %9 = zext <8 x i8> %8 to <8 x i32>
284 ret <8 x i32> %9
285 }
286 ; CHECK-LABEL: test20
287 ; CHECK-NOT: vinsertf128
288 ; CHECK: vmovaps
289 ; CHECK-NEXT: ret
290