llvm.org GIT mirror llvm / 1726e2f
[X86] Add two combine rules to simplify dag nodes introduced during type legalization when promoting nodes with illegal vector type. This patch teaches the backend how to simplify/canonicalize dag node sequences normally introduced by the backend when promoting certain dag nodes with illegal vector type. This patch adds two new combine rules: 1) fold (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) -> (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>) 2) fold (BINOP (shuffle (A, Undef, <Mask>)), (shuffle (B, Undef, <Mask>))) -> (shuffle (BINOP A, B), Undef, <Mask>). Both rules are only triggered on the type-legalized DAG. In particular, rule 1. is a target specific combine rule that attempts to sink a bitconvert into the operands of a binary operation. Rule 2. is a target independet rule that attempts to move a shuffle immediately after a binary operation. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@209930 91177308-0d34-0410-b5e6-96231b3b80d8 Andrea Di Biagio 6 years ago
4 changed file(s) with 355 addition(s) and 27 deletion(s). Raw diff Collapse all Expand all
1080010800 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), LHS.getValueType(), Ops);
1080110801 }
1080210802
10803 // Type legalization might introduce new shuffles in the DAG.
10804 // Fold (VBinOp (shuffle (A, Undef, Mask)), (shuffle (B, Undef, Mask)))
10805 // -> (shuffle (VBinOp (A, B)), Undef, Mask).
10806 if (LegalTypes && isa(LHS) &&
10807 isa(RHS) && LHS.hasOneUse() && RHS.hasOneUse() &&
10808 LHS.getOperand(1).getOpcode() == ISD::UNDEF &&
10809 RHS.getOperand(1).getOpcode() == ISD::UNDEF) {
10810 ShuffleVectorSDNode *SVN0 = cast(LHS);
10811 ShuffleVectorSDNode *SVN1 = cast(RHS);
10812
10813 if (SVN0->getMask().equals(SVN1->getMask())) {
10814 EVT VT = N->getValueType(0);
10815 SDValue UndefVector = LHS.getOperand(1);
10816 SDValue NewBinOp = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
10817 LHS.getOperand(0), RHS.getOperand(0));
10818 AddUsersToWorkList(N);
10819 return DAG.getVectorShuffle(VT, SDLoc(N), NewBinOp, UndefVector,
10820 &SVN0->getMask()[0]);
10821 }
10822 }
10823
1080310824 return SDValue();
1080410825 }
1080510826
1749417494 TargetLowering::DAGCombinerInfo &DCI,
1749517495 const X86Subtarget *Subtarget) {
1749617496 SDLoc dl(N);
17497 SDValue N0 = N->getOperand(0);
17498 SDValue N1 = N->getOperand(1);
1749717499 EVT VT = N->getValueType(0);
1749817500
1749917501 // Don't create instructions with illegal types after legalize types has run.
1750517507 if (Subtarget->hasFp256() && VT.is256BitVector() &&
1750617508 N->getOpcode() == ISD::VECTOR_SHUFFLE)
1750717509 return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
17510
17511 // During Type Legalization, when promoting illegal vector types,
17512 // the backend might introduce new shuffle dag nodes and bitcasts.
17513 //
17514 // This code performs the following transformation:
17515 // fold: (shuffle (bitcast (BINOP A, B)), Undef, ) ->
17516 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, )
17517 //
17518 // We do this only if both the bitcast and the BINOP dag nodes have
17519 // one use. Also, perform this transformation only if the new binary
17520 // operation is legal. This is to avoid introducing dag nodes that
17521 // potentially need to be further expanded (or custom lowered) into a
17522 // less optimal sequence of dag nodes.
17523 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
17524 N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
17525 N0.getOpcode() == ISD::BITCAST) {
17526 SDValue BC0 = N0.getOperand(0);
17527 EVT SVT = BC0.getValueType();
17528 unsigned Opcode = BC0.getOpcode();
17529 unsigned NumElts = VT.getVectorNumElements();
17530
17531 if (BC0.hasOneUse() && SVT.isVector() &&
17532 SVT.getVectorNumElements() * 2 == NumElts &&
17533 TLI.isOperationLegal(Opcode, VT)) {
17534 bool CanFold = false;
17535 switch (Opcode) {
17536 default : break;
17537 case ISD::ADD :
17538 case ISD::FADD :
17539 case ISD::SUB :
17540 case ISD::FSUB :
17541 case ISD::MUL :
17542 case ISD::FMUL :
17543 CanFold = true;
17544 }
17545
17546 unsigned SVTNumElts = SVT.getVectorNumElements();
17547 ShuffleVectorSDNode *SVOp = cast(N);
17548 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
17549 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
17550 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
17551 CanFold = SVOp->getMaskElt(i) < 0;
17552
17553 if (CanFold) {
17554 SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
17555 SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
17556 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
17557 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
17558 }
17559 }
17560 }
1750817561
1750917562 // Only handle 128 wide vector from here on.
1751017563 if (!VT.is128BitVector())
0 ; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41
1 ; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
2
3
4 define double @test1_add(double %A, double %B) {
5 %1 = bitcast double %A to <2 x i32>
6 %2 = bitcast double %B to <2 x i32>
7 %add = add <2 x i32> %1, %2
8 %3 = bitcast <2 x i32> %add to double
9 ret double %3
10 }
11 ; CHECK-LABEL: test1_add
12 ; SSE41: paddd
13 ; AVX: vpaddd
14 ; CHECK-NEXT: ret
15
16
17 define double @test2_add(double %A, double %B) {
18 %1 = bitcast double %A to <4 x i16>
19 %2 = bitcast double %B to <4 x i16>
20 %add = add <4 x i16> %1, %2
21 %3 = bitcast <4 x i16> %add to double
22 ret double %3
23 }
24 ; CHECK-LABEL: test2_add
25 ; SSE41: paddw
26 ; AVX: vpaddw
27 ; CHECK-NEXT: ret
28
29 define double @test3_add(double %A, double %B) {
30 %1 = bitcast double %A to <8 x i8>
31 %2 = bitcast double %B to <8 x i8>
32 %add = add <8 x i8> %1, %2
33 %3 = bitcast <8 x i8> %add to double
34 ret double %3
35 }
36 ; CHECK-LABEL: test3_add
37 ; SSE41: paddb
38 ; AVX: vpaddb
39 ; CHECK-NEXT: ret
40
41
42 define double @test1_sub(double %A, double %B) {
43 %1 = bitcast double %A to <2 x i32>
44 %2 = bitcast double %B to <2 x i32>
45 %sub = sub <2 x i32> %1, %2
46 %3 = bitcast <2 x i32> %sub to double
47 ret double %3
48 }
49 ; CHECK-LABEL: test1_sub
50 ; SSE41: psubd
51 ; AVX: vpsubd
52 ; CHECK-NEXT: ret
53
54
55 define double @test2_sub(double %A, double %B) {
56 %1 = bitcast double %A to <4 x i16>
57 %2 = bitcast double %B to <4 x i16>
58 %sub = sub <4 x i16> %1, %2
59 %3 = bitcast <4 x i16> %sub to double
60 ret double %3
61 }
62 ; CHECK-LABEL: test2_sub
63 ; SSE41: psubw
64 ; AVX: vpsubw
65 ; CHECK-NEXT: ret
66
67
68 define double @test3_sub(double %A, double %B) {
69 %1 = bitcast double %A to <8 x i8>
70 %2 = bitcast double %B to <8 x i8>
71 %sub = sub <8 x i8> %1, %2
72 %3 = bitcast <8 x i8> %sub to double
73 ret double %3
74 }
75 ; CHECK-LABEL: test3_sub
76 ; SSE41: psubb
77 ; AVX: vpsubb
78 ; CHECK-NEXT: ret
79
80
81 define double @test1_mul(double %A, double %B) {
82 %1 = bitcast double %A to <2 x i32>
83 %2 = bitcast double %B to <2 x i32>
84 %mul = mul <2 x i32> %1, %2
85 %3 = bitcast <2 x i32> %mul to double
86 ret double %3
87 }
88 ; CHECK-LABEL: test1_mul
89 ; SSE41: pmulld
90 ; AVX: vpmulld
91 ; CHECK-NEXT: ret
92
93
94 define double @test2_mul(double %A, double %B) {
95 %1 = bitcast double %A to <4 x i16>
96 %2 = bitcast double %B to <4 x i16>
97 %mul = mul <4 x i16> %1, %2
98 %3 = bitcast <4 x i16> %mul to double
99 ret double %3
100 }
101 ; CHECK-LABEL: test2_mul
102 ; SSE41: pmullw
103 ; AVX: vpmullw
104 ; CHECK-NEXT: ret
105
106 ; There is no legal ISD::MUL with type MVT::v8i16.
107 define double @test3_mul(double %A, double %B) {
108 %1 = bitcast double %A to <8 x i8>
109 %2 = bitcast double %B to <8 x i8>
110 %mul = mul <8 x i8> %1, %2
111 %3 = bitcast <8 x i8> %mul to double
112 ret double %3
113 }
114 ; CHECK-LABEL: test3_mul
115 ; CHECK: pmullw
116 ; CHECK-NEXT: pshufb
117 ; CHECK-NEXT: ret
118
119
120 define double @test1_and(double %A, double %B) {
121 %1 = bitcast double %A to <2 x i32>
122 %2 = bitcast double %B to <2 x i32>
123 %and = and <2 x i32> %1, %2
124 %3 = bitcast <2 x i32> %and to double
125 ret double %3
126 }
127 ; CHECK-LABEL: test1_and
128 ; SSE41: andps
129 ; AVX: vandps
130 ; CHECK-NEXT: ret
131
132
133 define double @test2_and(double %A, double %B) {
134 %1 = bitcast double %A to <4 x i16>
135 %2 = bitcast double %B to <4 x i16>
136 %and = and <4 x i16> %1, %2
137 %3 = bitcast <4 x i16> %and to double
138 ret double %3
139 }
140 ; CHECK-LABEL: test2_and
141 ; SSE41: andps
142 ; AVX: vandps
143 ; CHECK-NEXT: ret
144
145
146 define double @test3_and(double %A, double %B) {
147 %1 = bitcast double %A to <8 x i8>
148 %2 = bitcast double %B to <8 x i8>
149 %and = and <8 x i8> %1, %2
150 %3 = bitcast <8 x i8> %and to double
151 ret double %3
152 }
153 ; CHECK-LABEL: test3_and
154 ; SSE41: andps
155 ; AVX: vandps
156 ; CHECK-NEXT: ret
157
158
159 define double @test1_or(double %A, double %B) {
160 %1 = bitcast double %A to <2 x i32>
161 %2 = bitcast double %B to <2 x i32>
162 %or = or <2 x i32> %1, %2
163 %3 = bitcast <2 x i32> %or to double
164 ret double %3
165 }
166 ; CHECK-LABEL: test1_or
167 ; SSE41: orps
168 ; AVX: vorps
169 ; CHECK-NEXT: ret
170
171
172 define double @test2_or(double %A, double %B) {
173 %1 = bitcast double %A to <4 x i16>
174 %2 = bitcast double %B to <4 x i16>
175 %or = or <4 x i16> %1, %2
176 %3 = bitcast <4 x i16> %or to double
177 ret double %3
178 }
179 ; CHECK-LABEL: test2_or
180 ; SSE41: orps
181 ; AVX: vorps
182 ; CHECK-NEXT: ret
183
184
185 define double @test3_or(double %A, double %B) {
186 %1 = bitcast double %A to <8 x i8>
187 %2 = bitcast double %B to <8 x i8>
188 %or = or <8 x i8> %1, %2
189 %3 = bitcast <8 x i8> %or to double
190 ret double %3
191 }
192 ; CHECK-LABEL: test3_or
193 ; SSE41: orps
194 ; AVX: vorps
195 ; CHECK-NEXT: ret
196
197
198 define double @test1_xor(double %A, double %B) {
199 %1 = bitcast double %A to <2 x i32>
200 %2 = bitcast double %B to <2 x i32>
201 %xor = xor <2 x i32> %1, %2
202 %3 = bitcast <2 x i32> %xor to double
203 ret double %3
204 }
205 ; CHECK-LABEL: test1_xor
206 ; SSE41: xorps
207 ; AVX: vxorps
208 ; CHECK-NEXT: ret
209
210
211 define double @test2_xor(double %A, double %B) {
212 %1 = bitcast double %A to <4 x i16>
213 %2 = bitcast double %B to <4 x i16>
214 %xor = xor <4 x i16> %1, %2
215 %3 = bitcast <4 x i16> %xor to double
216 ret double %3
217 }
218 ; CHECK-LABEL: test2_xor
219 ; SSE41: xorps
220 ; AVX: vxorps
221 ; CHECK-NEXT: ret
222
223
224 define double @test3_xor(double %A, double %B) {
225 %1 = bitcast double %A to <8 x i8>
226 %2 = bitcast double %B to <8 x i8>
227 %xor = xor <8 x i8> %1, %2
228 %3 = bitcast <8 x i8> %xor to double
229 ret double %3
230 }
231 ; CHECK-LABEL: test3_xor
232 ; SSE41: xorps
233 ; AVX: vxorps
234 ; CHECK-NEXT: ret
235
236
237 define double @test_fadd(double %A, double %B) {
238 %1 = bitcast double %A to <2 x float>
239 %2 = bitcast double %B to <2 x float>
240 %add = fadd <2 x float> %1, %2
241 %3 = bitcast <2 x float> %add to double
242 ret double %3
243 }
244 ; CHECK-LABEL: test_fadd
245 ; SSE41: addps
246 ; AVX: vaddps
247 ; CHECK-NEXT: ret
248
249 define double @test_fsub(double %A, double %B) {
250 %1 = bitcast double %A to <2 x float>
251 %2 = bitcast double %B to <2 x float>
252 %sub = fsub <2 x float> %1, %2
253 %3 = bitcast <2 x float> %sub to double
254 ret double %3
255 }
256 ; CHECK-LABEL: test_fsub
257 ; SSE41: subps
258 ; AVX: vsubps
259 ; CHECK-NEXT: ret
260
261 define double @test_fmul(double %A, double %B) {
262 %1 = bitcast double %A to <2 x float>
263 %2 = bitcast double %B to <2 x float>
264 %mul = fmul <2 x float> %1, %2
265 %3 = bitcast <2 x float> %mul to double
266 ret double %3
267 }
268 ; CHECK-LABEL: test_fmul
269 ; SSE41: mulps
270 ; AVX: vmulps
271 ; CHECK-NEXT: ret
272
1313 ; CHECK-LABEL: test1
1414 ; CHECK-NOT: movsd
1515 ; CHECK: pshufd
16 ; CHECK-NEXT: paddq
16 ; CHECK-NEXT: paddd
1717 ; CHECK-NEXT: pshufd
1818 ; CHECK-NEXT: ret
1919
2525 %3 = bitcast <2 x i32> %add to double
2626 ret double %3
2727 }
28 ; FIXME: Ideally we should be able to fold the entire body of @test2 into a
29 ; single 'paddd %xmm1, %xmm0' instruction. At the moment we produce the
30 ; sequence pshufd+pshufd+paddq+pshufd.
31
3228 ; CHECK-LABEL: test2
3329 ; CHECK-NOT: movsd
34 ; CHECK: pshufd
35 ; CHECK-NEXT: pshufd
36 ; CHECK-NEXT: paddq
37 ; CHECK-NEXT: pshufd
30 ; CHECK: paddd
3831 ; CHECK-NEXT: ret
3932
4033
9083 ; CHECK-LABEL: test6
9184 ; CHECK-NOT: movsd
9285 ; CHECK: punpcklwd
93 ; CHECK-NEXT: paddd
86 ; CHECK-NEXT: paddw
9487 ; CHECK-NEXT: pshufb
9588 ; CHECK-NEXT: ret
9689
10295 %3 = bitcast <4 x i16> %add to double
10396 ret double %3
10497 }
105 ; FIXME: Ideally we should be able to fold the entire body of @test7 into a
106 ; single 'paddw %xmm1, %xmm0' instruction. At the moment we produce the
107 ; sequence pshufd+pshufd+paddd+pshufd.
108
10998 ; CHECK-LABEL: test7
11099 ; CHECK-NOT: movsd
111 ; CHECK: punpcklwd
112 ; CHECK-NEXT: punpcklwd
113 ; CHECK-NEXT: paddd
114 ; CHECK-NEXT: pshufb
100 ; CHECK-NOT: punpcklwd
101 ; CHECK: paddw
115102 ; CHECK-NEXT: ret
116103
117104
128115 ; CHECK-LABEL: test8
129116 ; CHECK-NOT: movsd
130117 ; CHECK: punpcklbw
131 ; CHECK-NEXT: paddw
118 ; CHECK-NEXT: paddb
132119 ; CHECK-NEXT: pshufb
133120 ; CHECK-NEXT: ret
134121
140127 %3 = bitcast <8 x i8> %add to double
141128 ret double %3
142129 }
143 ; FIXME: Ideally we should be able to fold the entire body of @test9 into a
144 ; single 'paddb %xmm1, %xmm0' instruction. At the moment we produce the
145 ; sequence pshufd+pshufd+paddw+pshufd.
146
147130 ; CHECK-LABEL: test9
148131 ; CHECK-NOT: movsd
149 ; CHECK: punpcklbw
150 ; CHECK-NEXT: punpcklbw
151 ; CHECK-NEXT: paddw
152 ; CHECK-NEXT: pshufb
132 ; CHECK-NOT: punpcklbw
133 ; CHECK: paddb
153134 ; CHECK-NEXT: ret
154135