llvm.org GIT mirror llvm / e5a55ce
[DAGCombiner] narrow shuffle of concatenated vectors // shuffle (concat X, undef), (concat Y, undef), Mask --> // concat (shuffle X, Y, Mask0), (shuffle X, Y, Mask1) The ARM changes with 'vtrn' and narrowed 'vuzp' are improvements. The x86 changes look neutral or better. There's one test with an extra instruction, but that could be reversed for a subtarget with the right attributes. But by default, we want to avoid the 256-bit op when possible (in my motivating benchmark, a handful of ymm ops sprinkled into a sequence of xmm ops are triggering frequency throttling on Haswell resulting in significantly worse perf). Differential Revision: https://reviews.llvm.org/D60545 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@358291 91177308-0d34-0410-b5e6-96231b3b80d8 Sanjay Patel 9 months ago
5 changed file(s) with 86 addition(s) and 50 deletion(s). Raw diff Collapse all Expand all
1759717597 return SDValue();
1759817598 }
1759917599
17600 /// Try to convert a wide shuffle of concatenated vectors into 2 narrow shuffles
17601 /// followed by concatenation. Narrow vector ops may have better performance
17602 /// than wide ops, and this can unlock further narrowing of other vector ops.
17603 /// Targets can invert this transform later if it is not profitable.
17604 static SDValue foldShuffleOfConcatUndefs(ShuffleVectorSDNode *Shuf,
17605 SelectionDAG &DAG) {
17606 SDValue N0 = Shuf->getOperand(0), N1 = Shuf->getOperand(1);
17607 if (N0.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
17608 N1.getOpcode() != ISD::CONCAT_VECTORS || N1.getNumOperands() != 2 ||
17609 !N0.getOperand(1).isUndef() || !N1.getOperand(1).isUndef())
17610 return SDValue();
17611
17612 // Split the wide shuffle mask into halves. Any mask element that is accessing
17613 // operand 1 is offset down to account for narrowing of the vectors.
17614 ArrayRef Mask = Shuf->getMask();
17615 EVT VT = Shuf->getValueType(0);
17616 unsigned NumElts = VT.getVectorNumElements();
17617 unsigned HalfNumElts = NumElts / 2;
17618 SmallVector Mask0(HalfNumElts, -1);
17619 SmallVector Mask1(HalfNumElts, -1);
17620 for (unsigned i = 0; i != NumElts; ++i) {
17621 if (Mask[i] == -1)
17622 continue;
17623 int M = Mask[i] < (int)NumElts ? Mask[i] : Mask[i] - (int)HalfNumElts;
17624 if (i < HalfNumElts)
17625 Mask0[i] = M;
17626 else
17627 Mask1[i - HalfNumElts] = M;
17628 }
17629
17630 // Ask the target if this is a valid transform.
17631 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17632 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
17633 HalfNumElts);
17634 if (!TLI.isShuffleMaskLegal(Mask0, HalfVT) ||
17635 !TLI.isShuffleMaskLegal(Mask1, HalfVT))
17636 return SDValue();
17637
17638 // shuffle (concat X, undef), (concat Y, undef), Mask -->
17639 // concat (shuffle X, Y, Mask0), (shuffle X, Y, Mask1)
17640 SDValue X = N0.getOperand(0), Y = N1.getOperand(0);
17641 SDLoc DL(Shuf);
17642 SDValue Shuf0 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask0);
17643 SDValue Shuf1 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask1);
17644 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Shuf0, Shuf1);
17645 }
17646
1760017647 // Tries to turn a shuffle of two CONCAT_VECTORS into a single concat,
1760117648 // or turn a shuffle of a single concat into simpler shuffle then concat.
1760217649 static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
1837718424 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
1837818425 return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, Mask);
1837918426 }
18427
18428 if (SDValue V = foldShuffleOfConcatUndefs(SVN, DAG))
18429 return V;
1838018430
1838118431 return SDValue();
1838218432 }
269269 ; CHECK-LABEL: vuzp_lower_shufflemask_undef:
270270 ; CHECK: @ %bb.0: @ %entry
271271 ; CHECK-NEXT: vldr d17, [r1]
272 ; CHECK-NEXT: vldr d16, [r0]
273 ; CHECK-NEXT: vorr q9, q8, q8
274 ; CHECK-NEXT: vuzp.16 q8, q9
275 ; CHECK-NEXT: vmov r0, r1, d18
276 ; CHECK-NEXT: vmov r2, r3, d19
272 ; CHECK-NEXT: vldr d18, [r0]
273 ; CHECK-NEXT: vuzp.16 d18, d17
274 ; CHECK-NEXT: vmov r0, r1, d16
275 ; CHECK-NEXT: vmov r2, r3, d17
277276 ; CHECK-NEXT: mov pc, lr
278277 entry:
279278 %tmp1 = load <4 x i16>, <4 x i16>* %A
285284 define <4 x i32> @vuzp_lower_shufflemask_zeroed(<2 x i32>* %A, <2 x i32>* %B) {
286285 ; CHECK-LABEL: vuzp_lower_shufflemask_zeroed:
287286 ; CHECK: @ %bb.0: @ %entry
287 ; CHECK-NEXT: vldr d18, [r0]
288 ; CHECK-NEXT: vorr d19, d18, d18
288289 ; CHECK-NEXT: vldr d17, [r1]
289 ; CHECK-NEXT: vldr d16, [r0]
290 ; CHECK-NEXT: vdup.32 q9, d16[0]
291 ; CHECK-NEXT: vuzp.32 q8, q9
292 ; CHECK-NEXT: vext.32 q8, q9, q9, #2
293 ; CHECK-NEXT: vmov r0, r1, d16
294 ; CHECK-NEXT: vmov r2, r3, d17
290 ; CHECK-NEXT: vtrn.32 d19, d17
291 ; CHECK-NEXT: vdup.32 d16, d18[0]
292 ; CHECK-NEXT: vmov r2, r3, d17
293 ; CHECK-NEXT: vmov r0, r1, d16
295294 ; CHECK-NEXT: mov pc, lr
296295 entry:
297296 %tmp1 = load <2 x i32>, <2 x i32>* %A
303302 define void @vuzp_rev_shufflemask_vtrn(<2 x i32>* %A, <2 x i32>* %B, <4 x i32>* %C) {
304303 ; CHECK-LABEL: vuzp_rev_shufflemask_vtrn:
305304 ; CHECK: @ %bb.0: @ %entry
306 ; CHECK-NEXT: vldr d17, [r1]
307 ; CHECK-NEXT: vldr d16, [r0]
308 ; CHECK-NEXT: vrev64.32 q9, q8
309 ; CHECK-NEXT: vuzp.32 q8, q9
310 ; CHECK-NEXT: vst1.64 {d18, d19}, [r2]
305 ; CHECK-NEXT: vldr d16, [r1]
306 ; CHECK-NEXT: vldr d17, [r0]
307 ; CHECK-NEXT: vtrn.32 d17, d16
308 ; CHECK-NEXT: vst1.64 {d16, d17}, [r2]
311309 ; CHECK-NEXT: mov pc, lr
312310 entry:
313311 %tmp1 = load <2 x i32>, <2 x i32>* %A
269269 ; CHECK-LABEL: vzip_lower_shufflemask_undef:
270270 ; CHECK: @ %bb.0: @ %entry
271271 ; CHECK-NEXT: vldr d17, [r1]
272 ; CHECK-NEXT: vldr d16, [r0]
273 ; CHECK-NEXT: vzip.16 d16, d17
272 ; CHECK-NEXT: vldr d18, [r0]
273 ; CHECK-NEXT: vzip.16 d18, d17
274274 ; CHECK-NEXT: vmov r0, r1, d16
275275 ; CHECK-NEXT: vmov r2, r3, d17
276276 ; CHECK-NEXT: mov pc, lr
228228 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
229229 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
230230 ; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
231 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
232 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
231 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm0[1]
232 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
233 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
233234 ; AVX2-NEXT: retq
234235 %even0 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32>
235236 %even1 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32>
380380 ; SSE42-NEXT: movdqa %xmm2, (%rdi)
381381 ; SSE42-NEXT: retq
382382 ;
383 ; AVX1-LABEL: v7i32:
384 ; AVX1: # %bb.0:
385 ; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3]
386 ; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2]
387 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
388 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,0,3]
389 ; AVX1-NEXT: vmovss %xmm1, 24(%rdi)
390 ; AVX1-NEXT: vmovlps %xmm0, 16(%rdi)
391 ; AVX1-NEXT: vmovaps %xmm2, (%rdi)
392 ; AVX1-NEXT: retq
393 ;
394 ; AVX2-LABEL: v7i32:
395 ; AVX2: # %bb.0:
396 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
397 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
398 ; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,6,3,6,1,7,4,u>
399 ; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
400 ; AVX2-NEXT: vmovss %xmm1, 24(%rdi)
401 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
402 ; AVX2-NEXT: vmovlps %xmm1, 16(%rdi)
403 ; AVX2-NEXT: vmovaps %xmm0, (%rdi)
404 ; AVX2-NEXT: vzeroupper
405 ; AVX2-NEXT: retq
383 ; AVX-LABEL: v7i32:
384 ; AVX: # %bb.0:
385 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3]
386 ; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2]
387 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
388 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
389 ; AVX-NEXT: vmovss %xmm1, 24(%rdi)
390 ; AVX-NEXT: vmovlps %xmm0, 16(%rdi)
391 ; AVX-NEXT: vmovaps %xmm2, (%rdi)
392 ; AVX-NEXT: retq
406393 ;
407394 ; XOP-LABEL: v7i32:
408395 ; XOP: # %bb.0:
409396 ; XOP-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3]
410397 ; XOP-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2]
411 ; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
412 ; XOP-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,0,3]
398 ; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
399 ; XOP-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
413400 ; XOP-NEXT: vmovss %xmm1, 24(%rdi)
414401 ; XOP-NEXT: vmovlps %xmm0, 16(%rdi)
415402 ; XOP-NEXT: vmovaps %xmm2, (%rdi)
486473 ; SSE2-NEXT: pandn %xmm2, %xmm3
487474 ; SSE2-NEXT: por %xmm4, %xmm3
488475 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
489 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,65535,65535,65535,65535,65535]
490 ; SSE2-NEXT: pand %xmm2, %xmm1
476 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,65535,65535]
491477 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
492478 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,1,3,4,5,6,7]
493 ; SSE2-NEXT: pandn %xmm0, %xmm2
494 ; SSE2-NEXT: por %xmm1, %xmm2
479 ; SSE2-NEXT: pand %xmm2, %xmm0
480 ; SSE2-NEXT: pandn %xmm1, %xmm2
481 ; SSE2-NEXT: por %xmm0, %xmm2
495482 ; SSE2-NEXT: movq %xmm2, 16(%rdi)
496483 ; SSE2-NEXT: movdqa %xmm3, (%rdi)
497484 ; SSE2-NEXT: retq
501488 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
502489 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
503490 ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
504 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2],xmm2[3,4,5,6,7]
491 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7]
505492 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,3]
506493 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13]
507494 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]