llvm.org GIT mirror llvm / b19c087
Lower certain build_vectors to insertps instructions Summary: Vectors built with zeros and elements in the same order as another (source) vector are optimized to be built using a single insertps instruction. Also optimize when we move one element in a vector to a different place in that vector while zeroing out some of the other elements. Further optimizations are possible, described in TODO comments. I will be implementing at least some of them in the near future. Added some tests for different cases where this optimization triggers. Reviewers: nadav, delena, craig.topper Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D3521 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@208271 91177308-0d34-0410-b5e6-96231b3b80d8 Filipe Cabecinhas 6 years ago
2 changed file(s) with 332 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
54365436 return V;
54375437 }
54385438
5439 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
5440 static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems,
5441 unsigned NonZeros, unsigned NumNonZero,
5442 unsigned NumZero, SelectionDAG &DAG,
5443 const X86Subtarget *Subtarget,
5444 const TargetLowering &TLI) {
5445 // We know there's at least one non-zero element
5446 unsigned FirstNonZeroIdx = 0;
5447 SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx);
5448 while (FirstNonZero.getOpcode() == ISD::UNDEF ||
5449 X86::isZeroNode(FirstNonZero)) {
5450 ++FirstNonZeroIdx;
5451 FirstNonZero = Op->getOperand(FirstNonZeroIdx);
5452 }
5453
5454 if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5455 !isa(FirstNonZero.getOperand(1)))
5456 return SDValue();
5457
5458 SDValue V = FirstNonZero.getOperand(0);
5459 unsigned FirstNonZeroDst = cast(FirstNonZero.getOperand(1))->getZExtValue();
5460 unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx;
5461 unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx;
5462 unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst;
5463
5464 for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) {
5465 SDValue Elem = Op.getOperand(Idx);
5466 if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem))
5467 continue;
5468
5469 // TODO: What else can be here? Deal with it.
5470 if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
5471 return SDValue();
5472
5473 // TODO: Some optimizations are still possible here
5474 // ex: Getting one element from a vector, and the rest from another.
5475 if (Elem.getOperand(0) != V)
5476 return SDValue();
5477
5478 unsigned Dst = cast(Elem.getOperand(1))->getZExtValue();
5479 if (Dst == Idx)
5480 ++CorrectIdx;
5481 else if (IncorrectIdx == -1U) {
5482 IncorrectIdx = Idx;
5483 IncorrectDst = Dst;
5484 } else
5485 // There was already one element with an incorrect index.
5486 // We can't optimize this case to an insertps.
5487 return SDValue();
5488 }
5489
5490 if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) {
5491 SDLoc dl(Op);
5492 EVT VT = Op.getSimpleValueType();
5493 unsigned ElementMoveMask = 0;
5494 if (IncorrectIdx == -1U)
5495 ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4;
5496 else
5497 ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4;
5498
5499 SDValue InsertpsMask = DAG.getIntPtrConstant(
5500 ElementMoveMask | (~NonZeros & 0xf));
5501 return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask);
5502 }
5503
5504 return SDValue();
5505 }
5506
54395507 /// getVShift - Return a vector logical shift node.
54405508 ///
54415509 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
61846252 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
61856253 Subtarget, *this);
61866254 if (V.getNode()) return V;
6255 }
6256
6257 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
6258 if (EVTBits == 32 && NumElems == 4) {
6259 SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero,
6260 NumZero, DAG, Subtarget, *this);
6261 if (V.getNode())
6262 return V;
61876263 }
61886264
61896265 // If element VT is == 32 bits, turn it into a number of shuffles.
319319 %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32>
320320 ret <4 x i32> %result
321321 }
322
323 ;;;;;; Shuffles optimizable with a single insertps instruction
324 define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
325 ; CHECK-LABEL: shuf_XYZ0:
326 ; CHECK-NOT: pextrd
327 ; CHECK-NOT: punpckldq
328 ; CHECK: insertps $8
329 ; CHECK: ret
330 %vecext = extractelement <4 x float> %x, i32 0
331 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
332 %vecext1 = extractelement <4 x float> %x, i32 1
333 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
334 %vecext3 = extractelement <4 x float> %x, i32 2
335 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
336 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
337 ret <4 x float> %vecinit5
338 }
339
340 define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
341 ; CHECK-LABEL: shuf_XY00:
342 ; CHECK-NOT: pextrd
343 ; CHECK-NOT: punpckldq
344 ; CHECK: insertps $12
345 ; CHECK: ret
346 %vecext = extractelement <4 x float> %x, i32 0
347 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
348 %vecext1 = extractelement <4 x float> %x, i32 1
349 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
350 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
351 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
352 ret <4 x float> %vecinit4
353 }
354
355 define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
356 ; CHECK-LABEL: shuf_XYY0:
357 ; CHECK-NOT: pextrd
358 ; CHECK-NOT: punpckldq
359 ; CHECK: insertps $104
360 ; CHECK: ret
361 %vecext = extractelement <4 x float> %x, i32 0
362 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
363 %vecext1 = extractelement <4 x float> %x, i32 1
364 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
365 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2
366 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
367 ret <4 x float> %vecinit5
368 }
369
370 define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
371 ; CHECK-LABEL: shuf_XYW0:
372 ; CHECK: insertps $232
373 ; CHECK: ret
374 %vecext = extractelement <4 x float> %x, i32 0
375 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
376 %vecext1 = extractelement <4 x float> %x, i32 1
377 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
378 %vecext2 = extractelement <4 x float> %x, i32 3
379 %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2
380 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
381 ret <4 x float> %vecinit4
382 }
383
384 define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
385 ; CHECK-LABEL: shuf_W00W:
386 ; CHECK-NOT: pextrd
387 ; CHECK-NOT: punpckldq
388 ; CHECK: insertps $198
389 ; CHECK: ret
390 %vecext = extractelement <4 x float> %x, i32 3
391 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
392 %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1
393 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
394 %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3
395 ret <4 x float> %vecinit4
396 }
397
398 define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
399 ; CHECK-LABEL: shuf_X00A:
400 ; CHECK-NOT: movaps
401 ; CHECK-NOT: shufps
402 ; CHECK: insertps $48
403 ; CHECK: ret
404 %vecext = extractelement <4 x float> %x, i32 0
405 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
406 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
407 %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
408 %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32>
409 ret <4 x float> %vecinit4
410 }
411
412 define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
413 ; CHECK-LABEL: shuf_X00X:
414 ; CHECK-NOT: movaps
415 ; CHECK-NOT: shufps
416 ; CHECK: insertps $48
417 ; CHECK: ret
418 %vecext = extractelement <4 x float> %x, i32 0
419 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
420 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
421 %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
422 %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32>
423 ret <4 x float> %vecinit4
424 }
425
426 define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
427 ; CHECK-LABEL: shuf_X0YC:
428 ; CHECK: shufps
429 ; CHECK-NOT: movhlps
430 ; CHECK-NOT: shufps
431 ; CHECK: insertps $176
432 ; CHECK: ret
433 %vecext = extractelement <4 x float> %x, i32 0
434 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
435 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
436 %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32>
437 %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32>
438 ret <4 x float> %vecinit5
439 }
440
441 define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
442 ; CHECK-LABEL: i32_shuf_XYZ0:
443 ; CHECK-NOT: pextrd
444 ; CHECK-NOT: punpckldq
445 ; CHECK: insertps $8
446 ; CHECK: ret
447 %vecext = extractelement <4 x i32> %x, i32 0
448 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
449 %vecext1 = extractelement <4 x i32> %x, i32 1
450 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
451 %vecext3 = extractelement <4 x i32> %x, i32 2
452 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
453 %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
454 ret <4 x i32> %vecinit5
455 }
456
457 define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
458 ; CHECK-LABEL: i32_shuf_XY00:
459 ; CHECK-NOT: pextrd
460 ; CHECK-NOT: punpckldq
461 ; CHECK: insertps $12
462 ; CHECK: ret
463 %vecext = extractelement <4 x i32> %x, i32 0
464 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
465 %vecext1 = extractelement <4 x i32> %x, i32 1
466 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
467 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
468 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
469 ret <4 x i32> %vecinit4
470 }
471
472 define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
473 ; CHECK-LABEL: i32_shuf_XYY0:
474 ; CHECK-NOT: pextrd
475 ; CHECK-NOT: punpckldq
476 ; CHECK: insertps $104
477 ; CHECK: ret
478 %vecext = extractelement <4 x i32> %x, i32 0
479 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
480 %vecext1 = extractelement <4 x i32> %x, i32 1
481 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
482 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2
483 %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
484 ret <4 x i32> %vecinit5
485 }
486
487 define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
488 ; CHECK-LABEL: i32_shuf_XYW0:
489 ; CHECK-NOT: pextrd
490 ; CHECK-NOT: punpckldq
491 ; CHECK: insertps $232
492 ; CHECK: ret
493 %vecext = extractelement <4 x i32> %x, i32 0
494 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
495 %vecext1 = extractelement <4 x i32> %x, i32 1
496 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
497 %vecext2 = extractelement <4 x i32> %x, i32 3
498 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2
499 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
500 ret <4 x i32> %vecinit4
501 }
502
503 define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
504 ; CHECK-LABEL: i32_shuf_W00W:
505 ; CHECK-NOT: pextrd
506 ; CHECK-NOT: punpckldq
507 ; CHECK: insertps $198
508 ; CHECK: ret
509 %vecext = extractelement <4 x i32> %x, i32 3
510 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
511 %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1
512 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
513 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3
514 ret <4 x i32> %vecinit4
515 }
516
517 define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
518 ; CHECK-LABEL: i32_shuf_X00A:
519 ; CHECK-NOT: movaps
520 ; CHECK-NOT: shufps
521 ; CHECK: insertps $48
522 ; CHECK: ret
523 %vecext = extractelement <4 x i32> %x, i32 0
524 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
525 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
526 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
527 %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32>
528 ret <4 x i32> %vecinit4
529 }
530
531 define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
532 ; CHECK-LABEL: i32_shuf_X00X:
533 ; CHECK-NOT: movaps
534 ; CHECK-NOT: shufps
535 ; CHECK: insertps $48
536 ; CHECK: ret
537 %vecext = extractelement <4 x i32> %x, i32 0
538 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
539 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
540 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
541 %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32>
542 ret <4 x i32> %vecinit4
543 }
544
545 define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
546 ; CHECK-LABEL: i32_shuf_X0YC:
547 ; CHECK: shufps
548 ; CHECK-NOT: movhlps
549 ; CHECK-NOT: shufps
550 ; CHECK: insertps $176
551 ; CHECK: ret
552 %vecext = extractelement <4 x i32> %x, i32 0
553 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
554 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
555 %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32>
556 %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32>
557 ret <4 x i32> %vecinit5
558 }
559
560 ;; Test for a bug in the first implementation of LowerBuildVectorv4x32
561 define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
562 ; CHECK-LABEL: test_insertps_no_undef:
563 ; CHECK: movaps %xmm0, %xmm1
564 ; CHECK-NEXT: insertps $8, %xmm1, %xmm1
565 ; CHECK-NEXT: maxps %xmm1, %xmm0
566 ; CHECK-NEXT: ret
567 %vecext = extractelement <4 x float> %x, i32 0
568 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
569 %vecext1 = extractelement <4 x float> %x, i32 1
570 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
571 %vecext3 = extractelement <4 x float> %x, i32 2
572 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
573 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
574 %mask = fcmp olt <4 x float> %vecinit5, %x
575 %res = select <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5
576 ret <4 x float> %res
577 }