llvm.org GIT mirror llvm / 440f5b2
[X86] Support SHLD/SHRD masked shift-counts (PR34641) Peek through shift modulo masks while matching double shift patterns. I was hoping to delay this until I could remove the X86 code with generic funnel shift matching (PR40081) but this will do for now. Differential Revision: https://reviews.llvm.org/D56199 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@350222 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 9 months ago
2 changed file(s) with 43 addition(s) and 34 deletion(s). Raw diff Collapse all Expand all
3651336513
3651436514 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
3651536515 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
36516 unsigned Bits = VT.getScalarSizeInBits();
3651636517
3651736518 // SHLD/SHRD instructions have lower register pressure, but on some
3651836519 // platforms they have higher latency than the equivalent
3653536536 SDValue ShAmt1 = N1.getOperand(1);
3653636537 if (ShAmt1.getValueType() != MVT::i8)
3653736538 return SDValue();
36539
36540 // Peek through any modulo shift masks.
36541 SDValue ShMsk0;
36542 if (ShAmt0.getOpcode() == ISD::AND &&
36543 isa(ShAmt0.getOperand(1)) &&
36544 ShAmt0.getConstantOperandVal(1) == (Bits - 1)) {
36545 ShMsk0 = ShAmt0;
36546 ShAmt0 = ShAmt0.getOperand(0);
36547 }
36548 SDValue ShMsk1;
36549 if (ShAmt1.getOpcode() == ISD::AND &&
36550 isa(ShAmt1.getOperand(1)) &&
36551 ShAmt1.getConstantOperandVal(1) == (Bits - 1)) {
36552 ShMsk1 = ShAmt1;
36553 ShAmt1 = ShAmt1.getOperand(0);
36554 }
36555
3653836556 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
3653936557 ShAmt0 = ShAmt0.getOperand(0);
3654036558 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
3654936567 Opc = X86ISD::SHRD;
3655036568 std::swap(Op0, Op1);
3655136569 std::swap(ShAmt0, ShAmt1);
36570 std::swap(ShMsk0, ShMsk1);
3655236571 }
3655336572
3655436573 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
3655536574 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
3655636575 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
3655736576 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
36558 unsigned Bits = VT.getScalarSizeInBits();
36577 // OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> SHLD( X, Y, C )
36578 // OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> SHRD( X, Y, C )
3655936579 if (ShAmt1.getOpcode() == ISD::SUB) {
3656036580 SDValue Sum = ShAmt1.getOperand(0);
3656136581 if (auto *SumC = dyn_cast(Sum)) {
3656236582 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
3656336583 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
3656436584 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
36565 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
36566 return DAG.getNode(Opc, DL, VT,
36567 Op0, Op1,
36568 DAG.getNode(ISD::TRUNCATE, DL,
36569 MVT::i8, ShAmt0));
36585 if ((SumC->getAPIntValue() == Bits ||
36586 (SumC->getAPIntValue() == 0 && ShMsk1)) &&
36587 ShAmt1Op1 == ShAmt0)
36588 return DAG.getNode(Opc, DL, VT, Op0, Op1,
36589 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
3657036590 }
3657136591 } else if (auto *ShAmt1C = dyn_cast(ShAmt1)) {
3657236592 auto *ShAmt0C = dyn_cast(ShAmt0);
3658236602 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
3658336603 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
3658436604 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
36585 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
36605 if (MaskC->getSExtValue() == (Bits - 1) &&
36606 (ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) {
3658636607 if (Op1.getOpcode() == InnerShift &&
3658736608 isa(Op1.getOperand(1)) &&
3658836609 Op1.getConstantOperandVal(1) == 1) {
3659336614 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
3659436615 Op1.getOperand(0) == Op1.getOperand(1)) {
3659536616 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
36596 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
36617 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
3659736618 }
3659836619 }
3659936620 }
459459 define i32 @shld_safe_i32(i32, i32, i32) {
460460 ; X86-LABEL: shld_safe_i32:
461461 ; X86: # %bb.0:
462 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
463 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
464 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
465 ; X86-NEXT: shll %cl, %edx
466 ; X86-NEXT: negb %cl
467 ; X86-NEXT: shrl %cl, %eax
468 ; X86-NEXT: orl %edx, %eax
462 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
463 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
464 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
465 ; X86-NEXT: shldl %cl, %edx, %eax
469466 ; X86-NEXT: retl
470467 ;
471468 ; X64-LABEL: shld_safe_i32:
472469 ; X64: # %bb.0:
473470 ; X64-NEXT: movl %edx, %ecx
474 ; X64-NEXT: movl %esi, %eax
475 ; X64-NEXT: shll %cl, %edi
476 ; X64-NEXT: negb %cl
477 ; X64-NEXT: # kill: def $cl killed $cl killed $ecx
478 ; X64-NEXT: shrl %cl, %eax
479 ; X64-NEXT: orl %edi, %eax
471 ; X64-NEXT: movl %edi, %eax
472 ; X64-NEXT: # kill: def $cl killed $cl killed $ecx
473 ; X64-NEXT: shldl %cl, %esi, %eax
480474 ; X64-NEXT: retq
481475 %4 = and i32 %2, 31
482476 %5 = shl i32 %0, %4
490484 define i32 @shrd_safe_i32(i32, i32, i32) {
491485 ; X86-LABEL: shrd_safe_i32:
492486 ; X86: # %bb.0:
493 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
494 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
495 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
496 ; X86-NEXT: shrl %cl, %edx
497 ; X86-NEXT: negb %cl
498 ; X86-NEXT: shll %cl, %eax
499 ; X86-NEXT: orl %edx, %eax
487 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
488 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
489 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
490 ; X86-NEXT: shrdl %cl, %edx, %eax
500491 ; X86-NEXT: retl
501492 ;
502493 ; X64-LABEL: shrd_safe_i32:
503494 ; X64: # %bb.0:
504495 ; X64-NEXT: movl %edx, %ecx
505 ; X64-NEXT: movl %esi, %eax
506 ; X64-NEXT: shrl %cl, %edi
507 ; X64-NEXT: negb %cl
508 ; X64-NEXT: # kill: def $cl killed $cl killed $ecx
509 ; X64-NEXT: shll %cl, %eax
510 ; X64-NEXT: orl %edi, %eax
496 ; X64-NEXT: movl %edi, %eax
497 ; X64-NEXT: # kill: def $cl killed $cl killed $ecx
498 ; X64-NEXT: shrdl %cl, %esi, %eax
511499 ; X64-NEXT: retq
512500 %4 = and i32 %2, 31
513501 %5 = lshr i32 %0, %4