llvm.org GIT mirror llvm / f42965e
[AArch64] Add patterns to replace fsub fmul with fma fneg. Summary: This patch adds MachineCombiner patterns for transforming (fsub (fmul x y) z) into (fma x y (fneg z)). This has a lower latency on micro architectures where fneg is cheap. Patch based on work by George Steed. Reviewers: rengolin, joelkevinjones, joel_k_jones, evandro, efriedma Reviewed By: evandro Subscribers: aemerson, javed.absar, llvm-commits, kristof.beyls Differential Revision: https://reviews.llvm.org/D40306 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319980 91177308-0d34-0410-b5e6-96231b3b80d8 Florian Hahn 2 years ago
3 changed file(s) with 194 addition(s) and 7 deletion(s). Raw diff Collapse all Expand all
6767 FMLAv4i32_indexed_OP2,
6868 FMLSv1i32_indexed_OP2,
6969 FMLSv1i64_indexed_OP2,
70 FMLSv2f32_OP1,
71 FMLSv2f32_OP2,
72 FMLSv2f64_OP1,
73 FMLSv2f64_OP2,
74 FMLSv2i32_indexed_OP1,
7075 FMLSv2i32_indexed_OP2,
76 FMLSv2i64_indexed_OP1,
7177 FMLSv2i64_indexed_OP2,
72 FMLSv2f32_OP2,
73 FMLSv2f64_OP2,
74 FMLSv4i32_indexed_OP2,
75 FMLSv4f32_OP2
78 FMLSv4f32_OP1,
79 FMLSv4f32_OP2,
80 FMLSv4i32_indexed_OP1,
81 FMLSv4i32_indexed_OP2
7682 };
7783
7884 } // end namespace llvm
36713671 }
36723672 break;
36733673 case AArch64::FSUBv2f32:
3674 if (canCombineWithFMUL(MBB, Root.getOperand(1),
3675 AArch64::FMULv2i32_indexed)) {
3676 Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP1);
3677 Found = true;
3678 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3679 AArch64::FMULv2f32)) {
3680 Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP1);
3681 Found = true;
3682 }
36743683 if (canCombineWithFMUL(MBB, Root.getOperand(2),
36753684 AArch64::FMULv2i32_indexed)) {
36763685 Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2);
36823691 }
36833692 break;
36843693 case AArch64::FSUBv2f64:
3694 if (canCombineWithFMUL(MBB, Root.getOperand(1),
3695 AArch64::FMULv2i64_indexed)) {
3696 Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP1);
3697 Found = true;
3698 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3699 AArch64::FMULv2f64)) {
3700 Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP1);
3701 Found = true;
3702 }
36853703 if (canCombineWithFMUL(MBB, Root.getOperand(2),
36863704 AArch64::FMULv2i64_indexed)) {
36873705 Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2);
36933711 }
36943712 break;
36953713 case AArch64::FSUBv4f32:
3714 if (canCombineWithFMUL(MBB, Root.getOperand(1),
3715 AArch64::FMULv4i32_indexed)) {
3716 Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP1);
3717 Found = true;
3718 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3719 AArch64::FMULv4f32)) {
3720 Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP1);
3721 Found = true;
3722 }
36963723 if (canCombineWithFMUL(MBB, Root.getOperand(2),
36973724 AArch64::FMULv4i32_indexed)) {
36983725 Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2);
37893816 /// \param MaddOpc the opcode fo the f|madd instruction
37903817 /// \param RC Register class of operands
37913818 /// \param kind of fma instruction (addressing mode) to be generated
3819 /// \param ReplacedAddend is the result register from the instruction
3820 /// replacing the non-combined operand, if any.
37923821 static MachineInstr *
37933822 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
37943823 const TargetInstrInfo *TII, MachineInstr &Root,
37953824 SmallVectorImpl &InsInstrs, unsigned IdxMulOpd,
37963825 unsigned MaddOpc, const TargetRegisterClass *RC,
3797 FMAInstKind kind = FMAInstKind::Default) {
3826 FMAInstKind kind = FMAInstKind::Default,
3827 const unsigned *ReplacedAddend = nullptr) {
37983828 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
37993829
38003830 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
38043834 bool Src0IsKill = MUL->getOperand(1).isKill();
38053835 unsigned SrcReg1 = MUL->getOperand(2).getReg();
38063836 bool Src1IsKill = MUL->getOperand(2).isKill();
3807 unsigned SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
3808 bool Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
3837
3838 unsigned SrcReg2;
3839 bool Src2IsKill;
3840 if (ReplacedAddend) {
3841 // If we just generated a new addend, we must be it's only use.
3842 SrcReg2 = *ReplacedAddend;
3843 Src2IsKill = true;
3844 } else {
3845 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
3846 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
3847 }
38093848
38103849 if (TargetRegisterInfo::isVirtualRegister(ResultReg))
38113850 MRI.constrainRegClass(ResultReg, RC);
43254364 FMAInstKind::Accumulator);
43264365 }
43274366 break;
4367 case MachineCombinerPattern::FMLSv2f32_OP1:
4368 case MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
4369 RC = &AArch64::FPR64RegClass;
4370 unsigned NewVR = MRI.createVirtualRegister(RC);
4371 MachineInstrBuilder MIB1 =
4372 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR)
4373 .add(Root.getOperand(2));
4374 InsInstrs.push_back(MIB1);
4375 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4376 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
4377 Opc = AArch64::FMLAv2i32_indexed;
4378 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4379 FMAInstKind::Indexed, &NewVR);
4380 } else {
4381 Opc = AArch64::FMLAv2f32;
4382 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4383 FMAInstKind::Accumulator, &NewVR);
4384 }
4385 break;
4386 }
4387 case MachineCombinerPattern::FMLSv4f32_OP1:
4388 case MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
4389 RC = &AArch64::FPR128RegClass;
4390 unsigned NewVR = MRI.createVirtualRegister(RC);
4391 MachineInstrBuilder MIB1 =
4392 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR)
4393 .add(Root.getOperand(2));
4394 InsInstrs.push_back(MIB1);
4395 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4396 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
4397 Opc = AArch64::FMLAv4i32_indexed;
4398 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4399 FMAInstKind::Indexed, &NewVR);
4400 } else {
4401 Opc = AArch64::FMLAv4f32;
4402 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4403 FMAInstKind::Accumulator, &NewVR);
4404 }
4405 break;
4406 }
4407 case MachineCombinerPattern::FMLSv2f64_OP1:
4408 case MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
4409 RC = &AArch64::FPR128RegClass;
4410 unsigned NewVR = MRI.createVirtualRegister(RC);
4411 MachineInstrBuilder MIB1 =
4412 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR)
4413 .add(Root.getOperand(2));
4414 InsInstrs.push_back(MIB1);
4415 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4416 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
4417 Opc = AArch64::FMLAv2i64_indexed;
4418 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4419 FMAInstKind::Indexed, &NewVR);
4420 } else {
4421 Opc = AArch64::FMLAv2f64;
4422 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4423 FMAInstKind::Accumulator, &NewVR);
4424 }
4425 break;
4426 }
43284427 } // end switch (Pattern)
43294428 // Record MUL and ADD/SUB for deletion
43304429 DelInstrs.push_back(MUL);
0 # RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=cortex-a57 -enable-unsafe-fp-math %s | FileCheck --check-prefix=UNPROFITABLE %s
1 # RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=falkor -enable-unsafe-fp-math %s | FileCheck --check-prefix=PROFITABLE %s
2 # RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=exynosm1 -enable-unsafe-fp-math %s | FileCheck --check-prefix=PROFITABLE %s
3 # RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx2t99 -enable-unsafe-fp-math %s | FileCheck --check-prefix=PROFITABLE %s
4 #
5 name: f1_2s
6 registers:
7 - { id: 0, class: fpr64 }
8 - { id: 1, class: fpr64 }
9 - { id: 2, class: fpr64 }
10 - { id: 3, class: fpr64 }
11 - { id: 4, class: fpr64 }
12 body: |
13 bb.0.entry:
14 %2:fpr64 = COPY %d2
15 %1:fpr64 = COPY %d1
16 %0:fpr64 = COPY %d0
17 %3:fpr64 = FMULv2f32 %0, %1
18 %4:fpr64 = FSUBv2f32 killed %3, %2
19 %d0 = COPY %4
20 RET_ReallyLR implicit %d0
21
22 ...
23 # UNPROFITABLE-LABEL: name: f1_2s
24 # UNPROFITABLE: %3:fpr64 = FMULv2f32 %0, %1
25 # UNPROFITABLE-NEXT: FSUBv2f32 killed %3, %2
26 #
27 # PROFITABLE-LABEL: name: f1_2s
28 # PROFITABLE: %5:fpr64 = FNEGv2f32 %2
29 # PROFITABLE-NEXT: FMLAv2f32 killed %5, %0, %1
30 ---
31 name: f1_4s
32 registers:
33 - { id: 0, class: fpr128 }
34 - { id: 1, class: fpr128 }
35 - { id: 2, class: fpr128 }
36 - { id: 3, class: fpr128 }
37 - { id: 4, class: fpr128 }
38 body: |
39 bb.0.entry:
40 %2:fpr128 = COPY %q2
41 %1:fpr128 = COPY %q1
42 %0:fpr128 = COPY %q0
43 %3:fpr128 = FMULv4f32 %0, %1
44 %4:fpr128 = FSUBv4f32 killed %3, %2
45 %q0 = COPY %4
46 RET_ReallyLR implicit %q0
47
48 ...
49 # UNPROFITABLE-LABEL: name: f1_4s
50 # UNPROFITABLE: %3:fpr128 = FMULv4f32 %0, %1
51 # UNPROFITABLE-NEXT: FSUBv4f32 killed %3, %2
52 #
53 # PROFITABLE-LABEL: name: f1_4s
54 # PROFITABLE: %5:fpr128 = FNEGv4f32 %2
55 # PROFITABLE-NEXT: FMLAv4f32 killed %5, %0, %1
56 ---
57 name: f1_2d
58 registers:
59 - { id: 0, class: fpr128 }
60 - { id: 1, class: fpr128 }
61 - { id: 2, class: fpr128 }
62 - { id: 3, class: fpr128 }
63 - { id: 4, class: fpr128 }
64 body: |
65 bb.0.entry:
66 %2:fpr128 = COPY %q2
67 %1:fpr128 = COPY %q1
68 %0:fpr128 = COPY %q0
69 %3:fpr128 = FMULv2f64 %0, %1
70 %4:fpr128 = FSUBv2f64 killed %3, %2
71 %q0 = COPY %4
72 RET_ReallyLR implicit %q0
73
74 ...
75 # UNPROFITABLE-LABEL: name: f1_2d
76 # UNPROFITABLE: %3:fpr128 = FMULv2f64 %0, %1
77 # UNPROFITABLE-NEXT: FSUBv2f64 killed %3, %2
78 #
79 # PROFITABLE-LABEL: name: f1_2d
80 # PROFITABLE: %5:fpr128 = FNEGv2f64 %2
81 # PROFITABLE-NEXT: FMLAv2f64 killed %5, %0, %1