llvm.org GIT mirror llvm / 584fedf
Teach two-address lowering how to unfold a load to open up commuting opportunities. For example, this lets it emit this: movq (%rax), %rcx addq %rdx, %rcx instead of this: movq %rdx, %rcx addq (%rax), %rcx in the case where %rdx has subsequent uses. It's the same number of instructions, and usually the same encoding size on x86, but it appears faster, and in general, it may allow better scheduling for the load. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@106493 91177308-0d34-0410-b5e6-96231b3b80d8 Dan Gohman 9 years ago
6 changed file(s) with 105 addition(s) and 18 deletion(s). Raw diff Collapse all Expand all
897897 }
898898 }
899899 }
900
901 // If this is an instruction with a load folded into it, try unfolding
902 // the load, e.g. avoid this:
903 // movq %rdx, %rcx
904 // addq (%rax), %rcx
905 // in favor of this:
906 // movq (%rax), %rcx
907 // addq %rdx, %rcx
908 // because it's preferable to schedule a load than a register copy.
909 if (TID.mayLoad() && !regBKilled) {
910 // Determine if a load can be unfolded.
911 unsigned LoadRegIndex;
912 unsigned NewOpc =
913 TII->getOpcodeAfterMemoryUnfold(mi->getOpcode(),
914 /*UnfoldLoad=*/true,
915 /*UnfoldStore=*/false,
916 &LoadRegIndex);
917 if (NewOpc != 0) {
918 const TargetInstrDesc &UnfoldTID = TII->get(NewOpc);
919 if (UnfoldTID.getNumDefs() == 1) {
920 MachineFunction &MF = *mbbi->getParent();
921
922 // Unfold the load.
923 DEBUG(dbgs() << "2addr: UNFOLDING: " << *mi);
924 const TargetRegisterClass *RC =
925 UnfoldTID.OpInfo[LoadRegIndex].getRegClass(TRI);
926 unsigned Reg = MRI->createVirtualRegister(RC);
927 SmallVector NewMIs;
928 bool Success =
929 TII->unfoldMemoryOperand(MF, mi, Reg,
930 /*UnfoldLoad=*/true, /*UnfoldStore=*/false,
931 NewMIs);
932 (void)Success;
933 assert(Success &&
934 "unfoldMemoryOperand failed when getOpcodeAfterMemoryUnfold "
935 "succeeded!");
936 assert(NewMIs.size() == 2 &&
937 "Unfolded a load into multiple instructions!");
938 // The load was previously folded, so this is the only use.
939 NewMIs[1]->addRegisterKilled(Reg, TRI);
940
941 // Tentatively insert the instructions into the block so that they
942 // look "normal" to the transformation logic.
943 mbbi->insert(mi, NewMIs[0]);
944 mbbi->insert(mi, NewMIs[1]);
945
946 DEBUG(dbgs() << "2addr: NEW LOAD: " << *NewMIs[0]
947 << "2addr: NEW INST: " << *NewMIs[1]);
948
949 // Transform the instruction, now that it no longer has a load.
950 unsigned NewDstIdx = NewMIs[1]->findRegisterDefOperandIdx(regA);
951 unsigned NewSrcIdx = NewMIs[1]->findRegisterUseOperandIdx(regB);
952 MachineBasicBlock::iterator NewMI = NewMIs[1];
953 bool TransformSuccess =
954 TryInstructionTransform(NewMI, mi, mbbi,
955 NewSrcIdx, NewDstIdx, Dist);
956 if (TransformSuccess ||
957 NewMIs[1]->getOperand(NewSrcIdx).isKill()) {
958 // Success, or at least we made an improvement. Keep the unfolded
959 // instructions and discard the original.
960 if (LV) {
961 for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) {
962 MachineOperand &MO = mi->getOperand(i);
963 if (MO.isReg() && MO.isUse() && MO.isKill())
964 LV->replaceKillInstruction(Reg, mi, NewMIs[0]);
965 }
966 LV->addVirtualRegisterKilled(Reg, NewMIs[1]);
967 }
968 mi->eraseFromParent();
969 mi = NewMIs[1];
970 if (TransformSuccess)
971 return true;
972 } else {
973 // Transforming didn't eliminate the tie and didn't lead to an
974 // improvement. Clean up the unfolded instructions and keep the
975 // original.
976 DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n");
977 NewMIs[0]->eraseFromParent();
978 NewMIs[1]->eraseFromParent();
979 }
980 }
981 }
982 }
983
900984 return false;
901985 }
902986
1010 %tmp14 = fadd float %tmp12, %tmp7
1111 ret float %tmp14
1212
13 ; CHECK: mulss LCPI0_0(%rip)
14 ; CHECK: mulss LCPI0_1(%rip)
13 ; CHECK: mulss
14 ; CHECK: mulss
1515 ; CHECK: addss
16 ; CHECK: mulss LCPI0_2(%rip)
16 ; CHECK: mulss
1717 ; CHECK: addss
18 ; CHECK: mulss LCPI0_3(%rip)
18 ; CHECK: mulss
1919 ; CHECK: addss
2020 ; CHECK: ret
2121 }
464464 ; And the one at %bb68, where we want to be sure to use superhero mode:
465465
466466 ; CHECK: BB10_10:
467 ; CHECK-NEXT: movaps %xmm{{.*}}, %xmm{{.*}}
468 ; CHECK-NEXT: mulps 48(%r{{[^,]*}}), %xmm{{.*}}
469 ; CHECK-NEXT: movaps %xmm{{.*}}, %xmm{{.*}}
470 ; CHECK-NEXT: mulps 32(%r{{[^,]*}}), %xmm{{.*}}
471 ; CHECK-NEXT: movaps %xmm{{.*}}, %xmm{{.*}}
472 ; CHECK-NEXT: mulps 16(%r{{[^,]*}}), %xmm{{.*}}
473 ; CHECK-NEXT: movaps %xmm{{.*}}, %xmm{{.*}}
474 ; CHECK-NEXT: mulps (%r{{[^,]*}}), %xmm{{.*}}
467 ; CHECK-NEXT: movaps 48(%r{{[^,]*}}), %xmm{{.*}}
468 ; CHECK-NEXT: mulps %xmm{{.*}}, %xmm{{.*}}
469 ; CHECK-NEXT: movaps 32(%r{{[^,]*}}), %xmm{{.*}}
470 ; CHECK-NEXT: mulps %xmm{{.*}}, %xmm{{.*}}
471 ; CHECK-NEXT: movaps 16(%r{{[^,]*}}), %xmm{{.*}}
472 ; CHECK-NEXT: mulps %xmm{{.*}}, %xmm{{.*}}
473 ; CHECK-NEXT: movaps (%r{{[^,]*}}), %xmm{{.*}}
474 ; CHECK-NEXT: mulps %xmm{{.*}}, %xmm{{.*}}
475475 ; CHECK-NEXT: movaps %xmm{{.*}}, (%r{{[^,]*}})
476476 ; CHECK-NEXT: movaps %xmm{{.*}}, 16(%r{{[^,]*}})
477477 ; CHECK-NEXT: movaps %xmm{{.*}}, 32(%r{{[^,]*}})
188188 ; LINUX: call .L7$pb
189189 ; LINUX: .L7$pb:
190190 ; LINUX: addl $_GLOBAL_OFFSET_TABLE_+(.L{{.*}}-.L7$pb),
191 ; LINUX: addl .LJTI7_0@GOTOFF(
191 ; LINUX: .LJTI7_0@GOTOFF(
192192 ; LINUX: jmpl *
193193
194194 ; LINUX: .LJTI7_0:
88
99 define void @test({ double, double }* byval %z, double* %P) {
1010 entry:
11 %tmp3 = load double* @G, align 16 ; [#uses=1]
12 %tmp4 = tail call double @fabs( double %tmp3 ) ; [#uses=1]
13 volatile store double %tmp4, double* %P
1114 %tmp = getelementptr { double, double }* %z, i32 0, i32 0 ; [#uses=1]
12 %tmp1 = load double* %tmp, align 8 ; [#uses=1]
15 %tmp1 = volatile load double* %tmp, align 8 ; [#uses=1]
1316 %tmp2 = tail call double @fabs( double %tmp1 ) ; [#uses=1]
1417 ; CHECK: andpd{{.*}}4(%esp), %xmm
15 %tmp3 = load double* @G, align 16 ; [#uses=1]
16 %tmp4 = tail call double @fabs( double %tmp3 ) ; [#uses=1]
1718 %tmp6 = fadd double %tmp4, %tmp2 ; [#uses=1]
18 store double %tmp6, double* %P, align 8
19 volatile store double %tmp6, double* %P, align 8
1920 ret void
2021 }
2122
11
22 ; Check that lowered arguments on the stack do not overwrite each other.
33 ; Add %in1 %p1 to a different temporary register (%eax).
4 ; CHECK: movl %edi, %eax
4 ; CHECK: movl 32(%rsp), %eax
55 ; Move param %in1 to temp register (%r10d).
66 ; CHECK: movl 40(%rsp), %r10d
7 ; Add %in1 %p1 to a different temporary register (%eax).
8 ; CHECK: addl %edi, %eax
79 ; Move param %in2 to stack.
810 ; CHECK: movl %r10d, 32(%rsp)
911 ; Move result of addition to stack.