llvm.org GIT mirror llvm / c207a75
Allow X86::COND_NE_OR_P and X86::COND_NP_OR_E to be reversed. Currently, AnalyzeBranch() fails non-equality comparison between floating points on X86 (see https://llvm.org/bugs/show_bug.cgi?id=23875). This is because this function can modify the branch by reversing the conditional jump and removing unconditional jump if there is a proper fall-through. However, in the case of non-equality comparison between floating points, this can turn the branch "unanalyzable". Consider the following case: jne.BB1 jp.BB1 jmp.BB2 .BB1: ... .BB2: ... AnalyzeBranch() will reverse "jp .BB1" to "jnp .BB2" and then "jmp .BB2" will be removed: jne.BB1 jnp.BB2 .BB1: ... .BB2: ... However, AnalyzeBranch() cannot analyze this branch anymore as there are two conditional jumps with different targets. This may disable some optimizations like block-placement: in this case the fall-through behavior is enforced even if the fall-through block is very cold, which is suboptimal. Actually this optimization is also done in block-placement pass, which means we can remove this optimization from AnalyzeBranch(). However, currently X86::COND_NE_OR_P and X86::COND_NP_OR_E are not reversible: there is no defined negation conditions for them. In order to reverse them, this patch defines two new CondCode X86::COND_E_AND_NP and X86::COND_P_AND_NE. It also defines how to synthesize instructions for them. Here only the second conditional jump is reversed. This is valid as we only need them to do this "unconditional jump removal" optimization. Differential Revision: http://reviews.llvm.org/D11393 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@258847 91177308-0d34-0410-b5e6-96231b3b80d8 Cong Hou 4 years ago
6 changed file(s) with 163 addition(s) and 72 deletion(s). Raw diff Collapse all Expand all
38043804 case X86::COND_NP: return X86::COND_P;
38053805 case X86::COND_O: return X86::COND_NO;
38063806 case X86::COND_NO: return X86::COND_O;
3807 case X86::COND_NE_OR_P: return X86::COND_E_AND_NP;
3808 case X86::COND_NP_OR_E: return X86::COND_P_AND_NE;
3809 case X86::COND_E_AND_NP: return X86::COND_NE_OR_P;
3810 case X86::COND_P_AND_NE: return X86::COND_NP_OR_E;
38073811 }
38083812 }
38093813
39974001 MachineBasicBlock::iterator OldInst = I;
39984002
39994003 BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC))
4000 .addMBB(UnCondBrIter->getOperand(0).getMBB());
4004 .addMBB(UnCondBrIter->getOperand(0).getMBB());
40014005 BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1))
4002 .addMBB(TargetBB);
4006 .addMBB(TargetBB);
40034007
40044008 OldInst->eraseFromParent();
40054009 UnCondBrIter->eraseFromParent();
40234027 assert(Cond.size() == 1);
40244028 assert(TBB);
40254029
4026 // Only handle the case where all conditional branches branch to the same
4027 // destination.
4028 if (TBB != I->getOperand(0).getMBB())
4029 return true;
4030
40314030 // If the conditions are the same, we can leave them alone.
40324031 X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
40334032 if (OldBranchCode == BranchCode)
40364035 // If they differ, see if they fit one of the known patterns. Theoretically,
40374036 // we could handle more patterns here, but we shouldn't expect to see them
40384037 // if instruction selection has done a reasonable job.
4039 if ((OldBranchCode == X86::COND_NP &&
4040 BranchCode == X86::COND_E) ||
4041 (OldBranchCode == X86::COND_E &&
4042 BranchCode == X86::COND_NP))
4038 auto NewTBB = I->getOperand(0).getMBB();
4039 if (TBB == NewTBB &&
4040 ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_E) ||
4041 (OldBranchCode == X86::COND_E && BranchCode == X86::COND_NP))) {
40434042 BranchCode = X86::COND_NP_OR_E;
4044 else if ((OldBranchCode == X86::COND_P &&
4045 BranchCode == X86::COND_NE) ||
4046 (OldBranchCode == X86::COND_NE &&
4047 BranchCode == X86::COND_P))
4043 } else if (TBB == NewTBB &&
4044 ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
4045 (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
40484046 BranchCode = X86::COND_NE_OR_P;
4049 else
4047 } else if ((OldBranchCode == X86::COND_NE && BranchCode == X86::COND_NP) ||
4048 (OldBranchCode == X86::COND_P && BranchCode == X86::COND_E)) {
4049 // X86::COND_P_AND_NE usually has two different branch destinations.
4050 //
4051 // JNP B1
4052 // JNE B2
4053 // B1: (fall-through)
4054 // B2:
4055 //
4056 // Here this condition branches to B2 only if P && NE. It has another
4057 // equivalent form:
4058 //
4059 // JE B1
4060 // JP B2
4061 // B1: (fall-through)
4062 // B2:
4063 //
4064 // Similarly it branches to B2 only if NE && P. That is why this condition
4065 // is named COND_P_AND_NE.
4066 BranchCode = X86::COND_P_AND_NE;
4067 } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
4068 (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
4069 // See comments above for X86::COND_P_AND_NE.
4070 BranchCode = X86::COND_E_AND_NP;
4071 } else
40504072 return true;
40514073
40524074 // Update the MachineOperand.
41554177 return Count;
41564178 }
41574179
4180 static MachineBasicBlock *getFallThroughMBB(MachineBasicBlock *MBB) {
4181 auto I = std::next(MBB->getIterator());
4182 if (I == MBB->getParent()->end())
4183 return nullptr;
4184 return &*I;
4185 }
4186
41584187 unsigned
41594188 X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
41604189 MachineBasicBlock *FBB, ArrayRef Cond,
41704199 BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
41714200 return 1;
41724201 }
4202
4203 // If FBB is null, it is implied to be a fall-through block.
4204 bool FallThru = FBB == nullptr;
41734205
41744206 // Conditional branch.
41754207 unsigned Count = 0;
41894221 BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB);
41904222 ++Count;
41914223 break;
4224 case X86::COND_P_AND_NE:
4225 // Use the next block of MBB as FBB if it is null.
4226 if (FBB == nullptr) {
4227 FBB = getFallThroughMBB(&MBB);
4228 assert(FBB && "MBB cannot be the last block in function when the false "
4229 "body is a fall-through.");
4230 }
4231 // Synthesize NEG_NP_OR_E with two branches.
4232 BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(FBB);
4233 ++Count;
4234 BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB);
4235 ++Count;
4236 break;
4237 case X86::COND_E_AND_NP:
4238 // Use the next block of MBB as FBB if it is null.
4239 if (FBB == nullptr) {
4240 FBB = getFallThroughMBB(&MBB);
4241 assert(FBB && "MBB cannot be the last block in function when the false "
4242 "body is a fall-through.");
4243 }
4244 // Synthesize NEG_NE_OR_P with two branches.
4245 BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(FBB);
4246 ++Count;
4247 BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB);
4248 ++Count;
4249 break;
41924250 default: {
41934251 unsigned Opc = GetCondBranchFromCond(CC);
41944252 BuildMI(&MBB, DL, get(Opc)).addMBB(TBB);
41954253 ++Count;
41964254 }
41974255 }
4198 if (FBB) {
4256 if (!FallThru) {
41994257 // Two-way Conditional branch. Insert the second branch.
42004258 BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
42014259 ++Count;
67166774 ReverseBranchCondition(SmallVectorImpl &Cond) const {
67176775 assert(Cond.size() == 1 && "Invalid X86 branch condition!");
67186776 X86::CondCode CC = static_cast(Cond[0].getImm());
6719 if (CC == X86::COND_NE_OR_P || CC == X86::COND_NP_OR_E)
6720 return true;
67216777 Cond[0].setImm(GetOppositeBranchCondition(CC));
67226778 return false;
67236779 }
2828 namespace X86 {
2929 // X86 specific condition code. These correspond to X86_*_COND in
3030 // X86InstrInfo.td. They must be kept in synch.
31 enum CondCode {
32 COND_A = 0,
33 COND_AE = 1,
34 COND_B = 2,
35 COND_BE = 3,
36 COND_E = 4,
37 COND_G = 5,
38 COND_GE = 6,
39 COND_L = 7,
40 COND_LE = 8,
41 COND_NE = 9,
42 COND_NO = 10,
43 COND_NP = 11,
44 COND_NS = 12,
45 COND_O = 13,
46 COND_P = 14,
47 COND_S = 15,
48 LAST_VALID_COND = COND_S,
49
50 // Artificial condition codes. These are used by AnalyzeBranch
51 // to indicate a block terminated with two conditional branches to
52 // the same location. This occurs in code using FCMP_OEQ or FCMP_UNE,
53 // which can't be represented on x86 with a single condition. These
54 // are never used in MachineInstrs.
55 COND_NE_OR_P,
56 COND_NP_OR_E,
57
58 COND_INVALID
59 };
31 enum CondCode {
32 COND_A = 0,
33 COND_AE = 1,
34 COND_B = 2,
35 COND_BE = 3,
36 COND_E = 4,
37 COND_G = 5,
38 COND_GE = 6,
39 COND_L = 7,
40 COND_LE = 8,
41 COND_NE = 9,
42 COND_NO = 10,
43 COND_NP = 11,
44 COND_NS = 12,
45 COND_O = 13,
46 COND_P = 14,
47 COND_S = 15,
48 LAST_VALID_COND = COND_S,
49
50 // Artificial condition codes. These are used by AnalyzeBranch
51 // to indicate a block terminated with two conditional branches to
52 // the same location. This occurs in code using FCMP_OEQ or FCMP_UNE,
53 // which can't be represented on x86 with a single condition. These
54 // are never used in MachineInstrs.
55 COND_NE_OR_P,
56 COND_NP_OR_E,
57
58 // Artificial condition codes. These are used to represent the negation of
59 // above two conditions. The only scenario we need these two conditions is
60 // when we try to reverse above two conditions in order to remove redundant
61 // unconditional jumps. Note that both true and false bodies need to be
62 // avaiable in order to correctly synthesize instructions for them. These are
63 // never used in MachineInstrs.
64 COND_E_AND_NP, // negate of COND_NE_OR_P
65 COND_P_AND_NE, // negate of COND_NP_OR_E
66
67 COND_INVALID
68 };
6069
6170 // Turn condition code into conditional branch opcode.
6271 unsigned GetCondBranchFromCond(CondCode CC);
462462 }
463463
464464 define void @fpcmp_unanalyzable_branch(i1 %cond) {
465 ; This function's CFG contains an unanalyzable branch that is likely to be
466 ; split due to having a different high-probability predecessor.
465 ; This function's CFG contains an once-unanalyzable branch (une on floating
466 ; points). As now it becomes analyzable, we should get best layout in which each
467 ; edge in 'entry' -> 'entry.if.then_crit_edge' -> 'if.then' -> 'if.end' is
468 ; fall-through.
467469 ; CHECK: fpcmp_unanalyzable_branch
468470 ; CHECK: %entry
469 ; CHECK: %exit
470 ; CHECK-NOT: %if.then
471 ; CHECK-NOT: %if.end
472 ; CHECK-NOT: jne
473 ; CHECK-NOT: jnp
471 ; CHECK: %entry.if.then_crit_edge
472 ; CHECK: %if.then
473 ; CHECK: %if.end
474 ; CHECK: %exit
474475 ; CHECK: jne
475476 ; CHECK-NEXT: jnp
476 ; CHECK-NEXT: %if.then
477477
478478 entry:
479479 ; Note that this branch must be strongly biased toward
480480 ; 'entry.if.then_crit_edge' to ensure that we would try to form a chain for
481 ; 'entry' -> 'entry.if.then_crit_edge' -> 'if.then'. It is the last edge in that
482 ; chain which would violate the unanalyzable branch in 'exit', but we won't even
483 ; try this trick unless 'if.then' is believed to almost always be reached from
484 ; 'entry.if.then_crit_edge'.
481 ; 'entry' -> 'entry.if.then_crit_edge' -> 'if.then' -> 'if.end'.
485482 br i1 %cond, label %entry.if.then_crit_edge, label %lor.lhs.false, !prof !1
486483
487484 entry.if.then_crit_edge:
493490
494491 exit:
495492 %cmp.i = fcmp une double 0.000000e+00, undef
496 br i1 %cmp.i, label %if.then, label %if.end
493 br i1 %cmp.i, label %if.then, label %if.end, !prof !3
497494
498495 if.then:
499496 %0 = phi i8 [ %.pre14, %entry.if.then_crit_edge ], [ undef, %exit ]
506503 }
507504
508505 !1 = !{!"branch_weights", i32 1000, i32 1}
506 !3 = !{!"branch_weights", i32 1, i32 1000}
509507
510508 declare i32 @f()
511509 declare i32 @g()
664662 ; Ensure that we can handle unanalyzable branches where the destination block
665663 ; gets selected as the optimal successor to merge.
666664 ;
665 ; This branch is now analyzable and hence the destination block becomes the
666 ; hotter one. The right order is entry->bar->exit->foo.
667 ;
667668 ; CHECK: unanalyzable_branch_to_best_succ
668669 ; CHECK: %entry
670 ; CHECK: %bar
671 ; CHECK: %exit
669672 ; CHECK: %foo
670 ; CHECK: %bar
671 ; CHECK: %exit
672673
673674 entry:
674675 ; Bias this branch toward bar to ensure we form that chain.
44 ; CHECK-LABEL: fcmp_oeq
55 ; CHECK: ucomiss %xmm1, %xmm0
66 ; CHECK-NEXT: jne {{LBB.+_1}}
7 ; CHECK-NEXT: jnp {{LBB.+_2}}
7 ; CHECK-NEXT: jp {{LBB.+_1}}
88 %1 = fcmp oeq float %x, %y
99 br i1 %1, label %bb1, label %bb2
1010 bb2:
161161 ; CHECK-LABEL: fcmp_une
162162 ; CHECK: ucomiss %xmm1, %xmm0
163163 ; CHECK-NEXT: jne {{LBB.+_2}}
164 ; CHECK-NEXT: jp {{LBB.+_2}}
165 ; CHECK-NEXT: jmp {{LBB.+_1}}
164 ; CHECK-NEXT: jnp {{LBB.+_1}}
166165 %1 = fcmp une float %x, %y
167166 br i1 %1, label %bb1, label %bb2
168167 bb2:
1616 ; CHECK: xorps %xmm1, %xmm1
1717 ; CHECK-NEXT: ucomiss %xmm1, %xmm0
1818 ; CHECK-NEXT: jne {{LBB.+_1}}
19 ; CHECK-NEXT: jnp {{LBB.+_2}}
19 ; CHECK-NEXT: jp {{LBB.+_1}}
2020 %1 = fcmp oeq float %x, 0.000000e+00
2121 br i1 %1, label %bb1, label %bb2
2222 bb2:
337337 ; CHECK: xorps %xmm1, %xmm1
338338 ; CHECK-NEXT: ucomiss %xmm1, %xmm0
339339 ; CHECK-NEXT: jne {{LBB.+_2}}
340 ; CHECK-NEXT: jp {{LBB.+_2}}
341 ; CHECK-NEXT: jmp {{LBB.+_1}}
340 ; CHECK-NEXT: jnp {{LBB.+_1}}
342341 %1 = fcmp une float %x, 0.000000e+00
343342 br i1 %1, label %bb1, label %bb2
344343 bb2:
1818 ; addsd ...
1919 ; LBB0_2:
2020
21 ; CHECK: func
21 define float @func1(float %x, float %y) nounwind readnone optsize ssp {
22 ; CHECK: func1
2223 ; CHECK: jne [[LABEL:.*]]
2324 ; CHECK-NEXT: jp [[LABEL]]
2425 ; CHECK-NOT: jmp
25
26 define float @func(float %x, float %y) nounwind readnone optsize ssp {
26 ;
2727 entry:
2828 %0 = fpext float %x to double
2929 %1 = fpext float %y to double
4040 %.0 = fptrunc double %.0.in to float
4141 ret float %.0
4242 }
43
44 define float @func2(float %x, float %y) nounwind readnone optsize ssp {
45 ; CHECK: func2
46 ; CHECK: jne [[LABEL:.*]]
47 ; CHECK-NEXT: jp [[LABEL]]
48 ; CHECK: %bb2
49 ; CHECK: %bb1
50 ; CHECK: jmp
51 ;
52 entry:
53 %0 = fpext float %x to double
54 %1 = fpext float %y to double
55 %2 = fmul double %0, %1
56 %3 = fcmp une double %2, 0.000000e+00
57 br i1 %3, label %bb1, label %bb2, !prof !1
58
59 bb1:
60 %4 = fadd double %2, -1.000000e+00
61 br label %bb2
62
63 bb2:
64 %.0.in = phi double [ %4, %bb1 ], [ %2, %entry ]
65 %.0 = fptrunc double %.0.in to float
66 ret float %.0
67 }
68
69 !1 = !{!"branch_weights", i32 1, i32 1000}