llvm.org GIT mirror llvm / 5dde893
Avoid some 's' 16-bit instruction which partially update CPSR (and add false dependency) when it isn't dependent on last CPSR defining instruction. rdar://8928208 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@129773 91177308-0d34-0410-b5e6-96231b3b80d8 Bob Wilson 9 years ago
5 changed file(s) with 198 addition(s) and 86 deletion(s). Raw diff Collapse all Expand all
6666 def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
6767 "Prefer 32-bit Thumb instrs">;
6868
69 /// Some instructions update CPSR partially, which can add false dependency for
70 /// out-of-order implementation, e.g. Cortex-A9, unless each individual bit is
71 /// mapped to a separate physical register. Avoid partial CPSR update for these
72 /// processors.
73 def FeatureAvoidPartialCPSR : SubtargetFeature<"avoid-partial-cpsr",
74 "AvoidCPSRPartialUpdate", "true",
75 "Avoid CPSR partial update for OOO execution">;
76
6977 // Multiprocessing extension.
7078 def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true",
7179 "Supports Multiprocessing extension">;
110118 def ProcA9 : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9",
111119 "Cortex-A9 ARM processors",
112120 [FeatureHasSlowFPVMLx, FeatureVMLxForwarding,
113 FeatureT2XtPk, FeatureFP16]>;
121 FeatureT2XtPk, FeatureFP16,
122 FeatureAvoidPartialCPSR]>;
114123
115124 class ProcNoItin Features>
116125 : Processor;
5151 , HasT2ExtractPack(false)
5252 , HasDataBarrier(false)
5353 , Pref32BitThumb(false)
54 , AvoidCPSRPartialUpdate(false)
5455 , HasMPExtension(false)
5556 , FPOnlySP(false)
5657 , AllowsUnalignedMem(false)
108108 /// Pref32BitThumb - If true, codegen would prefer 32-bit Thumb instructions
109109 /// over 16-bit ones.
110110 bool Pref32BitThumb;
111
112 /// AvoidCPSRPartialUpdate - If true, codegen would avoid using instructions
113 /// that partially update CPSR and add false dependency on the previous
114 /// CPSR setting instruction.
115 bool AvoidCPSRPartialUpdate;
111116
112117 /// HasMPExtension - True if the subtarget supports Multiprocessing
113118 /// extension (ARMv7 only).
189194 bool isFPBrccSlow() const { return SlowFPBrcc; }
190195 bool isFPOnlySP() const { return FPOnlySP; }
191196 bool prefers32BitThumb() const { return Pref32BitThumb; }
197 bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; }
192198 bool hasMPExtension() const { return HasMPExtension; }
193199
194200 bool hasFP16() const { return HasFP16; }
1111 #include "ARMAddressingModes.h"
1212 #include "ARMBaseRegisterInfo.h"
1313 #include "ARMBaseInstrInfo.h"
14 #include "ARMSubtarget.h"
1415 #include "Thumb2InstrInfo.h"
1516 #include "llvm/CodeGen/MachineInstr.h"
1617 #include "llvm/CodeGen/MachineInstrBuilder.h"
4849 // 1 - No cc field.
4950 // 2 - Always set CPSR.
5051 unsigned PredCC2 : 2;
52 unsigned PartFlag : 1; // 16-bit instruction does partial flag update
5153 unsigned Special : 1; // Needs to be dealt with specially
5254 };
5355
5456 static const ReduceEntry ReduceTable[] = {
55 // Wide, Narrow1, Narrow2, imm1,imm2, lo1, lo2, P/C, S
56 { ARM::t2ADCrr, 0, ARM::tADC, 0, 0, 0, 1, 0,0, 0 },
57 { ARM::t2ADDri, ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 0,0, 0 },
58 { ARM::t2ADDrr, ARM::tADDrr, ARM::tADDhirr, 0, 0, 1, 0, 0,1, 0 },
57 // Wide, Narrow1, Narrow2, imm1,imm2, lo1, lo2, P/C, PF, S
58 { ARM::t2ADCrr, 0, ARM::tADC, 0, 0, 0, 1, 0,0, 0,0 },
59 { ARM::t2ADDri, ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 0,0, 0,0 },
60 { ARM::t2ADDrr, ARM::tADDrr, ARM::tADDhirr, 0, 0, 1, 0, 0,1, 0,0 },
5961 // Note: immediate scale is 4.
60 { ARM::t2ADDrSPi,ARM::tADDrSPi,0, 8, 0, 1, 0, 1,0, 1 },
61 { ARM::t2ADDSri,ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 2,2, 1 },
62 { ARM::t2ADDSrr,ARM::tADDrr, 0, 0, 0, 1, 0, 2,0, 1 },
63 { ARM::t2ANDrr, 0, ARM::tAND, 0, 0, 0, 1, 0,0, 0 },
64 { ARM::t2ASRri, ARM::tASRri, 0, 5, 0, 1, 0, 0,0, 0 },
65 { ARM::t2ASRrr, 0, ARM::tASRrr, 0, 0, 0, 1, 0,0, 0 },
66 { ARM::t2BICrr, 0, ARM::tBIC, 0, 0, 0, 1, 0,0, 0 },
62 { ARM::t2ADDrSPi,ARM::tADDrSPi,0, 8, 0, 1, 0, 1,0, 0,1 },
63 { ARM::t2ADDSri,ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 2,2, 0,1 },
64 { ARM::t2ADDSrr,ARM::tADDrr, 0, 0, 0, 1, 0, 2,0, 0,1 },
65 { ARM::t2ANDrr, 0, ARM::tAND, 0, 0, 0, 1, 0,0, 1,0 },
66 { ARM::t2ASRri, ARM::tASRri, 0, 5, 0, 1, 0, 0,0, 1,0 },
67 { ARM::t2ASRrr, 0, ARM::tASRrr, 0, 0, 0, 1, 0,0, 1,0 },
68 { ARM::t2BICrr, 0, ARM::tBIC, 0, 0, 0, 1, 0,0, 1,0 },
6769 //FIXME: Disable CMN, as CCodes are backwards from compare expectations
68 //{ ARM::t2CMNrr, ARM::tCMN, 0, 0, 0, 1, 0, 2,0, 0 },
69 { ARM::t2CMPri, ARM::tCMPi8, 0, 8, 0, 1, 0, 2,0, 0 },
70 { ARM::t2CMPrr, ARM::tCMPhir, 0, 0, 0, 0, 0, 2,0, 1 },
71 { ARM::t2EORrr, 0, ARM::tEOR, 0, 0, 0, 1, 0,0, 0 },
70 //{ ARM::t2CMNrr, ARM::tCMN, 0, 0, 0, 1, 0, 2,0, 0,0 },
71 { ARM::t2CMPri, ARM::tCMPi8, 0, 8, 0, 1, 0, 2,0, 0,0 },
72 { ARM::t2CMPrr, ARM::tCMPhir, 0, 0, 0, 0, 0, 2,0, 0,1 },
73 { ARM::t2EORrr, 0, ARM::tEOR, 0, 0, 0, 1, 0,0, 1,0 },
7274 // FIXME: adr.n immediate offset must be multiple of 4.
73 //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0, 0, 0, 1, 0, 1,0, 0 },
74 { ARM::t2LSLri, ARM::tLSLri, 0, 5, 0, 1, 0, 0,0, 0 },
75 { ARM::t2LSLrr, 0, ARM::tLSLrr, 0, 0, 0, 1, 0,0, 0 },
76 { ARM::t2LSRri, ARM::tLSRri, 0, 5, 0, 1, 0, 0,0, 0 },
77 { ARM::t2LSRrr, 0, ARM::tLSRrr, 0, 0, 0, 1, 0,0, 0 },
78 { ARM::t2MOVi, ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 0 },
79 { ARM::t2MOVi16,ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 1 },
75 //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0, 0, 0, 1, 0, 1,0, 0,0 },
76 { ARM::t2LSLri, ARM::tLSLri, 0, 5, 0, 1, 0, 0,0, 1,0 },
77 { ARM::t2LSLrr, 0, ARM::tLSLrr, 0, 0, 0, 1, 0,0, 1,0 },
78 { ARM::t2LSRri, ARM::tLSRri, 0, 5, 0, 1, 0, 0,0, 1,0 },
79 { ARM::t2LSRrr, 0, ARM::tLSRrr, 0, 0, 0, 1, 0,0, 1,0 },
80 // FIXME: tMOVi8 and tMVN also partially update CPSR but they are less
81 // likely to cause issue in the loop. As a size / performance workaround,
82 // they are not marked as such.
83 { ARM::t2MOVi, ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 0,0 },
84 { ARM::t2MOVi16,ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 0,1 },
8085 // FIXME: Do we need the 16-bit 'S' variant?
81 { ARM::t2MOVr,ARM::tMOVgpr2gpr,0, 0, 0, 0, 0, 1,0, 0 },
82 { ARM::t2MOVCCr,0, ARM::tMOVCCr, 0, 0, 0, 0, 0,1, 0 },
83 { ARM::t2MOVCCi,0, ARM::tMOVCCi, 0, 8, 0, 1, 0,1, 0 },
84 { ARM::t2MUL, 0, ARM::tMUL, 0, 0, 0, 1, 0,0, 0 },
85 { ARM::t2MVNr, ARM::tMVN, 0, 0, 0, 1, 0, 0,0, 0 },
86 { ARM::t2ORRrr, 0, ARM::tORR, 0, 0, 0, 1, 0,0, 0 },
87 { ARM::t2REV, ARM::tREV, 0, 0, 0, 1, 0, 1,0, 0 },
88 { ARM::t2REV16, ARM::tREV16, 0, 0, 0, 1, 0, 1,0, 0 },
89 { ARM::t2REVSH, ARM::tREVSH, 0, 0, 0, 1, 0, 1,0, 0 },
90 { ARM::t2RORrr, 0, ARM::tROR, 0, 0, 0, 1, 0,0, 0 },
91 { ARM::t2RSBri, ARM::tRSB, 0, 0, 0, 1, 0, 0,0, 1 },
92 { ARM::t2RSBSri,ARM::tRSB, 0, 0, 0, 1, 0, 2,0, 1 },
93 { ARM::t2SBCrr, 0, ARM::tSBC, 0, 0, 0, 1, 0,0, 0 },
94 { ARM::t2SUBri, ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 0,0, 0 },
95 { ARM::t2SUBrr, ARM::tSUBrr, 0, 0, 0, 1, 0, 0,0, 0 },
96 { ARM::t2SUBSri,ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 2,2, 0 },
97 { ARM::t2SUBSrr,ARM::tSUBrr, 0, 0, 0, 1, 0, 2,0, 0 },
98 { ARM::t2SXTBr, ARM::tSXTB, 0, 0, 0, 1, 0, 1,0, 0 },
99 { ARM::t2SXTHr, ARM::tSXTH, 0, 0, 0, 1, 0, 1,0, 0 },
100 { ARM::t2TSTrr, ARM::tTST, 0, 0, 0, 1, 0, 2,0, 0 },
101 { ARM::t2UXTBr, ARM::tUXTB, 0, 0, 0, 1, 0, 1,0, 0 },
102 { ARM::t2UXTHr, ARM::tUXTH, 0, 0, 0, 1, 0, 1,0, 0 },
86 { ARM::t2MOVr,ARM::tMOVgpr2gpr,0, 0, 0, 0, 0, 1,0, 0,0 },
87 { ARM::t2MOVCCr,0, ARM::tMOVCCr, 0, 0, 0, 0, 0,1, 0,0 },
88 { ARM::t2MOVCCi,0, ARM::tMOVCCi, 0, 8, 0, 1, 0,1, 0,0 },
89 { ARM::t2MUL, 0, ARM::tMUL, 0, 0, 0, 1, 0,0, 1,0 },
90 { ARM::t2MVNr, ARM::tMVN, 0, 0, 0, 1, 0, 0,0, 0,0 },
91 { ARM::t2ORRrr, 0, ARM::tORR, 0, 0, 0, 1, 0,0, 1,0 },
92 { ARM::t2REV, ARM::tREV, 0, 0, 0, 1, 0, 1,0, 0,0 },
93 { ARM::t2REV16, ARM::tREV16, 0, 0, 0, 1, 0, 1,0, 0,0 },
94 { ARM::t2REVSH, ARM::tREVSH, 0, 0, 0, 1, 0, 1,0, 0,0 },
95 { ARM::t2RORrr, 0, ARM::tROR, 0, 0, 0, 1, 0,0, 1,0 },
96 { ARM::t2RSBri, ARM::tRSB, 0, 0, 0, 1, 0, 0,0, 0,1 },
97 { ARM::t2RSBSri,ARM::tRSB, 0, 0, 0, 1, 0, 2,0, 0,1 },
98 { ARM::t2SBCrr, 0, ARM::tSBC, 0, 0, 0, 1, 0,0, 0,0 },
99 { ARM::t2SUBri, ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 0,0, 0,0 },
100 { ARM::t2SUBrr, ARM::tSUBrr, 0, 0, 0, 1, 0, 0,0, 0,0 },
101 { ARM::t2SUBSri,ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 2,2, 0,0 },
102 { ARM::t2SUBSrr,ARM::tSUBrr, 0, 0, 0, 1, 0, 2,0, 0,0 },
103 { ARM::t2SXTBr, ARM::tSXTB, 0, 0, 0, 1, 0, 1,0, 0,0 },
104 { ARM::t2SXTHr, ARM::tSXTH, 0, 0, 0, 1, 0, 1,0, 0,0 },
105 { ARM::t2TSTrr, ARM::tTST, 0, 0, 0, 1, 0, 2,0, 0,0 },
106 { ARM::t2UXTBr, ARM::tUXTB, 0, 0, 0, 1, 0, 1,0, 0,0 },
107 { ARM::t2UXTHr, ARM::tUXTH, 0, 0, 0, 1, 0, 1,0, 0,0 },
103108
104109 // FIXME: Clean this up after splitting each Thumb load / store opcode
105110 // into multiple ones.
106 { ARM::t2LDRi12,ARM::tLDRi, ARM::tLDRspi, 5, 8, 1, 0, 0,0, 1 },
107 { ARM::t2LDRs, ARM::tLDRr, 0, 0, 0, 1, 0, 0,0, 1 },
108 { ARM::t2LDRBi12,ARM::tLDRBi, 0, 5, 0, 1, 0, 0,0, 1 },
109 { ARM::t2LDRBs, ARM::tLDRBr, 0, 0, 0, 1, 0, 0,0, 1 },
110 { ARM::t2LDRHi12,ARM::tLDRHi, 0, 5, 0, 1, 0, 0,0, 1 },
111 { ARM::t2LDRHs, ARM::tLDRHr, 0, 0, 0, 1, 0, 0,0, 1 },
112 { ARM::t2LDRSBs,ARM::tLDRSB, 0, 0, 0, 1, 0, 0,0, 1 },
113 { ARM::t2LDRSHs,ARM::tLDRSH, 0, 0, 0, 1, 0, 0,0, 1 },
114 { ARM::t2STRi12,ARM::tSTRi, ARM::tSTRspi, 5, 8, 1, 0, 0,0, 1 },
115 { ARM::t2STRs, ARM::tSTRr, 0, 0, 0, 1, 0, 0,0, 1 },
116 { ARM::t2STRBi12,ARM::tSTRBi, 0, 5, 0, 1, 0, 0,0, 1 },
117 { ARM::t2STRBs, ARM::tSTRBr, 0, 0, 0, 1, 0, 0,0, 1 },
118 { ARM::t2STRHi12,ARM::tSTRHi, 0, 5, 0, 1, 0, 0,0, 1 },
119 { ARM::t2STRHs, ARM::tSTRHr, 0, 0, 0, 1, 0, 0,0, 1 },
120
121 { ARM::t2LDMIA, ARM::tLDMIA, 0, 0, 0, 1, 1, 1,1, 1 },
122 { ARM::t2LDMIA_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 1 },
123 { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0, 0, 1, 1, 1,1, 1 },
111 { ARM::t2LDRi12,ARM::tLDRi, ARM::tLDRspi, 5, 8, 1, 0, 0,0, 0,1 },
112 { ARM::t2LDRs, ARM::tLDRr, 0, 0, 0, 1, 0, 0,0, 0,1 },
113 { ARM::t2LDRBi12,ARM::tLDRBi, 0, 5, 0, 1, 0, 0,0, 0,1 },
114 { ARM::t2LDRBs, ARM::tLDRBr, 0, 0, 0, 1, 0, 0,0, 0,1 },
115 { ARM::t2LDRHi12,ARM::tLDRHi, 0, 5, 0, 1, 0, 0,0, 0,1 },
116 { ARM::t2LDRHs, ARM::tLDRHr, 0, 0, 0, 1, 0, 0,0, 0,1 },
117 { ARM::t2LDRSBs,ARM::tLDRSB, 0, 0, 0, 1, 0, 0,0, 0,1 },
118 { ARM::t2LDRSHs,ARM::tLDRSH, 0, 0, 0, 1, 0, 0,0, 0,1 },
119 { ARM::t2STRi12,ARM::tSTRi, ARM::tSTRspi, 5, 8, 1, 0, 0,0, 0,1 },
120 { ARM::t2STRs, ARM::tSTRr, 0, 0, 0, 1, 0, 0,0, 0,1 },
121 { ARM::t2STRBi12,ARM::tSTRBi, 0, 5, 0, 1, 0, 0,0, 0,1 },
122 { ARM::t2STRBs, ARM::tSTRBr, 0, 0, 0, 1, 0, 0,0, 0,1 },
123 { ARM::t2STRHi12,ARM::tSTRHi, 0, 5, 0, 1, 0, 0,0, 0,1 },
124 { ARM::t2STRHs, ARM::tSTRHr, 0, 0, 0, 1, 0, 0,0, 0,1 },
125
126 { ARM::t2LDMIA, ARM::tLDMIA, 0, 0, 0, 1, 1, 1,1, 0,1 },
127 { ARM::t2LDMIA_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 0,1 },
128 { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0, 0, 1, 1, 1,1, 0,1 },
124129 // ARM::t2STM (with no basereg writeback) has no Thumb1 equivalent
125 { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 1 },
126 { ARM::t2STMDB_UPD, 0, ARM::tPUSH, 0, 0, 1, 1, 1,1, 1 },
130 { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1 },
131 { ARM::t2STMDB_UPD, 0, ARM::tPUSH, 0, 0, 1, 1, 1,1, 0,1 },
127132 };
128133
129134 class Thumb2SizeReduce : public MachineFunctionPass {
132137 Thumb2SizeReduce();
133138
134139 const Thumb2InstrInfo *TII;
140 const ARMSubtarget *STI;
135141
136142 virtual bool runOnMachineFunction(MachineFunction &MF);
137143
143149 /// ReduceOpcodeMap - Maps wide opcode to index of entry in ReduceTable.
144150 DenseMap ReduceOpcodeMap;
145151
152 bool canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use);
153
146154 bool VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry,
147155 bool is2Addr, ARMCC::CondCodes Pred,
148156 bool LiveCPSR, bool &HasCC, bool &CCDead);
151159 const ReduceEntry &Entry);
152160
153161 bool ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
154 const ReduceEntry &Entry, bool LiveCPSR);
162 const ReduceEntry &Entry, bool LiveCPSR,
163 MachineInstr *CPSRDef);
155164
156165 /// ReduceTo2Addr - Reduce a 32-bit instruction to a 16-bit two-address
157166 /// instruction.
158167 bool ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
159168 const ReduceEntry &Entry,
160 bool LiveCPSR);
169 bool LiveCPSR, MachineInstr *CPSRDef);
161170
162171 /// ReduceToNarrow - Reduce a 32-bit instruction to a 16-bit
163172 /// non-two-address instruction.
164173 bool ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
165174 const ReduceEntry &Entry,
166 bool LiveCPSR);
175 bool LiveCPSR, MachineInstr *CPSRDef);
167176
168177 /// ReduceMBB - Reduce width of instructions in the specified basic block.
169178 bool ReduceMBB(MachineBasicBlock &MBB);
184193 if (*Regs == ARM::CPSR)
185194 return true;
186195 return false;
196 }
197
198 /// canAddPseudoFlagDep - For A9 (and other out-of-order) implementations,
199 /// the 's' 16-bit instruction partially update CPSR. Abort the
200 /// transformation to avoid adding false dependency on last CPSR setting
201 /// instruction which hurts the ability for out-of-order execution engine
202 /// to do register renaming magic.
203 /// This function checks if there is a read-of-write dependency between the
204 /// last instruction that defines the CPSR and the current instruction. If there
205 /// is, then there is no harm done since the instruction cannot be retired
206 /// before the CPSR setting instruction anyway.
207 /// Note, we are not doing full dependency analysis here for the sake of compile
208 /// time. We're not looking for cases like:
209 /// r0 = muls ...
210 /// r1 = add.w r0, ...
211 /// ...
212 /// = mul.w r1
213 /// In this case it would have been ok to narrow the mul.w to muls since there
214 /// are indirect RAW dependency between the muls and the mul.w
215 bool
216 Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use) {
217 if (!Def || !STI->avoidCPSRPartialUpdate())
218 return false;
219
220 SmallSet Defs;
221 for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) {
222 const MachineOperand &MO = Def->getOperand(i);
223 if (!MO.isReg() || MO.isUndef() || MO.isUse())
224 continue;
225 unsigned Reg = MO.getReg();
226 if (Reg == 0 || Reg == ARM::CPSR)
227 continue;
228 Defs.insert(Reg);
229 }
230
231 for (unsigned i = 0, e = Use->getNumOperands(); i != e; ++i) {
232 const MachineOperand &MO = Use->getOperand(i);
233 if (!MO.isReg() || MO.isUndef() || MO.isDef())
234 continue;
235 unsigned Reg = MO.getReg();
236 if (Defs.count(Reg))
237 return false;
238 }
239
240 // No read-after-write dependency. The narrowing will add false dependency.
241 return true;
187242 }
188243
189244 bool
424479 bool
425480 Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
426481 const ReduceEntry &Entry,
427 bool LiveCPSR) {
482 bool LiveCPSR, MachineInstr *CPSRDef) {
428483 if (Entry.LowRegs1 && !VerifyLowRegs(MI))
429484 return false;
430485
442497 switch (Opc) {
443498 default: break;
444499 case ARM::t2ADDSri: {
445 if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR))
500 if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef))
446501 return true;
447502 // fallthrough
448503 }
449504 case ARM::t2ADDSrr:
450 return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
505 return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef);
451506 }
452507 }
453508 break;
455510 case ARM::t2RSBri:
456511 case ARM::t2RSBSri:
457512 if (MI->getOperand(2).getImm() == 0)
458 return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
513 return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef);
459514 break;
460515 case ARM::t2MOVi16:
461516 // Can convert only 'pure' immediate operands, not immediates obtained as
462517 // globals' addresses.
463518 if (MI->getOperand(1).isImm())
464 return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
519 return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef);
465520 break;
466521 case ARM::t2CMPrr: {
467522 // Try to reduce to the lo-reg only version first. Why there are two
470525 // are prioritized, but the table assumes a unique entry for each
471526 // source insn opcode. So for now, we hack a local entry record to use.
472527 static const ReduceEntry NarrowEntry =
473 { ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 1 };
474 if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR))
528 { ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 0,1 };
529 if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, CPSRDef))
475530 return true;
476 return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
531 return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef);
477532 }
478533 case ARM::t2ADDrSPi: {
479534 static const ReduceEntry NarrowEntry =
480 { ARM::t2ADDrSPi,ARM::tADDspi, 0, 7, 0, 1, 0, 1, 0, 1 };
535 { ARM::t2ADDrSPi,ARM::tADDspi, 0, 7, 0, 1, 0, 1, 0, 0,1 };
481536 if (MI->getOperand(0).getReg() == ARM::SP)
482 return ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR);
483 return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
537 return ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, CPSRDef);
538 return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef);
484539 }
485540 }
486541 return false;
489544 bool
490545 Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
491546 const ReduceEntry &Entry,
492 bool LiveCPSR) {
547 bool LiveCPSR, MachineInstr *CPSRDef) {
493548
494549 if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr))
495550 return false;
542597 CCDead = true;
543598 }
544599 if (!VerifyPredAndCC(MI, Entry, true, Pred, LiveCPSR, HasCC, CCDead))
600 return false;
601
602 // Avoid adding a false dependency on partial flag update by some 16-bit
603 // instructions which has the 's' bit set.
604 if (Entry.PartFlag && NewTID.hasOptionalDef() && HasCC &&
605 canAddPseudoFlagDep(CPSRDef, MI))
545606 return false;
546607
547608 // Add the 16-bit instruction.
578639 bool
579640 Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
580641 const ReduceEntry &Entry,
581 bool LiveCPSR) {
642 bool LiveCPSR, MachineInstr *CPSRDef) {
582643 if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit))
583644 return false;
584645
631692 if (!VerifyPredAndCC(MI, Entry, false, Pred, LiveCPSR, HasCC, CCDead))
632693 return false;
633694
695 // Avoid adding a false dependency on partial flag update by some 16-bit
696 // instructions which has the 's' bit set.
697 if (Entry.PartFlag && NewTID.hasOptionalDef() && HasCC &&
698 canAddPseudoFlagDep(CPSRDef, MI))
699 return false;
700
634701 // Add the 16-bit instruction.
635702 DebugLoc dl = MI->getDebugLoc();
636703 MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewTID);
678745 return true;
679746 }
680747
681 static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR) {
748 static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR, bool &DefCPSR) {
682749 bool HasDef = false;
683750 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
684751 const MachineOperand &MO = MI.getOperand(i);
686753 continue;
687754 if (MO.getReg() != ARM::CPSR)
688755 continue;
756
757 DefCPSR = true;
689758 if (!MO.isDead())
690759 HasDef = true;
691760 }
715784
716785 // Yes, CPSR could be livein.
717786 bool LiveCPSR = MBB.isLiveIn(ARM::CPSR);
787 MachineInstr *CPSRDef = 0;
718788
719789 MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end();
720790 MachineBasicBlock::iterator NextMII;
730800 const ReduceEntry &Entry = ReduceTable[OPI->second];
731801 // Ignore "special" cases for now.
732802 if (Entry.Special) {
733 if (ReduceSpecial(MBB, MI, Entry, LiveCPSR)) {
803 if (ReduceSpecial(MBB, MI, Entry, LiveCPSR, CPSRDef)) {
734804 Modified = true;
735805 MachineBasicBlock::iterator I = prior(NextMII);
736806 MI = &*I;
739809 }
740810
741811 // Try to transform to a 16-bit two-address instruction.
742 if (Entry.NarrowOpc2 && ReduceTo2Addr(MBB, MI, Entry, LiveCPSR)) {
812 if (Entry.NarrowOpc2 &&
813 ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef)) {
743814 Modified = true;
744815 MachineBasicBlock::iterator I = prior(NextMII);
745816 MI = &*I;
747818 }
748819
749820 // Try to transform to a 16-bit non-two-address instruction.
750 if (Entry.NarrowOpc1 && ReduceToNarrow(MBB, MI, Entry, LiveCPSR)) {
821 if (Entry.NarrowOpc1 &&
822 ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef)) {
751823 Modified = true;
752824 MachineBasicBlock::iterator I = prior(NextMII);
753825 MI = &*I;
755827 }
756828
757829 ProcessNext:
758 LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR);
830 bool DefCPSR = false;
831 LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR, DefCPSR);
832 if (MI->getDesc().isCall())
833 // Calls don't really set CPSR.
834 CPSRDef = 0;
835 else if (DefCPSR)
836 // This is the last CPSR defining instruction.
837 CPSRDef = MI;
759838 }
760839
761840 return Modified;
764843 bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
765844 const TargetMachine &TM = MF.getTarget();
766845 TII = static_cast(TM.getInstrInfo());
846 STI = &TM.getSubtarget();
767847
768848 bool Modified = false;
769849 for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
0 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s
1 ; Avoid some 's' 16-bit instruction which partially update CPSR (and add false
2 ; dependency) when it isn't dependent on last CPSR defining instruction.
3 ; rdar://8928208
4
5 define i32 @t(i32 %a, i32 %b, i32 %c, i32 %d) nounwind readnone {
6 entry:
7 ; CHECK: t:
8 ; CHECK: muls r2, r3, r2
9 ; CHECK-NEXT: mul r0, r0, r1
10 ; CHECK-NEXT: muls r0, r2, r0
11 %0 = mul nsw i32 %a, %b
12 %1 = mul nsw i32 %c, %d
13 %2 = mul nsw i32 %0, %1
14 ret i32 %2
15 }