llvm.org GIT mirror llvm / bd8c8d7
[SLH] Introduce a new pass to do Speculative Load Hardening to mitigate Spectre variant #1 for x86. There is a lengthy, detailed RFC thread on llvm-dev which discusses the high level issues. High level discussion is probably best there. I've split the design document out of this patch and will land it separately once I update it to reflect the latest edits and updates to the Google doc used in the RFC thread. This patch is really just an initial step. It isn't quite ready for prime time and is only exposed via debugging flags. It has two major limitations currently: 1) It only supports x86-64, and only certain ABIs. Many assumptions are currently hard-coded and need to be factored out of the code here. 2) It doesn't include any options for more fine-grained control, either of which control flow edges are significant or which loads are important to be hardened. 3) The code is still quite rough and the testing lighter than I'd like. However, this is enough for people to begin using. I have had numerous requests from people to be able to experiment with this patch to understand the trade-offs it presents and how to use it. We would also like to encourage work to similar effect in other toolchains. The ARM folks are actively developing a system based on this for AArch64. We hope to merge this with their efforts when both are far enough along. But we also don't want to block making this available on that effort. Many thanks to the *numerous* people who helped along the way here. For this patch in particular, both Eric and Craig did a ton of review to even have confidence in it as an early, rough cut at this functionality. Differential Revision: https://reviews.llvm.org/D44824 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@336990 91177308-0d34-0410-b5e6-96231b3b80d8 Chandler Carruth 2 years ago
7 changed file(s) with 2272 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
476476 /// probabilities may need to be normalized.
477477 void copySuccessor(MachineBasicBlock *Orig, succ_iterator I);
478478
479 /// Split the old successor into old plus new and updates the probability
480 /// info.
481 void splitSuccessor(MachineBasicBlock *Old, MachineBasicBlock *New,
482 bool NormalizeSuccProbs = false);
483
479484 /// Transfers all the successors from MBB to this machine basic block (i.e.,
480485 /// copies all the successors FromMBB and remove all the successors from
481486 /// FromMBB).
658658 Succ->addPredecessor(this);
659659 }
660660
661 void MachineBasicBlock::splitSuccessor(MachineBasicBlock *Old,
662 MachineBasicBlock *New,
663 bool NormalizeSuccProbs) {
664 succ_iterator OldI = llvm::find(successors(), Old);
665 assert(OldI != succ_end() && "Old is not a successor of this block!");
666 assert(llvm::find(successors(), New) == succ_end() &&
667 "New is already a successor of this block!");
668
669 // Add a new successor with equal probability as the original one. Note
670 // that we directly copy the probability using the iterator rather than
671 // getting a potentially synthetic probability computed when unknown. This
672 // preserves the probabilities as-is and then we can renormalize them and
673 // query them effectively afterward.
674 addSuccessor(New, Probs.empty() ? BranchProbability::getUnknown()
675 : *getProbabilityIterator(OldI));
676 if (NormalizeSuccProbs)
677 normalizeSuccProbs();
678 }
679
661680 void MachineBasicBlock::removeSuccessor(MachineBasicBlock *Succ,
662681 bool NormalizeSuccProbs) {
663682 succ_iterator I = find(Successors, Succ);
5656 X86RetpolineThunks.cpp
5757 X86SelectionDAGInfo.cpp
5858 X86ShuffleDecodeConstantPool.cpp
59 X86SpeculativeLoadHardening.cpp
5960 X86Subtarget.cpp
6061 X86TargetMachine.cpp
6162 X86TargetObjectFile.cpp
126126
127127 void initializeEvexToVexInstPassPass(PassRegistry &);
128128
129 FunctionPass *createX86SpeculativeLoadHardeningPass();
130
129131 } // End llvm namespace
130132
131133 #endif
0 //====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 ///
10 /// Provide a pass which mitigates speculative execution attacks which operate
11 /// by speculating incorrectly past some predicate (a type check, bounds check,
12 /// or other condition) to reach a load with invalid inputs and leak the data
13 /// accessed by that load using a side channel out of the speculative domain.
14 ///
15 /// For details on the attacks, see the first variant in both the Project Zero
16 /// writeup and the Spectre paper:
17 /// https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
18 /// https://spectreattack.com/spectre.pdf
19 ///
20 //===----------------------------------------------------------------------===//
21
22 #include "X86.h"
23 #include "X86InstrBuilder.h"
24 #include "X86InstrInfo.h"
25 #include "X86Subtarget.h"
26 #include "llvm/ADT/ArrayRef.h"
27 #include "llvm/ADT/DenseMap.h"
28 #include "llvm/ADT/STLExtras.h"
29 #include "llvm/ADT/ScopeExit.h"
30 #include "llvm/ADT/SmallPtrSet.h"
31 #include "llvm/ADT/SmallSet.h"
32 #include "llvm/ADT/SmallVector.h"
33 #include "llvm/ADT/SparseBitVector.h"
34 #include "llvm/ADT/Statistic.h"
35 #include "llvm/CodeGen/MachineBasicBlock.h"
36 #include "llvm/CodeGen/MachineConstantPool.h"
37 #include "llvm/CodeGen/MachineFunction.h"
38 #include "llvm/CodeGen/MachineFunctionPass.h"
39 #include "llvm/CodeGen/MachineInstr.h"
40 #include "llvm/CodeGen/MachineInstrBuilder.h"
41 #include "llvm/CodeGen/MachineModuleInfo.h"
42 #include "llvm/CodeGen/MachineOperand.h"
43 #include "llvm/CodeGen/MachineRegisterInfo.h"
44 #include "llvm/CodeGen/MachineSSAUpdater.h"
45 #include "llvm/CodeGen/TargetInstrInfo.h"
46 #include "llvm/CodeGen/TargetRegisterInfo.h"
47 #include "llvm/CodeGen/TargetSchedule.h"
48 #include "llvm/CodeGen/TargetSubtargetInfo.h"
49 #include "llvm/IR/DebugLoc.h"
50 #include "llvm/MC/MCSchedule.h"
51 #include "llvm/Pass.h"
52 #include "llvm/Support/CommandLine.h"
53 #include "llvm/Support/Debug.h"
54 #include "llvm/Support/raw_ostream.h"
55 #include
56 #include
57 #include
58 #include
59
60 using namespace llvm;
61
62 #define PASS_KEY "x86-speculative-load-hardening"
63 #define DEBUG_TYPE PASS_KEY
64
65 STATISTIC(NumCondBranchesTraced, "Number of conditional branches traced");
66 STATISTIC(NumBranchesUntraced, "Number of branches unable to trace");
67 STATISTIC(NumAddrRegsHardened,
68 "Number of address mode used registers hardaned");
69 STATISTIC(NumPostLoadRegsHardened,
70 "Number of post-load register values hardened");
71 STATISTIC(NumInstsInserted, "Number of instructions inserted");
72 STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted");
73
74 static cl::opt HardenEdgesWithLFENCE(
75 PASS_KEY "-lfence",
76 cl::desc(
77 "Use LFENCE along each conditional edge to harden against speculative "
78 "loads rather than conditional movs and poisoned pointers."),
79 cl::init(false), cl::Hidden);
80
81 static cl::opt EnablePostLoadHardening(
82 PASS_KEY "-post-load",
83 cl::desc("Harden the value loaded *after* it is loaded by "
84 "flushing the loaded bits to 1. This is hard to do "
85 "in general but can be done easily for GPRs."),
86 cl::init(true), cl::Hidden);
87
88 static cl::opt FenceCallAndRet(
89 PASS_KEY "-fence-call-and-ret",
90 cl::desc("Use a full speculation fence to harden both call and ret edges "
91 "rather than a lighter weight mitigation."),
92 cl::init(false), cl::Hidden);
93
94 static cl::opt HardenInterprocedurally(
95 PASS_KEY "-ip",
96 cl::desc("Harden interprocedurally by passing our state in and out of "
97 "functions in the high bits of the stack pointer."),
98 cl::init(true), cl::Hidden);
99
100 static cl::opt
101 HardenLoads(PASS_KEY "-loads",
102 cl::desc("Sanitize loads from memory. When disable, no "
103 "significant security is provided."),
104 cl::init(true), cl::Hidden);
105
106 namespace llvm {
107
108 void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
109
110 } // end namespace llvm
111
112 namespace {
113
114 class X86SpeculativeLoadHardeningPass : public MachineFunctionPass {
115 public:
116 X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) {
117 initializeX86SpeculativeLoadHardeningPassPass(
118 *PassRegistry::getPassRegistry());
119 }
120
121 StringRef getPassName() const override {
122 return "X86 speculative load hardening";
123 }
124 bool runOnMachineFunction(MachineFunction &MF) override;
125 void getAnalysisUsage(AnalysisUsage &AU) const override;
126
127 /// Pass identification, replacement for typeid.
128 static char ID;
129
130 private:
131 /// The information about a block's conditional terminators needed to trace
132 /// our predicate state through the exiting edges.
133 struct BlockCondInfo {
134 MachineBasicBlock *MBB;
135
136 // We mostly have one conditional branch, and in extremely rare cases have
137 // two. Three and more are so rare as to be unimportant for compile time.
138 SmallVector CondBrs;
139
140 MachineInstr *UncondBr;
141 };
142
143 const X86Subtarget *Subtarget;
144 MachineRegisterInfo *MRI;
145 const X86InstrInfo *TII;
146 const TargetRegisterInfo *TRI;
147 const TargetRegisterClass *PredStateRC;
148
149 void hardenEdgesWithLFENCE(MachineFunction &MF);
150
151 SmallVector collectBlockCondInfo(MachineFunction &MF);
152
153 void checkAllLoads(MachineFunction &MF, MachineSSAUpdater &PredStateSSA);
154
155 unsigned saveEFLAGS(MachineBasicBlock &MBB,
156 MachineBasicBlock::iterator InsertPt, DebugLoc Loc);
157 void restoreEFLAGS(MachineBasicBlock &MBB,
158 MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
159 unsigned OFReg);
160
161 void mergePredStateIntoSP(MachineBasicBlock &MBB,
162 MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
163 unsigned PredStateReg);
164 unsigned extractPredStateFromSP(MachineBasicBlock &MBB,
165 MachineBasicBlock::iterator InsertPt,
166 DebugLoc Loc);
167
168 void
169 hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO,
170 MachineOperand &IndexMO, MachineSSAUpdater &PredStateSSA,
171 SmallDenseMap &AddrRegToHardenedReg);
172 MachineInstr *
173 sinkPostLoadHardenedInst(MachineInstr &MI,
174 SmallPtrSetImpl &HardenedLoads);
175 void hardenPostLoad(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
176 void checkReturnInstr(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
177 void checkCallInstr(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
178 };
179
180 } // end anonymous namespace
181
182 char X86SpeculativeLoadHardeningPass::ID = 0;
183
184 void X86SpeculativeLoadHardeningPass::getAnalysisUsage(
185 AnalysisUsage &AU) const {
186 MachineFunctionPass::getAnalysisUsage(AU);
187 }
188
189 static MachineBasicBlock &splitEdge(MachineBasicBlock &MBB,
190 MachineBasicBlock &Succ, int SuccCount,
191 MachineInstr *Br, MachineInstr *&UncondBr,
192 const X86InstrInfo &TII) {
193 assert(!Succ.isEHPad() && "Shouldn't get edges to EH pads!");
194
195 MachineFunction &MF = *MBB.getParent();
196
197 MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
198
199 // We have to insert the new block immediately after the current one as we
200 // don't know what layout-successor relationships the successor has and we
201 // may not be able to (and generally don't want to) try to fix those up.
202 MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
203
204 // Update the branch instruction if necessary.
205 if (Br) {
206 assert(Br->getOperand(0).getMBB() == &Succ &&
207 "Didn't start with the right target!");
208 Br->getOperand(0).setMBB(&NewMBB);
209
210 // If this successor was reached through a branch rather than fallthrough,
211 // we might have *broken* fallthrough and so need to inject a new
212 // unconditional branch.
213 if (!UncondBr) {
214 MachineBasicBlock &OldLayoutSucc =
215 *std::next(MachineFunction::iterator(&NewMBB));
216 assert(MBB.isSuccessor(&OldLayoutSucc) &&
217 "Without an unconditional branch, the old layout successor should "
218 "be an actual successor!");
219 auto BrBuilder =
220 BuildMI(&MBB, DebugLoc(), TII.get(X86::JMP_1)).addMBB(&OldLayoutSucc);
221 // Update the unconditional branch now that we've added one.
222 UncondBr = &*BrBuilder;
223 }
224
225 // Insert unconditional "jump Succ" instruction in the new block if
226 // necessary.
227 if (!NewMBB.isLayoutSuccessor(&Succ)) {
228 SmallVector Cond;
229 TII.insertBranch(NewMBB, &Succ, nullptr, Cond, Br->getDebugLoc());
230 }
231 } else {
232 assert(!UncondBr &&
233 "Cannot have a branchless successor and an unconditional branch!");
234 assert(NewMBB.isLayoutSuccessor(&Succ) &&
235 "A non-branch successor must have been a layout successor before "
236 "and now is a layout successor of the new block.");
237 }
238
239 // If this is the only edge to the successor, we can just replace it in the
240 // CFG. Otherwise we need to add a new entry in the CFG for the new
241 // successor.
242 if (SuccCount == 1) {
243 MBB.replaceSuccessor(&Succ, &NewMBB);
244 } else {
245 MBB.splitSuccessor(&Succ, &NewMBB);
246 }
247
248 // Hook up the edge from the new basic block to the old successor in the CFG.
249 NewMBB.addSuccessor(&Succ);
250
251 // Fix PHI nodes in Succ so they refer to NewMBB instead of MBB.
252 for (MachineInstr &MI : Succ) {
253 if (!MI.isPHI())
254 break;
255 for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
256 OpIdx += 2) {
257 MachineOperand &OpV = MI.getOperand(OpIdx);
258 MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
259 assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
260 if (OpMBB.getMBB() != &MBB)
261 continue;
262
263 // If this is the last edge to the succesor, just replace MBB in the PHI
264 if (SuccCount == 1) {
265 OpMBB.setMBB(&NewMBB);
266 break;
267 }
268
269 // Otherwise, append a new pair of operands for the new incoming edge.
270 MI.addOperand(MF, OpV);
271 MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
272 break;
273 }
274 }
275
276 // Inherit live-ins from the successor
277 for (auto &LI : Succ.liveins())
278 NewMBB.addLiveIn(LI);
279
280 LLVM_DEBUG(dbgs() << " Split edge from '" << MBB.getName() << "' to '"
281 << Succ.getName() << "'.\n");
282 return NewMBB;
283 }
284
285 bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
286 MachineFunction &MF) {
287 LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
288 << " **********\n");
289
290 Subtarget = &MF.getSubtarget();
291 MRI = &MF.getRegInfo();
292 TII = Subtarget->getInstrInfo();
293 TRI = Subtarget->getRegisterInfo();
294 // FIXME: Support for 32-bit.
295 PredStateRC = &X86::GR64_NOSPRegClass;
296
297 if (MF.begin() == MF.end())
298 // Nothing to do for a degenerate empty function...
299 return false;
300
301 // We support an alternative hardening technique based on a debug flag.
302 if (HardenEdgesWithLFENCE) {
303 hardenEdgesWithLFENCE(MF);
304 return true;
305 }
306
307 // Create a dummy debug loc to use for all the generated code here.
308 DebugLoc Loc;
309
310 MachineBasicBlock &Entry = *MF.begin();
311 auto EntryInsertPt = Entry.SkipPHIsLabelsAndDebug(Entry.begin());
312
313 // Do a quick scan to see if we have any checkable loads.
314 bool HasCheckableLoad = false;
315 for (MachineBasicBlock &MBB : MF) {
316 for (MachineInstr &MI : MBB) {
317 // Stop searching blocks at an LFENCE.
318 if (MI.getOpcode() == X86::LFENCE)
319 break;
320
321 // Looking for loads only.
322 if (!MI.mayLoad())
323 continue;
324
325 // An MFENCE is modeled as a load but doesn't require hardening.
326 if (MI.getOpcode() == X86::MFENCE)
327 continue;
328
329 HasCheckableLoad = true;
330 break;
331 }
332 if (HasCheckableLoad)
333 break;
334 }
335
336 // See if we have any conditional branching blocks that we will need to trace
337 // predicate state through.
338 SmallVector Infos = collectBlockCondInfo(MF);
339
340 // If we have no interesting conditions or loads, nothing to do here.
341 if (!HasCheckableLoad && Infos.empty())
342 return true;
343
344 unsigned PredStateReg;
345 unsigned PredStateSizeInBytes = TRI->getRegSizeInBits(*PredStateRC) / 8;
346
347 // The poison value is required to be an all-ones value for many aspects of
348 // this mitigation.
349 const int PoisonVal = -1;
350 unsigned PoisonReg = MRI->createVirtualRegister(PredStateRC);
351 BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV64ri32), PoisonReg)
352 .addImm(PoisonVal);
353 ++NumInstsInserted;
354
355 // If we have loads being hardened and we've asked for call and ret edges to
356 // get a full fence-based mitigation, inject that fence.
357 if (HasCheckableLoad && FenceCallAndRet) {
358 // We need to insert an LFENCE at the start of the function to suspend any
359 // incoming misspeculation from the caller. This helps two-fold: the caller
360 // may not have been protected as this code has been, and this code gets to
361 // not take any specific action to protect across calls.
362 // FIXME: We could skip this for functions which unconditionally return
363 // a constant.
364 BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::LFENCE));
365 ++NumInstsInserted;
366 ++NumLFENCEsInserted;
367 }
368
369 // If we have no conditionals to protect in blocks, then all we needed to do
370 // was protect the entry and so we're done.
371 if (Infos.empty())
372 // We may have changed the function's code at this point to insert fences.
373 return true;
374
375 // For every basic block in the function which can b
376 if (HardenInterprocedurally && !FenceCallAndRet) {
377 // Set up the predicate state by extracting it from the incoming stack
378 // pointer so we pick up any misspeculation in our caller.
379 PredStateReg = extractPredStateFromSP(Entry, EntryInsertPt, Loc);
380 } else {
381 // Otherwise, just build the predicate state itself by zeroing a register
382 // as we don't need any initial state.
383 PredStateReg = MRI->createVirtualRegister(PredStateRC);
384 unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
385 auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
386 PredStateSubReg);
387 ++NumInstsInserted;
388 MachineOperand *ZeroEFLAGSDefOp =
389 ZeroI->findRegisterDefOperand(X86::EFLAGS);
390 assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() &&
391 "Must have an implicit def of EFLAGS!");
392 ZeroEFLAGSDefOp->setIsDead(true);
393 BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG),
394 PredStateReg)
395 .addImm(0)
396 .addReg(PredStateSubReg)
397 .addImm(X86::sub_32bit);
398 }
399
400 // We're going to need to trace predicate state throughout the function's
401 // CFG. Prepare for this by setting up our initial state of PHIs with unique
402 // predecessor entries and all the initial predicate state.
403
404 // FIXME: It's really frustrating that we have to do this, but SSA-form in
405 // MIR isn't what you might expect. We may have multiple entries in PHI nodes
406 // for a single predecessor. This makes CFG-updating extremely complex, so
407 // here we simplify all PHI nodes to a model even simpler than the IR's
408 // model: exactly one entry per predecessor, regardless of how many edges
409 // there are.
410 SmallPtrSet Preds;
411 SmallVector DupIndices;
412 for (auto &MBB : MF)
413 for (auto &MI : MBB) {
414 if (!MI.isPHI())
415 break;
416
417 // First we scan the operands of the PHI looking for duplicate entries
418 // a particular predecessor. We retain the operand index of each duplicate
419 // entry found.
420 for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
421 OpIdx += 2)
422 if (!Preds.insert(MI.getOperand(OpIdx + 1).getMBB()).second)
423 DupIndices.push_back(OpIdx);
424
425 // Now walk the duplicate indices, removing both the block and value. Note
426 // that these are stored as a vector making this element-wise removal
427 // :w
428 // potentially quadratic.
429 //
430 // FIXME: It is really frustrating that we have to use a quadratic
431 // removal algorithm here. There should be a better way, but the use-def
432 // updates required make that impossible using the public API.
433 //
434 // Note that we have to process these backwards so that we don't
435 // invalidate other indices with each removal.
436 while (!DupIndices.empty()) {
437 int OpIdx = DupIndices.pop_back_val();
438 // Remove both the block and value operand, again in reverse order to
439 // preserve indices.
440 MI.RemoveOperand(OpIdx + 1);
441 MI.RemoveOperand(OpIdx);
442 }
443
444 Preds.clear();
445 }
446
447 // Track the updated values in an SSA updater to rewrite into SSA form at the
448 // end.
449 MachineSSAUpdater PredStateSSA(MF);
450 PredStateSSA.Initialize(PredStateReg);
451 PredStateSSA.AddAvailableValue(&Entry, PredStateReg);
452 // Collect the inserted instructions so we can rewrite their uses of the
453 // predicate state into SSA form.
454 SmallVector CMovs;
455
456 // Now walk all of the basic blocks looking for ones that end in conditional
457 // jumps where we need to update this register along each edge.
458 for (BlockCondInfo &Info : Infos) {
459 MachineBasicBlock &MBB = *Info.MBB;
460 SmallVectorImpl &CondBrs = Info.CondBrs;
461 MachineInstr *UncondBr = Info.UncondBr;
462
463 LLVM_DEBUG(dbgs() << "Tracing predicate through block: " << MBB.getName()
464 << "\n");
465 ++NumCondBranchesTraced;
466
467 // Compute the non-conditional successor as either the target of any
468 // unconditional branch or the layout successor.
469 MachineBasicBlock *UncondSucc =
470 UncondBr ? (UncondBr->getOpcode() == X86::JMP_1
471 ? UncondBr->getOperand(0).getMBB()
472 : nullptr)
473 : &*std::next(MachineFunction::iterator(&MBB));
474
475 // Count how many edges there are to any given successor.
476 SmallDenseMap SuccCounts;
477 if (UncondSucc)
478 ++SuccCounts[UncondSucc];
479 for (auto *CondBr : CondBrs)
480 ++SuccCounts[CondBr->getOperand(0).getMBB()];
481
482 // A lambda to insert cmov instructions into a block checking all of the
483 // condition codes in a sequence.
484 auto BuildCheckingBlockForSuccAndConds =
485 [&](MachineBasicBlock &MBB, MachineBasicBlock &Succ, int SuccCount,
486 MachineInstr *Br, MachineInstr *&UncondBr,
487 ArrayRef Conds) {
488 // First, we split the edge to insert the checking block into a safe
489 // location.
490 auto &CheckingMBB =
491 (SuccCount == 1 && Succ.pred_size() == 1)
492 ? Succ
493 : splitEdge(MBB, Succ, SuccCount, Br, UncondBr, *TII);
494
495 bool LiveEFLAGS = Succ.isLiveIn(X86::EFLAGS);
496 if (!LiveEFLAGS)
497 CheckingMBB.addLiveIn(X86::EFLAGS);
498
499 // Now insert the cmovs to implement the checks.
500 auto InsertPt = CheckingMBB.begin();
501 assert(
502 InsertPt == CheckingMBB.end() ||
503 !InsertPt->isPHI() &&
504 "Should never have a PHI in the initial checking block as it "
505 "always has a single predecessor!");
506
507 // We will wire each cmov to each other, but need to start with the
508 // incoming pred state.
509 unsigned CurStateReg = PredStateReg;
510
511 for (X86::CondCode Cond : Conds) {
512 auto CMovOp = X86::getCMovFromCond(Cond, PredStateSizeInBytes);
513
514 unsigned UpdatedStateReg = MRI->createVirtualRegister(PredStateRC);
515 auto CMovI = BuildMI(CheckingMBB, InsertPt, Loc, TII->get(CMovOp),
516 UpdatedStateReg)
517 .addReg(CurStateReg)
518 .addReg(PoisonReg);
519 // If this is the last cmov and the EFLAGS weren't originally
520 // live-in, mark them as killed.
521 if (!LiveEFLAGS && Cond == Conds.back())
522 CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
523
524 ++NumInstsInserted;
525 LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump();
526 dbgs() << "\n");
527
528 // The first one of the cmovs will be using the top level
529 // `PredStateReg` and need to get rewritten into SSA form.
530 if (CurStateReg == PredStateReg)
531 CMovs.push_back(&*CMovI);
532
533 // The next cmov should start from this one's def.
534 CurStateReg = UpdatedStateReg;
535 }
536
537 // And put the last one into the available values for PredStateSSA.
538 PredStateSSA.AddAvailableValue(&CheckingMBB, CurStateReg);
539 };
540
541 std::vector UncondCodeSeq;
542 for (auto *CondBr : CondBrs) {
543 MachineBasicBlock &Succ = *CondBr->getOperand(0).getMBB();
544 int &SuccCount = SuccCounts[&Succ];
545
546 X86::CondCode Cond = X86::getCondFromBranchOpc(CondBr->getOpcode());
547 X86::CondCode InvCond = X86::GetOppositeBranchCondition(Cond);
548 UncondCodeSeq.push_back(Cond);
549
550 BuildCheckingBlockForSuccAndConds(MBB, Succ, SuccCount, CondBr, UncondBr,
551 {InvCond});
552
553 // Decrement the successor count now that we've split one of the edges.
554 // We need to keep the count of edges to the successor accurate in order
555 // to know above when to *replace* the successor in the CFG vs. just
556 // adding the new successor.
557 --SuccCount;
558 }
559
560 // Since we may have split edges and changed the number of successors,
561 // normalize the probabilities. This avoids doing it each time we split an
562 // edge.
563 MBB.normalizeSuccProbs();
564
565 // Finally, we need to insert cmovs into the "fallthrough" edge. Here, we
566 // need to intersect the other condition codes. We can do this by just
567 // doing a cmov for each one.
568 if (!UncondSucc)
569 // If we have no fallthrough to protect (perhaps it is an indirect jump?)
570 // just skip this and continue.
571 continue;
572
573 assert(SuccCounts[UncondSucc] == 1 &&
574 "We should never have more than one edge to the unconditional "
575 "successor at this point because every other edge must have been "
576 "split above!");
577
578 // Sort and unique the codes to minimize them.
579 llvm::sort(UncondCodeSeq.begin(), UncondCodeSeq.end());
580 UncondCodeSeq.erase(std::unique(UncondCodeSeq.begin(), UncondCodeSeq.end()),
581 UncondCodeSeq.end());
582
583 // Build a checking version of the successor.
584 BuildCheckingBlockForSuccAndConds(MBB, *UncondSucc, /*SuccCount*/ 1,
585 UncondBr, UncondBr, UncondCodeSeq);
586 }
587
588 // We may also enter basic blocks in this function via exception handling
589 // control flow. Here, if we are hardening interprocedurally, we need to
590 // re-capture the predicate state from the throwing code. In the Itanium ABI,
591 // the throw will always look like a call to __cxa_throw and will have the
592 // predicate state in the stack pointer, so extract fresh predicate state from
593 // the stack pointer and make it available in SSA.
594 // FIXME: Handle non-itanium ABI EH models.
595 if (HardenInterprocedurally) {
596 for (MachineBasicBlock &MBB : MF) {
597 assert(!MBB.isEHScopeEntry() && "Only Itanium ABI EH supported!");
598 assert(!MBB.isEHFuncletEntry() && "Only Itanium ABI EH supported!");
599 assert(!MBB.isCleanupFuncletEntry() && "Only Itanium ABI EH supported!");
600 if (!MBB.isEHPad())
601 continue;
602 PredStateSSA.AddAvailableValue(
603 &MBB,
604 extractPredStateFromSP(MBB, MBB.SkipPHIsAndLabels(MBB.begin()), Loc));
605 }
606 }
607
608 // Now check all of the loads using the predicate state.
609 checkAllLoads(MF, PredStateSSA);
610
611 // Now rewrite all the uses of the pred state using the SSA updater so that
612 // we track updates through the CFG.
613 for (MachineInstr *CMovI : CMovs)
614 for (MachineOperand &Op : CMovI->operands()) {
615 if (!Op.isReg() || Op.getReg() != PredStateReg)
616 continue;
617
618 PredStateSSA.RewriteUse(Op);
619 }
620
621 // If we are hardening interprocedurally, find each returning block and
622 // protect the caller from being returned to through misspeculation.
623 if (HardenInterprocedurally)
624 for (MachineBasicBlock &MBB : MF) {
625 if (MBB.empty())
626 continue;
627
628 MachineInstr &MI = MBB.back();
629 if (!MI.isReturn())
630 continue;
631
632 checkReturnInstr(MI, PredStateSSA);
633 }
634
635 LLVM_DEBUG(dbgs() << "Final speculative load hardened function:\n"; MF.dump();
636 dbgs() << "\n"; MF.verify(this));
637 return true;
638 }
639
640 /// Implements the naive hardening approach of putting an LFENCE after every
641 /// potentially mis-predicted control flow construct.
642 ///
643 /// We include this as an alternative mostly for the purpose of comparison. The
644 /// performance impact of this is expected to be extremely severe and not
645 /// practical for any real-world users.
646 void X86SpeculativeLoadHardeningPass::hardenEdgesWithLFENCE(
647 MachineFunction &MF) {
648 // First, we scan the function looking for blocks that are reached along edges
649 // that we might want to harden.
650 SmallSetVector Blocks;
651 for (MachineBasicBlock &MBB : MF) {
652 // If there are no or only one successor, nothing to do here.
653 if (MBB.succ_size() <= 1)
654 continue;
655
656 // Skip blocks unless their terminators start with a branch. Other
657 // terminators don't seem interesting for guarding against misspeculation.
658 auto TermIt = MBB.getFirstTerminator();
659 if (TermIt == MBB.end() || !TermIt->isBranch())
660 continue;
661
662 // Add all the non-EH-pad succossors to the blocks we want to harden. We
663 // skip EH pads because there isn't really a condition of interest on
664 // entering.
665 for (MachineBasicBlock *SuccMBB : MBB.successors())
666 if (!SuccMBB->isEHPad())
667 Blocks.insert(SuccMBB);
668 }
669
670 for (MachineBasicBlock *MBB : Blocks) {
671 auto InsertPt = MBB->SkipPHIsAndLabels(MBB->begin());
672 BuildMI(*MBB, InsertPt, DebugLoc(), TII->get(X86::LFENCE));
673 ++NumInstsInserted;
674 ++NumLFENCEsInserted;
675 }
676 }
677
678 SmallVector
679 X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) {
680 SmallVector Infos;
681
682 // Walk the function and build up a summary for each block's conditions that
683 // we need to trace through.
684 for (MachineBasicBlock &MBB : MF) {
685 // If there are no or only one successor, nothing to do here.
686 if (MBB.succ_size() <= 1)
687 continue;
688
689 // We want to reliably handle any conditional branch terminators in the
690 // MBB, so we manually analyze the branch. We can handle all of the
691 // permutations here, including ones that analyze branch cannot.
692 //
693 // The approach is to walk backwards across the terminators, resetting at
694 // any unconditional non-indirect branch, and track all conditional edges
695 // to basic blocks as well as the fallthrough or unconditional successor
696 // edge. For each conditional edge, we track the target and the opposite
697 // condition code in order to inject a "no-op" cmov into that successor
698 // that will harden the predicate. For the fallthrough/unconditional
699 // edge, we inject a separate cmov for each conditional branch with
700 // matching condition codes. This effectively implements an "and" of the
701 // condition flags, even if there isn't a single condition flag that would
702 // directly implement that. We don't bother trying to optimize either of
703 // these cases because if such an optimization is possible, LLVM should
704 // have optimized the conditional *branches* in that way already to reduce
705 // instruction count. This late, we simply assume the minimal number of
706 // branch instructions is being emitted and use that to guide our cmov
707 // insertion.
708
709 BlockCondInfo Info = {&MBB, {}, nullptr};
710
711 // Now walk backwards through the terminators and build up successors they
712 // reach and the conditions.
713 for (MachineInstr &MI : llvm::reverse(MBB)) {
714 // Once we've handled all the terminators, we're done.
715 if (!MI.isTerminator())
716 break;
717
718 // If we see a non-branch terminator, we can't handle anything so bail.
719 if (!MI.isBranch()) {
720 Info.CondBrs.clear();
721 break;
722 }
723
724 // If we see an unconditional branch, reset our state, clear any
725 // fallthrough, and set this is the "else" successor.
726 if (MI.getOpcode() == X86::JMP_1) {
727 Info.CondBrs.clear();
728 Info.UncondBr = &MI;
729 continue;
730 }
731
732 // If we get an invalid condition, we have an indirect branch or some
733 // other unanalyzable "fallthrough" case. We model this as a nullptr for
734 // the destination so we can still guard any conditional successors.
735 // Consider code sequences like:
736 // ```
737 // jCC L1
738 // jmpq *%rax
739 // ```
740 // We still want to harden the edge to `L1`.
741 if (X86::getCondFromBranchOpc(MI.getOpcode()) == X86::COND_INVALID) {
742 Info.CondBrs.clear();
743 Info.UncondBr = &MI;
744 continue;
745 }
746
747 // We have a vanilla conditional branch, add it to our list.
748 Info.CondBrs.push_back(&MI);
749 }
750 if (Info.CondBrs.empty()) {
751 ++NumBranchesUntraced;
752 LLVM_DEBUG(dbgs() << "WARNING: unable to secure successors of block:\n";
753 MBB.dump());
754 continue;
755 }
756
757 Infos.push_back(Info);
758 }
759
760 return Infos;
761 }
762
763 /// Returns true if the instruction has no behavior (specified or otherwise)
764 /// that is based on the value of any of its register operands
765 ///
766 /// A classical example of something that is inherently not data invariant is an
767 /// indirect jump -- the destination is loaded into icache based on the bits set
768 /// in the jump destination register.
769 ///
770 /// FIXME: This should become part of our instruction tables.
771 static bool isDataInvariant(MachineInstr &MI) {
772 switch (MI.getOpcode()) {
773 default:
774 // By default, assume that the instruction is not data invariant.
775 return false;
776
777 // FIXME: For now, we just use a very boring, conservative set of unary
778 // instructions because we're mostly interested in handling simple
779 // transformations.
780 case TargetOpcode::COPY:
781 return true;
782 }
783 }
784
785 /// Returns true if the instruction has no behavior (specified or otherwise)
786 /// that is based on the value loaded from memory or the value of any
787 /// non-address register operands.
788 ///
789 /// For example, if the latency of the instruction is dependent on the
790 /// particular bits set in any of the registers *or* any of the bits loaded from
791 /// memory.
792 ///
793 /// A classical example of something that is inherently not data invariant is an
794 /// indirect jump -- the destination is loaded into icache based on the bits set
795 /// in the jump destination register.
796 ///
797 /// FIXME: This should become part of our instruction tables.
798 static bool isDataInvariantLoad(MachineInstr &MI) {
799 switch (MI.getOpcode()) {
800 default:
801 // By default, assume that the load will immediately leak.
802 return false;
803
804 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
805 // However, they set flags and are perhaps the most surprisingly constant
806 // time operations so we call them out here separately.
807 case X86::IMUL16rm:
808 case X86::IMUL16rmi8:
809 case X86::IMUL16rmi:
810 case X86::IMUL32rm:
811 case X86::IMUL32rmi8:
812 case X86::IMUL32rmi:
813 case X86::IMUL64rm:
814 case X86::IMUL64rmi32:
815 case X86::IMUL64rmi8:
816
817 // Bitfield and bit scanning instructions that are somewhat surprisingly
818 // constant time as they scan across bits and do other fairly complex
819 // operations like popcnt, but are believed to be constant time on x86.
820 // However, these set flags.
821 case X86::BLCFILL32rm:
822 case X86::BLCFILL64rm:
823 case X86::BLCI32rm:
824 case X86::BLCI64rm:
825 case X86::BLCIC32rm:
826 case X86::BLCIC64rm:
827 case X86::BLCMSK32rm:
828 case X86::BLCMSK64rm:
829 case X86::BLCS32rm:
830 case X86::BLCS64rm:
831 case X86::BLSFILL32rm:
832 case X86::BLSFILL64rm:
833 case X86::BLSI32rm:
834 case X86::BLSI64rm:
835 case X86::BLSIC32rm:
836 case X86::BLSIC64rm:
837 case X86::BLSMSK32rm:
838 case X86::BLSMSK64rm:
839 case X86::BLSR32rm:
840 case X86::BLSR64rm:
841 case X86::BZHI32rm:
842 case X86::BZHI64rm:
843 case X86::LZCNT16rm:
844 case X86::LZCNT32rm:
845 case X86::LZCNT64rm:
846 case X86::POPCNT16rm:
847 case X86::POPCNT32rm:
848 case X86::POPCNT64rm:
849 case X86::TZCNT16rm:
850 case X86::TZCNT32rm:
851 case X86::TZCNT64rm:
852 case X86::TZMSK32rm:
853 case X86::TZMSK64rm:
854
855 // Basic arithmetic is constant time on the input but does set flags.
856 case X86::ADC8rm:
857 case X86::ADC16rm:
858 case X86::ADC32rm:
859 case X86::ADC64rm:
860 case X86::ADCX32rm:
861 case X86::ADCX64rm:
862 case X86::ADD8rm:
863 case X86::ADD16rm:
864 case X86::ADD32rm:
865 case X86::ADD64rm:
866 case X86::ADOX32rm:
867 case X86::ADOX64rm:
868 case X86::AND8rm:
869 case X86::AND16rm:
870 case X86::AND32rm:
871 case X86::AND64rm:
872 case X86::ANDN32rm:
873 case X86::ANDN64rm:
874 case X86::BSF16rm:
875 case X86::BSF32rm:
876 case X86::BSF64rm:
877 case X86::BSR16rm:
878 case X86::BSR32rm:
879 case X86::BSR64rm:
880 case X86::OR8rm:
881 case X86::OR16rm:
882 case X86::OR32rm:
883 case X86::OR64rm:
884 case X86::SBB8rm:
885 case X86::SBB16rm:
886 case X86::SBB32rm:
887 case X86::SBB64rm:
888 case X86::SUB8rm:
889 case X86::SUB16rm:
890 case X86::SUB32rm:
891 case X86::SUB64rm:
892 case X86::XOR8rm:
893 case X86::XOR16rm:
894 case X86::XOR32rm:
895 case X86::XOR64rm:
896 case X86::BEXTR32rm:
897 case X86::BEXTR64rm:
898 case X86::BEXTRI32mi:
899 case X86::BEXTRI64mi:
900 // Check whether the EFLAGS implicit-def is dead. We assume that this will
901 // always find the implicit-def because this code should only be reached
902 // for instructions that do in fact implicitly def this.
903 if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
904 // If we would clobber EFLAGS that are used, just bail for now.
905 LLVM_DEBUG(dbgs() << " Unable to harden post-load due to EFLAGS: ";
906 MI.dump(); dbgs() << "\n");
907 return false;
908 }
909
910 // Otherwise, fallthrough to handle these the same as instructions that
911 // don't set EFLAGS.
912 LLVM_FALLTHROUGH;
913
914 // Integer multiply w/o affecting flags is still believed to be constant
915 // time on x86. Called out separately as this is among the most surprising
916 // instructions to exhibit that behavior.
917 case X86::MULX32rm:
918 case X86::MULX64rm:
919
920 // Arithmetic instructions that are both constant time and don't set flags.
921 case X86::PDEP32rm:
922 case X86::PDEP64rm:
923 case X86::PEXT32rm:
924 case X86::PEXT64rm:
925 case X86::RORX32mi:
926 case X86::RORX64mi:
927 case X86::SARX32rm:
928 case X86::SARX64rm:
929 case X86::SHLX32rm:
930 case X86::SHLX64rm:
931 case X86::SHRX32rm:
932 case X86::SHRX64rm:
933
934 // Conversions are believed to be constant time and don't set flags.
935 // FIXME: Add AVX versions.
936 case X86::CVTSD2SI64rm_Int:
937 case X86::CVTSD2SIrm_Int:
938 case X86::CVTSS2SI64rm_Int:
939 case X86::CVTSS2SIrm_Int:
940 case X86::CVTTSD2SI64rm:
941 case X86::CVTTSD2SI64rm_Int:
942 case X86::CVTTSD2SIrm:
943 case X86::CVTTSD2SIrm_Int:
944 case X86::CVTTSS2SI64rm:
945 case X86::CVTTSS2SI64rm_Int:
946 case X86::CVTTSS2SIrm:
947 case X86::CVTTSS2SIrm_Int:
948
949 // Loads to register don't set flags.
950 case X86::MOV8rm:
951 case X86::MOV8rm_NOREX:
952 case X86::MOV16rm:
953 case X86::MOV32rm:
954 case X86::MOV64rm:
955 case X86::MOVSX16rm8:
956 case X86::MOVSX32rm16:
957 case X86::MOVSX32rm8:
958 case X86::MOVSX32rm8_NOREX:
959 case X86::MOVSX64rm16:
960 case X86::MOVSX64rm32:
961 case X86::MOVSX64rm8:
962 case X86::MOVZX16rm8:
963 case X86::MOVZX32rm16:
964 case X86::MOVZX32rm8:
965 case X86::MOVZX32rm8_NOREX:
966 case X86::MOVZX64rm16:
967 case X86::MOVZX64rm8:
968 return true;
969 }
970 }
971
972 static bool isEFLAGSLive(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
973 const TargetRegisterInfo &TRI) {
974 // Check if EFLAGS are alive by seeing if there is a def of them or they
975 // live-in, and then seeing if that def is in turn used.
976 for (MachineInstr &MI : llvm::reverse(llvm::make_range(MBB.begin(), I))) {
977 if (MachineOperand *DefOp = MI.findRegisterDefOperand(X86::EFLAGS)) {
978 // If the def is dead, then EFLAGS is not live.
979 if (DefOp->isDead())
980 return false;
981
982 // Otherwise we've def'ed it, and it is live.
983 return true;
984 }
985 // While at this instruction, also check if we use and kill EFLAGS
986 // which means it isn't live.
987 if (MI.killsRegister(X86::EFLAGS, &TRI))
988 return false;
989 }
990
991 // If we didn't find anything conclusive (neither definitely alive or
992 // definitely dead) return whether it lives into the block.
993 return MBB.isLiveIn(X86::EFLAGS);
994 }
995
996 void X86SpeculativeLoadHardeningPass::checkAllLoads(
997 MachineFunction &MF, MachineSSAUpdater &PredStateSSA) {
998 // If the actual checking of loads is disabled, skip doing anything here.
999 if (!HardenLoads)
1000 return;
1001
1002 SmallPtrSet HardenPostLoad;
1003 SmallPtrSet HardenLoadAddr;
1004
1005 SmallSet HardenedAddrRegs;
1006
1007 SmallDenseMap AddrRegToHardenedReg;
1008
1009 // Track the set of load-dependent registers through the basic block. Because
1010 // the values of these registers have an existing data dependency on a loaded
1011 // value which we would have checked, we can omit any checks on them.
1012 SparseBitVector<> LoadDepRegs;
1013
1014 for (MachineBasicBlock &MBB : MF) {
1015 // We harden the loads of a basic block in several passes:
1016 //
1017 // 1) Collect all the loads which can have their loaded value hardened
1018 // and all the loads that instead need their address hardened. During
1019 // this walk we propagate load dependence for address hardened loads and
1020 // also look for LFENCE to stop hardening wherever possible. When
1021 // deciding whether or not to harden the loaded value or not, we check
1022 // to see if any registers used in the address will have been hardened
1023 // at this point and if so, harden any remaining address registers as
1024 // that often successfully re-uses hardened addresses and minimizes
1025 // instructions. FIXME: We should consider an aggressive mode where we
1026 // continue to keep as many loads value hardened even when some address
1027 // register hardening would be free (due to reuse).
1028 for (MachineInstr &MI : MBB) {
1029 // We naively assume that all def'ed registers of an instruction have
1030 // a data dependency on all of their operands.
1031 // FIXME: Do a more careful analysis of x86 to build a conservative model
1032 // here.
1033 if (llvm::any_of(MI.uses(), [&](MachineOperand &Op) {
1034 return Op.isReg() && LoadDepRegs.test(Op.getReg());
1035 }))
1036 for (MachineOperand &Def : MI.defs())
1037 if (Def.isReg())
1038 LoadDepRegs.set(Def.getReg());
1039
1040 // Both Intel and AMD are guiding that they will change the semantics of
1041 // LFENCE to be a speculation barrier, so if we see an LFENCE, there is
1042 // no more need to guard things in this block.
1043 if (MI.getOpcode() == X86::LFENCE)
1044 break;
1045
1046 // If this instruction cannot load, nothing to do.
1047 if (!MI.mayLoad())
1048 continue;
1049
1050 // Some instructions which "load" are trivially safe or unimportant.
1051 if (MI.getOpcode() == X86::MFENCE)
1052 continue;
1053
1054 // Extract the memory operand information about this instruction.
1055 // FIXME: This doesn't handle loading pseudo instructions which we often
1056 // could handle with similarly generic logic. We probably need to add an
1057 // MI-layer routine similar to the MC-layer one we use here which maps
1058 // pseudos much like this maps real instructions.
1059 const MCInstrDesc &Desc = MI.getDesc();
1060 int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
1061 if (MemRefBeginIdx < 0) {
1062 LLVM_DEBUG(dbgs() << "WARNING: unable to harden loading instruction: ";
1063 MI.dump());
1064 continue;
1065 }
1066
1067 MemRefBeginIdx += X86II::getOperandBias(Desc);
1068
1069 MachineOperand &BaseMO = MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
1070 MachineOperand &IndexMO =
1071 MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
1072
1073 // If we have at least one (non-frame-index, non-RIP) register operand,
1074 // and neither operand is load-dependent, we need to check the load.
1075 unsigned BaseReg = 0, IndexReg = 0;
1076 if (!BaseMO.isFI() && BaseMO.getReg() != X86::RIP &&
1077 BaseMO.getReg() != X86::NoRegister)
1078 BaseReg = BaseMO.getReg();
1079 if (IndexMO.getReg() != X86::NoRegister)
1080 IndexReg = IndexMO.getReg();
1081
1082 if (!BaseReg && !IndexReg)
1083 // No register operands!
1084 continue;
1085
1086 // If any register operand is dependent, this load is dependent and we
1087 // needn't check it.
1088 // FIXME: Is this true in the case where we are hardening loads after
1089 // they complete? Unclear, need to investigate.
1090 if ((BaseReg && LoadDepRegs.test(BaseReg)) ||
1091 (IndexReg && LoadDepRegs.test(IndexReg)))
1092 continue;
1093
1094 // If post-load hardening is enabled, this load is known to be
1095 // data-invariant, and we aren't already going to harden one of the
1096 // address registers, queue it up to be hardened post-load. Notably, even
1097 // once hardened this won't introduce a useful dependency that could prune
1098 // out subsequent loads.
1099 if (EnablePostLoadHardening && isDataInvariantLoad(MI) &&
1100 !HardenedAddrRegs.count(BaseReg) &&
1101 !HardenedAddrRegs.count(IndexReg)) {
1102 HardenPostLoad.insert(&MI);
1103 HardenedAddrRegs.insert(MI.getOperand(0).getReg());
1104 continue;
1105 }
1106
1107 // Record this instruction for address hardening and record its register
1108 // operands as being address-hardened.
1109 HardenLoadAddr.insert(&MI);
1110 if (BaseReg)
1111 HardenedAddrRegs.insert(BaseReg);
1112 if (IndexReg)
1113 HardenedAddrRegs.insert(IndexReg);
1114
1115 for (MachineOperand &Def : MI.defs())
1116 if (Def.isReg())
1117 LoadDepRegs.set(Def.getReg());
1118 }
1119
1120 // Now re-walk the instructions in the basic block, and apply whichever
1121 // hardening strategy we have elected. Note that we do this in a second
1122 // pass specifically so that we have the complete set of instructions for
1123 // which we will do post-load hardening and can defer it in certain
1124 // circumstances.
1125 //
1126 // FIXME: This could probably be made even more effective by doing it
1127 // across the entire function. Rather than just walking the flat list
1128 // backwards here, we could walk the function in PO and each block bottom
1129 // up, allowing us to in some cases sink hardening across block blocks. As
1130 // long as the in-block predicate state is used at the eventual hardening
1131 // site, this remains safe.
1132 for (MachineInstr &MI : MBB) {
1133 // We cannot both require hardening the def of a load and its address.
1134 assert(!(HardenLoadAddr.count(&MI) && HardenPostLoad.count(&MI)) &&
1135 "Requested to harden both the address and def of a load!");
1136
1137 // Check if this is a load whose address needs to be hardened.
1138 if (HardenLoadAddr.erase(&MI)) {
1139 const MCInstrDesc &Desc = MI.getDesc();
1140 int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
1141 assert(MemRefBeginIdx >= 0 && "Cannot have an invalid index here!");
1142
1143 MemRefBeginIdx += X86II::getOperandBias(Desc);
1144
1145 MachineOperand &BaseMO =
1146 MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
1147 MachineOperand &IndexMO =
1148 MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
1149 hardenLoadAddr(MI, BaseMO, IndexMO, PredStateSSA, AddrRegToHardenedReg);
1150 continue;
1151 }
1152
1153 // Test if this instruction is one of our post load instructions (and
1154 // remove it from the set if so).
1155 if (HardenPostLoad.erase(&MI)) {
1156 assert(!MI.isCall() && "Must not try to post-load harden a call!");
1157
1158 // If this is a data-invariant load, we want to try and sink any
1159 // hardening as far as possible.
1160 if (isDataInvariantLoad(MI)) {
1161 // Sink the instruction we'll need to harden as far as we can down the
1162 // graph.
1163 MachineInstr *SunkMI = sinkPostLoadHardenedInst(MI, HardenPostLoad);
1164
1165 // If we managed to sink this instruction, update everything so we
1166 // harden that instruction when we reach it in the instruction
1167 // sequence.
1168 if (SunkMI != &MI) {
1169 // If in sinking there was no instruction needing to be hardened,
1170 // we're done.
1171 if (!SunkMI)
1172 continue;
1173
1174 // Otherwise, add this to the set of defs we harden.
1175 HardenPostLoad.insert(SunkMI);
1176 continue;
1177 }
1178 }
1179
1180 // The register def'ed by this instruction is trivially hardened so map
1181 // it to itself.
1182 AddrRegToHardenedReg[MI.getOperand(0).getReg()] =
1183 MI.getOperand(0).getReg();
1184
1185 hardenPostLoad(MI, PredStateSSA);
1186 continue;
1187 }
1188
1189 // After we finish processing the instruction and doing any hardening
1190 // necessary for it, we need to handle transferring the predicate state
1191 // into a call and recovering it after the call returns (if it returns).
1192 if (!MI.isCall())
1193 continue;
1194
1195 // If we're not hardening interprocedurally, we can just skip calls.
1196 if (!HardenInterprocedurally)
1197 continue;
1198
1199 auto InsertPt = MI.getIterator();
1200 DebugLoc Loc = MI.getDebugLoc();
1201
1202 // First, we transfer the predicate state into the called function by
1203 // merging it into the stack pointer. This will kill the current def of
1204 // the state.
1205 unsigned StateReg = PredStateSSA.GetValueAtEndOfBlock(&MBB);
1206 mergePredStateIntoSP(MBB, InsertPt, Loc, StateReg);
1207
1208 // If this call is also a return (because it is a tail call) we're done.
1209 if (MI.isReturn())
1210 continue;
1211
1212 // Otherwise we need to step past the call and recover the predicate
1213 // state from SP after the return, and make this new state available.
1214 ++InsertPt;
1215 unsigned NewStateReg = extractPredStateFromSP(MBB, InsertPt, Loc);
1216 PredStateSSA.AddAvailableValue(&MBB, NewStateReg);
1217 }
1218
1219 HardenPostLoad.clear();
1220 HardenLoadAddr.clear();
1221 HardenedAddrRegs.clear();
1222 AddrRegToHardenedReg.clear();
1223
1224 // Currently, we only track data-dependent loads within a basic block.
1225 // FIXME: We should see if this is necessary or if we could be more
1226 // aggressive here without opening up attack avenues.
1227 LoadDepRegs.clear();
1228 }
1229 }
1230
1231 /// Save EFLAGS into the returned GPR. This can in turn be restored with
1232 /// `restoreEFLAGS`.
1233 ///
1234 /// Note that LLVM can only lower very simple patterns of saved and restored
1235 /// EFLAGS registers. The restore should always be within the same basic block
1236 /// as the save so that no PHI nodes are inserted.
1237 unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
1238 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1239 DebugLoc Loc) {
1240 // FIXME: Hard coding this to a 32-bit register class seems weird, but matches
1241 // what instruction selection does.
1242 unsigned Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
1243 // We directly copy the FLAGS register and rely on later lowering to clean
1244 // this up into the appropriate setCC instructions.
1245 BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), Reg).addReg(X86::EFLAGS);
1246 ++NumInstsInserted;
1247 return Reg;
1248 }
1249
1250 /// Restore EFLAGS from the provided GPR. This should be produced by
1251 /// `saveEFLAGS`.
1252 ///
1253 /// This must be done within the same basic block as the save in order to
1254 /// reliably lower.
1255 void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
1256 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
1257 unsigned Reg) {
1258 BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg);
1259 ++NumInstsInserted;
1260 }
1261
1262 /// Takes the current predicate state (in a register) and merges it into the
1263 /// stack pointer. The state is essentially a single bit, but we merge this in
1264 /// a way that won't form non-canonical pointers and also will be preserved
1265 /// across normal stack adjustments.
1266 void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
1267 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
1268 unsigned PredStateReg) {
1269 unsigned TmpReg = MRI->createVirtualRegister(PredStateRC);
1270 // FIXME: This hard codes a shift distance based on the number of bits needed
1271 // to stay canonical on 64-bit. We should compute this somehow and support
1272 // 32-bit as part of that.
1273 auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHL64ri), TmpReg)
1274 .addReg(PredStateReg, RegState::Kill)
1275 .addImm(47);
1276 ShiftI->addRegisterDead(X86::EFLAGS, TRI);
1277 ++NumInstsInserted;
1278 auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), X86::RSP)
1279 .addReg(X86::RSP)
1280 .addReg(TmpReg, RegState::Kill);
1281 OrI->addRegisterDead(X86::EFLAGS, TRI);
1282 ++NumInstsInserted;
1283 }
1284
1285 /// Extracts the predicate state stored in the high bits of the stack pointer.
1286 unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
1287 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1288 DebugLoc Loc) {
1289 unsigned PredStateReg = MRI->createVirtualRegister(PredStateRC);
1290 unsigned TmpReg = MRI->createVirtualRegister(PredStateRC);
1291
1292 // We know that the stack pointer will have any preserved predicate state in
1293 // its high bit. We just want to smear this across the other bits. Turns out,
1294 // this is exactly what an arithmetic right shift does.
1295 BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), TmpReg)
1296 .addReg(X86::RSP);
1297 auto ShiftI =
1298 BuildMI(MBB, InsertPt, Loc, TII->get(X86::SAR64ri), PredStateReg)
1299 .addReg(TmpReg, RegState::Kill)
1300 .addImm(TRI->getRegSizeInBits(*PredStateRC) - 1);
1301 ShiftI->addRegisterDead(X86::EFLAGS, TRI);
1302 ++NumInstsInserted;
1303
1304 return PredStateReg;
1305 }
1306
1307 void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
1308 MachineInstr &MI, MachineOperand &BaseMO, MachineOperand &IndexMO,
1309 MachineSSAUpdater &PredStateSSA,
1310 SmallDenseMap &AddrRegToHardenedReg) {
1311 MachineBasicBlock &MBB = *MI.getParent();
1312 DebugLoc Loc = MI.getDebugLoc();
1313
1314 // Check if EFLAGS are alive by seeing if there is a def of them or they
1315 // live-in, and then seeing if that def is in turn used.
1316 bool EFLAGSLive = isEFLAGSLive(MBB, MI.getIterator(), *TRI);
1317
1318 SmallVector HardenOpRegs;
1319
1320 if (BaseMO.isFI()) {
1321 // A frame index is never a dynamically controllable load, so only
1322 // harden it if we're covering fixed address loads as well.
1323 LLVM_DEBUG(
1324 dbgs() << " Skipping hardening base of explicit stack frame load: ";
1325 MI.dump(); dbgs() << "\n");
1326 } else if (BaseMO.getReg() == X86::RIP ||
1327 BaseMO.getReg() == X86::NoRegister) {
1328 // For both RIP-relative addressed loads or absolute loads, we cannot
1329 // meaningfully harden them because the address being loaded has no
1330 // dynamic component.
1331 //
1332 // FIXME: When using a segment base (like TLS does) we end up with the
1333 // dynamic address being the base plus -1 because we can't mutate the
1334 // segment register here. This allows the signed 32-bit offset to point at
1335 // valid segment-relative addresses and load them successfully.
1336 LLVM_DEBUG(
1337 dbgs() << " Cannot harden base of "
1338 << (BaseMO.getReg() == X86::RIP ? "RIP-relative" : "no-base")
1339 << " address in a load!");
1340 } else {
1341 assert(BaseMO.isReg() &&
1342 "Only allowed to have a frame index or register base.");
1343 HardenOpRegs.push_back(&BaseMO);
1344 }
1345
1346 if (IndexMO.getReg() != X86::NoRegister &&
1347 (HardenOpRegs.empty() ||
1348 HardenOpRegs.front()->getReg() != IndexMO.getReg()))
1349 HardenOpRegs.push_back(&IndexMO);
1350
1351 assert((HardenOpRegs.size() == 1 || HardenOpRegs.size() == 2) &&
1352 "Should have exactly one or two registers to harden!");
1353 assert((HardenOpRegs.size() == 1 ||
1354 HardenOpRegs[0]->getReg() != HardenOpRegs[1]->getReg()) &&
1355 "Should not have two of the same registers!");
1356
1357 // Remove any registers that have alreaded been checked.
1358 llvm::erase_if(HardenOpRegs, [&](MachineOperand *Op) {
1359 // See if this operand's register has already been checked.
1360 auto It = AddrRegToHardenedReg.find(Op->getReg());
1361 if (It == AddrRegToHardenedReg.end())
1362 // Not checked, so retain this one.
1363 return false;
1364
1365 // Otherwise, we can directly update this operand and remove it.
1366 Op->setReg(It->second);
1367 return true;
1368 });
1369 // If there are none left, we're done.
1370 if (HardenOpRegs.empty())
1371 return;
1372
1373 // Compute the current predicate state.
1374 unsigned StateReg = PredStateSSA.GetValueAtEndOfBlock(&MBB);
1375
1376 auto InsertPt = MI.getIterator();
1377
1378 // If EFLAGS are live and we don't have access to instructions that avoid
1379 // clobbering EFLAGS we need to save and restore them. This in turn makes
1380 // the EFLAGS no longer live.
1381 unsigned FlagsReg = 0;
1382 if (EFLAGSLive && !Subtarget->hasBMI2()) {
1383 EFLAGSLive = false;
1384 FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
1385 }
1386
1387 for (MachineOperand *Op : HardenOpRegs) {
1388 auto *OpRC = MRI->getRegClass(Op->getReg());
1389
1390 unsigned OpReg = Op->getReg();
1391 unsigned TmpReg = MRI->createVirtualRegister(OpRC);
1392
1393 if (!EFLAGSLive) {
1394 // Merge our potential poison state into the value with an or.
1395 auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), TmpReg)
1396 .addReg(StateReg)
1397 .addReg(OpReg);
1398 OrI->addRegisterDead(X86::EFLAGS, TRI);
1399 ++NumInstsInserted;
1400 LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1401 } else {
1402 // We need to avoid touching EFLAGS so shift out all but the least
1403 // significant bit using the instruction that doesn't update flags.
1404 auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHRX64rr), TmpReg)
1405 .addReg(OpReg)
1406 .addReg(StateReg);
1407 (void)ShiftI;
1408 ++NumInstsInserted;
1409 LLVM_DEBUG(dbgs() << " Inserting shrx: "; ShiftI->dump();
1410 dbgs() << "\n");
1411 }
1412
1413 // Record this register as checked and update the operand.
1414 assert(!AddrRegToHardenedReg.count(Op->getReg()) &&
1415 "Should not have checked this register yet!");
1416 AddrRegToHardenedReg[Op->getReg()] = TmpReg;
1417 Op->setReg(TmpReg);
1418 ++NumAddrRegsHardened;
1419 }
1420
1421 // And restore the flags if needed.
1422 if (FlagsReg)
1423 restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
1424 }
1425
1426 MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
1427 MachineInstr &InitialMI, SmallPtrSetImpl &HardenedLoads) {
1428 assert(isDataInvariantLoad(InitialMI) &&
1429 "Cannot get here with a non-invariant load!");
1430
1431 // See if we can sink hardening the loaded value.
1432 auto SinkCheckToSingleUse =
1433 [&](MachineInstr &MI) -> Optional {
1434 unsigned DefReg = MI.getOperand(0).getReg();
1435
1436 // We need to find a single use which we can sink the check. We can
1437 // primarily do this because many uses may already end up checked on their
1438 // own.
1439 MachineInstr *SingleUseMI = nullptr;
1440 for (MachineInstr &UseMI : MRI->use_instructions(DefReg)) {
1441 // If we're already going to harden this use, it is data invariant and
1442 // within our block and we just need to check that the use isn't in an
1443 // address.
1444 if (HardenedLoads.count(&UseMI)) {
1445 const MCInstrDesc &Desc = UseMI.getDesc();
1446 int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
1447 assert(MemRefBeginIdx >= 0 &&
1448 "Should always have mem references here!");
1449 MemRefBeginIdx += X86II::getOperandBias(Desc);
1450
1451 MachineOperand &BaseMO =
1452 UseMI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
1453 MachineOperand &IndexMO =
1454 UseMI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
1455 if ((BaseMO.isReg() && BaseMO.getReg() == DefReg) ||
1456 (IndexMO.isReg() && IndexMO.getReg() == DefReg))
1457 // The load uses the register as part of its address making it not
1458 // invariant.
1459 return {};
1460
1461 continue;
1462 }
1463
1464 if (SingleUseMI)
1465 // We already have a single use, this would make two. Bail.
1466 return {};
1467
1468 // If this single use isn't data invariant, isn't in this block, or has
1469 // interfering EFLAGS, we can't sink the hardening to it.
1470 if (!isDataInvariant(UseMI) || UseMI.getParent() != MI.getParent())
1471 return {};
1472
1473 // If this instruction defines multiple registers bail as we won't harden
1474 // all of them.
1475 if (UseMI.getDesc().getNumDefs() > 1)
1476 return {};
1477
1478 // If this register isn't a virtual register we can't walk uses of sanely,
1479 // just bail. Also check that its register class is one of the ones we
1480 // can harden.
1481 unsigned UseDefReg = UseMI.getOperand(0).getReg();
1482 if (!TRI->isVirtualRegister(UseDefReg) ||
1483 !MRI->getRegClass(UseDefReg)->hasSubClassEq(&X86::GR64RegClass))
1484 return {};
1485
1486 SingleUseMI = &UseMI;
1487 }
1488
1489 // If SingleUseMI is still null, there is no use that needs its own
1490 // checking. Otherwise, it is the single use that needs checking.
1491 return {SingleUseMI};
1492 };
1493
1494 MachineInstr *MI = &InitialMI;
1495 while (Optional SingleUse = SinkCheckToSingleUse(*MI)) {
1496 // Update which MI we're checking now.
1497 MI = *SingleUse;
1498 if (!MI)
1499 break;
1500 }
1501
1502 return MI;
1503 }
1504
1505 // We can harden non-leaking loads into register without touching the address
1506 // by just hiding all of the loaded bits. We use an `or` instruction to do
1507 // this because having the poison value be all ones allows us to use the same
1508 // value below. And the goal is just for the loaded bits to not be exposed to
1509 // execution and coercing them to one is sufficient.
1510 void X86SpeculativeLoadHardeningPass::hardenPostLoad(
1511 MachineInstr &MI, MachineSSAUpdater &PredStateSSA) {
1512 assert(isDataInvariantLoad(MI) &&
1513 "Cannot get here with a non-invariant load!");
1514
1515 MachineBasicBlock &MBB = *MI.getParent();
1516 DebugLoc Loc = MI.getDebugLoc();
1517
1518 // For all of these, the def'ed register operand is operand zero.
1519 auto &DefOp = MI.getOperand(0);
1520 unsigned OldDefReg = DefOp.getReg();
1521
1522 auto *DefRC = MRI->getRegClass(OldDefReg);
1523 int DefRegBytes = TRI->getRegSizeInBits(*DefRC) / 8;
1524
1525 unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr};
1526 unsigned OrOpCode = OrOpCodes[Log2_32(DefRegBytes)];
1527
1528 unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit};
1529
1530 auto GetStateRegInRC = [&](const TargetRegisterClass &RC) {
1531 unsigned StateReg = PredStateSSA.GetValueAtEndOfBlock(&MBB);
1532
1533 int Bytes = TRI->getRegSizeInBits(RC) / 8;
1534 // FIXME: Need to teach this about 32-bit mode.
1535 if (Bytes != 8) {
1536 unsigned SubRegImm = SubRegImms[Log2_32(Bytes)];
1537 unsigned NarrowStateReg = MRI->createVirtualRegister(&RC);
1538 BuildMI(MBB, MI.getIterator(), Loc, TII->get(TargetOpcode::COPY),
1539 NarrowStateReg)
1540 .addReg(StateReg, 0, SubRegImm);
1541 StateReg = NarrowStateReg;
1542 }
1543 return StateReg;
1544 };
1545
1546 auto InsertPt = std::next(MI.getIterator());
1547 unsigned FlagsReg = 0;
1548 bool EFLAGSLive = isEFLAGSLive(MBB, InsertPt, *TRI);
1549 if (EFLAGSLive && !Subtarget->hasBMI2()) {
1550 FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
1551 EFLAGSLive = false;
1552 }
1553
1554 if (!EFLAGSLive) {
1555 unsigned StateReg = GetStateRegInRC(*DefRC);
1556 unsigned NewDefReg = MRI->createVirtualRegister(DefRC);
1557 DefOp.setReg(NewDefReg);
1558 auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), OldDefReg)
1559 .addReg(StateReg)
1560 .addReg(NewDefReg);
1561 OrI->addRegisterDead(X86::EFLAGS, TRI);
1562 ++NumInstsInserted;
1563 LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1564 } else {
1565 assert(Subtarget->hasBMI2() &&
1566 "Cannot harden loads and preserve EFLAGS without BMI2!");
1567
1568 unsigned ShiftOpCode = DefRegBytes < 4 ? X86::SHRX32rr : X86::SHRX64rr;
1569 auto &ShiftRC =
1570 DefRegBytes < 4 ? X86::GR32_NOSPRegClass : X86::GR64_NOSPRegClass;
1571 int ShiftRegBytes = TRI->getRegSizeInBits(ShiftRC) / 8;
1572 unsigned DefSubRegImm = SubRegImms[Log2_32(DefRegBytes)];
1573
1574 unsigned StateReg = GetStateRegInRC(ShiftRC);
1575
1576 // First have the def instruction def a temporary register.
1577 unsigned TmpReg = MRI->createVirtualRegister(DefRC);
1578 DefOp.setReg(TmpReg);
1579 // Now copy it into a register of the shift RC.
1580 unsigned ShiftInputReg = TmpReg;
1581 if (DefRegBytes != ShiftRegBytes) {
1582 unsigned UndefReg = MRI->createVirtualRegister(&ShiftRC);
1583 BuildMI(MBB, InsertPt, Loc, TII->get(X86::IMPLICIT_DEF), UndefReg);
1584 ShiftInputReg = MRI->createVirtualRegister(&ShiftRC);
1585 BuildMI(MBB, InsertPt, Loc, TII->get(X86::INSERT_SUBREG), ShiftInputReg)
1586 .addReg(UndefReg)
1587 .addReg(TmpReg)
1588 .addImm(DefSubRegImm);
1589 }
1590
1591 // We shift this once if the shift is wider than the def and thus we can
1592 // shift *all* of the def'ed bytes out. Otherwise we need to do two shifts.
1593
1594 unsigned ShiftedReg = MRI->createVirtualRegister(&ShiftRC);
1595 auto Shift1I =
1596 BuildMI(MBB, InsertPt, Loc, TII->get(ShiftOpCode), ShiftedReg)
1597 .addReg(ShiftInputReg)
1598 .addReg(StateReg);
1599 (void)Shift1I;
1600 ++NumInstsInserted;
1601 LLVM_DEBUG(dbgs() << " Inserting shrx: "; Shift1I->dump(); dbgs() << "\n");
1602
1603 // The only way we have a bit left is if all 8 bytes were defined. Do an
1604 // extra shift to get the last bit in this case.
1605 if (DefRegBytes == ShiftRegBytes) {
1606 // We can just directly def the old def register as its the same size.
1607 ShiftInputReg = ShiftedReg;
1608 auto Shift2I =
1609 BuildMI(MBB, InsertPt, Loc, TII->get(ShiftOpCode), OldDefReg)
1610 .addReg(ShiftInputReg)
1611 .addReg(StateReg);
1612 (void)Shift2I;
1613 ++NumInstsInserted;
1614 LLVM_DEBUG(dbgs() << " Inserting shrx: "; Shift2I->dump();
1615 dbgs() << "\n");
1616 } else {
1617 // When we have different size shift register we need to fix up the
1618 // class. We can do that as we copy into the old def register.
1619 BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), OldDefReg)
1620 .addReg(ShiftedReg, 0, DefSubRegImm);
1621 }
1622 }
1623
1624 if (FlagsReg)
1625 restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
1626
1627 ++NumPostLoadRegsHardened;
1628 }
1629
1630 void X86SpeculativeLoadHardeningPass::checkReturnInstr(
1631 MachineInstr &MI, MachineSSAUpdater &PredStateSSA) {
1632 MachineBasicBlock &MBB = *MI.getParent();
1633 DebugLoc Loc = MI.getDebugLoc();
1634 auto InsertPt = MI.getIterator();
1635
1636 if (FenceCallAndRet) {
1637 // Simply forcibly block speculation of loads out of the function by using
1638 // an LFENCE. This is potentially a heavy-weight mitigation strategy, but
1639 // should be secure, is simple from an ABI perspective, and the cost can be
1640 // minimized through inlining.
1641 //
1642 // FIXME: We should investigate ways to establish a strong data-dependency
1643 // on the return. However, poisoning the stack pointer is unlikely to work
1644 // because the return is *predicted* rather than relying on the load of the
1645 // return address to actually resolve.
1646 BuildMI(MBB, InsertPt, Loc, TII->get(X86::LFENCE));
1647 ++NumInstsInserted;
1648 ++NumLFENCEsInserted;
1649 return;
1650 }
1651
1652 // Take our predicate state, shift it to the high 17 bits (so that we keep
1653 // pointers canonical) and merge it into RSP. This will allow the caller to
1654 // extract it when we return (speculatively).
1655 mergePredStateIntoSP(MBB, InsertPt, Loc,
1656 PredStateSSA.GetValueAtEndOfBlock(&MBB));
1657 }
1658
1659 INITIALIZE_PASS_BEGIN(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
1660 "X86 speculative load hardener", false, false)
1661 INITIALIZE_PASS_END(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
1662 "X86 speculative load hardener", false, false)
1663
1664 FunctionPass *llvm::createX86SpeculativeLoadHardeningPass() {
1665 return new X86SpeculativeLoadHardeningPass();
1666 }
5353 cl::desc("Enable the machine combiner pass"),
5454 cl::init(true), cl::Hidden);
5555
56 static cl::opt EnableSpeculativeLoadHardening(
57 "x86-speculative-load-hardening",
58 cl::desc("Enable speculative load hardening"), cl::init(false), cl::Hidden);
59
5660 namespace llvm {
5761
5862 void initializeWinEHStatePassPass(PassRegistry &);
462466 addPass(createX86AvoidStoreForwardingBlocks());
463467 }
464468
469 if (EnableSpeculativeLoadHardening)
470 addPass(createX86SpeculativeLoadHardeningPass());
471
465472 addPass(createX86FlagsCopyLoweringPass());
466473 addPass(createX86WinAllocaExpander());
467474 }
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening | FileCheck %s --check-prefix=X64
2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening -x86-speculative-load-hardening-lfence | FileCheck %s --check-prefix=X64-LFENCE
3 ;
4 ; FIXME: Add support for 32-bit and other EH ABIs.
5
6 declare void @leak(i32 %v1, i32 %v2)
7
8 declare void @sink(i32)
9
10 define void @test_basic_conditions(i32 %a, i32 %b, i32 %c, i32* %ptr1, i32* %ptr2, i32** %ptr3) nounwind {
11 ; X64-LABEL: test_basic_conditions:
12 ; X64: # %bb.0: # %entry
13 ; X64-NEXT: pushq %r15
14 ; X64-NEXT: pushq %r14
15 ; X64-NEXT: pushq %rbx
16 ; X64-NEXT: movq %rsp, %rax
17 ; X64-NEXT: movq $-1, %rbx
18 ; X64-NEXT: sarq $63, %rax
19 ; X64-NEXT: testl %edi, %edi
20 ; X64-NEXT: jne .LBB0_1
21 ; X64-NEXT: # %bb.2: # %then1
22 ; X64-NEXT: cmovneq %rbx, %rax
23 ; X64-NEXT: testl %esi, %esi
24 ; X64-NEXT: je .LBB0_4
25 ; X64-NEXT: .LBB0_1:
26 ; X64-NEXT: cmoveq %rbx, %rax
27 ; X64-NEXT: .LBB0_8: # %exit
28 ; X64-NEXT: shlq $47, %rax
29 ; X64-NEXT: orq %rax, %rsp
30 ; X64-NEXT: popq %rbx
31 ; X64-NEXT: popq %r14
32 ; X64-NEXT: popq %r15
33 ; X64-NEXT: retq
34 ; X64-NEXT: .LBB0_4: # %then2
35 ; X64-NEXT: movq %r8, %r15
36 ; X64-NEXT: cmovneq %rbx, %rax
37 ; X64-NEXT: testl %edx, %edx
38 ; X64-NEXT: je .LBB0_6
39 ; X64-NEXT: # %bb.5: # %else3
40 ; X64-NEXT: cmoveq %rbx, %rax
41 ; X64-NEXT: movslq (%r9), %rcx
42 ; X64-NEXT: orq %rax, %rcx
43 ; X64-NEXT: leaq (%r15,%rcx,4), %r14
44 ; X64-NEXT: movl %ecx, (%r15,%rcx,4)
45 ; X64-NEXT: jmp .LBB0_7
46 ; X64-NEXT: .LBB0_6: # %then3
47 ; X64-NEXT: cmovneq %rbx, %rax
48 ; X64-NEXT: movl (%rcx), %ecx
49 ; X64-NEXT: addl (%r15), %ecx
50 ; X64-NEXT: orl %eax, %ecx
51 ; X64-NEXT: movslq %ecx, %rdi
52 ; X64-NEXT: movl (%r15,%rdi,4), %esi
53 ; X64-NEXT: orl %eax, %esi
54 ; X64-NEXT: movq (%r9), %r14
55 ; X64-NEXT: orq %rax, %r14
56 ; X64-NEXT: addl (%r14), %esi
57 ; X64-NEXT: shlq $47, %rax
58 ; X64-NEXT: # kill: def $edi killed $edi killed $rdi
59 ; X64-NEXT: orq %rax, %rsp
60 ; X64-NEXT: callq leak
61 ; X64-NEXT: movq %rsp, %rax
62 ; X64-NEXT: sarq $63, %rax
63 ; X64-NEXT: .LBB0_7: # %merge
64 ; X64-NEXT: movslq (%r14), %rcx
65 ; X64-NEXT: orq %rax, %rcx
66 ; X64-NEXT: movl $0, (%r15,%rcx,4)
67 ; X64-NEXT: jmp .LBB0_8
68 ;
69 ; X64-LFENCE-LABEL: test_basic_conditions:
70 ; X64-LFENCE: # %bb.0: # %entry
71 ; X64-LFENCE-NEXT: testl %edi, %edi
72 ; X64-LFENCE-NEXT: jne .LBB0_6
73 ; X64-LFENCE-NEXT: # %bb.1: # %then1
74 ; X64-LFENCE-NEXT: lfence
75 ; X64-LFENCE-NEXT: testl %esi, %esi
76 ; X64-LFENCE-NEXT: je .LBB0_2
77 ; X64-LFENCE-NEXT: .LBB0_6: # %exit
78 ; X64-LFENCE-NEXT: lfence
79 ; X64-LFENCE-NEXT: retq
80 ; X64-LFENCE-NEXT: .LBB0_2: # %then2
81 ; X64-LFENCE-NEXT: pushq %r14
82 ; X64-LFENCE-NEXT: pushq %rbx
83 ; X64-LFENCE-NEXT: pushq %rax
84 ; X64-LFENCE-NEXT: movq %r8, %rbx
85 ; X64-LFENCE-NEXT: lfence
86 ; X64-LFENCE-NEXT: testl %edx, %edx
87 ; X64-LFENCE-NEXT: je .LBB0_3
88 ; X64-LFENCE-NEXT: # %bb.4: # %else3
89 ; X64-LFENCE-NEXT: lfence
90 ; X64-LFENCE-NEXT: movslq (%r9), %rax
91 ; X64-LFENCE-NEXT: leaq (%rbx,%rax,4), %r14
92 ; X64-LFENCE-NEXT: movl %eax, (%rbx,%rax,4)
93 ; X64-LFENCE-NEXT: jmp .LBB0_5
94 ; X64-LFENCE-NEXT: .LBB0_3: # %then3
95 ; X64-LFENCE-NEXT: lfence
96 ; X64-LFENCE-NEXT: movl (%rcx), %eax
97 ; X64-LFENCE-NEXT: addl (%rbx), %eax
98 ; X64-LFENCE-NEXT: movslq %eax, %rdi
99 ; X64-LFENCE-NEXT: movl (%rbx,%rdi,4), %esi
100 ; X64-LFENCE-NEXT: movq (%r9), %r14
101 ; X64-LFENCE-NEXT: addl (%r14), %esi
102 ; X64-LFENCE-NEXT: # kill: def $edi killed $edi killed $rdi
103 ; X64-LFENCE-NEXT: callq leak
104 ; X64-LFENCE-NEXT: .LBB0_5: # %merge
105 ; X64-LFENCE-NEXT: movslq (%r14), %rax
106 ; X64-LFENCE-NEXT: movl $0, (%rbx,%rax,4)
107 ; X64-LFENCE-NEXT: addq $8, %rsp
108 ; X64-LFENCE-NEXT: popq %rbx
109 ; X64-LFENCE-NEXT: popq %r14
110 ; X64-LFENCE-NEXT: lfence
111 ; X64-LFENCE-NEXT: retq
112 entry:
113 %a.cmp = icmp eq i32 %a, 0
114 br i1 %a.cmp, label %then1, label %exit
115
116 then1:
117 %b.cmp = icmp eq i32 %b, 0
118 br i1 %b.cmp, label %then2, label %exit
119
120 then2:
121 %c.cmp = icmp eq i32 %c, 0
122 br i1 %c.cmp, label %then3, label %else3
123
124 then3:
125 %secret1 = load i32, i32* %ptr1
126 %secret2 = load i32, i32* %ptr2
127 %secret.sum1 = add i32 %secret1, %secret2
128 %ptr2.idx = getelementptr i32, i32* %ptr2, i32 %secret.sum1
129 %secret3 = load i32, i32* %ptr2.idx
130 %secret4 = load i32*, i32** %ptr3
131 %secret5 = load i32, i32* %secret4
132 %secret.sum2 = add i32 %secret3, %secret5
133 call void @leak(i32 %secret.sum1, i32 %secret.sum2)
134 br label %merge
135
136 else3:
137 %secret6 = load i32*, i32** %ptr3
138 %cast = ptrtoint i32* %secret6 to i32
139 %ptr2.idx2 = getelementptr i32, i32* %ptr2, i32 %cast
140 store i32 %cast, i32* %ptr2.idx2
141 br label %merge
142
143 merge:
144 %phi = phi i32* [ %secret4, %then3 ], [ %ptr2.idx2, %else3 ]
145 %secret7 = load i32, i32* %phi
146 %ptr2.idx3 = getelementptr i32, i32* %ptr2, i32 %secret7
147 store i32 0, i32* %ptr2.idx3
148 br label %exit
149
150 exit:
151 ret void
152 }
153
154 define void @test_basic_loop(i32 %a, i32 %b, i32* %ptr1, i32* %ptr2) nounwind {
155 ; X64-LABEL: test_basic_loop:
156 ; X64: # %bb.0: # %entry
157 ; X64-NEXT: pushq %rbp
158 ; X64-NEXT: pushq %r15
159 ; X64-NEXT: pushq %r14
160 ; X64-NEXT: pushq %r12
161 ; X64-NEXT: pushq %rbx
162 ; X64-NEXT: movq %rsp, %rax
163 ; X64-NEXT: movq $-1, %r15
164 ; X64-NEXT: sarq $63, %rax
165 ; X64-NEXT: testl %edi, %edi
166 ; X64-NEXT: je .LBB1_2
167 ; X64-NEXT: # %bb.1:
168 ; X64-NEXT: cmoveq %r15, %rax
169 ; X64-NEXT: jmp .LBB1_5
170 ; X64-NEXT: .LBB1_2: # %l.header.preheader
171 ; X64-NEXT: movq %rcx, %r14
172 ; X64-NEXT: movq %rdx, %r12
173 ; X64-NEXT: movl %esi, %ebp
174 ; X64-NEXT: cmovneq %r15, %rax
175 ; X64-NEXT: xorl %ebx, %ebx
176 ; X64-NEXT: jmp .LBB1_3
177 ; X64-NEXT: .p2align 4, 0x90
178 ; X64-NEXT: .LBB1_6: # in Loop: Header=BB1_3 Depth=1
179 ; X64-NEXT: cmovgeq %r15, %rax
180 ; X64-NEXT: .LBB1_3: # %l.header
181 ; X64-NEXT: # =>This Inner Loop Header: Depth=1
182 ; X64-NEXT: movslq (%r12), %rcx
183 ; X64-NEXT: orq %rax, %rcx
184 ; X64-NEXT: movq %rax, %rdx
185 ; X64-NEXT: orq %r14, %rdx
186 ; X64-NEXT: movl (%rdx,%rcx,4), %edi
187 ; X64-NEXT: shlq $47, %rax
188 ; X64-NEXT: orq %rax, %rsp
189 ; X64-NEXT: callq sink
190 ; X64-NEXT: movq %rsp, %rax
191 ; X64-NEXT: sarq $63, %rax
192 ; X64-NEXT: incl %ebx
193 ; X64-NEXT: cmpl %ebp, %ebx
194 ; X64-NEXT: jl .LBB1_6
195 ; X64-NEXT: # %bb.4:
196 ; X64-NEXT: cmovlq %r15, %rax
197 ; X64-NEXT: .LBB1_5: # %exit
198 ; X64-NEXT: shlq $47, %rax
199 ; X64-NEXT: orq %rax, %rsp
200 ; X64-NEXT: popq %rbx
201 ; X64-NEXT: popq %r12
202 ; X64-NEXT: popq %r14
203 ; X64-NEXT: popq %r15
204 ; X64-NEXT: popq %rbp
205 ; X64-NEXT: retq
206 ;
207 ; X64-LFENCE-LABEL: test_basic_loop:
208 ; X64-LFENCE: # %bb.0: # %entry
209 ; X64-LFENCE-NEXT: pushq %rbp
210 ; X64-LFENCE-NEXT: pushq %r15
211 ; X64-LFENCE-NEXT: pushq %r14
212 ; X64-LFENCE-NEXT: pushq %rbx
213 ; X64-LFENCE-NEXT: pushq %rax
214 ; X64-LFENCE-NEXT: testl %edi, %edi
215 ; X64-LFENCE-NEXT: jne .LBB1_3
216 ; X64-LFENCE-NEXT: # %bb.1: # %l.header.preheader
217 ; X64-LFENCE-NEXT: movq %rcx, %r14
218 ; X64-LFENCE-NEXT: movq %rdx, %r15
219 ; X64-LFENCE-NEXT: movl %esi, %ebp
220 ; X64-LFENCE-NEXT: lfence
221 ; X64-LFENCE-NEXT: xorl %ebx, %ebx
222 ; X64-LFENCE-NEXT: .p2align 4, 0x90
223 ; X64-LFENCE-NEXT: .LBB1_2: # %l.header
224 ; X64-LFENCE-NEXT: # =>This Inner Loop Header: Depth=1
225 ; X64-LFENCE-NEXT: lfence
226 ; X64-LFENCE-NEXT: movslq (%r15), %rax
227 ; X64-LFENCE-NEXT: movl (%r14,%rax,4), %edi
228 ; X64-LFENCE-NEXT: callq sink
229 ; X64-LFENCE-NEXT: incl %ebx
230 ; X64-LFENCE-NEXT: cmpl %ebp, %ebx
231 ; X64-LFENCE-NEXT: jl .LBB1_2
232 ; X64-LFENCE-NEXT: .LBB1_3: # %exit
233 ; X64-LFENCE-NEXT: lfence
234 ; X64-LFENCE-NEXT: addq $8, %rsp
235 ; X64-LFENCE-NEXT: popq %rbx
236 ; X64-LFENCE-NEXT: popq %r14
237 ; X64-LFENCE-NEXT: popq %r15
238 ; X64-LFENCE-NEXT: popq %rbp
239 ; X64-LFENCE-NEXT: retq
240 entry:
241 %a.cmp = icmp eq i32 %a, 0
242 br i1 %a.cmp, label %l.header, label %exit
243
244 l.header:
245 %i = phi i32 [ 0, %entry ], [ %i.next, %l.header ]
246 %secret = load i32, i32* %ptr1
247 %ptr2.idx = getelementptr i32, i32* %ptr2, i32 %secret
248 %leak = load i32, i32* %ptr2.idx
249 call void @sink(i32 %leak)
250 %i.next = add i32 %i, 1
251 %i.cmp = icmp slt i32 %i.next, %b
252 br i1 %i.cmp, label %l.header, label %exit
253
254 exit:
255 ret void
256 }
257
258 define void @test_basic_nested_loop(i32 %a, i32 %b, i32 %c, i32* %ptr1, i32* %ptr2) nounwind {
259 ; X64-LABEL: test_basic_nested_loop:
260 ; X64: # %bb.0: # %entry
261 ; X64-NEXT: pushq %rbp
262 ; X64-NEXT: pushq %r15
263 ; X64-NEXT: pushq %r14
264 ; X64-NEXT: pushq %r13
265 ; X64-NEXT: pushq %r12
266 ; X64-NEXT: pushq %rbx
267 ; X64-NEXT: pushq %rax
268 ; X64-NEXT: movq %rsp, %rax
269 ; X64-NEXT: movq $-1, %r12
270 ; X64-NEXT: sarq $63, %rax
271 ; X64-NEXT: testl %edi, %edi
272 ; X64-NEXT: je .LBB2_2
273 ; X64-NEXT: # %bb.1:
274 ; X64-NEXT: cmoveq %r12, %rax
275 ; X64-NEXT: jmp .LBB2_10
276 ; X64-NEXT: .LBB2_2: # %l1.header.preheader
277 ; X64-NEXT: movq %r8, %r14
278 ; X64-NEXT: movq %rcx, %rbx
279 ; X64-NEXT: movl %edx, %ebp
280 ; X64-NEXT: movl %esi, %r15d
281 ; X64-NEXT: cmovneq %r12, %rax
282 ; X64-NEXT: xorl %r13d, %r13d
283 ; X64-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
284 ; X64-NEXT: testl %r15d, %r15d
285 ; X64-NEXT: jg .LBB2_5
286 ; X64-NEXT: jmp .LBB2_4
287 ; X64-NEXT: .p2align 4, 0x90
288 ; X64-NEXT: .LBB2_12:
289 ; X64-NEXT: cmovgeq %r12, %rax
290 ; X64-NEXT: testl %r15d, %r15d
291 ; X64-NEXT: jle .LBB2_4
292 ; X64-NEXT: .LBB2_5: # %l2.header.preheader
293 ; X64-NEXT: cmovleq %r12, %rax
294 ; X64-NEXT: xorl %r15d, %r15d
295 ; X64-NEXT: jmp .LBB2_6
296 ; X64-NEXT: .p2align 4, 0x90
297 ; X64-NEXT: .LBB2_11: # in Loop: Header=BB2_6 Depth=1
298 ; X64-NEXT: cmovgeq %r12, %rax
299 ; X64-NEXT: .LBB2_6: # %l2.header
300 ; X64-NEXT: # =>This Inner Loop Header: Depth=1
301 ; X64-NEXT: movslq (%rbx), %rcx
302 ; X64-NEXT: orq %rax, %rcx
303 ; X64-NEXT: movq %rax, %rdx
304 ; X64-NEXT: orq %r14, %rdx
305 ; X64-NEXT: movl (%rdx,%rcx,4), %edi
306 ; X64-NEXT: shlq $47, %rax
307 ; X64-NEXT: orq %rax, %rsp
308 ; X64-NEXT: callq sink
309 ; X64-NEXT: movq %rsp, %rax
310 ; X64-NEXT: sarq $63, %rax
311 ; X64-NEXT: incl %r15d
312 ; X64-NEXT: cmpl %ebp, %r15d
313 ; X64-NEXT: jl .LBB2_11
314 ; X64-NEXT: # %bb.7:
315 ; X64-NEXT: cmovlq %r12, %rax
316 ; X64-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Reload
317 ; X64-NEXT: jmp .LBB2_8
318 ; X64-NEXT: .p2align 4, 0x90
319 ; X64-NEXT: .LBB2_4:
320 ; X64-NEXT: cmovgq %r12, %rax
321 ; X64-NEXT: .LBB2_8: # %l1.latch
322 ; X64-NEXT: movslq (%rbx), %rcx
323 ; X64-NEXT: orq %rax, %rcx
324 ; X64-NEXT: movq %rax, %rdx
325 ; X64-NEXT: orq %r14, %rdx
326 ; X64-NEXT: movl (%rdx,%rcx,4), %edi
327 ; X64-NEXT: shlq $47, %rax
328 ; X64-NEXT: orq %rax, %rsp
329 ; X64-NEXT: callq sink
330 ; X64-NEXT: movq %rsp, %rax
331 ; X64-NEXT: sarq $63, %rax
332 ; X64-NEXT: incl %r13d
333 ; X64-NEXT: cmpl %r15d, %r13d
334 ; X64-NEXT: jl .LBB2_12
335 ; X64-NEXT: # %bb.9:
336 ; X64-NEXT: cmovlq %r12, %rax
337 ; X64-NEXT: .LBB2_10: # %exit
338 ; X64-NEXT: shlq $47, %rax
339 ; X64-NEXT: orq %rax, %rsp
340 ; X64-NEXT: addq $8, %rsp
341 ; X64-NEXT: popq %rbx
342 ; X64-NEXT: popq %r12
343 ; X64-NEXT: popq %r13
344 ; X64-NEXT: popq %r14
345 ; X64-NEXT: popq %r15
346 ; X64-NEXT: popq %rbp
347 ; X64-NEXT: retq
348 ;
349 ; X64-LFENCE-LABEL: test_basic_nested_loop:
350 ; X64-LFENCE: # %bb.0: # %entry
351 ; X64-LFENCE-NEXT: pushq %rbp
352 ; X64-LFENCE-NEXT: pushq %r15
353 ; X64-LFENCE-NEXT: pushq %r14
354 ; X64-LFENCE-NEXT: pushq %r13
355 ; X64-LFENCE-NEXT: pushq %r12
356 ; X64-LFENCE-NEXT: pushq %rbx
357 ; X64-LFENCE-NEXT: pushq %rax
358 ; X64-LFENCE-NEXT: testl %edi, %edi
359 ; X64-LFENCE-NEXT: jne .LBB2_6
360 ; X64-LFENCE-NEXT: # %bb.1: # %l1.header.preheader
361 ; X64-LFENCE-NEXT: movq %r8, %r14
362 ; X64-LFENCE-NEXT: movq %rcx, %rbx
363 ; X64-LFENCE-NEXT: movl %edx, %r13d
364 ; X64-LFENCE-NEXT: movl %esi, %r15d
365 ; X64-LFENCE-NEXT: lfence
366 ; X64-LFENCE-NEXT: xorl %r12d, %r12d
367 ; X64-LFENCE-NEXT: .p2align 4, 0x90
368 ; X64-LFENCE-NEXT: .LBB2_2: # %l1.header
369 ; X64-LFENCE-NEXT: # =>This Loop Header: Depth=1
370 ; X64-LFENCE-NEXT: # Child Loop BB2_4 Depth 2
371 ; X64-LFENCE-NEXT: lfence
372 ; X64-LFENCE-NEXT: testl %r15d, %r15d
373 ; X64-LFENCE-NEXT: jle .LBB2_5
374 ; X64-LFENCE-NEXT: # %bb.3: # %l2.header.preheader
375 ; X64-LFENCE-NEXT: # in Loop: Header=BB2_2 Depth=1
376 ; X64-LFENCE-NEXT: lfence
377 ; X64-LFENCE-NEXT: xorl %ebp, %ebp
378 ; X64-LFENCE-NEXT: .p2align 4, 0x90
379 ; X64-LFENCE-NEXT: .LBB2_4: # %l2.header
380 ; X64-LFENCE-NEXT: # Parent Loop BB2_2 Depth=1
381 ; X64-LFENCE-NEXT: # => This Inner Loop Header: Depth=2
382 ; X64-LFENCE-NEXT: lfence
383 ; X64-LFENCE-NEXT: movslq (%rbx), %rax
384 ; X64-LFENCE-NEXT: movl (%r14,%rax,4), %edi
385 ; X64-LFENCE-NEXT: callq sink
386 ; X64-LFENCE-NEXT: incl %ebp
387 ; X64-LFENCE-NEXT: cmpl %r13d, %ebp
388 ; X64-LFENCE-NEXT: jl .LBB2_4
389 ; X64-LFENCE-NEXT: .LBB2_5: # %l1.latch
390 ; X64-LFENCE-NEXT: # in Loop: Header=BB2_2 Depth=1
391 ; X64-LFENCE-NEXT: lfence
392 ; X64-LFENCE-NEXT: movslq (%rbx), %rax
393 ; X64-LFENCE-NEXT: movl (%r14,%rax,4), %edi
394 ; X64-LFENCE-NEXT: callq sink
395 ; X64-LFENCE-NEXT: incl %r12d
396 ; X64-LFENCE-NEXT: cmpl %r15d, %r12d
397 ; X64-LFENCE-NEXT: jl .LBB2_2
398 ; X64-LFENCE-NEXT: .LBB2_6: # %exit
399 ; X64-LFENCE-NEXT: lfence
400 ; X64-LFENCE-NEXT: addq $8, %rsp
401 ; X64-LFENCE-NEXT: popq %rbx
402 ; X64-LFENCE-NEXT: popq %r12
403 ; X64-LFENCE-NEXT: popq %r13
404 ; X64-LFENCE-NEXT: popq %r14
405 ; X64-LFENCE-NEXT: popq %r15
406 ; X64-LFENCE-NEXT: popq %rbp
407 ; X64-LFENCE-NEXT: retq
408 entry:
409 %a.cmp = icmp eq i32 %a, 0
410 br i1 %a.cmp, label %l1.header, label %exit
411
412 l1.header:
413 %i = phi i32 [ 0, %entry ], [ %i.next, %l1.latch ]
414 %b.cmp = icmp sgt i32 %b, 0
415 br i1 %b.cmp, label %l2.header, label %l1.latch
416
417 l2.header:
418 %j = phi i32 [ 0, %l1.header ], [ %j.next, %l2.header ]
419 %secret = load i32, i32* %ptr1
420 %ptr2.idx = getelementptr i32, i32* %ptr2, i32 %secret
421 %leak = load i32, i32* %ptr2.idx
422 call void @sink(i32 %leak)
423 %j.next = add i32 %j, 1
424 %j.cmp = icmp slt i32 %j.next, %c
425 br i1 %j.cmp, label %l2.header, label %l1.latch
426
427 l1.latch:
428 %secret2 = load i32, i32* %ptr1
429 %ptr2.idx2 = getelementptr i32, i32* %ptr2, i32 %secret2
430 %leak2 = load i32, i32* %ptr2.idx2
431 call void @sink(i32 %leak2)
432 %i.next = add i32 %i, 1
433 %i.cmp = icmp slt i32 %i.next, %b
434 br i1 %i.cmp, label %l1.header, label %exit
435
436 exit:
437 ret void
438 }
439
440 declare i32 @__gxx_personality_v0(...)
441
442 declare i8* @__cxa_allocate_exception(i64) local_unnamed_addr
443
444 declare void @__cxa_throw(i8*, i8*, i8*) local_unnamed_addr
445
446 define void @test_basic_eh(i32 %a, i32* %ptr1, i32* %ptr2) nounwind personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
447 ; X64-LABEL: test_basic_eh:
448 ; X64: # %bb.0: # %entry
449 ; X64-NEXT: pushq %rbp
450 ; X64-NEXT: pushq %r14
451 ; X64-NEXT: pushq %rbx
452 ; X64-NEXT: movq %rsp, %rax
453 ; X64-NEXT: movq $-1, %rcx
454 ; X64-NEXT: sarq $63, %rax
455 ; X64-NEXT: cmpl $41, %edi
456 ; X64-NEXT: jg .LBB3_1
457 ; X64-NEXT: # %bb.2: # %thrower
458 ; X64-NEXT: movq %rdx, %r14
459 ; X64-NEXT: movq %rsi, %rbx
460 ; X64-NEXT: cmovgq %rcx, %rax
461 ; X64-NEXT: movslq %edi, %rcx
462 ; X64-NEXT: movl (%rsi,%rcx,4), %ebp
463 ; X64-NEXT: orl %eax, %ebp
464 ; X64-NEXT: movl $4, %edi
465 ; X64-NEXT: shlq $47, %rax
466 ; X64-NEXT: orq %rax, %rsp
467 ; X64-NEXT: callq __cxa_allocate_exception
468 ; X64-NEXT: movq %rsp, %rcx
469 ; X64-NEXT: sarq $63, %rcx
470 ; X64-NEXT: movl %ebp, (%rax)
471 ; X64-NEXT: .Ltmp0:
472 ; X64-NEXT: xorl %esi, %esi
473 ; X64-NEXT: xorl %edx, %edx
474 ; X64-NEXT: shlq $47, %rcx
475 ; X64-NEXT: movq %rax, %rdi
476 ; X64-NEXT: orq %rcx, %rsp
477 ; X64-NEXT: callq __cxa_throw
478 ; X64-NEXT: movq %rsp, %rax
479 ; X64-NEXT: sarq $63, %rax
480 ; X64-NEXT: .Ltmp1:
481 ; X64-NEXT: jmp .LBB3_3
482 ; X64-NEXT: .LBB3_1:
483 ; X64-NEXT: cmovleq %rcx, %rax
484 ; X64-NEXT: .LBB3_3: # %exit
485 ; X64-NEXT: shlq $47, %rax
486 ; X64-NEXT: orq %rax, %rsp
487 ; X64-NEXT: popq %rbx
488 ; X64-NEXT: popq %r14
489 ; X64-NEXT: popq %rbp
490 ; X64-NEXT: retq
491 ; X64-NEXT: .LBB3_4: # %lpad
492 ; X64-NEXT: .Ltmp2:
493 ; X64-NEXT: movq %rsp, %rcx
494 ; X64-NEXT: sarq $63, %rcx
495 ; X64-NEXT: movl (%rax), %eax
496 ; X64-NEXT: addl (%rbx), %eax
497 ; X64-NEXT: orl %ecx, %eax
498 ; X64-NEXT: cltq
499 ; X64-NEXT: movl (%r14,%rax,4), %edi
500 ; X64-NEXT: orl %ecx, %edi
501 ; X64-NEXT: shlq $47, %rcx
502 ; X64-NEXT: orq %rcx, %rsp
503 ; X64-NEXT: callq sink
504 ; X64-NEXT: movq %rsp, %rax
505 ; X64-NEXT: sarq $63, %rax
506 ;
507 ; X64-LFENCE-LABEL: test_basic_eh:
508 ; X64-LFENCE: # %bb.0: # %entry
509 ; X64-LFENCE-NEXT: pushq %rbp
510 ; X64-LFENCE-NEXT: pushq %r14
511 ; X64-LFENCE-NEXT: pushq %rbx
512 ; X64-LFENCE-NEXT: cmpl $41, %edi
513 ; X64-LFENCE-NEXT: jg .LBB3_2
514 ; X64-LFENCE-NEXT: # %bb.1: # %thrower
515 ; X64-LFENCE-NEXT: movq %rdx, %r14
516 ; X64-LFENCE-NEXT: movq %rsi, %rbx
517 ; X64-LFENCE-NEXT: lfence
518 ; X64-LFENCE-NEXT: movslq %edi, %rax
519 ; X64-LFENCE-NEXT: movl (%rsi,%rax,4), %ebp
520 ; X64-LFENCE-NEXT: movl $4, %edi
521 ; X64-LFENCE-NEXT: callq __cxa_allocate_exception
522 ; X64-LFENCE-NEXT: movl %ebp, (%rax)
523 ; X64-LFENCE-NEXT: .Ltmp0:
524 ; X64-LFENCE-NEXT: xorl %esi, %esi
525 ; X64-LFENCE-NEXT: xorl %edx, %edx
526 ; X64-LFENCE-NEXT: movq %rax, %rdi
527 ; X64-LFENCE-NEXT: callq __cxa_throw
528 ; X64-LFENCE-NEXT: .Ltmp1:
529 ; X64-LFENCE-NEXT: .LBB3_2: # %exit
530 ; X64-LFENCE-NEXT: lfence
531 ; X64-LFENCE-NEXT: popq %rbx
532 ; X64-LFENCE-NEXT: popq %r14
533 ; X64-LFENCE-NEXT: popq %rbp
534 ; X64-LFENCE-NEXT: retq
535 ; X64-LFENCE-NEXT: .LBB3_3: # %lpad
536 ; X64-LFENCE-NEXT: .Ltmp2:
537 ; X64-LFENCE-NEXT: movl (%rax), %eax
538 ; X64-LFENCE-NEXT: addl (%rbx), %eax
539 ; X64-LFENCE-NEXT: cltq
540 ; X64-LFENCE-NEXT: movl (%r14,%rax,4), %edi
541 ; X64-LFENCE-NEXT: callq sink
542 entry:
543 %a.cmp = icmp slt i32 %a, 42
544 br i1 %a.cmp, label %thrower, label %exit
545
546 thrower:
547 %badidx = getelementptr i32, i32* %ptr1, i32 %a
548 %secret1 = load i32, i32* %badidx
549 %e.ptr = call i8* @__cxa_allocate_exception(i64 4)
550 %e.ptr.cast = bitcast i8* %e.ptr to i32*
551 store i32 %secret1, i32* %e.ptr.cast
552 invoke void @__cxa_throw(i8* %e.ptr, i8* null, i8* null)
553 to label %exit unwind label %lpad
554
555 exit:
556 ret void
557
558 lpad:
559 %e = landingpad { i8*, i32 }
560 catch i8* null
561 %e.catch.ptr = extractvalue { i8*, i32 } %e, 0
562 %e.catch.ptr.cast = bitcast i8* %e.catch.ptr to i32*
563 %secret1.catch = load i32, i32* %e.catch.ptr.cast
564 %secret2 = load i32, i32* %ptr1
565 %secret.sum = add i32 %secret1.catch, %secret2
566 %ptr2.idx = getelementptr i32, i32* %ptr2, i32 %secret.sum
567 %leak = load i32, i32* %ptr2.idx
568 call void @sink(i32 %leak)
569 unreachable
570 }