llvm.org GIT mirror llvm / fc177d3
Revert r325128 ("[X86] Reduce Store Forward Block issues in HW"). This is causing miscompiles in some situations. See the llvm-commits thread for the commit for details. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@325852 91177308-0d34-0410-b5e6-96231b3b80d8 Richard Smith 2 years ago
5 changed file(s) with 0 addition(s) and 2511 deletion(s). Raw diff Collapse all Expand all
3030 X86FastISel.cpp
3131 X86FixupBWInsts.cpp
3232 X86FixupLEAs.cpp
33 X86FixupSFB.cpp
3433 X86FixupSetCC.cpp
3534 X86FloatingPoint.cpp
3635 X86FrameLowering.cpp
6969 /// Return a pass that transforms setcc + movzx pairs into xor + setcc.
7070 FunctionPass *createX86FixupSetCC();
7171
72 /// Return a pass that avoids creating store forward block issues in the hardware.
73 FunctionPass *createX86FixupSFB();
74
7572 /// Return a pass that expands WinAlloca pseudo-instructions.
7673 FunctionPass *createX86WinAllocaExpander();
7774
+0
-580
lib/Target/X86/X86FixupSFB.cpp less more
None //===- X86FixupSFB.cpp - Avoid HW Store Forward Block issues -----------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // If a load follows a store and reloads data that the store has written to
10 // memory, Intel microarchitectures can in many cases forward the data directly
11 // from the store to the load, This "store forwarding" saves cycles by enabling
12 // the load to directly obtain the data instead of accessing the data from
13 // cache or memory.
14 // A "store forward block" occurs in cases that a store cannot be forwarded to
15 // the load. The most typical case of store forward block on Intel Core
16 // microarchitecture that a small store cannot be forwarded to a large load.
17 // The estimated penalty for a store forward block is ~13 cycles.
18 //
19 // This pass tries to recognize and handle cases where "store forward block"
20 // is created by the compiler when lowering memcpy calls to a sequence
21 // of a load and a store.
22 //
23 // The pass currently only handles cases where memcpy is lowered to
24 // XMM/YMM registers, it tries to break the memcpy into smaller copies.
25 // breaking the memcpy should be possible since there is no atomicity
26 // guarantee for loads and stores to XMM/YMM.
27 //
28 // It could be better for performance to solve the problem by loading
29 // to XMM/YMM then inserting the partial store before storing back from XMM/YMM
30 // to memory, but this will result in a more conservative optimization since it
31 // requires we prove that all memory accesses between the blocking store and the
32 // load must alias/don't alias before we can move the store, whereas the
33 // transformation done here is correct regardless to other memory accesses.
34 //===----------------------------------------------------------------------===//
35
36 #include "X86InstrInfo.h"
37 #include "X86Subtarget.h"
38 #include "llvm/CodeGen/MachineBasicBlock.h"
39 #include "llvm/CodeGen/MachineFunction.h"
40 #include "llvm/CodeGen/MachineFunctionPass.h"
41 #include "llvm/CodeGen/MachineInstr.h"
42 #include "llvm/CodeGen/MachineInstrBuilder.h"
43 #include "llvm/CodeGen/MachineOperand.h"
44 #include "llvm/CodeGen/MachineRegisterInfo.h"
45 #include "llvm/IR/DebugInfoMetadata.h"
46 #include "llvm/IR/DebugLoc.h"
47 #include "llvm/IR/Function.h"
48 #include "llvm/MC/MCInstrDesc.h"
49
50 using namespace llvm;
51
52 #define DEBUG_TYPE "x86-fixup-SFB"
53
54 static cl::opt DisableX86FixupSFB("disable-fixup-SFB", cl::Hidden,
55 cl::desc("X86: Disable SFB fixup."),
56 cl::init(false));
57 namespace {
58
59 class FixupSFBPass : public MachineFunctionPass {
60 public:
61 FixupSFBPass() : MachineFunctionPass(ID) {}
62
63 StringRef getPassName() const override {
64 return "X86 Fixup Store Forward Block";
65 }
66
67 bool runOnMachineFunction(MachineFunction &MF) override;
68
69 private:
70 MachineRegisterInfo *MRI;
71 const X86InstrInfo *TII;
72 const X86RegisterInfo *TRI;
73 SmallVector, 2> BlockedLoadsStores;
74 SmallVector ForRemoval;
75 bool Is64Bit;
76
77 /// \brief Returns couples of Load then Store to memory which look
78 /// like a memcpy.
79 void findPotentiallylBlockedCopies(MachineFunction &MF);
80 /// \brief Break the memcpy's load and store into smaller copies
81 /// such that each memory load that was blocked by a smaller store
82 /// would now be copied separately.
83 void
84 breakBlockedCopies(MachineInstr *LoadInst, MachineInstr *StoreInst,
85 const std::map &BlockingStoresDisp);
86 /// \brief Break a copy of size Size to smaller copies.
87 void buildCopies(int Size, MachineInstr *LoadInst, int64_t LdDispImm,
88 MachineInstr *StoreInst, int64_t StDispImm,
89 int64_t LMMOffset, int64_t SMMOffset);
90
91 void buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, int64_t LoadDisp,
92 MachineInstr *StoreInst, unsigned NStoreOpcode,
93 int64_t StoreDisp, unsigned Size, int64_t LMMOffset,
94 int64_t SMMOffset);
95
96 unsigned getRegSizeInBytes(MachineInstr *Inst);
97 static char ID;
98 };
99
100 } // end anonymous namespace
101
102 char FixupSFBPass::ID = 0;
103
104 FunctionPass *llvm::createX86FixupSFB() { return new FixupSFBPass(); }
105
106 static bool isXMMLoadOpcode(unsigned Opcode) {
107 return Opcode == X86::MOVUPSrm || Opcode == X86::MOVAPSrm ||
108 Opcode == X86::VMOVUPSrm || Opcode == X86::VMOVAPSrm ||
109 Opcode == X86::VMOVUPDrm || Opcode == X86::VMOVAPDrm ||
110 Opcode == X86::VMOVDQUrm || Opcode == X86::VMOVDQArm ||
111 Opcode == X86::VMOVUPSZ128rm || Opcode == X86::VMOVAPSZ128rm ||
112 Opcode == X86::VMOVUPDZ128rm || Opcode == X86::VMOVAPDZ128rm ||
113 Opcode == X86::VMOVDQU64Z128rm || Opcode == X86::VMOVDQA64Z128rm ||
114 Opcode == X86::VMOVDQU32Z128rm || Opcode == X86::VMOVDQA32Z128rm;
115 }
116 static bool isYMMLoadOpcode(unsigned Opcode) {
117 return Opcode == X86::VMOVUPSYrm || Opcode == X86::VMOVAPSYrm ||
118 Opcode == X86::VMOVUPDYrm || Opcode == X86::VMOVAPDYrm ||
119 Opcode == X86::VMOVDQUYrm || Opcode == X86::VMOVDQAYrm ||
120 Opcode == X86::VMOVUPSZ256rm || Opcode == X86::VMOVAPSZ256rm ||
121 Opcode == X86::VMOVUPDZ256rm || Opcode == X86::VMOVAPDZ256rm ||
122 Opcode == X86::VMOVDQU64Z256rm || Opcode == X86::VMOVDQA64Z256rm ||
123 Opcode == X86::VMOVDQU32Z256rm || Opcode == X86::VMOVDQA32Z256rm;
124 }
125
126 static bool isPotentialBlockedMemCpyLd(unsigned Opcode) {
127 return isXMMLoadOpcode(Opcode) || isYMMLoadOpcode(Opcode);
128 }
129
130 std::map> PotentialBlockedMemCpy{
131 {X86::MOVUPSrm, {X86::MOVUPSmr, X86::MOVAPSmr}},
132 {X86::MOVAPSrm, {X86::MOVUPSmr, X86::MOVAPSmr}},
133 {X86::VMOVUPSrm, {X86::VMOVUPSmr, X86::VMOVAPSmr}},
134 {X86::VMOVAPSrm, {X86::VMOVUPSmr, X86::VMOVAPSmr}},
135 {X86::VMOVUPDrm, {X86::VMOVUPDmr, X86::VMOVAPDmr}},
136 {X86::VMOVAPDrm, {X86::VMOVUPDmr, X86::VMOVAPDmr}},
137 {X86::VMOVDQUrm, {X86::VMOVDQUmr, X86::VMOVDQAmr}},
138 {X86::VMOVDQArm, {X86::VMOVDQUmr, X86::VMOVDQAmr}},
139 {X86::VMOVUPSZ128rm, {X86::VMOVUPSZ128mr, X86::VMOVAPSZ128mr}},
140 {X86::VMOVAPSZ128rm, {X86::VMOVUPSZ128mr, X86::VMOVAPSZ128mr}},
141 {X86::VMOVUPDZ128rm, {X86::VMOVUPDZ128mr, X86::VMOVAPDZ128mr}},
142 {X86::VMOVAPDZ128rm, {X86::VMOVUPDZ128mr, X86::VMOVAPDZ128mr}},
143 {X86::VMOVUPSYrm, {X86::VMOVUPSYmr, X86::VMOVAPSYmr}},
144 {X86::VMOVAPSYrm, {X86::VMOVUPSYmr, X86::VMOVAPSYmr}},
145 {X86::VMOVUPDYrm, {X86::VMOVUPDYmr, X86::VMOVAPDYmr}},
146 {X86::VMOVAPDYrm, {X86::VMOVUPDYmr, X86::VMOVAPDYmr}},
147 {X86::VMOVDQUYrm, {X86::VMOVDQUYmr, X86::VMOVDQAYmr}},
148 {X86::VMOVDQAYrm, {X86::VMOVDQUYmr, X86::VMOVDQAYmr}},
149 {X86::VMOVUPSZ256rm, {X86::VMOVUPSZ256mr, X86::VMOVAPSZ256mr}},
150 {X86::VMOVAPSZ256rm, {X86::VMOVUPSZ256mr, X86::VMOVAPSZ256mr}},
151 {X86::VMOVUPDZ256rm, {X86::VMOVUPDZ256mr, X86::VMOVAPDZ256mr}},
152 {X86::VMOVAPDZ256rm, {X86::VMOVUPDZ256mr, X86::VMOVAPDZ256mr}},
153 {X86::VMOVDQU64Z128rm, {X86::VMOVDQU64Z128mr, X86::VMOVDQA64Z128mr}},
154 {X86::VMOVDQA64Z128rm, {X86::VMOVDQU64Z128mr, X86::VMOVDQA64Z128mr}},
155 {X86::VMOVDQU32Z128rm, {X86::VMOVDQU32Z128mr, X86::VMOVDQA32Z128mr}},
156 {X86::VMOVDQA32Z128rm, {X86::VMOVDQU32Z128mr, X86::VMOVDQA32Z128mr}},
157 {X86::VMOVDQU64Z256rm, {X86::VMOVDQU64Z256mr, X86::VMOVDQA64Z256mr}},
158 {X86::VMOVDQA64Z256rm, {X86::VMOVDQU64Z256mr, X86::VMOVDQA64Z256mr}},
159 {X86::VMOVDQU32Z256rm, {X86::VMOVDQU32Z256mr, X86::VMOVDQA32Z256mr}},
160 {X86::VMOVDQA32Z256rm, {X86::VMOVDQU32Z256mr, X86::VMOVDQA32Z256mr}},
161 };
162
163 static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode) {
164 auto PotentialStores = PotentialBlockedMemCpy.at(LdOpcode);
165 return PotentialStores.first == StOpcode ||
166 PotentialStores.second == StOpcode;
167 }
168
169 static bool isPotentialBlockingStoreInst(int Opcode, int LoadOpcode) {
170 bool PBlock = false;
171 PBlock |= Opcode == X86::MOV64mr || Opcode == X86::MOV64mi32 ||
172 Opcode == X86::MOV32mr || Opcode == X86::MOV32mi ||
173 Opcode == X86::MOV16mr || Opcode == X86::MOV16mi ||
174 Opcode == X86::MOV8mr || Opcode == X86::MOV8mi;
175 if (isYMMLoadOpcode(LoadOpcode))
176 PBlock |= Opcode == X86::VMOVUPSmr || Opcode == X86::VMOVAPSmr ||
177 Opcode == X86::VMOVUPDmr || Opcode == X86::VMOVAPDmr ||
178 Opcode == X86::VMOVDQUmr || Opcode == X86::VMOVDQAmr ||
179 Opcode == X86::VMOVUPSZ128mr || Opcode == X86::VMOVAPSZ128mr ||
180 Opcode == X86::VMOVUPDZ128mr || Opcode == X86::VMOVAPDZ128mr ||
181 Opcode == X86::VMOVDQU64Z128mr ||
182 Opcode == X86::VMOVDQA64Z128mr ||
183 Opcode == X86::VMOVDQU32Z128mr || Opcode == X86::VMOVDQA32Z128mr;
184 return PBlock;
185 }
186
187 static const int MOV128SZ = 16;
188 static const int MOV64SZ = 8;
189 static const int MOV32SZ = 4;
190 static const int MOV16SZ = 2;
191 static const int MOV8SZ = 1;
192
193 std::map YMMtoXMMLoadMap = {
194 {X86::VMOVUPSYrm, X86::VMOVUPSrm},
195 {X86::VMOVAPSYrm, X86::VMOVUPSrm},
196 {X86::VMOVUPDYrm, X86::VMOVUPDrm},
197 {X86::VMOVAPDYrm, X86::VMOVUPDrm},
198 {X86::VMOVDQUYrm, X86::VMOVDQUrm},
199 {X86::VMOVDQAYrm, X86::VMOVDQUrm},
200 {X86::VMOVUPSZ256rm, X86::VMOVUPSZ128rm},
201 {X86::VMOVAPSZ256rm, X86::VMOVUPSZ128rm},
202 {X86::VMOVUPDZ256rm, X86::VMOVUPDZ128rm},
203 {X86::VMOVAPDZ256rm, X86::VMOVUPDZ128rm},
204 {X86::VMOVDQU64Z256rm, X86::VMOVDQU64Z128rm},
205 {X86::VMOVDQA64Z256rm, X86::VMOVDQU64Z128rm},
206 {X86::VMOVDQU32Z256rm, X86::VMOVDQU32Z128rm},
207 {X86::VMOVDQA32Z256rm, X86::VMOVDQU32Z128rm},
208 };
209
210 std::map YMMtoXMMStoreMap = {
211 {X86::VMOVUPSYmr, X86::VMOVUPSmr},
212 {X86::VMOVAPSYmr, X86::VMOVUPSmr},
213 {X86::VMOVUPDYmr, X86::VMOVUPDmr},
214 {X86::VMOVAPDYmr, X86::VMOVUPDmr},
215 {X86::VMOVDQUYmr, X86::VMOVDQUmr},
216 {X86::VMOVDQAYmr, X86::VMOVDQUmr},
217 {X86::VMOVUPSZ256mr, X86::VMOVUPSZ128mr},
218 {X86::VMOVAPSZ256mr, X86::VMOVUPSZ128mr},
219 {X86::VMOVUPDZ256mr, X86::VMOVUPDZ128mr},
220 {X86::VMOVAPDZ256mr, X86::VMOVUPDZ128mr},
221 {X86::VMOVDQU64Z256mr, X86::VMOVDQU64Z128mr},
222 {X86::VMOVDQA64Z256mr, X86::VMOVDQU64Z128mr},
223 {X86::VMOVDQU32Z256mr, X86::VMOVDQU32Z128mr},
224 {X86::VMOVDQA32Z256mr, X86::VMOVDQU32Z128mr},
225 };
226
227 static int getAddrOffset(MachineInstr *MI) {
228 const MCInstrDesc &Descl = MI->getDesc();
229 int AddrOffset = X86II::getMemoryOperandNo(Descl.TSFlags);
230 assert(AddrOffset != -1 && "Expected Memory Operand");
231 AddrOffset += X86II::getOperandBias(Descl);
232 return AddrOffset;
233 }
234
235 static MachineOperand &getBaseOperand(MachineInstr *MI) {
236 int AddrOffset = getAddrOffset(MI);
237 return MI->getOperand(AddrOffset + X86::AddrBaseReg);
238 }
239
240 static MachineOperand &getDispOperand(MachineInstr *MI) {
241 int AddrOffset = getAddrOffset(MI);
242 return MI->getOperand(AddrOffset + X86::AddrDisp);
243 }
244
245 // Relevant addressing modes contain only base register and immediate
246 // displacement or frameindex and immediate displacement.
247 // TODO: Consider expanding to other addressing modes in the future
248 static bool isRelevantAddressingMode(MachineInstr *MI) {
249 int AddrOffset = getAddrOffset(MI);
250 MachineOperand &Base = MI->getOperand(AddrOffset + X86::AddrBaseReg);
251 MachineOperand &Disp = MI->getOperand(AddrOffset + X86::AddrDisp);
252 MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt);
253 MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg);
254 MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg);
255
256 if (!((Base.isReg() && Base.getReg() != X86::NoRegister) || Base.isFI()))
257 return false;
258 if (!Disp.isImm())
259 return false;
260 if (Scale.getImm() != 1)
261 return false;
262 if (!(Index.isReg() && Index.getReg() == X86::NoRegister))
263 return false;
264 if (!(Segment.isReg() && Segment.getReg() == X86::NoRegister))
265 return false;
266 return true;
267 }
268
269 // Collect potentially blocking stores.
270 // Limit the number of instructions backwards we want to inspect
271 // since the effect of store block won't be visible if the store
272 // and load instructions have enough instructions in between to
273 // keep the core busy.
274 static const unsigned LIMIT = 20;
275 static SmallVector
276 findPotentialBlockers(MachineInstr *LoadInst) {
277 SmallVector PotentialBlockers;
278 unsigned BlockLimit = 0;
279 for (MachineBasicBlock::iterator LI = LoadInst,
280 BB = LoadInst->getParent()->begin();
281 LI != BB; --LI) {
282 BlockLimit++;
283 if (BlockLimit >= LIMIT)
284 break;
285 MachineInstr &MI = *LI;
286 if (MI.getDesc().isCall())
287 break;
288 PotentialBlockers.push_back(&MI);
289 }
290 // If we didn't get to the instructions limit try predecessing blocks.
291 // Ideally we should traverse the predecessor blocks in depth with some
292 // coloring algorithm, but for now let's just look at the first order
293 // predecessors.
294 if (BlockLimit < LIMIT) {
295 MachineBasicBlock *MBB = LoadInst->getParent();
296 int LimitLeft = LIMIT - BlockLimit;
297 for (MachineBasicBlock::pred_iterator PB = MBB->pred_begin(),
298 PE = MBB->pred_end();
299 PB != PE; ++PB) {
300 MachineBasicBlock *PMBB = *PB;
301 int PredLimit = 0;
302 for (MachineBasicBlock::reverse_iterator PMI = PMBB->rbegin(),
303 PME = PMBB->rend();
304 PMI != PME; ++PMI) {
305 PredLimit++;
306 if (PredLimit >= LimitLeft)
307 break;
308 if (PMI->getDesc().isCall())
309 break;
310 PotentialBlockers.push_back(&*PMI);
311 }
312 }
313 }
314 return PotentialBlockers;
315 }
316
317 void FixupSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
318 int64_t LoadDisp, MachineInstr *StoreInst,
319 unsigned NStoreOpcode, int64_t StoreDisp,
320 unsigned Size, int64_t LMMOffset,
321 int64_t SMMOffset) {
322 MachineOperand &LoadBase = getBaseOperand(LoadInst);
323 MachineOperand &StoreBase = getBaseOperand(StoreInst);
324 MachineBasicBlock *MBB = LoadInst->getParent();
325 MachineMemOperand *LMMO = *LoadInst->memoperands_begin();
326 MachineMemOperand *SMMO = *StoreInst->memoperands_begin();
327
328 unsigned Reg1 = MRI->createVirtualRegister(
329 TII->getRegClass(TII->get(NLoadOpcode), 0, TRI, *(MBB->getParent())));
330 BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode), Reg1)
331 .add(LoadBase)
332 .addImm(1)
333 .addReg(X86::NoRegister)
334 .addImm(LoadDisp)
335 .addReg(X86::NoRegister)
336 .addMemOperand(
337 MBB->getParent()->getMachineMemOperand(LMMO, LMMOffset, Size));
338 DEBUG(LoadInst->getPrevNode()->dump());
339 // If the load and store are consecutive, use the loadInst location to
340 // reduce register pressure.
341 MachineInstr *StInst = StoreInst;
342 if (StoreInst->getPrevNode() == LoadInst)
343 StInst = LoadInst;
344 BuildMI(*MBB, StInst, StInst->getDebugLoc(), TII->get(NStoreOpcode))
345 .add(StoreBase)
346 .addImm(1)
347 .addReg(X86::NoRegister)
348 .addImm(StoreDisp)
349 .addReg(X86::NoRegister)
350 .addReg(Reg1)
351 .addMemOperand(
352 MBB->getParent()->getMachineMemOperand(SMMO, SMMOffset, Size));
353 DEBUG(StInst->getPrevNode()->dump());
354 }
355
356 void FixupSFBPass::buildCopies(int Size, MachineInstr *LoadInst,
357 int64_t LdDispImm, MachineInstr *StoreInst,
358 int64_t StDispImm, int64_t LMMOffset,
359 int64_t SMMOffset) {
360 int LdDisp = LdDispImm;
361 int StDisp = StDispImm;
362 while (Size > 0) {
363 if ((Size - MOV128SZ >= 0) && isYMMLoadOpcode(LoadInst->getOpcode())) {
364 Size = Size - MOV128SZ;
365 buildCopy(LoadInst, YMMtoXMMLoadMap.at(LoadInst->getOpcode()), LdDisp,
366 StoreInst, YMMtoXMMStoreMap.at(StoreInst->getOpcode()), StDisp,
367 MOV128SZ, LMMOffset, SMMOffset);
368 LdDisp += MOV128SZ;
369 StDisp += MOV128SZ;
370 LMMOffset += MOV128SZ;
371 SMMOffset += MOV128SZ;
372 continue;
373 }
374 if (Size - MOV64SZ >= 0 && Is64Bit) {
375 Size = Size - MOV64SZ;
376 buildCopy(LoadInst, X86::MOV64rm, LdDisp, StoreInst, X86::MOV64mr, StDisp,
377 MOV64SZ, LMMOffset, SMMOffset);
378 LdDisp += MOV64SZ;
379 StDisp += MOV64SZ;
380 LMMOffset += MOV64SZ;
381 SMMOffset += MOV64SZ;
382 continue;
383 }
384 if (Size - MOV32SZ >= 0) {
385 Size = Size - MOV32SZ;
386 buildCopy(LoadInst, X86::MOV32rm, LdDisp, StoreInst, X86::MOV32mr, StDisp,
387 MOV32SZ, LMMOffset, SMMOffset);
388 LdDisp += MOV32SZ;
389 StDisp += MOV32SZ;
390 LMMOffset += MOV32SZ;
391 SMMOffset += MOV32SZ;
392 continue;
393 }
394 if (Size - MOV16SZ >= 0) {
395 Size = Size - MOV16SZ;
396 buildCopy(LoadInst, X86::MOV16rm, LdDisp, StoreInst, X86::MOV16mr, StDisp,
397 MOV16SZ, LMMOffset, SMMOffset);
398 LdDisp += MOV16SZ;
399 StDisp += MOV16SZ;
400 LMMOffset += MOV16SZ;
401 SMMOffset += MOV16SZ;
402 continue;
403 }
404 if (Size - MOV8SZ >= 0) {
405 Size = Size - MOV8SZ;
406 buildCopy(LoadInst, X86::MOV8rm, LdDisp, StoreInst, X86::MOV8mr, StDisp,
407 MOV8SZ, LMMOffset, SMMOffset);
408 LdDisp += MOV8SZ;
409 StDisp += MOV8SZ;
410 LMMOffset += MOV8SZ;
411 SMMOffset += MOV8SZ;
412 continue;
413 }
414 }
415 assert(Size == 0 && "Wrong size division");
416 }
417
418 static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst) {
419 MachineOperand &LoadBase = getBaseOperand(LoadInst);
420 MachineOperand &StoreBase = getBaseOperand(StoreInst);
421 if (LoadBase.isReg()) {
422 MachineInstr *LastLoad = LoadInst->getPrevNode();
423 // If the original load and store to xmm/ymm were consecutive
424 // then the partial copies were also created in
425 // a consecutive order to reduce register pressure,
426 // and the location of the last load is before the last store.
427 if (StoreInst->getPrevNode() == LoadInst)
428 LastLoad = LoadInst->getPrevNode()->getPrevNode();
429 getBaseOperand(LastLoad).setIsKill(LoadBase.isKill());
430 }
431 if (StoreBase.isReg()) {
432 MachineInstr *StInst = StoreInst;
433 if (StoreInst->getPrevNode() == LoadInst)
434 StInst = LoadInst;
435 getBaseOperand(StInst->getPrevNode()).setIsKill(StoreBase.isKill());
436 }
437 }
438
439 void FixupSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
440 for (auto &MBB : MF)
441 for (auto &MI : MBB)
442 if (isPotentialBlockedMemCpyLd(MI.getOpcode())) {
443 int DefVR = MI.getOperand(0).getReg();
444 if (MRI->hasOneUse(DefVR))
445 for (auto UI = MRI->use_nodbg_begin(DefVR), UE = MRI->use_nodbg_end();
446 UI != UE;) {
447 MachineOperand &StoreMO = *UI++;
448 MachineInstr &StoreMI = *StoreMO.getParent();
449 if (isPotentialBlockedMemCpyPair(MI.getOpcode(),
450 StoreMI.getOpcode()) &&
451 (StoreMI.getParent() == MI.getParent()))
452 if (isRelevantAddressingMode(&MI) &&
453 isRelevantAddressingMode(&StoreMI))
454 BlockedLoadsStores.push_back(
455 std::pair(&MI, &StoreMI));
456 }
457 }
458 }
459 unsigned FixupSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) {
460 auto TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI,
461 *LoadInst->getParent()->getParent());
462 return TRI->getRegSizeInBits(*TRC) / 8;
463 }
464
465 void FixupSFBPass::breakBlockedCopies(
466 MachineInstr *LoadInst, MachineInstr *StoreInst,
467 const std::map &BlockingStoresDisp) {
468 int64_t LdDispImm = getDispOperand(LoadInst).getImm();
469 int64_t StDispImm = getDispOperand(StoreInst).getImm();
470 int64_t LMMOffset = (*LoadInst->memoperands_begin())->getOffset();
471 int64_t SMMOffset = (*StoreInst->memoperands_begin())->getOffset();
472
473 int64_t LdDisp1 = LdDispImm;
474 int64_t LdDisp2 = 0;
475 int64_t StDisp1 = StDispImm;
476 int64_t StDisp2 = 0;
477 unsigned Size1 = 0;
478 unsigned Size2 = 0;
479 int64_t LdStDelta = StDispImm - LdDispImm;
480 for (auto inst : BlockingStoresDisp) {
481 LdDisp2 = inst.first;
482 StDisp2 = inst.first + LdStDelta;
483 Size1 = std::abs(std::abs(LdDisp2) - std::abs(LdDisp1));
484 Size2 = inst.second;
485 buildCopies(Size1, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
486 SMMOffset);
487 buildCopies(Size2, LoadInst, LdDisp2, StoreInst, StDisp2, LMMOffset + Size1,
488 SMMOffset + Size1);
489 LdDisp1 = LdDisp2 + Size2;
490 StDisp1 = StDisp2 + Size2;
491 LMMOffset += Size1 + Size2;
492 SMMOffset += Size1 + Size2;
493 }
494 unsigned Size3 = (LdDispImm + getRegSizeInBytes(LoadInst)) - LdDisp1;
495 buildCopies(Size3, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
496 LMMOffset);
497 }
498
499 bool FixupSFBPass::runOnMachineFunction(MachineFunction &MF) {
500 bool Changed = false;
501
502 if (DisableX86FixupSFB || skipFunction(MF.getFunction()))
503 return false;
504
505 MRI = &MF.getRegInfo();
506 assert(MRI->isSSA() && "Expected MIR to be in SSA form");
507 TII = MF.getSubtarget().getInstrInfo();
508 TRI = MF.getSubtarget().getRegisterInfo();
509 Is64Bit = MF.getSubtarget().is64Bit();
510 DEBUG(dbgs() << "Start X86FixupSFB\n";);
511 // Look for a load then a store to XMM/YMM which look like a memcpy
512 findPotentiallylBlockedCopies(MF);
513
514 for (auto LoadStoreInst : BlockedLoadsStores) {
515 MachineInstr *LoadInst = LoadStoreInst.first;
516 SmallVector PotentialBlockers =
517 findPotentialBlockers(LoadInst);
518
519 MachineOperand &LoadBase = getBaseOperand(LoadInst);
520 int64_t LdDispImm = getDispOperand(LoadInst).getImm();
521 std::map BlockingStoresDisp;
522 int LdBaseReg = LoadBase.isReg() ? LoadBase.getReg() : LoadBase.getIndex();
523
524 for (auto PBInst : PotentialBlockers) {
525 if (isPotentialBlockingStoreInst(PBInst->getOpcode(),
526 LoadInst->getOpcode())) {
527 if (!isRelevantAddressingMode(PBInst))
528 continue;
529 MachineOperand &PBstoreBase = getBaseOperand(PBInst);
530 int64_t PBstDispImm = getDispOperand(PBInst).getImm();
531 assert(PBInst->hasOneMemOperand() && "Expected One Memory Operand");
532 unsigned PBstSize = (*PBInst->memoperands_begin())->getSize();
533 int PBstBaseReg =
534 PBstoreBase.isReg() ? PBstoreBase.getReg() : PBstoreBase.getIndex();
535 // This check doesn't cover all cases, but it will suffice for now.
536 // TODO: take branch probability into consideration, if the blocking
537 // store is in an unreached block, breaking the memcopy could lose
538 // performance.
539 if (((LoadBase.isReg() && PBstoreBase.isReg()) ||
540 (LoadBase.isFI() && PBstoreBase.isFI())) &&
541 LdBaseReg == PBstBaseReg &&
542 ((PBstDispImm >= LdDispImm) &&
543 (PBstDispImm <=
544 LdDispImm + (getRegSizeInBytes(LoadInst) - PBstSize)))) {
545 if (BlockingStoresDisp.count(PBstDispImm)) {
546 if (BlockingStoresDisp[PBstDispImm] > PBstSize)
547 BlockingStoresDisp[PBstDispImm] = PBstSize;
548
549 } else
550 BlockingStoresDisp[PBstDispImm] = PBstSize;
551 }
552 }
553 }
554
555 if (BlockingStoresDisp.size() == 0)
556 continue;
557
558 // We found a store forward block, break the memcpy's load and store
559 // into smaller copies such that each smaller store that was causing
560 // a store block would now be copied separately.
561 MachineInstr *StoreInst = LoadStoreInst.second;
562 DEBUG(dbgs() << "Blocked load and store instructions: \n");
563 DEBUG(LoadInst->dump());
564 DEBUG(StoreInst->dump());
565 DEBUG(dbgs() << "Replaced with:\n");
566 breakBlockedCopies(LoadInst, StoreInst, BlockingStoresDisp);
567 updateKillStatus(LoadInst, StoreInst);
568 ForRemoval.push_back(LoadInst);
569 ForRemoval.push_back(StoreInst);
570 }
571 for (auto RemovedInst : ForRemoval) {
572 RemovedInst->eraseFromParent();
573 }
574 ForRemoval.clear();
575 BlockedLoadsStores.clear();
576 DEBUG(dbgs() << "End X86FixupSFB\n";);
577
578 return Changed;
579 }
448448 addPass(createX86FixupSetCC());
449449 addPass(createX86OptimizeLEAs());
450450 addPass(createX86CallFrameOptimization());
451 addPass(createX86FixupSFB());
452451 }
453452
454453 addPass(createX86WinAllocaExpander());
+0
-1926
test/CodeGen/X86/fixup-sfb-32.ll less more
None ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefix=CHECK
2 ; RUN: llc < %s -mtriple=i686-linux --disable-fixup-SFB | FileCheck %s --check-prefix=DISABLED
3 ; RUN: llc < %s -mtriple=i686-linux -mattr +sse4.1 | FileCheck %s -check-prefix=CHECK-AVX2
4 ; RUN: llc < %s -mtriple=i686-linux -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s -check-prefix=CHECK-AVX512
5
6 %struct.S = type { i32, i32, i32, i32 }
7
8 ; Function Attrs: nounwind uwtable
9 define void @test_conditional_block(%struct.S* nocapture %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4) local_unnamed_addr #0 {
10 ; CHECK-LABEL: test_conditional_block:
11 ; CHECK: # %bb.0: # %entry
12 ; CHECK-NEXT: pushl %edi
13 ; CHECK-NEXT: .cfi_def_cfa_offset 8
14 ; CHECK-NEXT: pushl %esi
15 ; CHECK-NEXT: .cfi_def_cfa_offset 12
16 ; CHECK-NEXT: .cfi_offset %esi, -12
17 ; CHECK-NEXT: .cfi_offset %edi, -8
18 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
19 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
20 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi
21 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
22 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
23 ; CHECK-NEXT: cmpl $18, %edi
24 ; CHECK-NEXT: jl .LBB0_2
25 ; CHECK-NEXT: # %bb.1: # %if.then
26 ; CHECK-NEXT: movl %edi, 4(%ecx)
27 ; CHECK-NEXT: .LBB0_2: # %if.end
28 ; CHECK-NEXT: movups (%esi), %xmm0
29 ; CHECK-NEXT: movups %xmm0, (%edx)
30 ; CHECK-NEXT: movl (%ecx), %edx
31 ; CHECK-NEXT: movl %edx, (%eax)
32 ; CHECK-NEXT: movl 4(%ecx), %edx
33 ; CHECK-NEXT: movl %edx, 4(%eax)
34 ; CHECK-NEXT: movl 8(%ecx), %edx
35 ; CHECK-NEXT: movl %edx, 8(%eax)
36 ; CHECK-NEXT: movl 12(%ecx), %ecx
37 ; CHECK-NEXT: movl %ecx, 12(%eax)
38 ; CHECK-NEXT: popl %esi
39 ; CHECK-NEXT: popl %edi
40 ; CHECK-NEXT: retl
41 ;
42 ; DISABLED-LABEL: test_conditional_block:
43 ; DISABLED: # %bb.0: # %entry
44 ; DISABLED-NEXT: pushl %edi
45 ; DISABLED-NEXT: .cfi_def_cfa_offset 8
46 ; DISABLED-NEXT: pushl %esi
47 ; DISABLED-NEXT: .cfi_def_cfa_offset 12
48 ; DISABLED-NEXT: .cfi_offset %esi, -12
49 ; DISABLED-NEXT: .cfi_offset %edi, -8
50 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %edx
51 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %ecx
52 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %edi
53 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %eax
54 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %esi
55 ; DISABLED-NEXT: cmpl $18, %edi
56 ; DISABLED-NEXT: jl .LBB0_2
57 ; DISABLED-NEXT: # %bb.1: # %if.then
58 ; DISABLED-NEXT: movl %edi, 4(%esi)
59 ; DISABLED-NEXT: .LBB0_2: # %if.end
60 ; DISABLED-NEXT: movups (%edx), %xmm0
61 ; DISABLED-NEXT: movups %xmm0, (%ecx)
62 ; DISABLED-NEXT: movups (%esi), %xmm0
63 ; DISABLED-NEXT: movups %xmm0, (%eax)
64 ; DISABLED-NEXT: popl %esi
65 ; DISABLED-NEXT: popl %edi
66 ; DISABLED-NEXT: retl
67 ;
68 ; CHECK-AVX2-LABEL: test_conditional_block:
69 ; CHECK-AVX2: # %bb.0: # %entry
70 ; CHECK-AVX2-NEXT: pushl %edi
71 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8
72 ; CHECK-AVX2-NEXT: pushl %esi
73 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 12
74 ; CHECK-AVX2-NEXT: .cfi_offset %esi, -12
75 ; CHECK-AVX2-NEXT: .cfi_offset %edi, -8
76 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi
77 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
78 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edi
79 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
80 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
81 ; CHECK-AVX2-NEXT: cmpl $18, %edi
82 ; CHECK-AVX2-NEXT: jl .LBB0_2
83 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
84 ; CHECK-AVX2-NEXT: movl %edi, 4(%ecx)
85 ; CHECK-AVX2-NEXT: .LBB0_2: # %if.end
86 ; CHECK-AVX2-NEXT: movups (%esi), %xmm0
87 ; CHECK-AVX2-NEXT: movups %xmm0, (%edx)
88 ; CHECK-AVX2-NEXT: movl (%ecx), %edx
89 ; CHECK-AVX2-NEXT: movl %edx, (%eax)
90 ; CHECK-AVX2-NEXT: movl 4(%ecx), %edx
91 ; CHECK-AVX2-NEXT: movl %edx, 4(%eax)
92 ; CHECK-AVX2-NEXT: movl 8(%ecx), %edx
93 ; CHECK-AVX2-NEXT: movl %edx, 8(%eax)
94 ; CHECK-AVX2-NEXT: movl 12(%ecx), %ecx
95 ; CHECK-AVX2-NEXT: movl %ecx, 12(%eax)
96 ; CHECK-AVX2-NEXT: popl %esi
97 ; CHECK-AVX2-NEXT: popl %edi
98 ; CHECK-AVX2-NEXT: retl
99 ;
100 ; CHECK-AVX512-LABEL: test_conditional_block:
101 ; CHECK-AVX512: # %bb.0: # %entry
102 ; CHECK-AVX512-NEXT: pushl %edi
103 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 8
104 ; CHECK-AVX512-NEXT: pushl %esi
105 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 12
106 ; CHECK-AVX512-NEXT: .cfi_offset %esi, -12
107 ; CHECK-AVX512-NEXT: .cfi_offset %edi, -8
108 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %esi
109 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
110 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edi
111 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
112 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
113 ; CHECK-AVX512-NEXT: cmpl $18, %edi
114 ; CHECK-AVX512-NEXT: jl .LBB0_2
115 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
116 ; CHECK-AVX512-NEXT: movl %edi, 4(%ecx)
117 ; CHECK-AVX512-NEXT: .LBB0_2: # %if.end
118 ; CHECK-AVX512-NEXT: vmovups (%esi), %xmm0
119 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%edx)
120 ; CHECK-AVX512-NEXT: movl (%ecx), %edx
121 ; CHECK-AVX512-NEXT: movl %edx, (%eax)
122 ; CHECK-AVX512-NEXT: movl 4(%ecx), %edx
123 ; CHECK-AVX512-NEXT: movl %edx, 4(%eax)
124 ; CHECK-AVX512-NEXT: movl 8(%ecx), %edx
125 ; CHECK-AVX512-NEXT: movl %edx, 8(%eax)
126 ; CHECK-AVX512-NEXT: movl 12(%ecx), %ecx
127 ; CHECK-AVX512-NEXT: movl %ecx, 12(%eax)
128 ; CHECK-AVX512-NEXT: popl %esi
129 ; CHECK-AVX512-NEXT: popl %edi
130 ; CHECK-AVX512-NEXT: retl
131 entry:
132 %cmp = icmp sgt i32 %x, 17
133 br i1 %cmp, label %if.then, label %if.end
134
135 if.then: ; preds = %entry
136 %b = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 1
137 store i32 %x, i32* %b, align 4
138 br label %if.end
139
140 if.end: ; preds = %if.then, %entry
141 %0 = bitcast %struct.S* %s3 to i8*
142 %1 = bitcast %struct.S* %s4 to i8*
143 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
144 %2 = bitcast %struct.S* %s2 to i8*
145 %3 = bitcast %struct.S* %s1 to i8*
146 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 4, i1 false)
147 ret void
148 }
149
150 ; Function Attrs: nounwind uwtable
151 define void @test_imm_store(%struct.S* nocapture %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3) local_unnamed_addr #0 {
152 ; CHECK-LABEL: test_imm_store:
153 ; CHECK: # %bb.0: # %entry
154 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
155 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
156 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
157 ; CHECK-NEXT: movl $0, (%edx)
158 ; CHECK-NEXT: movl $1, (%ecx)
159 ; CHECK-NEXT: movl (%edx), %ecx
160 ; CHECK-NEXT: movl %ecx, (%eax)
161 ; CHECK-NEXT: movl 4(%edx), %ecx
162 ; CHECK-NEXT: movl %ecx, 4(%eax)
163 ; CHECK-NEXT: movl 8(%edx), %ecx
164 ; CHECK-NEXT: movl %ecx, 8(%eax)
165 ; CHECK-NEXT: movl 12(%edx), %ecx
166 ; CHECK-NEXT: movl %ecx, 12(%eax)
167 ; CHECK-NEXT: retl
168 ;
169 ; DISABLED-LABEL: test_imm_store:
170 ; DISABLED: # %bb.0: # %entry
171 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %eax
172 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %ecx
173 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %edx
174 ; DISABLED-NEXT: movl $0, (%edx)
175 ; DISABLED-NEXT: movl $1, (%ecx)
176 ; DISABLED-NEXT: movups (%edx), %xmm0
177 ; DISABLED-NEXT: movups %xmm0, (%eax)
178 ; DISABLED-NEXT: retl
179 ;
180 ; CHECK-AVX2-LABEL: test_imm_store:
181 ; CHECK-AVX2: # %bb.0: # %entry
182 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
183 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
184 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
185 ; CHECK-AVX2-NEXT: movl $0, (%edx)
186 ; CHECK-AVX2-NEXT: movl $1, (%ecx)
187 ; CHECK-AVX2-NEXT: movl (%edx), %ecx
188 ; CHECK-AVX2-NEXT: movl %ecx, (%eax)
189 ; CHECK-AVX2-NEXT: movl 4(%edx), %ecx
190 ; CHECK-AVX2-NEXT: movl %ecx, 4(%eax)
191 ; CHECK-AVX2-NEXT: movl 8(%edx), %ecx
192 ; CHECK-AVX2-NEXT: movl %ecx, 8(%eax)
193 ; CHECK-AVX2-NEXT: movl 12(%edx), %ecx
194 ; CHECK-AVX2-NEXT: movl %ecx, 12(%eax)
195 ; CHECK-AVX2-NEXT: retl
196 ;
197 ; CHECK-AVX512-LABEL: test_imm_store:
198 ; CHECK-AVX512: # %bb.0: # %entry
199 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
200 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
201 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
202 ; CHECK-AVX512-NEXT: movl $0, (%edx)
203 ; CHECK-AVX512-NEXT: movl $1, (%ecx)
204 ; CHECK-AVX512-NEXT: movl (%edx), %ecx
205 ; CHECK-AVX512-NEXT: movl %ecx, (%eax)
206 ; CHECK-AVX512-NEXT: movl 4(%edx), %ecx
207 ; CHECK-AVX512-NEXT: movl %ecx, 4(%eax)
208 ; CHECK-AVX512-NEXT: movl 8(%edx), %ecx
209 ; CHECK-AVX512-NEXT: movl %ecx, 8(%eax)
210 ; CHECK-AVX512-NEXT: movl 12(%edx), %ecx
211 ; CHECK-AVX512-NEXT: movl %ecx, 12(%eax)
212 ; CHECK-AVX512-NEXT: retl
213 entry:
214 %a = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 0
215 store i32 0, i32* %a, align 4
216 %a1 = getelementptr inbounds %struct.S, %struct.S* %s3, i64 0, i32 0
217 store i32 1, i32* %a1, align 4
218 %0 = bitcast %struct.S* %s2 to i8*
219 %1 = bitcast %struct.S* %s1 to i8*
220 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
221 ret void
222 }
223
224 ; Function Attrs: nounwind uwtable
225 define void @test_nondirect_br(%struct.S* nocapture %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 {
226 ; CHECK-LABEL: test_nondirect_br:
227 ; CHECK: # %bb.0: # %entry
228 ; CHECK-NEXT: pushl %edi
229 ; CHECK-NEXT: .cfi_def_cfa_offset 8
230 ; CHECK-NEXT: pushl %esi
231 ; CHECK-NEXT: .cfi_def_cfa_offset 12
232 ; CHECK-NEXT: .cfi_offset %esi, -12
233 ; CHECK-NEXT: .cfi_offset %edi, -8
234 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
235 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
236 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
237 ; CHECK-NEXT: cmpl $18, %ecx
238 ; CHECK-NEXT: jl .LBB2_2
239 ; CHECK-NEXT: # %bb.1: # %if.then
240 ; CHECK-NEXT: movl %ecx, 4(%eax)
241 ; CHECK-NEXT: .LBB2_2: # %if.end
242 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi
243 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
244 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
245 ; CHECK-NEXT: cmpl $14, %edx
246 ; CHECK-NEXT: jl .LBB2_4
247 ; CHECK-NEXT: # %bb.3: # %if.then2
248 ; CHECK-NEXT: movl %edx, 12(%eax)
249 ; CHECK-NEXT: .LBB2_4: # %if.end3
250 ; CHECK-NEXT: movups (%edi), %xmm0
251 ; CHECK-NEXT: movups %xmm0, (%esi)
252 ; CHECK-NEXT: movl (%eax), %edx
253 ; CHECK-NEXT: movl %edx, (%ecx)
254 ; CHECK-NEXT: movl 4(%eax), %edx
255 ; CHECK-NEXT: movl %edx, 4(%ecx)
256 ; CHECK-NEXT: movl 8(%eax), %edx
257 ; CHECK-NEXT: movl %edx, 8(%ecx)
258 ; CHECK-NEXT: movl 12(%eax), %eax
259 ; CHECK-NEXT: movl %eax, 12(%ecx)
260 ; CHECK-NEXT: popl %esi
261 ; CHECK-NEXT: popl %edi
262 ; CHECK-NEXT: retl
263 ;
264 ; DISABLED-LABEL: test_nondirect_br:
265 ; DISABLED: # %bb.0: # %entry
266 ; DISABLED-NEXT: pushl %edi
267 ; DISABLED-NEXT: .cfi_def_cfa_offset 8
268 ; DISABLED-NEXT: pushl %esi
269 ; DISABLED-NEXT: .cfi_def_cfa_offset 12
270 ; DISABLED-NEXT: .cfi_offset %esi, -12
271 ; DISABLED-NEXT: .cfi_offset %edi, -8
272 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %ecx
273 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %edx
274 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %eax
275 ; DISABLED-NEXT: cmpl $18, %edx
276 ; DISABLED-NEXT: jl .LBB2_2
277 ; DISABLED-NEXT: # %bb.1: # %if.then
278 ; DISABLED-NEXT: movl %edx, 4(%eax)
279 ; DISABLED-NEXT: .LBB2_2: # %if.end
280 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %edi
281 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %esi
282 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %edx
283 ; DISABLED-NEXT: cmpl $14, %ecx
284 ; DISABLED-NEXT: jl .LBB2_4
285 ; DISABLED-NEXT: # %bb.3: # %if.then2
286 ; DISABLED-NEXT: movl %ecx, 12(%eax)
287 ; DISABLED-NEXT: .LBB2_4: # %if.end3
288 ; DISABLED-NEXT: movups (%edi), %xmm0
289 ; DISABLED-NEXT: movups %xmm0, (%esi)
290 ; DISABLED-NEXT: movups (%eax), %xmm0
291 ; DISABLED-NEXT: movups %xmm0, (%edx)
292 ; DISABLED-NEXT: popl %esi
293 ; DISABLED-NEXT: popl %edi
294 ; DISABLED-NEXT: retl
295 ;
296 ; CHECK-AVX2-LABEL: test_nondirect_br:
297 ; CHECK-AVX2: # %bb.0: # %entry
298 ; CHECK-AVX2-NEXT: pushl %edi
299 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8
300 ; CHECK-AVX2-NEXT: pushl %esi
301 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 12
302 ; CHECK-AVX2-NEXT: .cfi_offset %esi, -12
303 ; CHECK-AVX2-NEXT: .cfi_offset %edi, -8
304 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
305 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
306 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
307 ; CHECK-AVX2-NEXT: cmpl $18, %ecx
308 ; CHECK-AVX2-NEXT: jl .LBB2_2
309 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
310 ; CHECK-AVX2-NEXT: movl %ecx, 4(%eax)
311 ; CHECK-AVX2-NEXT: .LBB2_2: # %if.end
312 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edi
313 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi
314 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
315 ; CHECK-AVX2-NEXT: cmpl $14, %edx
316 ; CHECK-AVX2-NEXT: jl .LBB2_4
317 ; CHECK-AVX2-NEXT: # %bb.3: # %if.then2
318 ; CHECK-AVX2-NEXT: movl %edx, 12(%eax)
319 ; CHECK-AVX2-NEXT: .LBB2_4: # %if.end3
320 ; CHECK-AVX2-NEXT: movups (%edi), %xmm0
321 ; CHECK-AVX2-NEXT: movups %xmm0, (%esi)
322 ; CHECK-AVX2-NEXT: movl (%eax), %edx
323 ; CHECK-AVX2-NEXT: movl %edx, (%ecx)
324 ; CHECK-AVX2-NEXT: movl 4(%eax), %edx
325 ; CHECK-AVX2-NEXT: movl %edx, 4(%ecx)
326 ; CHECK-AVX2-NEXT: movl 8(%eax), %edx
327 ; CHECK-AVX2-NEXT: movl %edx, 8(%ecx)
328 ; CHECK-AVX2-NEXT: movl 12(%eax), %eax
329 ; CHECK-AVX2-NEXT: movl %eax, 12(%ecx)
330 ; CHECK-AVX2-NEXT: popl %esi
331 ; CHECK-AVX2-NEXT: popl %edi
332 ; CHECK-AVX2-NEXT: retl
333 ;
334 ; CHECK-AVX512-LABEL: test_nondirect_br:
335 ; CHECK-AVX512: # %bb.0: # %entry
336 ; CHECK-AVX512-NEXT: pushl %edi
337 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 8
338 ; CHECK-AVX512-NEXT: pushl %esi
339 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 12
340 ; CHECK-AVX512-NEXT: .cfi_offset %esi, -12
341 ; CHECK-AVX512-NEXT: .cfi_offset %edi, -8
342 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
343 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
344 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
345 ; CHECK-AVX512-NEXT: cmpl $18, %ecx
346 ; CHECK-AVX512-NEXT: jl .LBB2_2
347 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
348 ; CHECK-AVX512-NEXT: movl %ecx, 4(%eax)
349 ; CHECK-AVX512-NEXT: .LBB2_2: # %if.end
350 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edi
351 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %esi
352 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
353 ; CHECK-AVX512-NEXT: cmpl $14, %edx
354 ; CHECK-AVX512-NEXT: jl .LBB2_4
355 ; CHECK-AVX512-NEXT: # %bb.3: # %if.then2
356 ; CHECK-AVX512-NEXT: movl %edx, 12(%eax)
357 ; CHECK-AVX512-NEXT: .LBB2_4: # %if.end3
358 ; CHECK-AVX512-NEXT: vmovups (%edi), %xmm0
359 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%esi)
360 ; CHECK-AVX512-NEXT: movl (%eax), %edx
361 ; CHECK-AVX512-NEXT: movl %edx, (%ecx)
362 ; CHECK-AVX512-NEXT: movl 4(%eax), %edx
363 ; CHECK-AVX512-NEXT: movl %edx, 4(%ecx)
364 ; CHECK-AVX512-NEXT: movl 8(%eax), %edx
365 ; CHECK-AVX512-NEXT: movl %edx, 8(%ecx)
366 ; CHECK-AVX512-NEXT: movl 12(%eax), %eax
367 ; CHECK-AVX512-NEXT: movl %eax, 12(%ecx)
368 ; CHECK-AVX512-NEXT: popl %esi
369 ; CHECK-AVX512-NEXT: popl %edi
370 ; CHECK-AVX512-NEXT: retl
371 entry:
372 %cmp = icmp sgt i32 %x, 17
373 br i1 %cmp, label %if.then, label %if.end
374
375 if.then: ; preds = %entry
376 %b = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 1
377 store i32 %x, i32* %b, align 4
378 br label %if.end
379
380 if.end: ; preds = %if.then, %entry
381 %cmp1 = icmp sgt i32 %x2, 13
382 br i1 %cmp1, label %if.then2, label %if.end3
383
384 if.then2: ; preds = %if.end
385 %d = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 3
386 store i32 %x2, i32* %d, align 4
387 br label %if.end3
388
389 if.end3: ; preds = %if.then2, %if.end
390 %0 = bitcast %struct.S* %s3 to i8*
391 %1 = bitcast %struct.S* %s4 to i8*
392 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
393 %2 = bitcast %struct.S* %s2 to i8*
394 %3 = bitcast %struct.S* %s1 to i8*
395 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 4, i1 false)
396 ret void
397 }
398
399 ; Function Attrs: nounwind uwtable
400 define void @test_2preds_block(%struct.S* nocapture %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 {
401 ; CHECK-LABEL: test_2preds_block:
402 ; CHECK: # %bb.0: # %entry
403 ; CHECK-NEXT: pushl %ebx
404 ; CHECK-NEXT: .cfi_def_cfa_offset 8
405 ; CHECK-NEXT: pushl %edi
406 ; CHECK-NEXT: .cfi_def_cfa_offset 12
407 ; CHECK-NEXT: pushl %esi
408 ; CHECK-NEXT: .cfi_def_cfa_offset 16
409 ; CHECK-NEXT: .cfi_offset %esi, -16
410 ; CHECK-NEXT: .cfi_offset %edi, -12
411 ; CHECK-NEXT: .cfi_offset %ebx, -8
412 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
413 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
414 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi
415 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
416 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
417 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx
418 ; CHECK-NEXT: movl %ebx, 12(%ecx)
419 ; CHECK-NEXT: cmpl $18, %edi
420 ; CHECK-NEXT: jl .LBB3_2
421 ; CHECK-NEXT: # %bb.1: # %if.then
422 ; CHECK-NEXT: movl %edi, 4(%ecx)
423 ; CHECK-NEXT: .LBB3_2: # %if.end
424 ; CHECK-NEXT: movups (%esi), %xmm0
425 ; CHECK-NEXT: movups %xmm0, (%edx)
426 ; CHECK-NEXT: movl (%ecx), %edx
427 ; CHECK-NEXT: movl %edx, (%eax)
428 ; CHECK-NEXT: movl 4(%ecx), %edx
429 ; CHECK-NEXT: movl %edx, 4(%eax)
430 ; CHECK-NEXT: movl 8(%ecx), %edx
431 ; CHECK-NEXT: movl %edx, 8(%eax)
432 ; CHECK-NEXT: movl 12(%ecx), %ecx
433 ; CHECK-NEXT: movl %ecx, 12(%eax)
434 ; CHECK-NEXT: popl %esi
435 ; CHECK-NEXT: popl %edi
436 ; CHECK-NEXT: popl %ebx
437 ; CHECK-NEXT: retl
438 ;
439 ; DISABLED-LABEL: test_2preds_block:
440 ; DISABLED: # %bb.0: # %entry
441 ; DISABLED-NEXT: pushl %ebx
442 ; DISABLED-NEXT: .cfi_def_cfa_offset 8
443 ; DISABLED-NEXT: pushl %edi
444 ; DISABLED-NEXT: .cfi_def_cfa_offset 12
445 ; DISABLED-NEXT: pushl %esi
446 ; DISABLED-NEXT: .cfi_def_cfa_offset 16
447 ; DISABLED-NEXT: .cfi_offset %esi, -16
448 ; DISABLED-NEXT: .cfi_offset %edi, -12
449 ; DISABLED-NEXT: .cfi_offset %ebx, -8
450 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %edx
451 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %ecx
452 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %edi
453 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %eax
454 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %esi
455 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %ebx
456 ; DISABLED-NEXT: movl %ebx, 12(%esi)
457 ; DISABLED-NEXT: cmpl $18, %edi
458 ; DISABLED-NEXT: jl .LBB3_2
459 ; DISABLED-NEXT: # %bb.1: # %if.then
460 ; DISABLED-NEXT: movl %edi, 4(%esi)
461 ; DISABLED-NEXT: .LBB3_2: # %if.end
462 ; DISABLED-NEXT: movups (%edx), %xmm0
463 ; DISABLED-NEXT: movups %xmm0, (%ecx)
464 ; DISABLED-NEXT: movups (%esi), %xmm0
465 ; DISABLED-NEXT: movups %xmm0, (%eax)
466 ; DISABLED-NEXT: popl %esi
467 ; DISABLED-NEXT: popl %edi
468 ; DISABLED-NEXT: popl %ebx
469 ; DISABLED-NEXT: retl
470 ;
471 ; CHECK-AVX2-LABEL: test_2preds_block:
472 ; CHECK-AVX2: # %bb.0: # %entry
473 ; CHECK-AVX2-NEXT: pushl %ebx
474 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8
475 ; CHECK-AVX2-NEXT: pushl %edi
476 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 12
477 ; CHECK-AVX2-NEXT: pushl %esi
478 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 16
479 ; CHECK-AVX2-NEXT: .cfi_offset %esi, -16
480 ; CHECK-AVX2-NEXT: .cfi_offset %edi, -12
481 ; CHECK-AVX2-NEXT: .cfi_offset %ebx, -8
482 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi
483 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
484 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edi
485 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
486 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
487 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ebx
488 ; CHECK-AVX2-NEXT: movl %ebx, 12(%ecx)
489 ; CHECK-AVX2-NEXT: cmpl $18, %edi
490 ; CHECK-AVX2-NEXT: jl .LBB3_2
491 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
492 ; CHECK-AVX2-NEXT: movl %edi, 4(%ecx)
493 ; CHECK-AVX2-NEXT: .LBB3_2: # %if.end
494 ; CHECK-AVX2-NEXT: movups (%esi), %xmm0
495 ; CHECK-AVX2-NEXT: movups %xmm0, (%edx)
496 ; CHECK-AVX2-NEXT: movl (%ecx), %edx
497 ; CHECK-AVX2-NEXT: movl %edx, (%eax)
498 ; CHECK-AVX2-NEXT: movl 4(%ecx), %edx
499 ; CHECK-AVX2-NEXT: movl %edx, 4(%eax)
500 ; CHECK-AVX2-NEXT: movl 8(%ecx), %edx
501 ; CHECK-AVX2-NEXT: movl %edx, 8(%eax)
502 ; CHECK-AVX2-NEXT: movl 12(%ecx), %ecx
503 ; CHECK-AVX2-NEXT: movl %ecx, 12(%eax)
504 ; CHECK-AVX2-NEXT: popl %esi
505 ; CHECK-AVX2-NEXT: popl %edi
506 ; CHECK-AVX2-NEXT: popl %ebx
507 ; CHECK-AVX2-NEXT: retl
508 ;
509 ; CHECK-AVX512-LABEL: test_2preds_block:
510 ; CHECK-AVX512: # %bb.0: # %entry
511 ; CHECK-AVX512-NEXT: pushl %ebx
512 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 8
513 ; CHECK-AVX512-NEXT: pushl %edi
514 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 12
515 ; CHECK-AVX512-NEXT: pushl %esi
516 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 16
517 ; CHECK-AVX512-NEXT: .cfi_offset %esi, -16
518 ; CHECK-AVX512-NEXT: .cfi_offset %edi, -12
519 ; CHECK-AVX512-NEXT: .cfi_offset %ebx, -8
520 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %esi
521 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
522 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edi
523 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
524 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
525 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ebx
526 ; CHECK-AVX512-NEXT: movl %ebx, 12(%ecx)
527 ; CHECK-AVX512-NEXT: cmpl $18, %edi
528 ; CHECK-AVX512-NEXT: jl .LBB3_2
529 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
530 ; CHECK-AVX512-NEXT: movl %edi, 4(%ecx)
531 ; CHECK-AVX512-NEXT: .LBB3_2: # %if.end
532 ; CHECK-AVX512-NEXT: vmovups (%esi), %xmm0
533 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%edx)
534 ; CHECK-AVX512-NEXT: movl (%ecx), %edx
535 ; CHECK-AVX512-NEXT: movl %edx, (%eax)
536 ; CHECK-AVX512-NEXT: movl 4(%ecx), %edx
537 ; CHECK-AVX512-NEXT: movl %edx, 4(%eax)
538 ; CHECK-AVX512-NEXT: movl 8(%ecx), %edx
539 ; CHECK-AVX512-NEXT: movl %edx, 8(%eax)
540 ; CHECK-AVX512-NEXT: movl 12(%ecx), %ecx
541 ; CHECK-AVX512-NEXT: movl %ecx, 12(%eax)
542 ; CHECK-AVX512-NEXT: popl %esi
543 ; CHECK-AVX512-NEXT: popl %edi
544 ; CHECK-AVX512-NEXT: popl %ebx
545 ; CHECK-AVX512-NEXT: retl
546 entry:
547 %d = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 3
548 store i32 %x2, i32* %d, align 4
549 %cmp = icmp sgt i32 %x, 17
550 br i1 %cmp, label %if.then, label %if.end
551
552 if.then: ; preds = %entry
553 %b = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 1
554 store i32 %x, i32* %b, align 4
555 br label %if.end
556
557 if.end: ; preds = %if.then, %entry
558 %0 = bitcast %struct.S* %s3 to i8*
559 %1 = bitcast %struct.S* %s4 to i8*
560 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
561 %2 = bitcast %struct.S* %s2 to i8*
562 %3 = bitcast %struct.S* %s1 to i8*
563 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 4, i1 false)
564 ret void
565 }
566 %struct.S2 = type { i64, i64 }
567
568 ; Function Attrs: nounwind uwtable
569 define void @test_type64(%struct.S2* nocapture %s1, %struct.S2* nocapture %s2, i32 %x, %struct.S2* nocapture %s3, %struct.S2* nocapture readonly %s4) local_unnamed_addr #0 {
570 ; CHECK-LABEL: test_type64:
571 ; CHECK: # %bb.0: # %entry
572 ; CHECK-NEXT: pushl %edi
573 ; CHECK-NEXT: .cfi_def_cfa_offset 8
574 ; CHECK-NEXT: pushl %esi
575 ; CHECK-NEXT: .cfi_def_cfa_offset 12
576 ; CHECK-NEXT: .cfi_offset %esi, -12
577 ; CHECK-NEXT: .cfi_offset %edi, -8
578 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
579 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
580 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi
581 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
582 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
583 ; CHECK-NEXT: cmpl $18, %edi
584 ; CHECK-NEXT: jl .LBB4_2
585 ; CHECK-NEXT: # %bb.1: # %if.then
586 ; CHECK-NEXT: movl %edi, 8(%ecx)
587 ; CHECK-NEXT: sarl $31, %edi
588 ; CHECK-NEXT: movl %edi, 12(%ecx)
589 ; CHECK-NEXT: .LBB4_2: # %if.end
590 ; CHECK-NEXT: movups (%esi), %xmm0
591 ; CHECK-NEXT: movups %xmm0, (%edx)
592 ; CHECK-NEXT: movl (%ecx), %edx
593 ; CHECK-NEXT: movl %edx, (%eax)
594 ; CHECK-NEXT: movl 4(%ecx), %edx
595 ; CHECK-NEXT: movl %edx, 4(%eax)
596 ; CHECK-NEXT: movl 8(%ecx), %edx
597 ; CHECK-NEXT: movl %edx, 8(%eax)
598 ; CHECK-NEXT: movl 12(%ecx), %ecx
599 ; CHECK-NEXT: movl %ecx, 12(%eax)
600 ; CHECK-NEXT: popl %esi
601 ; CHECK-NEXT: popl %edi
602 ; CHECK-NEXT: retl
603 ;
604 ; DISABLED-LABEL: test_type64:
605 ; DISABLED: # %bb.0: # %entry
606 ; DISABLED-NEXT: pushl %edi
607 ; DISABLED-NEXT: .cfi_def_cfa_offset 8
608 ; DISABLED-NEXT: pushl %esi
609 ; DISABLED-NEXT: .cfi_def_cfa_offset 12
610 ; DISABLED-NEXT: .cfi_offset %esi, -12
611 ; DISABLED-NEXT: .cfi_offset %edi, -8
612 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %edx
613 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %ecx
614 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %edi
615 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %eax
616 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %esi
617 ; DISABLED-NEXT: cmpl $18, %edi
618 ; DISABLED-NEXT: jl .LBB4_2
619 ; DISABLED-NEXT: # %bb.1: # %if.then
620 ; DISABLED-NEXT: movl %edi, 8(%esi)
621 ; DISABLED-NEXT: sarl $31, %edi
622 ; DISABLED-NEXT: movl %edi, 12(%esi)
623 ; DISABLED-NEXT: .LBB4_2: # %if.end
624 ; DISABLED-NEXT: movups (%edx), %xmm0
625 ; DISABLED-NEXT: movups %xmm0, (%ecx)
626 ; DISABLED-NEXT: movups (%esi), %xmm0
627 ; DISABLED-NEXT: movups %xmm0, (%eax)
628 ; DISABLED-NEXT: popl %esi
629 ; DISABLED-NEXT: popl %edi
630 ; DISABLED-NEXT: retl
631 ;
632 ; CHECK-AVX2-LABEL: test_type64:
633 ; CHECK-AVX2: # %bb.0: # %entry
634 ; CHECK-AVX2-NEXT: pushl %edi
635 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8
636 ; CHECK-AVX2-NEXT: pushl %esi
637 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 12
638 ; CHECK-AVX2-NEXT: .cfi_offset %esi, -12
639 ; CHECK-AVX2-NEXT: .cfi_offset %edi, -8
640 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi
641 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
642 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edi
643 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
644 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
645 ; CHECK-AVX2-NEXT: cmpl $18, %edi
646 ; CHECK-AVX2-NEXT: jl .LBB4_2
647 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
648 ; CHECK-AVX2-NEXT: movl %edi, 8(%ecx)
649 ; CHECK-AVX2-NEXT: sarl $31, %edi
650 ; CHECK-AVX2-NEXT: movl %edi, 12(%ecx)
651 ; CHECK-AVX2-NEXT: .LBB4_2: # %if.end
652 ; CHECK-AVX2-NEXT: movups (%esi), %xmm0
653 ; CHECK-AVX2-NEXT: movups %xmm0, (%edx)
654 ; CHECK-AVX2-NEXT: movl (%ecx), %edx
655 ; CHECK-AVX2-NEXT: movl %edx, (%eax)
656 ; CHECK-AVX2-NEXT: movl 4(%ecx), %edx
657 ; CHECK-AVX2-NEXT: movl %edx, 4(%eax)
658 ; CHECK-AVX2-NEXT: movl 8(%ecx), %edx
659 ; CHECK-AVX2-NEXT: movl %edx, 8(%eax)
660 ; CHECK-AVX2-NEXT: movl 12(%ecx), %ecx
661 ; CHECK-AVX2-NEXT: movl %ecx, 12(%eax)
662 ; CHECK-AVX2-NEXT: popl %esi
663 ; CHECK-AVX2-NEXT: popl %edi
664 ; CHECK-AVX2-NEXT: retl
665 ;
666 ; CHECK-AVX512-LABEL: test_type64:
667 ; CHECK-AVX512: # %bb.0: # %entry
668 ; CHECK-AVX512-NEXT: pushl %edi
669 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 8
670 ; CHECK-AVX512-NEXT: pushl %esi
671 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 12
672 ; CHECK-AVX512-NEXT: .cfi_offset %esi, -12
673 ; CHECK-AVX512-NEXT: .cfi_offset %edi, -8
674 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %esi
675 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
676 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edi
677 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
678 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
679 ; CHECK-AVX512-NEXT: cmpl $18, %edi
680 ; CHECK-AVX512-NEXT: jl .LBB4_2
681 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
682 ; CHECK-AVX512-NEXT: movl %edi, 8(%ecx)
683 ; CHECK-AVX512-NEXT: sarl $31, %edi
684 ; CHECK-AVX512-NEXT: movl %edi, 12(%ecx)
685 ; CHECK-AVX512-NEXT: .LBB4_2: # %if.end
686 ; CHECK-AVX512-NEXT: vmovups (%esi), %xmm0
687 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%edx)
688 ; CHECK-AVX512-NEXT: movl (%ecx), %edx
689 ; CHECK-AVX512-NEXT: movl %edx, (%eax)
690 ; CHECK-AVX512-NEXT: movl 4(%ecx), %edx
691 ; CHECK-AVX512-NEXT: movl %edx, 4(%eax)
692 ; CHECK-AVX512-NEXT: movl 8(%ecx), %edx
693 ; CHECK-AVX512-NEXT: movl %edx, 8(%eax)
694 ; CHECK-AVX512-NEXT: movl 12(%ecx), %ecx
695 ; CHECK-AVX512-NEXT: movl %ecx, 12(%eax)
696 ; CHECK-AVX512-NEXT: popl %esi
697 ; CHECK-AVX512-NEXT: popl %edi
698 ; CHECK-AVX512-NEXT: retl
699 entry:
700 %cmp = icmp sgt i32 %x, 17
701 br i1 %cmp, label %if.then, label %if.end
702
703 if.then: ; preds = %entry
704 %conv = sext i32 %x to i64
705 %b = getelementptr inbounds %struct.S2, %struct.S2* %s1, i64 0, i32 1
706 store i64 %conv, i64* %b, align 8
707 br label %if.end
708
709 if.end: ; preds = %if.then, %entry
710 %0 = bitcast %struct.S2* %s3 to i8*
711 %1 = bitcast %struct.S2* %s4 to i8*
712 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 8, i1 false)
713 %2 = bitcast %struct.S2* %s2 to i8*
714 %3 = bitcast %struct.S2* %s1 to i8*
715 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 8, i1 false)
716 ret void
717 }
718 %struct.S3 = type { i64, i8, i8, i16, i32 }
719
720 ; Function Attrs: noinline nounwind uwtable
721 define void @test_mixed_type(%struct.S3* nocapture %s1, %struct.S3* nocapture %s2, i32 %x, %struct.S3* nocapture readnone %s3, %struct.S3* nocapture readnone %s4) local_unnamed_addr #0 {
722 ; CHECK-LABEL: test_mixed_type:
723 ; CHECK: # %bb.0: # %entry
724 ; CHECK-NEXT: pushl %esi
725 ; CHECK-NEXT: .cfi_def_cfa_offset 8
726 ; CHECK-NEXT: .cfi_offset %esi, -8
727 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
728 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
729 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
730 ; CHECK-NEXT: cmpl $18, %edx
731 ; CHECK-NEXT: jl .LBB5_2
732 ; CHECK-NEXT: # %bb.1: # %if.then
733 ; CHECK-NEXT: movl %edx, %esi
734 ; CHECK-NEXT: sarl $31, %esi
735 ; CHECK-NEXT: movl %edx, (%ecx)
736 ; CHECK-NEXT: movl %esi, 4(%ecx)
737 ; CHECK-NEXT: movb %dl, 8(%ecx)
738 ; CHECK-NEXT: .LBB5_2: # %if.end
739 ; CHECK-NEXT: movl (%ecx), %edx
740 ; CHECK-NEXT: movl %edx, (%eax)
741 ; CHECK-NEXT: movl 4(%ecx), %edx
742 ; CHECK-NEXT: movl %edx, 4(%eax)
743 ; CHECK-NEXT: movb 8(%ecx), %dl
744 ; CHECK-NEXT: movb %dl, 8(%eax)
745 ; CHECK-NEXT: movl 9(%ecx), %edx
746 ; CHECK-NEXT: movl %edx, 9(%eax)
747 ; CHECK-NEXT: movzwl 13(%ecx), %edx
748 ; CHECK-NEXT: movw %dx, 13(%eax)
749 ; CHECK-NEXT: movb 15(%ecx), %cl
750 ; CHECK-NEXT: movb %cl, 15(%eax)
751 ; CHECK-NEXT: popl %esi
752 ; CHECK-NEXT: retl
753 ;
754 ; DISABLED-LABEL: test_mixed_type:
755 ; DISABLED: # %bb.0: # %entry
756 ; DISABLED-NEXT: pushl %esi
757 ; DISABLED-NEXT: .cfi_def_cfa_offset 8
758 ; DISABLED-NEXT: .cfi_offset %esi, -8
759 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %edx
760 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %eax
761 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %ecx
762 ; DISABLED-NEXT: cmpl $18, %edx
763 ; DISABLED-NEXT: jl .LBB5_2
764 ; DISABLED-NEXT: # %bb.1: # %if.then
765 ; DISABLED-NEXT: movl %edx, %esi
766 ; DISABLED-NEXT: sarl $31, %esi
767 ; DISABLED-NEXT: movl %edx, (%ecx)
768 ; DISABLED-NEXT: movl %esi, 4(%ecx)
769 ; DISABLED-NEXT: movb %dl, 8(%ecx)
770 ; DISABLED-NEXT: .LBB5_2: # %if.end
771 ; DISABLED-NEXT: movups (%ecx), %xmm0
772 ; DISABLED-NEXT: movups %xmm0, (%eax)
773 ; DISABLED-NEXT: popl %esi
774 ; DISABLED-NEXT: retl
775 ;
776 ; CHECK-AVX2-LABEL: test_mixed_type:
777 ; CHECK-AVX2: # %bb.0: # %entry
778 ; CHECK-AVX2-NEXT: pushl %esi
779 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8
780 ; CHECK-AVX2-NEXT: .cfi_offset %esi, -8
781 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
782 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
783 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
784 ; CHECK-AVX2-NEXT: cmpl $18, %edx
785 ; CHECK-AVX2-NEXT: jl .LBB5_2
786 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
787 ; CHECK-AVX2-NEXT: movl %edx, %esi
788 ; CHECK-AVX2-NEXT: sarl $31, %esi
789 ; CHECK-AVX2-NEXT: movl %edx, (%ecx)
790 ; CHECK-AVX2-NEXT: movl %esi, 4(%ecx)
791 ; CHECK-AVX2-NEXT: movb %dl, 8(%ecx)
792 ; CHECK-AVX2-NEXT: .LBB5_2: # %if.end
793 ; CHECK-AVX2-NEXT: movl (%ecx), %edx
794 ; CHECK-AVX2-NEXT: movl %edx, (%eax)
795 ; CHECK-AVX2-NEXT: movl 4(%ecx), %edx
796 ; CHECK-AVX2-NEXT: movl %edx, 4(%eax)
797 ; CHECK-AVX2-NEXT: movb 8(%ecx), %dl
798 ; CHECK-AVX2-NEXT: movb %dl, 8(%eax)
799 ; CHECK-AVX2-NEXT: movl 9(%ecx), %edx
800 ; CHECK-AVX2-NEXT: movl %edx, 9(%eax)
801 ; CHECK-AVX2-NEXT: movzwl 13(%ecx), %edx
802 ; CHECK-AVX2-NEXT: movw %dx, 13(%eax)
803 ; CHECK-AVX2-NEXT: movb 15(%ecx), %cl
804 ; CHECK-AVX2-NEXT: movb %cl, 15(%eax)
805 ; CHECK-AVX2-NEXT: popl %esi
806 ; CHECK-AVX2-NEXT: retl
807 ;
808 ; CHECK-AVX512-LABEL: test_mixed_type:
809 ; CHECK-AVX512: # %bb.0: # %entry
810 ; CHECK-AVX512-NEXT: pushl %esi
811 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 8
812 ; CHECK-AVX512-NEXT: .cfi_offset %esi, -8
813 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
814 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
815 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
816 ; CHECK-AVX512-NEXT: cmpl $18, %edx
817 ; CHECK-AVX512-NEXT: jl .LBB5_2
818 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
819 ; CHECK-AVX512-NEXT: movl %edx, %esi
820 ; CHECK-AVX512-NEXT: sarl $31, %esi
821 ; CHECK-AVX512-NEXT: movl %edx, (%ecx)
822 ; CHECK-AVX512-NEXT: movl %esi, 4(%ecx)
823 ; CHECK-AVX512-NEXT: movb %dl, 8(%ecx)
824 ; CHECK-AVX512-NEXT: .LBB5_2: # %if.end
825 ; CHECK-AVX512-NEXT: movl (%ecx), %edx
826 ; CHECK-AVX512-NEXT: movl %edx, (%eax)
827 ; CHECK-AVX512-NEXT: movl 4(%ecx), %edx
828 ; CHECK-AVX512-NEXT: movl %edx, 4(%eax)
829 ; CHECK-AVX512-NEXT: movb 8(%ecx), %dl
830 ; CHECK-AVX512-NEXT: movb %dl, 8(%eax)
831 ; CHECK-AVX512-NEXT: movl 9(%ecx), %edx
832 ; CHECK-AVX512-NEXT: movl %edx, 9(%eax)
833 ; CHECK-AVX512-NEXT: movzwl 13(%ecx), %edx
834 ; CHECK-AVX512-NEXT: movw %dx, 13(%eax)
835 ; CHECK-AVX512-NEXT: movb 15(%ecx), %cl
836 ; CHECK-AVX512-NEXT: movb %cl, 15(%eax)
837 ; CHECK-AVX512-NEXT: popl %esi
838 ; CHECK-AVX512-NEXT: retl
839 entry:
840 %cmp = icmp sgt i32 %x, 17
841 br i1 %cmp, label %if.then, label %if.end
842
843 if.then: ; preds = %entry
844 %conv = sext i32 %x to i64
845 %a = getelementptr inbounds %struct.S3, %struct.S3* %s1, i64 0, i32 0
846 store i64 %conv, i64* %a, align 8
847 %conv1 = trunc i32 %x to i8
848 %b = getelementptr inbounds %struct.S3, %struct.S3* %s1, i64 0, i32 1
849 store i8 %conv1, i8* %b, align 8
850 br label %if.end
851
852 if.end: ; preds = %if.then, %entry
853 %0 = bitcast %struct.S3* %s2 to i8*
854 %1 = bitcast %struct.S3* %s1 to i8*
855 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 8, i1 false)
856 ret void
857 }
858 %struct.S4 = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
859
860 ; Function Attrs: nounwind uwtable
861 define void @test_multiple_blocks(%struct.S4* nocapture %s1, %struct.S4* nocapture %s2) local_unnamed_addr #0 {
862 ; CHECK-LABEL: test_multiple_blocks:
863 ; CHECK: # %bb.0: # %entry
864 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
865 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
866 ; CHECK-NEXT: movl $0, 4(%ecx)
867 ; CHECK-NEXT: movl $0, 36(%ecx)
868 ; CHECK-NEXT: movups 16(%ecx), %xmm0
869 ; CHECK-NEXT: movups %xmm0, 16(%eax)
870 ; CHECK-NEXT: movl 32(%ecx), %edx
871 ; CHECK-NEXT: movl %edx, 32(%eax)
872 ; CHECK-NEXT: movl 36(%ecx), %edx
873 ; CHECK-NEXT: movl %edx, 36(%eax)
874 ; CHECK-NEXT: movl 40(%ecx), %edx
875 ; CHECK-NEXT: movl %edx, 40(%eax)
876 ; CHECK-NEXT: movl 44(%ecx), %edx
877 ; CHECK-NEXT: movl %edx, 44(%eax)
878 ; CHECK-NEXT: movl (%ecx), %edx
879 ; CHECK-NEXT: movl %edx, (%eax)
880 ; CHECK-NEXT: movl 4(%ecx), %edx
881 ; CHECK-NEXT: movl %edx, 4(%eax)
882 ; CHECK-NEXT: movl 8(%ecx), %edx
883 ; CHECK-NEXT: movl %edx, 8(%eax)
884 ; CHECK-NEXT: movl 12(%ecx), %ecx
885 ; CHECK-NEXT: movl %ecx, 12(%eax)
886 ; CHECK-NEXT: retl
887 ;
888 ; DISABLED-LABEL: test_multiple_blocks:
889 ; DISABLED: # %bb.0: # %entry
890 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %eax
891 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %ecx
892 ; DISABLED-NEXT: movl $0, 4(%ecx)
893 ; DISABLED-NEXT: movl $0, 36(%ecx)
894 ; DISABLED-NEXT: movups 16(%ecx), %xmm0
895 ; DISABLED-NEXT: movups %xmm0, 16(%eax)
896 ; DISABLED-NEXT: movups 32(%ecx), %xmm0
897 ; DISABLED-NEXT: movups %xmm0, 32(%eax)
898 ; DISABLED-NEXT: movups (%ecx), %xmm0
899 ; DISABLED-NEXT: movups %xmm0, (%eax)
900 ; DISABLED-NEXT: retl
901 ;
902 ; CHECK-AVX2-LABEL: test_multiple_blocks:
903 ; CHECK-AVX2: # %bb.0: # %entry
904 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
905 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
906 ; CHECK-AVX2-NEXT: movl $0, 4(%ecx)
907 ; CHECK-AVX2-NEXT: movl $0, 36(%ecx)
908 ; CHECK-AVX2-NEXT: movups 16(%ecx), %xmm0
909 ; CHECK-AVX2-NEXT: movups %xmm0, 16(%eax)
910 ; CHECK-AVX2-NEXT: movl 32(%ecx), %edx
911 ; CHECK-AVX2-NEXT: movl %edx, 32(%eax)
912 ; CHECK-AVX2-NEXT: movl 36(%ecx), %edx
913 ; CHECK-AVX2-NEXT: movl %edx, 36(%eax)
914 ; CHECK-AVX2-NEXT: movl 40(%ecx), %edx
915 ; CHECK-AVX2-NEXT: movl %edx, 40(%eax)
916 ; CHECK-AVX2-NEXT: movl 44(%ecx), %edx
917 ; CHECK-AVX2-NEXT: movl %edx, 44(%eax)
918 ; CHECK-AVX2-NEXT: movl (%ecx), %edx
919 ; CHECK-AVX2-NEXT: movl %edx, (%eax)
920 ; CHECK-AVX2-NEXT: movl 4(%ecx), %edx
921 ; CHECK-AVX2-NEXT: movl %edx, 4(%eax)
922 ; CHECK-AVX2-NEXT: movl 8(%ecx), %edx
923 ; CHECK-AVX2-NEXT: movl %edx, 8(%eax)
924 ; CHECK-AVX2-NEXT: movl 12(%ecx), %ecx
925 ; CHECK-AVX2-NEXT: movl %ecx, 12(%eax)
926 ; CHECK-AVX2-NEXT: retl
927 ;
928 ; CHECK-AVX512-LABEL: test_multiple_blocks:
929 ; CHECK-AVX512: # %bb.0: # %entry
930 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
931 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
932 ; CHECK-AVX512-NEXT: movl $0, 4(%ecx)
933 ; CHECK-AVX512-NEXT: movl $0, 36(%ecx)
934 ; CHECK-AVX512-NEXT: vmovups 16(%ecx), %xmm0
935 ; CHECK-AVX512-NEXT: vmovups %xmm0, 16(%eax)
936 ; CHECK-AVX512-NEXT: movl 32(%ecx), %edx
937 ; CHECK-AVX512-NEXT: movl %edx, 32(%eax)
938 ; CHECK-AVX512-NEXT: movl 36(%ecx), %edx
939 ; CHECK-AVX512-NEXT: movl %edx, 36(%eax)
940 ; CHECK-AVX512-NEXT: movl 40(%ecx), %edx
941 ; CHECK-AVX512-NEXT: movl %edx, 40(%eax)
942 ; CHECK-AVX512-NEXT: movl 44(%ecx), %edx
943 ; CHECK-AVX512-NEXT: movl %edx, 44(%eax)
944 ; CHECK-AVX512-NEXT: movl (%ecx), %edx
945 ; CHECK-AVX512-NEXT: movl %edx, (%eax)
946 ; CHECK-AVX512-NEXT: movl 4(%ecx), %edx
947 ; CHECK-AVX512-NEXT: movl %edx, 4(%eax)
948 ; CHECK-AVX512-NEXT: vmovups 8(%ecx), %xmm0
949 ; CHECK-AVX512-NEXT: vmovups %xmm0, 8(%eax)
950 ; CHECK-AVX512-NEXT: movl 24(%ecx), %edx
951 ; CHECK-AVX512-NEXT: movl %edx, 24(%eax)
952 ; CHECK-AVX512-NEXT: movl 28(%ecx), %ecx
953 ; CHECK-AVX512-NEXT: movl %ecx, 28(%eax)
954 ; CHECK-AVX512-NEXT: retl
955 entry:
956 %b = getelementptr inbounds %struct.S4, %struct.S4* %s1, i64 0, i32 1
957 store i32 0, i32* %b, align 4
958 %b3 = getelementptr inbounds %struct.S4, %struct.S4* %s1, i64 0, i32 9
959 store i32 0, i32* %b3, align 4
960 %0 = bitcast %struct.S4* %s2 to i8*
961 %1 = bitcast %struct.S4* %s1 to i8*
962 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 48, i32 4, i1 false)
963 ret void
964 }
965 %struct.S5 = type { i16, i16, i16, i16, i16, i16, i16, i16 }
966
967 ; Function Attrs: nounwind uwtable
968 define void @test_type16(%struct.S5* nocapture %s1, %struct.S5* nocapture %s2, i32 %x, %struct.S5* nocapture %s3, %struct.S5* nocapture readonly %s4) local_unnamed_addr #0 {
969 ; CHECK-LABEL: test_type16:
970 ; CHECK: # %bb.0: # %entry
971 ; CHECK-NEXT: pushl %edi
972 ; CHECK-NEXT: .cfi_def_cfa_offset 8
973 ; CHECK-NEXT: pushl %esi
974 ; CHECK-NEXT: .cfi_def_cfa_offset 12
975 ; CHECK-NEXT: .cfi_offset %esi, -12
976 ; CHECK-NEXT: .cfi_offset %edi, -8
977 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
978 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
979 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi
980 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
981 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
982 ; CHECK-NEXT: cmpl $18, %edi
983 ; CHECK-NEXT: jl .LBB7_2
984 ; CHECK-NEXT: # %bb.1: # %if.then
985 ; CHECK-NEXT: movw %di, 2(%ecx)
986 ; CHECK-NEXT: .LBB7_2: # %if.end
987 ; CHECK-NEXT: movups (%esi), %xmm0
988 ; CHECK-NEXT: movups %xmm0, (%edx)
989 ; CHECK-NEXT: movzwl (%ecx), %edx
990 ; CHECK-NEXT: movw %dx, (%eax)
991 ; CHECK-NEXT: movzwl 2(%ecx), %edx
992 ; CHECK-NEXT: movw %dx, 2(%eax)
993 ; CHECK-NEXT: movl 4(%ecx), %edx
994 ; CHECK-NEXT: movl %edx, 4(%eax)
995 ; CHECK-NEXT: movl 8(%ecx), %edx
996 ; CHECK-NEXT: movl %edx, 8(%eax)
997 ; CHECK-NEXT: movl 12(%ecx), %ecx
998 ; CHECK-NEXT: movl %ecx, 12(%eax)
999 ; CHECK-NEXT: popl %esi
1000 ; CHECK-NEXT: popl %edi
1001 ; CHECK-NEXT: retl
1002 ;
1003 ; DISABLED-LABEL: test_type16:
1004 ; DISABLED: # %bb.0: # %entry
1005 ; DISABLED-NEXT: pushl %edi
1006 ; DISABLED-NEXT: .cfi_def_cfa_offset 8
1007 ; DISABLED-NEXT: pushl %esi
1008 ; DISABLED-NEXT: .cfi_def_cfa_offset 12
1009 ; DISABLED-NEXT: .cfi_offset %esi, -12
1010 ; DISABLED-NEXT: .cfi_offset %edi, -8
1011 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %edx
1012 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %ecx
1013 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %edi
1014 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %eax
1015 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %esi
1016 ; DISABLED-NEXT: cmpl $18, %edi
1017 ; DISABLED-NEXT: jl .LBB7_2
1018 ; DISABLED-NEXT: # %bb.1: # %if.then
1019 ; DISABLED-NEXT: movw %di, 2(%esi)
1020 ; DISABLED-NEXT: .LBB7_2: # %if.end
1021 ; DISABLED-NEXT: movups (%edx), %xmm0
1022 ; DISABLED-NEXT: movups %xmm0, (%ecx)
1023 ; DISABLED-NEXT: movups (%esi), %xmm0
1024 ; DISABLED-NEXT: movups %xmm0, (%eax)
1025 ; DISABLED-NEXT: popl %esi
1026 ; DISABLED-NEXT: popl %edi
1027 ; DISABLED-NEXT: retl
1028 ;
1029 ; CHECK-AVX2-LABEL: test_type16:
1030 ; CHECK-AVX2: # %bb.0: # %entry
1031 ; CHECK-AVX2-NEXT: pushl %edi
1032 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8
1033 ; CHECK-AVX2-NEXT: pushl %esi
1034 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 12
1035 ; CHECK-AVX2-NEXT: .cfi_offset %esi, -12
1036 ; CHECK-AVX2-NEXT: .cfi_offset %edi, -8
1037 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi
1038 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
1039 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edi
1040 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
1041 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
1042 ; CHECK-AVX2-NEXT: cmpl $18, %edi
1043 ; CHECK-AVX2-NEXT: jl .LBB7_2
1044 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
1045 ; CHECK-AVX2-NEXT: movw %di, 2(%ecx)
1046 ; CHECK-AVX2-NEXT: .LBB7_2: # %if.end
1047 ; CHECK-AVX2-NEXT: movups (%esi), %xmm0
1048 ; CHECK-AVX2-NEXT: movups %xmm0, (%edx)
1049 ; CHECK-AVX2-NEXT: movzwl (%ecx), %edx
1050 ; CHECK-AVX2-NEXT: movw %dx, (%eax)
1051 ; CHECK-AVX2-NEXT: movzwl 2(%ecx), %edx
1052 ; CHECK-AVX2-NEXT: movw %dx, 2(%eax)
1053 ; CHECK-AVX2-NEXT: movl 4(%ecx), %edx
1054 ; CHECK-AVX2-NEXT: movl %edx, 4(%eax)
1055 ; CHECK-AVX2-NEXT: movl 8(%ecx), %edx
1056 ; CHECK-AVX2-NEXT: movl %edx, 8(%eax)
1057 ; CHECK-AVX2-NEXT: movl 12(%ecx), %ecx
1058 ; CHECK-AVX2-NEXT: movl %ecx, 12(%eax)
1059 ; CHECK-AVX2-NEXT: popl %esi
1060 ; CHECK-AVX2-NEXT: popl %edi
1061 ; CHECK-AVX2-NEXT: retl
1062 ;
1063 ; CHECK-AVX512-LABEL: test_type16:
1064 ; CHECK-AVX512: # %bb.0: # %entry
1065 ; CHECK-AVX512-NEXT: pushl %edi
1066 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 8
1067 ; CHECK-AVX512-NEXT: pushl %esi
1068 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 12
1069 ; CHECK-AVX512-NEXT: .cfi_offset %esi, -12
1070 ; CHECK-AVX512-NEXT: .cfi_offset %edi, -8
1071 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %esi
1072 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
1073 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edi
1074 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1075 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
1076 ; CHECK-AVX512-NEXT: cmpl $18, %edi
1077 ; CHECK-AVX512-NEXT: jl .LBB7_2
1078 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
1079 ; CHECK-AVX512-NEXT: movw %di, 2(%ecx)
1080 ; CHECK-AVX512-NEXT: .LBB7_2: # %if.end
1081 ; CHECK-AVX512-NEXT: vmovups (%esi), %xmm0
1082 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%edx)
1083 ; CHECK-AVX512-NEXT: movzwl (%ecx), %edx
1084 ; CHECK-AVX512-NEXT: movw %dx, (%eax)
1085 ; CHECK-AVX512-NEXT: movzwl 2(%ecx), %edx
1086 ; CHECK-AVX512-NEXT: movw %dx, 2(%eax)
1087 ; CHECK-AVX512-NEXT: movl 4(%ecx), %edx
1088 ; CHECK-AVX512-NEXT: movl %edx, 4(%eax)
1089 ; CHECK-AVX512-NEXT: movl 8(%ecx), %edx
1090 ; CHECK-AVX512-NEXT: movl %edx, 8(%eax)
1091 ; CHECK-AVX512-NEXT: movl 12(%ecx), %ecx
1092 ; CHECK-AVX512-NEXT: movl %ecx, 12(%eax)
1093 ; CHECK-AVX512-NEXT: popl %esi
1094 ; CHECK-AVX512-NEXT: popl %edi
1095 ; CHECK-AVX512-NEXT: retl
1096 entry:
1097 %cmp = icmp sgt i32 %x, 17
1098 br i1 %cmp, label %if.then, label %if.end
1099
1100 if.then: ; preds = %entry
1101 %conv = trunc i32 %x to i16
1102 %b = getelementptr inbounds %struct.S5, %struct.S5* %s1, i64 0, i32 1
1103 store i16 %conv, i16* %b, align 2
1104 br label %if.end
1105
1106 if.end: ; preds = %if.then, %entry
1107 %0 = bitcast %struct.S5* %s3 to i8*
1108 %1 = bitcast %struct.S5* %s4 to i8*
1109 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 2, i1 false)
1110 %2 = bitcast %struct.S5* %s2 to i8*
1111 %3 = bitcast %struct.S5* %s1 to i8*
1112 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 2, i1 false)
1113 ret void
1114 }
1115
1116 %struct.S6 = type { [4 x i32], i32, i32, i32, i32 }
1117
1118 ; Function Attrs: nounwind uwtable
1119 define void @test_stack(%struct.S6* noalias nocapture sret %agg.result, %struct.S6* byval nocapture readnone align 8 %s1, %struct.S6* byval nocapture align 8 %s2, i32 %x) local_unnamed_addr #0 {
1120 ; CHECK-LABEL: test_stack:
1121 ; CHECK: # %bb.0: # %entry
1122 ; CHECK-NEXT: pushl %eax
1123 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1124 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
1125 ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp)
1126 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
1127 ; CHECK-NEXT: movups {{[0-9]+}}(%esp), %xmm0
1128 ; CHECK-NEXT: movups %xmm0, (%eax)
1129 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
1130 ; CHECK-NEXT: movl %ecx, 16(%eax)
1131 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
1132 ; CHECK-NEXT: movl %ecx, 20(%eax)
1133 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
1134 ; CHECK-NEXT: movl %ecx, 24(%eax)
1135 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
1136 ; CHECK-NEXT: movl %ecx, 28(%eax)
1137 ; CHECK-NEXT: popl %ecx
1138 ; CHECK-NEXT: retl $4
1139 ;
1140 ; DISABLED-LABEL: test_stack:
1141 ; DISABLED: # %bb.0: # %entry
1142 ; DISABLED-NEXT: pushl %eax
1143 ; DISABLED-NEXT: .cfi_def_cfa_offset 8
1144 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %eax
1145 ; DISABLED-NEXT: movl %eax, {{[0-9]+}}(%esp)
1146 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %eax
1147 ; DISABLED-NEXT: movups {{[0-9]+}}(%esp), %xmm0
1148 ; DISABLED-NEXT: movups %xmm0, (%eax)
1149 ; DISABLED-NEXT: movups {{[0-9]+}}(%esp), %xmm0
1150 ; DISABLED-NEXT: movups %xmm0, 16(%eax)
1151 ; DISABLED-NEXT: popl %ecx
1152 ; DISABLED-NEXT: retl $4
1153 ;
1154 ; CHECK-AVX2-LABEL: test_stack:
1155 ; CHECK-AVX2: # %bb.0: # %entry
1156 ; CHECK-AVX2-NEXT: pushl %eax
1157 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8
1158 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
1159 ; CHECK-AVX2-NEXT: movl %eax, {{[0-9]+}}(%esp)
1160 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
1161 ; CHECK-AVX2-NEXT: movups {{[0-9]+}}(%esp), %xmm0
1162 ; CHECK-AVX2-NEXT: movups %xmm0, (%eax)
1163 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
1164 ; CHECK-AVX2-NEXT: movl %ecx, 16(%eax)
1165 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
1166 ; CHECK-AVX2-NEXT: movl %ecx, 20(%eax)
1167 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
1168 ; CHECK-AVX2-NEXT: movl %ecx, 24(%eax)
1169 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
1170 ; CHECK-AVX2-NEXT: movl %ecx, 28(%eax)
1171 ; CHECK-AVX2-NEXT: popl %ecx
1172 ; CHECK-AVX2-NEXT: retl $4
1173 ;
1174 ; CHECK-AVX512-LABEL: test_stack:
1175 ; CHECK-AVX512: # %bb.0: # %entry
1176 ; CHECK-AVX512-NEXT: pushl %eax
1177 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 8
1178 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1179 ; CHECK-AVX512-NEXT: movl %eax, {{[0-9]+}}(%esp)
1180 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1181 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
1182 ; CHECK-AVX512-NEXT: movl %ecx, 16(%eax)
1183 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
1184 ; CHECK-AVX512-NEXT: movl %ecx, 20(%eax)
1185 ; CHECK-AVX512-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
1186 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
1187 ; CHECK-AVX512-NEXT: movl %ecx, 24(%eax)
1188 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%eax)
1189 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
1190 ; CHECK-AVX512-NEXT: movl %ecx, 28(%eax)
1191 ; CHECK-AVX512-NEXT: popl %ecx
1192 ; CHECK-AVX512-NEXT: retl $4
1193 entry:
1194 %s6.sroa.0.0..sroa_cast1 = bitcast %struct.S6* %s2 to i8*
1195 %s6.sroa.3.0..sroa_idx4 = getelementptr inbounds %struct.S6, %struct.S6* %s2, i64 0, i32 3
1196 store i32 %x, i32* %s6.sroa.3.0..sroa_idx4, align 8
1197 %0 = bitcast %struct.S6* %agg.result to i8*
1198 call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* nonnull %s6.sroa.0.0..sroa_cast1, i64 32, i32 4, i1 false)
1199 ret void
1200 }
1201
1202 ; Function Attrs: nounwind uwtable
1203 define void @test_limit_all(%struct.S* %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 {
1204 ; CHECK-LABEL: test_limit_all:
1205 ; CHECK: # %bb.0: # %entry
1206 ; CHECK-NEXT: pushl %ebp
1207 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1208 ; CHECK-NEXT: pushl %ebx
1209 ; CHECK-NEXT: .cfi_def_cfa_offset 12
1210 ; CHECK-NEXT: pushl %edi
1211 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1212 ; CHECK-NEXT: pushl %esi
1213 ; CHECK-NEXT: .cfi_def_cfa_offset 20
1214 ; CHECK-NEXT: subl $12, %esp
1215 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1216 ; CHECK-NEXT: .cfi_offset %esi, -20
1217 ; CHECK-NEXT: .cfi_offset %edi, -16
1218 ; CHECK-NEXT: .cfi_offset %ebx, -12
1219 ; CHECK-NEXT: .cfi_offset %ebp, -8
1220 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx
1221 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi
1222 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
1223 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp
1224 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
1225 ; CHECK-NEXT: movl %eax, 12(%ebp)
1226 ; CHECK-NEXT: movl %ebp, (%esp)
1227 ; CHECK-NEXT: calll bar
1228 ; CHECK-NEXT: cmpl $18, %esi
1229 ; CHECK-NEXT: jl .LBB9_2
1230 ; CHECK-NEXT: # %bb.1: # %if.then
1231 ; CHECK-NEXT: movl %esi, 4(%ebp)
1232 ; CHECK-NEXT: movl %ebp, (%esp)
1233 ; CHECK-NEXT: calll bar
1234 ; CHECK-NEXT: .LBB9_2: # %if.end
1235 ; CHECK-NEXT: movups (%ebx), %xmm0
1236 ; CHECK-NEXT: movups %xmm0, (%edi)
1237 ; CHECK-NEXT: movups (%ebp), %xmm0
1238 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
1239 ; CHECK-NEXT: movups %xmm0, (%eax)
1240 ; CHECK-NEXT: addl $12, %esp
1241 ; CHECK-NEXT: popl %esi
1242 ; CHECK-NEXT: popl %edi
1243 ; CHECK-NEXT: popl %ebx
1244 ; CHECK-NEXT: popl %ebp
1245 ; CHECK-NEXT: retl
1246 ;
1247 ; DISABLED-LABEL: test_limit_all:
1248 ; DISABLED: # %bb.0: # %entry
1249 ; DISABLED-NEXT: pushl %ebp
1250 ; DISABLED-NEXT: .cfi_def_cfa_offset 8
1251 ; DISABLED-NEXT: pushl %ebx
1252 ; DISABLED-NEXT: .cfi_def_cfa_offset 12
1253 ; DISABLED-NEXT: pushl %edi
1254 ; DISABLED-NEXT: .cfi_def_cfa_offset 16
1255 ; DISABLED-NEXT: pushl %esi
1256 ; DISABLED-NEXT: .cfi_def_cfa_offset 20
1257 ; DISABLED-NEXT: subl $12, %esp
1258 ; DISABLED-NEXT: .cfi_def_cfa_offset 32
1259 ; DISABLED-NEXT: .cfi_offset %esi, -20
1260 ; DISABLED-NEXT: .cfi_offset %edi, -16
1261 ; DISABLED-NEXT: .cfi_offset %ebx, -12
1262 ; DISABLED-NEXT: .cfi_offset %ebp, -8
1263 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %ebx
1264 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %edi
1265 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %esi
1266 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %ebp
1267 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %eax
1268 ; DISABLED-NEXT: movl %eax, 12(%ebp)
1269 ; DISABLED-NEXT: movl %ebp, (%esp)
1270 ; DISABLED-NEXT: calll bar
1271 ; DISABLED-NEXT: cmpl $18, %esi
1272 ; DISABLED-NEXT: jl .LBB9_2
1273 ; DISABLED-NEXT: # %bb.1: # %if.then
1274 ; DISABLED-NEXT: movl %esi, 4(%ebp)
1275 ; DISABLED-NEXT: movl %ebp, (%esp)
1276 ; DISABLED-NEXT: calll bar
1277 ; DISABLED-NEXT: .LBB9_2: # %if.end
1278 ; DISABLED-NEXT: movups (%ebx), %xmm0
1279 ; DISABLED-NEXT: movups %xmm0, (%edi)
1280 ; DISABLED-NEXT: movups (%ebp), %xmm0
1281 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %eax
1282 ; DISABLED-NEXT: movups %xmm0, (%eax)
1283 ; DISABLED-NEXT: addl $12, %esp
1284 ; DISABLED-NEXT: popl %esi
1285 ; DISABLED-NEXT: popl %edi
1286 ; DISABLED-NEXT: popl %ebx
1287 ; DISABLED-NEXT: popl %ebp
1288 ; DISABLED-NEXT: retl
1289 ;
1290 ; CHECK-AVX2-LABEL: test_limit_all:
1291 ; CHECK-AVX2: # %bb.0: # %entry
1292 ; CHECK-AVX2-NEXT: pushl %ebp
1293 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8
1294 ; CHECK-AVX2-NEXT: pushl %ebx
1295 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 12
1296 ; CHECK-AVX2-NEXT: pushl %edi
1297 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 16
1298 ; CHECK-AVX2-NEXT: pushl %esi
1299 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 20
1300 ; CHECK-AVX2-NEXT: subl $12, %esp
1301 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 32
1302 ; CHECK-AVX2-NEXT: .cfi_offset %esi, -20
1303 ; CHECK-AVX2-NEXT: .cfi_offset %edi, -16
1304 ; CHECK-AVX2-NEXT: .cfi_offset %ebx, -12
1305 ; CHECK-AVX2-NEXT: .cfi_offset %ebp, -8
1306 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ebx
1307 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edi
1308 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi
1309 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ebp
1310 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
1311 ; CHECK-AVX2-NEXT: movl %eax, 12(%ebp)
1312 ; CHECK-AVX2-NEXT: movl %ebp, (%esp)
1313 ; CHECK-AVX2-NEXT: calll bar
1314 ; CHECK-AVX2-NEXT: cmpl $18, %esi
1315 ; CHECK-AVX2-NEXT: jl .LBB9_2
1316 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
1317 ; CHECK-AVX2-NEXT: movl %esi, 4(%ebp)
1318 ; CHECK-AVX2-NEXT: movl %ebp, (%esp)
1319 ; CHECK-AVX2-NEXT: calll bar
1320 ; CHECK-AVX2-NEXT: .LBB9_2: # %if.end
1321 ; CHECK-AVX2-NEXT: movups (%ebx), %xmm0
1322 ; CHECK-AVX2-NEXT: movups %xmm0, (%edi)
1323 ; CHECK-AVX2-NEXT: movups (%ebp), %xmm0
1324 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
1325 ; CHECK-AVX2-NEXT: movups %xmm0, (%eax)
1326 ; CHECK-AVX2-NEXT: addl $12, %esp
1327 ; CHECK-AVX2-NEXT: popl %esi
1328 ; CHECK-AVX2-NEXT: popl %edi
1329 ; CHECK-AVX2-NEXT: popl %ebx
1330 ; CHECK-AVX2-NEXT: popl %ebp
1331 ; CHECK-AVX2-NEXT: retl
1332 ;
1333 ; CHECK-AVX512-LABEL: test_limit_all:
1334 ; CHECK-AVX512: # %bb.0: # %entry
1335 ; CHECK-AVX512-NEXT: pushl %ebp
1336 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 8
1337 ; CHECK-AVX512-NEXT: pushl %ebx
1338 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 12
1339 ; CHECK-AVX512-NEXT: pushl %edi
1340 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 16
1341 ; CHECK-AVX512-NEXT: pushl %esi
1342 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 20
1343 ; CHECK-AVX512-NEXT: subl $12, %esp
1344 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 32
1345 ; CHECK-AVX512-NEXT: .cfi_offset %esi, -20
1346 ; CHECK-AVX512-NEXT: .cfi_offset %edi, -16
1347 ; CHECK-AVX512-NEXT: .cfi_offset %ebx, -12
1348 ; CHECK-AVX512-NEXT: .cfi_offset %ebp, -8
1349 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ebx
1350 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edi
1351 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %esi
1352 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ebp
1353 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1354 ; CHECK-AVX512-NEXT: movl %eax, 12(%ebp)
1355 ; CHECK-AVX512-NEXT: movl %ebp, (%esp)
1356 ; CHECK-AVX512-NEXT: calll bar
1357 ; CHECK-AVX512-NEXT: cmpl $18, %esi
1358 ; CHECK-AVX512-NEXT: jl .LBB9_2
1359 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
1360 ; CHECK-AVX512-NEXT: movl %esi, 4(%ebp)
1361 ; CHECK-AVX512-NEXT: movl %ebp, (%esp)
1362 ; CHECK-AVX512-NEXT: calll bar
1363 ; CHECK-AVX512-NEXT: .LBB9_2: # %if.end
1364 ; CHECK-AVX512-NEXT: vmovups (%ebx), %xmm0
1365 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%edi)
1366 ; CHECK-AVX512-NEXT: vmovups (%ebp), %xmm0
1367 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1368 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%eax)
1369 ; CHECK-AVX512-NEXT: addl $12, %esp
1370 ; CHECK-AVX512-NEXT: popl %esi
1371 ; CHECK-AVX512-NEXT: popl %edi
1372 ; CHECK-AVX512-NEXT: popl %ebx
1373 ; CHECK-AVX512-NEXT: popl %ebp
1374 ; CHECK-AVX512-NEXT: retl
1375 entry:
1376 %d = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 3
1377 store i32 %x2, i32* %d, align 4
1378 tail call void @bar(%struct.S* %s1) #3
1379 %cmp = icmp sgt i32 %x, 17
1380 br i1 %cmp, label %if.then, label %if.end
1381
1382 if.then: ; preds = %entry
1383 %b = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 1
1384 store i32 %x, i32* %b, align 4
1385 tail call void @bar(%struct.S* nonnull %s1) #3
1386 br label %if.end
1387
1388 if.end: ; preds = %if.then, %entry
1389 %0 = bitcast %struct.S* %s3 to i8*
1390 %1 = bitcast %struct.S* %s4 to i8*
1391 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
1392 %2 = bitcast %struct.S* %s2 to i8*
1393 %3 = bitcast %struct.S* %s1 to i8*
1394 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 4, i1 false)
1395 ret void
1396 }
1397
1398 ; Function Attrs: nounwind uwtable
1399 define void @test_limit_one_pred(%struct.S* %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 {
1400 ; CHECK-LABEL: test_limit_one_pred:
1401 ; CHECK: # %bb.0: # %entry
1402 ; CHECK-NEXT: pushl %ebp
1403 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1404 ; CHECK-NEXT: pushl %ebx
1405 ; CHECK-NEXT: .cfi_def_cfa_offset 12
1406 ; CHECK-NEXT: pushl %edi
1407 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1408 ; CHECK-NEXT: pushl %esi
1409 ; CHECK-NEXT: .cfi_def_cfa_offset 20
1410 ; CHECK-NEXT: subl $12, %esp
1411 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1412 ; CHECK-NEXT: .cfi_offset %esi, -20
1413 ; CHECK-NEXT: .cfi_offset %edi, -16
1414 ; CHECK-NEXT: .cfi_offset %ebx, -12
1415 ; CHECK-NEXT: .cfi_offset %ebp, -8
1416 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp
1417 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx
1418 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
1419 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
1420 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi
1421 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
1422 ; CHECK-NEXT: movl %ecx, 12(%edi)
1423 ; CHECK-NEXT: cmpl $18, %eax
1424 ; CHECK-NEXT: jl .LBB10_2
1425 ; CHECK-NEXT: # %bb.1: # %if.then
1426 ; CHECK-NEXT: movl %eax, 4(%edi)
1427 ; CHECK-NEXT: movl %edi, (%esp)
1428 ; CHECK-NEXT: calll bar
1429 ; CHECK-NEXT: .LBB10_2: # %if.end
1430 ; CHECK-NEXT: movups (%ebp), %xmm0
1431 ; CHECK-NEXT: movups %xmm0, (%ebx)
1432 ; CHECK-NEXT: movl (%edi), %eax
1433 ; CHECK-NEXT: movl %eax, (%esi)
1434 ; CHECK-NEXT: movl 4(%edi), %eax
1435 ; CHECK-NEXT: movl %eax, 4(%esi)
1436 ; CHECK-NEXT: movl 8(%edi), %eax
1437 ; CHECK-NEXT: movl %eax, 8(%esi)
1438 ; CHECK-NEXT: movl 12(%edi), %eax
1439 ; CHECK-NEXT: movl %eax, 12(%esi)
1440 ; CHECK-NEXT: addl $12, %esp
1441 ; CHECK-NEXT: popl %esi
1442 ; CHECK-NEXT: popl %edi
1443 ; CHECK-NEXT: popl %ebx
1444 ; CHECK-NEXT: popl %ebp
1445 ; CHECK-NEXT: retl
1446 ;
1447 ; DISABLED-LABEL: test_limit_one_pred:
1448 ; DISABLED: # %bb.0: # %entry
1449 ; DISABLED-NEXT: pushl %ebp
1450 ; DISABLED-NEXT: .cfi_def_cfa_offset 8
1451 ; DISABLED-NEXT: pushl %ebx
1452 ; DISABLED-NEXT: .cfi_def_cfa_offset 12
1453 ; DISABLED-NEXT: pushl %edi
1454 ; DISABLED-NEXT: .cfi_def_cfa_offset 16
1455 ; DISABLED-NEXT: pushl %esi
1456 ; DISABLED-NEXT: .cfi_def_cfa_offset 20
1457 ; DISABLED-NEXT: subl $12, %esp
1458 ; DISABLED-NEXT: .cfi_def_cfa_offset 32
1459 ; DISABLED-NEXT: .cfi_offset %esi, -20
1460 ; DISABLED-NEXT: .cfi_offset %edi, -16
1461 ; DISABLED-NEXT: .cfi_offset %ebx, -12
1462 ; DISABLED-NEXT: .cfi_offset %ebp, -8
1463 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %ebx
1464 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %edi
1465 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %eax
1466 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %esi
1467 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %ebp
1468 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %ecx
1469 ; DISABLED-NEXT: movl %ecx, 12(%ebp)
1470 ; DISABLED-NEXT: cmpl $18, %eax
1471 ; DISABLED-NEXT: jl .LBB10_2
1472 ; DISABLED-NEXT: # %bb.1: # %if.then
1473 ; DISABLED-NEXT: movl %eax, 4(%ebp)
1474 ; DISABLED-NEXT: movl %ebp, (%esp)
1475 ; DISABLED-NEXT: calll bar
1476 ; DISABLED-NEXT: .LBB10_2: # %if.end
1477 ; DISABLED-NEXT: movups (%ebx), %xmm0
1478 ; DISABLED-NEXT: movups %xmm0, (%edi)
1479 ; DISABLED-NEXT: movups (%ebp), %xmm0
1480 ; DISABLED-NEXT: movups %xmm0, (%esi)
1481 ; DISABLED-NEXT: addl $12, %esp
1482 ; DISABLED-NEXT: popl %esi
1483 ; DISABLED-NEXT: popl %edi
1484 ; DISABLED-NEXT: popl %ebx
1485 ; DISABLED-NEXT: popl %ebp
1486 ; DISABLED-NEXT: retl
1487 ;
1488 ; CHECK-AVX2-LABEL: test_limit_one_pred:
1489 ; CHECK-AVX2: # %bb.0: # %entry
1490 ; CHECK-AVX2-NEXT: pushl %ebp
1491 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8
1492 ; CHECK-AVX2-NEXT: pushl %ebx
1493 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 12
1494 ; CHECK-AVX2-NEXT: pushl %edi
1495 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 16
1496 ; CHECK-AVX2-NEXT: pushl %esi
1497 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 20
1498 ; CHECK-AVX2-NEXT: subl $12, %esp
1499 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 32
1500 ; CHECK-AVX2-NEXT: .cfi_offset %esi, -20
1501 ; CHECK-AVX2-NEXT: .cfi_offset %edi, -16
1502 ; CHECK-AVX2-NEXT: .cfi_offset %ebx, -12
1503 ; CHECK-AVX2-NEXT: .cfi_offset %ebp, -8
1504 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ebp
1505 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ebx
1506 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
1507 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi
1508 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edi
1509 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
1510 ; CHECK-AVX2-NEXT: movl %ecx, 12(%edi)
1511 ; CHECK-AVX2-NEXT: cmpl $18, %eax
1512 ; CHECK-AVX2-NEXT: jl .LBB10_2
1513 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
1514 ; CHECK-AVX2-NEXT: movl %eax, 4(%edi)
1515 ; CHECK-AVX2-NEXT: movl %edi, (%esp)
1516 ; CHECK-AVX2-NEXT: calll bar
1517 ; CHECK-AVX2-NEXT: .LBB10_2: # %if.end
1518 ; CHECK-AVX2-NEXT: movups (%ebp), %xmm0
1519 ; CHECK-AVX2-NEXT: movups %xmm0, (%ebx)
1520 ; CHECK-AVX2-NEXT: movl (%edi), %eax
1521 ; CHECK-AVX2-NEXT: movl %eax, (%esi)
1522 ; CHECK-AVX2-NEXT: movl 4(%edi), %eax
1523 ; CHECK-AVX2-NEXT: movl %eax, 4(%esi)
1524 ; CHECK-AVX2-NEXT: movl 8(%edi), %eax
1525 ; CHECK-AVX2-NEXT: movl %eax, 8(%esi)
1526 ; CHECK-AVX2-NEXT: movl 12(%edi), %eax
1527 ; CHECK-AVX2-NEXT: movl %eax, 12(%esi)
1528 ; CHECK-AVX2-NEXT: addl $12, %esp
1529 ; CHECK-AVX2-NEXT: popl %esi
1530 ; CHECK-AVX2-NEXT: popl %edi
1531 ; CHECK-AVX2-NEXT: popl %ebx
1532 ; CHECK-AVX2-NEXT: popl %ebp
1533 ; CHECK-AVX2-NEXT: retl
1534 ;
1535 ; CHECK-AVX512-LABEL: test_limit_one_pred:
1536 ; CHECK-AVX512: # %bb.0: # %entry
1537 ; CHECK-AVX512-NEXT: pushl %ebp
1538 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 8
1539 ; CHECK-AVX512-NEXT: pushl %ebx
1540 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 12
1541 ; CHECK-AVX512-NEXT: pushl %edi
1542 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 16
1543 ; CHECK-AVX512-NEXT: pushl %esi
1544 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 20
1545 ; CHECK-AVX512-NEXT: subl $12, %esp
1546 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 32
1547 ; CHECK-AVX512-NEXT: .cfi_offset %esi, -20
1548 ; CHECK-AVX512-NEXT: .cfi_offset %edi, -16
1549 ; CHECK-AVX512-NEXT: .cfi_offset %ebx, -12
1550 ; CHECK-AVX512-NEXT: .cfi_offset %ebp, -8
1551 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ebp
1552 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ebx
1553 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1554 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %esi
1555 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edi
1556 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
1557 ; CHECK-AVX512-NEXT: movl %ecx, 12(%edi)
1558 ; CHECK-AVX512-NEXT: cmpl $18, %eax
1559 ; CHECK-AVX512-NEXT: jl .LBB10_2
1560 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
1561 ; CHECK-AVX512-NEXT: movl %eax, 4(%edi)
1562 ; CHECK-AVX512-NEXT: movl %edi, (%esp)
1563 ; CHECK-AVX512-NEXT: calll bar
1564 ; CHECK-AVX512-NEXT: .LBB10_2: # %if.end
1565 ; CHECK-AVX512-NEXT: vmovups (%ebp), %xmm0
1566 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%ebx)
1567 ; CHECK-AVX512-NEXT: movl (%edi), %eax
1568 ; CHECK-AVX512-NEXT: movl %eax, (%esi)
1569 ; CHECK-AVX512-NEXT: movl 4(%edi), %eax
1570 ; CHECK-AVX512-NEXT: movl %eax, 4(%esi)
1571 ; CHECK-AVX512-NEXT: movl 8(%edi), %eax
1572 ; CHECK-AVX512-NEXT: movl %eax, 8(%esi)
1573 ; CHECK-AVX512-NEXT: movl 12(%edi), %eax
1574 ; CHECK-AVX512-NEXT: movl %eax, 12(%esi)
1575 ; CHECK-AVX512-NEXT: addl $12, %esp
1576 ; CHECK-AVX512-NEXT: popl %esi
1577 ; CHECK-AVX512-NEXT: popl %edi
1578 ; CHECK-AVX512-NEXT: popl %ebx
1579 ; CHECK-AVX512-NEXT: popl %ebp
1580 ; CHECK-AVX512-NEXT: retl
1581 entry:
1582 %d = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 3
1583 store i32 %x2, i32* %d, align 4
1584 %cmp = icmp sgt i32 %x, 17
1585 br i1 %cmp, label %if.then, label %if.end
1586
1587 if.then: ; preds = %entry
1588 %b = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 1
1589 store i32 %x, i32* %b, align 4
1590 tail call void @bar(%struct.S* nonnull %s1) #3
1591 br label %if.end
1592
1593 if.end: ; preds = %if.then, %entry
1594 %0 = bitcast %struct.S* %s3 to i8*
1595 %1 = bitcast %struct.S* %s4 to i8*
1596 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
1597 %2 = bitcast %struct.S* %s2 to i8*
1598 %3 = bitcast %struct.S* %s1 to i8*
1599 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 4, i1 false)
1600 ret void
1601 }
1602
1603
1604 declare void @bar(%struct.S*) local_unnamed_addr #1
1605
1606
1607 ; Function Attrs: argmemonly nounwind
1608 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1
1609
1610 attributes #0 = { nounwind uwtable "target-cpu"="x86-64" }
1611
1612 %struct.S7 = type { float, float, float , float, float, float, float, float }
1613
1614 ; Function Attrs: nounwind uwtable
1615 define void @test_conditional_block_float(%struct.S7* nocapture %s1, %struct.S7* nocapture %s2, i32 %x, %struct.S7* nocapture %s3, %struct.S7* nocapture readonly %s4, float %y) local_unnamed_addr #0 {
1616 ; CHECK-LABEL: test_conditional_block_float:
1617 ; CHECK: # %bb.0: # %entry
1618 ; CHECK-NEXT: pushl %ebx
1619 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1620 ; CHECK-NEXT: pushl %edi
1621 ; CHECK-NEXT: .cfi_def_cfa_offset 12
1622 ; CHECK-NEXT: pushl %esi
1623 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1624 ; CHECK-NEXT: .cfi_offset %esi, -16
1625 ; CHECK-NEXT: .cfi_offset %edi, -12
1626 ; CHECK-NEXT: .cfi_offset %ebx, -8
1627 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
1628 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
1629 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
1630 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
1631 ; CHECK-NEXT: cmpl $18, {{[0-9]+}}(%esp)
1632 ; CHECK-NEXT: jl .LBB11_2
1633 ; CHECK-NEXT: # %bb.1: # %if.then
1634 ; CHECK-NEXT: movl $1065353216, 4(%ecx) # imm = 0x3F800000
1635 ; CHECK-NEXT: .LBB11_2: # %if.end
1636 ; CHECK-NEXT: movups (%esi), %xmm0
1637 ; CHECK-NEXT: movups 16(%esi), %xmm1
1638 ; CHECK-NEXT: movups %xmm1, 16(%edx)
1639 ; CHECK-NEXT: movups %xmm0, (%edx)
1640 ; CHECK-NEXT: movl (%ecx), %edx
1641 ; CHECK-NEXT: movl 4(%ecx), %esi
1642 ; CHECK-NEXT: movl 8(%ecx), %edi
1643 ; CHECK-NEXT: movl 12(%ecx), %ebx
1644 ; CHECK-NEXT: movups 16(%ecx), %xmm0
1645 ; CHECK-NEXT: movups %xmm0, 16(%eax)
1646 ; CHECK-NEXT: movl %edx, (%eax)
1647 ; CHECK-NEXT: movl %esi, 4(%eax)
1648 ; CHECK-NEXT: movl %edi, 8(%eax)
1649 ; CHECK-NEXT: movl %ebx, 12(%eax)
1650 ; CHECK-NEXT: popl %esi
1651 ; CHECK-NEXT: popl %edi
1652 ; CHECK-NEXT: popl %ebx
1653 ; CHECK-NEXT: retl
1654 ;
1655 ; DISABLED-LABEL: test_conditional_block_float:
1656 ; DISABLED: # %bb.0: # %entry
1657 ; DISABLED-NEXT: pushl %esi
1658 ; DISABLED-NEXT: .cfi_def_cfa_offset 8
1659 ; DISABLED-NEXT: .cfi_offset %esi, -8
1660 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %esi
1661 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %edx
1662 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %eax
1663 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %ecx
1664 ; DISABLED-NEXT: cmpl $18, {{[0-9]+}}(%esp)
1665 ; DISABLED-NEXT: jl .LBB11_2
1666 ; DISABLED-NEXT: # %bb.1: # %if.then
1667 ; DISABLED-NEXT: movl $1065353216, 4(%ecx) # imm = 0x3F800000
1668 ; DISABLED-NEXT: .LBB11_2: # %if.end
1669 ; DISABLED-NEXT: movups (%esi), %xmm0
1670 ; DISABLED-NEXT: movups 16(%esi), %xmm1
1671 ; DISABLED-NEXT: movups %xmm1, 16(%edx)
1672 ; DISABLED-NEXT: movups %xmm0, (%edx)
1673 ; DISABLED-NEXT: movups (%ecx), %xmm0
1674 ; DISABLED-NEXT: movups 16(%ecx), %xmm1
1675 ; DISABLED-NEXT: movups %xmm1, 16(%eax)
1676 ; DISABLED-NEXT: movups %xmm0, (%eax)
1677 ; DISABLED-NEXT: popl %esi
1678 ; DISABLED-NEXT: retl
1679 ;
1680 ; CHECK-AVX2-LABEL: test_conditional_block_float:
1681 ; CHECK-AVX2: # %bb.0: # %entry
1682 ; CHECK-AVX2-NEXT: pushl %ebx
1683 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8
1684 ; CHECK-AVX2-NEXT: pushl %edi
1685 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 12
1686 ; CHECK-AVX2-NEXT: pushl %esi
1687 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 16
1688 ; CHECK-AVX2-NEXT: .cfi_offset %esi, -16
1689 ; CHECK-AVX2-NEXT: .cfi_offset %edi, -12
1690 ; CHECK-AVX2-NEXT: .cfi_offset %ebx, -8
1691 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi
1692 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
1693 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
1694 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
1695 ; CHECK-AVX2-NEXT: cmpl $18, {{[0-9]+}}(%esp)
1696 ; CHECK-AVX2-NEXT: jl .LBB11_2
1697 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
1698 ; CHECK-AVX2-NEXT: movl $1065353216, 4(%ecx) # imm = 0x3F800000
1699 ; CHECK-AVX2-NEXT: .LBB11_2: # %if.end
1700 ; CHECK-AVX2-NEXT: movups (%esi), %xmm0
1701 ; CHECK-AVX2-NEXT: movups 16(%esi), %xmm1
1702 ; CHECK-AVX2-NEXT: movups %xmm1, 16(%edx)
1703 ; CHECK-AVX2-NEXT: movups %xmm0, (%edx)
1704 ; CHECK-AVX2-NEXT: movl (%ecx), %edx
1705 ; CHECK-AVX2-NEXT: movl 4(%ecx), %esi
1706 ; CHECK-AVX2-NEXT: movl 8(%ecx), %edi
1707 ; CHECK-AVX2-NEXT: movl 12(%ecx), %ebx
1708 ; CHECK-AVX2-NEXT: movups 16(%ecx), %xmm0
1709 ; CHECK-AVX2-NEXT: movups %xmm0, 16(%eax)
1710 ; CHECK-AVX2-NEXT: movl %edx, (%eax)
1711 ; CHECK-AVX2-NEXT: movl %esi, 4(%eax)
1712 ; CHECK-AVX2-NEXT: movl %edi, 8(%eax)
1713 ; CHECK-AVX2-NEXT: movl %ebx, 12(%eax)
1714 ; CHECK-AVX2-NEXT: popl %esi
1715 ; CHECK-AVX2-NEXT: popl %edi
1716 ; CHECK-AVX2-NEXT: popl %ebx
1717 ; CHECK-AVX2-NEXT: retl
1718 ;
1719 ; CHECK-AVX512-LABEL: test_conditional_block_float:
1720 ; CHECK-AVX512: # %bb.0: # %entry
1721 ; CHECK-AVX512-NEXT: pushl %esi
1722 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 8
1723 ; CHECK-AVX512-NEXT: .cfi_offset %esi, -8
1724 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %esi
1725 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
1726 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1727 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
1728 ; CHECK-AVX512-NEXT: cmpl $18, {{[0-9]+}}(%esp)
1729 ; CHECK-AVX512-NEXT: jl .LBB11_2
1730 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
1731 ; CHECK-AVX512-NEXT: movl $1065353216, 4(%ecx) # imm = 0x3F800000
1732 ; CHECK-AVX512-NEXT: .LBB11_2: # %if.end
1733 ; CHECK-AVX512-NEXT: vmovups (%esi), %ymm0
1734 ; CHECK-AVX512-NEXT: vmovups %ymm0, (%edx)
1735 ; CHECK-AVX512-NEXT: movl (%ecx), %edx
1736 ; CHECK-AVX512-NEXT: movl %edx, (%eax)
1737 ; CHECK-AVX512-NEXT: movl 4(%ecx), %edx
1738 ; CHECK-AVX512-NEXT: movl %edx, 4(%eax)
1739 ; CHECK-AVX512-NEXT: vmovups 8(%ecx), %xmm0
1740 ; CHECK-AVX512-NEXT: vmovups %xmm0, 8(%eax)
1741 ; CHECK-AVX512-NEXT: movl 24(%ecx), %edx
1742 ; CHECK-AVX512-NEXT: movl %edx, 24(%eax)
1743 ; CHECK-AVX512-NEXT: movl 28(%ecx), %ecx
1744 ; CHECK-AVX512-NEXT: movl %ecx, 28(%eax)
1745 ; CHECK-AVX512-NEXT: popl %esi
1746 ; CHECK-AVX512-NEXT: vzeroupper
1747 ; CHECK-AVX512-NEXT: retl
1748 entry:
1749 %cmp = icmp sgt i32 %x, 17
1750 br i1 %cmp, label %if.then, label %if.end
1751
1752 if.then: ; preds = %entry
1753 %b = getelementptr inbounds %struct.S7, %struct.S7* %s1, i64 0, i32 1
1754 store float 1.0, float* %b, align 4
1755 br label %if.end
1756
1757 if.end: ; preds = %if.then, %entry
1758 %0 = bitcast %struct.S7* %s3 to i8*
1759 %1 = bitcast %struct.S7* %s4 to i8*
1760 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 32, i32 4, i1 false)
1761 %2 = bitcast %struct.S7* %s2 to i8*
1762 %3 = bitcast %struct.S7* %s1 to i8*
1763 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 32, i32 4, i1 false)
1764 ret void
1765 }
1766
1767 %struct.S8 = type { i64, i64, i64, i64, i64, i64 }
1768
1769 ; Function Attrs: nounwind uwtable
1770 define void @test_conditional_block_ymm(%struct.S8* nocapture %s1, %struct.S8* nocapture %s2, i32 %x, %struct.S8* nocapture %s3, %struct.S8* nocapture readonly %s4) local_unnamed_addr #0 {
1771 ; CHECK-LABEL: test_conditional_block_ymm:
1772 ; CHECK: # %bb.0: # %entry
1773 ; CHECK-NEXT: pushl %ebx
1774 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1775 ; CHECK-NEXT: pushl %edi
1776 ; CHECK-NEXT: .cfi_def_cfa_offset 12
1777 ; CHECK-NEXT: pushl %esi
1778 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1779 ; CHECK-NEXT: .cfi_offset %esi, -16
1780 ; CHECK-NEXT: .cfi_offset %edi, -12
1781 ; CHECK-NEXT: .cfi_offset %ebx, -8
1782 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
1783 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
1784 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
1785 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
1786 ; CHECK-NEXT: cmpl $18, {{[0-9]+}}(%esp)
1787 ; CHECK-NEXT: jl .LBB12_2
1788 ; CHECK-NEXT: # %bb.1: # %if.then
1789 ; CHECK-NEXT: movl $0, 12(%ecx)
1790 ; CHECK-NEXT: movl $1, 8(%ecx)
1791 ; CHECK-NEXT: .LBB12_2: # %if.end
1792 ; CHECK-NEXT: movups (%esi), %xmm0
1793 ; CHECK-NEXT: movups 16(%esi), %xmm1
1794 ; CHECK-NEXT: movups %xmm1, 16(%edx)
1795 ; CHECK-NEXT: movups %xmm0, (%edx)
1796 ; CHECK-NEXT: movl (%ecx), %edx
1797 ; CHECK-NEXT: movl 4(%ecx), %esi
1798 ; CHECK-NEXT: movl 8(%ecx), %edi
1799 ; CHECK-NEXT: movl 12(%ecx), %ebx
1800 ; CHECK-NEXT: movups 16(%ecx), %xmm0
1801 ; CHECK-NEXT: movups %xmm0, 16(%eax)
1802 ; CHECK-NEXT: movl %edx, (%eax)
1803 ; CHECK-NEXT: movl %esi, 4(%eax)
1804 ; CHECK-NEXT: movl %edi, 8(%eax)
1805 ; CHECK-NEXT: movl %ebx, 12(%eax)
1806 ; CHECK-NEXT: popl %esi
1807 ; CHECK-NEXT: popl %edi
1808 ; CHECK-NEXT: popl %ebx
1809 ; CHECK-NEXT: retl
1810 ;
1811 ; DISABLED-LABEL: test_conditional_block_ymm:
1812 ; DISABLED: # %bb.0: # %entry
1813 ; DISABLED-NEXT: pushl %esi
1814 ; DISABLED-NEXT: .cfi_def_cfa_offset 8
1815 ; DISABLED-NEXT: .cfi_offset %esi, -8
1816 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %esi
1817 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %edx
1818 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %eax
1819 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %ecx
1820 ; DISABLED-NEXT: cmpl $18, {{[0-9]+}}(%esp)
1821 ; DISABLED-NEXT: jl .LBB12_2
1822 ; DISABLED-NEXT: # %bb.1: # %if.then
1823 ; DISABLED-NEXT: movl $0, 12(%ecx)
1824 ; DISABLED-NEXT: movl $1, 8(%ecx)
1825 ; DISABLED-NEXT: .LBB12_2: # %if.end
1826 ; DISABLED-NEXT: movups (%esi), %xmm0
1827 ; DISABLED-NEXT: movups 16(%esi), %xmm1
1828 ; DISABLED-NEXT: movups %xmm1, 16(%edx)
1829 ; DISABLED-NEXT: movups %xmm0, (%edx)
1830 ; DISABLED-NEXT: movups (%ecx), %xmm0
1831 ; DISABLED-NEXT: movups 16(%ecx), %xmm1
1832 ; DISABLED-NEXT: movups %xmm1, 16(%eax)
1833 ; DISABLED-NEXT: movups %xmm0, (%eax)
1834 ; DISABLED-NEXT: popl %esi
1835 ; DISABLED-NEXT: retl
1836 ;
1837 ; CHECK-AVX2-LABEL: test_conditional_block_ymm:
1838 ; CHECK-AVX2: # %bb.0: # %entry
1839 ; CHECK-AVX2-NEXT: pushl %ebx
1840 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8
1841 ; CHECK-AVX2-NEXT: pushl %edi
1842 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 12
1843 ; CHECK-AVX2-NEXT: pushl %esi
1844 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 16
1845 ; CHECK-AVX2-NEXT: .cfi_offset %esi, -16
1846 ; CHECK-AVX2-NEXT: .cfi_offset %edi, -12
1847 ; CHECK-AVX2-NEXT: .cfi_offset %ebx, -8
1848 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi
1849 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
1850 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
1851 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
1852 ; CHECK-AVX2-NEXT: cmpl $18, {{[0-9]+}}(%esp)
1853 ; CHECK-AVX2-NEXT: jl .LBB12_2
1854 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
1855 ; CHECK-AVX2-NEXT: movl $0, 12(%ecx)
1856 ; CHECK-AVX2-NEXT: movl $1, 8(%ecx)
1857 ; CHECK-AVX2-NEXT: .LBB12_2: # %if.end
1858 ; CHECK-AVX2-NEXT: movups (%esi), %xmm0
1859 ; CHECK-AVX2-NEXT: movups 16(%esi), %xmm1
1860 ; CHECK-AVX2-NEXT: movups %xmm1, 16(%edx)
1861 ; CHECK-AVX2-NEXT: movups %xmm0, (%edx)
1862 ; CHECK-AVX2-NEXT: movl (%ecx), %edx
1863 ; CHECK-AVX2-NEXT: movl 4(%ecx), %esi
1864 ; CHECK-AVX2-NEXT: movl 8(%ecx), %edi
1865 ; CHECK-AVX2-NEXT: movl 12(%ecx), %ebx
1866 ; CHECK-AVX2-NEXT: movups 16(%ecx), %xmm0
1867 ; CHECK-AVX2-NEXT: movups %xmm0, 16(%eax)
1868 ; CHECK-AVX2-NEXT: movl %edx, (%eax)
1869 ; CHECK-AVX2-NEXT: movl %esi, 4(%eax)
1870 ; CHECK-AVX2-NEXT: movl %edi, 8(%eax)
1871 ; CHECK-AVX2-NEXT: movl %ebx, 12(%eax)
1872 ; CHECK-AVX2-NEXT: popl %esi
1873 ; CHECK-AVX2-NEXT: popl %edi
1874 ; CHECK-AVX2-NEXT: popl %ebx
1875 ; CHECK-AVX2-NEXT: retl
1876 ;
1877 ; CHECK-AVX512-LABEL: test_conditional_block_ymm:
1878 ; CHECK-AVX512: # %bb.0: # %entry
1879 ; CHECK-AVX512-NEXT: pushl %esi
1880 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 8
1881 ; CHECK-AVX512-NEXT: .cfi_offset %esi, -8
1882 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %esi
1883 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
1884 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1885 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
1886 ; CHECK-AVX512-NEXT: cmpl $18, {{[0-9]+}}(%esp)
1887 ; CHECK-AVX512-NEXT: jl .LBB12_2
1888 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
1889 ; CHECK-AVX512-NEXT: movl $0, 12(%ecx)
1890 ; CHECK-AVX512-NEXT: movl $1, 8(%ecx)
1891 ; CHECK-AVX512-NEXT: .LBB12_2: # %if.end
1892 ; CHECK-AVX512-NEXT: vmovups (%esi), %ymm0
1893 ; CHECK-AVX512-NEXT: vmovups %ymm0, (%edx)
1894 ; CHECK-AVX512-NEXT: movl (%ecx), %edx
1895 ; CHECK-AVX512-NEXT: movl %edx, (%eax)
1896 ; CHECK-AVX512-NEXT: movl 4(%ecx), %edx
1897 ; CHECK-AVX512-NEXT: movl %edx, 4(%eax)
1898 ; CHECK-AVX512-NEXT: movl 8(%ecx), %edx
1899 ; CHECK-AVX512-NEXT: movl %edx, 8(%eax)
1900 ; CHECK-AVX512-NEXT: movl 12(%ecx), %edx
1901 ; CHECK-AVX512-NEXT: movl %edx, 12(%eax)
1902 ; CHECK-AVX512-NEXT: vmovups 16(%ecx), %xmm0
1903 ; CHECK-AVX512-NEXT: vmovups %xmm0, 16(%eax)
1904 ; CHECK-AVX512-NEXT: popl %esi
1905 ; CHECK-AVX512-NEXT: vzeroupper
1906 ; CHECK-AVX512-NEXT: retl
1907 entry:
1908 %cmp = icmp sgt i32 %x, 17
1909 br i1 %cmp, label %if.then, label %if.end
1910
1911 if.then: ; preds = %entry
1912 %b = getelementptr inbounds %struct.S8, %struct.S8* %s1, i64 0, i32 1
1913 store i64 1, i64* %b, align 4
1914 br label %if.end
1915
1916 if.end: ; preds = %if.then, %entry
1917 %0 = bitcast %struct.S8* %s3 to i8*
1918 %1 = bitcast %struct.S8* %s4 to i8*
1919 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 32, i32 4, i1 false)
1920 %2 = bitcast %struct.S8* %s2 to i8*
1921 %3 = bitcast %struct.S8* %s1 to i8*
1922 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 32, i32 4, i1 false)
1923 ret void
1924 }
1925