llvm.org GIT mirror llvm / fa2037a
[X86] Reduce Store Forward Block issues in HW If a load follows a store and reloads data that the store has written to memory, Intel microarchitectures can in many cases forward the data directly from the store to the load, This "store forwarding" saves cycles by enabling the load to directly obtain the data instead of accessing the data from cache or memory. A "store forward block" occurs in cases that a store cannot be forwarded to the load. The most typical case of store forward block on Intel Core microarchiticutre that a small store cannot be forwarded to a large load. The estimated penalty for a store forward block is ~13 cycles. This pass tries to recognize and handle cases where "store forward block" is created by the compiler when lowering memcpy calls to a sequence of a load and a store. The pass currently only handles cases where memcpy is lowered to XMM/YMM registers, it tries to break the memcpy into smaller copies. breaking the memcpy should be possible since there is no atomicity guarantee for loads and stores to XMM/YMM. Change-Id: I620b6dc91583ad9a1444591e3ddc00dd25d81748 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@324835 91177308-0d34-0410-b5e6-96231b3b80d8 Lama Saba 2 years ago
5 changed file(s) with 1936 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
3030 X86FastISel.cpp
3131 X86FixupBWInsts.cpp
3232 X86FixupLEAs.cpp
33 X86FixupSFB.cpp
3334 X86FixupSetCC.cpp
3435 X86FloatingPoint.cpp
3536 X86FrameLowering.cpp
6969 /// Return a pass that transforms setcc + movzx pairs into xor + setcc.
7070 FunctionPass *createX86FixupSetCC();
7171
72 /// Return a pass that avoids creating store forward block issues in the hardware.
73 FunctionPass *createX86FixupSFB();
74
7275 /// Return a pass that expands WinAlloca pseudo-instructions.
7376 FunctionPass *createX86WinAllocaExpander();
7477
0 //===- X86FixupSFB.cpp - Avoid HW Store Forward Block issues -----------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // If a load follows a store and reloads data that the store has written to
10 // memory, Intel microarchitectures can in many cases forward the data directly
11 // from the store to the load, This "store forwarding" saves cycles by enabling
12 // the load to directly obtain the data instead of accessing the data from
13 // cache or memory.
14 // A "store forward block" occurs in cases that a store cannot be forwarded to
15 // the load. The most typical case of store forward block on Intel Core
16 // microarchitecture that a small store cannot be forwarded to a large load.
17 // The estimated penalty for a store forward block is ~13 cycles.
18 //
19 // This pass tries to recognize and handle cases where "store forward block"
20 // is created by the compiler when lowering memcpy calls to a sequence
21 // of a load and a store.
22 //
23 // The pass currently only handles cases where memcpy is lowered to
24 // XMM/YMM registers, it tries to break the memcpy into smaller copies.
25 // breaking the memcpy should be possible since there is no atomicity
26 // guarantee for loads and stores to XMM/YMM.
27 //
28 // It could be better for performance to solve the problem by loading
29 // to XMM/YMM then inserting the partial store before storing back from XMM/YMM
30 // to memory, but this will result in a more conservative optimization since it
31 // requires we prove that all memory accesses between the blocking store and the
32 // load must alias/don't alias before we can move the store, whereas the
33 // transformation done here is correct regardless to other memory accesses.
34 //===----------------------------------------------------------------------===//
35
36 #include "X86InstrInfo.h"
37 #include "X86Subtarget.h"
38 #include "llvm/CodeGen/MachineBasicBlock.h"
39 #include "llvm/CodeGen/MachineFunction.h"
40 #include "llvm/CodeGen/MachineFunctionPass.h"
41 #include "llvm/CodeGen/MachineInstr.h"
42 #include "llvm/CodeGen/MachineInstrBuilder.h"
43 #include "llvm/CodeGen/MachineOperand.h"
44 #include "llvm/CodeGen/MachineRegisterInfo.h"
45 #include "llvm/IR/DebugInfoMetadata.h"
46 #include "llvm/IR/DebugLoc.h"
47 #include "llvm/IR/Function.h"
48 #include "llvm/MC/MCInstrDesc.h"
49
50 using namespace llvm;
51
52 #define DEBUG_TYPE "x86-fixup-SFB"
53
54 static cl::opt DisableX86FixupSFB("disable-fixup-SFB", cl::Hidden,
55 cl::desc("X86: Disable SFB fixup."),
56 cl::init(false));
57 namespace {
58
59 class FixupSFBPass : public MachineFunctionPass {
60 public:
61 FixupSFBPass() : MachineFunctionPass(ID) {}
62
63 StringRef getPassName() const override {
64 return "X86 Fixup Store Forward Block";
65 }
66
67 bool runOnMachineFunction(MachineFunction &MF) override;
68
69 private:
70 MachineRegisterInfo *MRI;
71 const X86InstrInfo *TII;
72 const X86RegisterInfo *TRI;
73 SmallVector, 2> BlockedLoadsStores;
74 SmallVector ForRemoval;
75
76 /// \brief Returns couples of Load then Store to memory which look
77 /// like a memcpy.
78 void findPotentiallylBlockedCopies(MachineFunction &MF);
79 /// \brief Break the memcpy's load and store into smaller copies
80 /// such that each memory load that was blocked by a smaller store
81 /// would now be copied separately.
82 void
83 breakBlockedCopies(MachineInstr *LoadInst, MachineInstr *StoreInst,
84 const std::map &BlockingStoresDisp);
85 /// \brief Break a copy of size Size to smaller copies.
86 void buildCopies(int Size, MachineInstr *LoadInst, int64_t LdDispImm,
87 MachineInstr *StoreInst, int64_t StDispImm);
88
89 void buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, int64_t LoadDisp,
90 MachineInstr *StoreInst, unsigned NStoreOpcode,
91 int64_t StoreDisp, unsigned Size);
92
93 unsigned getRegSizeInBytes(MachineInstr *Inst);
94 static char ID;
95 };
96
97 } // end anonymous namespace
98
99 char FixupSFBPass::ID = 0;
100
101 FunctionPass *llvm::createX86FixupSFB() { return new FixupSFBPass(); }
102
103 static bool isXMMLoadOpcode(unsigned Opcode) {
104 return Opcode == X86::MOVUPSrm || Opcode == X86::MOVAPSrm ||
105 Opcode == X86::VMOVUPSrm || Opcode == X86::VMOVAPSrm ||
106 Opcode == X86::VMOVUPDrm || Opcode == X86::VMOVAPDrm ||
107 Opcode == X86::VMOVDQUrm || Opcode == X86::VMOVDQArm ||
108 Opcode == X86::VMOVUPSZ128rm || Opcode == X86::VMOVAPSZ128rm ||
109 Opcode == X86::VMOVUPDZ128rm || Opcode == X86::VMOVAPDZ128rm ||
110 Opcode == X86::VMOVDQU64Z128rm || Opcode == X86::VMOVDQA64Z128rm ||
111 Opcode == X86::VMOVDQU32Z128rm || Opcode == X86::VMOVDQA32Z128rm;
112 }
113 static bool isYMMLoadOpcode(unsigned Opcode) {
114 return Opcode == X86::VMOVUPSYrm || Opcode == X86::VMOVAPSYrm ||
115 Opcode == X86::VMOVUPDYrm || Opcode == X86::VMOVAPDYrm ||
116 Opcode == X86::VMOVDQUYrm || Opcode == X86::VMOVDQAYrm ||
117 Opcode == X86::VMOVUPSZ256rm || Opcode == X86::VMOVAPSZ256rm ||
118 Opcode == X86::VMOVUPDZ256rm || Opcode == X86::VMOVAPDZ256rm ||
119 Opcode == X86::VMOVDQU64Z256rm || Opcode == X86::VMOVDQA64Z256rm ||
120 Opcode == X86::VMOVDQU32Z256rm || Opcode == X86::VMOVDQA32Z256rm;
121 }
122
123 static bool isPotentialBlockedMemCpyLd(unsigned Opcode) {
124 return isXMMLoadOpcode(Opcode) || isYMMLoadOpcode(Opcode);
125 }
126
127 std::map> PotentialBlockedMemCpy{
128 {X86::MOVUPSrm, {X86::MOVUPSmr, X86::MOVAPSmr}},
129 {X86::MOVAPSrm, {X86::MOVUPSmr, X86::MOVAPSmr}},
130 {X86::VMOVUPSrm, {X86::VMOVUPSmr, X86::VMOVAPSmr}},
131 {X86::VMOVAPSrm, {X86::VMOVUPSmr, X86::VMOVAPSmr}},
132 {X86::VMOVUPDrm, {X86::VMOVUPDmr, X86::VMOVAPDmr}},
133 {X86::VMOVAPDrm, {X86::VMOVUPDmr, X86::VMOVAPDmr}},
134 {X86::VMOVDQUrm, {X86::VMOVDQUmr, X86::VMOVDQAmr}},
135 {X86::VMOVDQArm, {X86::VMOVDQUmr, X86::VMOVDQAmr}},
136 {X86::VMOVUPSZ128rm, {X86::VMOVUPSZ128mr, X86::VMOVAPSZ128mr}},
137 {X86::VMOVAPSZ128rm, {X86::VMOVUPSZ128mr, X86::VMOVAPSZ128mr}},
138 {X86::VMOVUPDZ128rm, {X86::VMOVUPDZ128mr, X86::VMOVAPDZ128mr}},
139 {X86::VMOVAPDZ128rm, {X86::VMOVUPDZ128mr, X86::VMOVAPDZ128mr}},
140 {X86::VMOVUPSYrm, {X86::VMOVUPSYmr, X86::VMOVAPSYmr}},
141 {X86::VMOVAPSYrm, {X86::VMOVUPSYmr, X86::VMOVAPSYmr}},
142 {X86::VMOVUPDYrm, {X86::VMOVUPDYmr, X86::VMOVAPDYmr}},
143 {X86::VMOVAPDYrm, {X86::VMOVUPDYmr, X86::VMOVAPDYmr}},
144 {X86::VMOVDQUYrm, {X86::VMOVDQUYmr, X86::VMOVDQAYmr}},
145 {X86::VMOVDQAYrm, {X86::VMOVDQUYmr, X86::VMOVDQAYmr}},
146 {X86::VMOVUPSZ256rm, {X86::VMOVUPSZ256mr, X86::VMOVAPSZ256mr}},
147 {X86::VMOVAPSZ256rm, {X86::VMOVUPSZ256mr, X86::VMOVAPSZ256mr}},
148 {X86::VMOVUPDZ256rm, {X86::VMOVUPDZ256mr, X86::VMOVAPDZ256mr}},
149 {X86::VMOVAPDZ256rm, {X86::VMOVUPDZ256mr, X86::VMOVAPDZ256mr}},
150 {X86::VMOVDQU64Z128rm, {X86::VMOVDQU64Z128mr, X86::VMOVDQA64Z128mr}},
151 {X86::VMOVDQA64Z128rm, {X86::VMOVDQU64Z128mr, X86::VMOVDQA64Z128mr}},
152 {X86::VMOVDQU32Z128rm, {X86::VMOVDQU32Z128mr, X86::VMOVDQA32Z128mr}},
153 {X86::VMOVDQA32Z128rm, {X86::VMOVDQU32Z128mr, X86::VMOVDQA32Z128mr}},
154 {X86::VMOVDQU64Z256rm, {X86::VMOVDQU64Z256mr, X86::VMOVDQA64Z256mr}},
155 {X86::VMOVDQA64Z256rm, {X86::VMOVDQU64Z256mr, X86::VMOVDQA64Z256mr}},
156 {X86::VMOVDQU32Z256rm, {X86::VMOVDQU32Z256mr, X86::VMOVDQA32Z256mr}},
157 {X86::VMOVDQA32Z256rm, {X86::VMOVDQU32Z256mr, X86::VMOVDQA32Z256mr}},
158 };
159
160 static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode) {
161 auto PotentialStores = PotentialBlockedMemCpy.at(LdOpcode);
162 return PotentialStores.first == StOpcode ||
163 PotentialStores.second == StOpcode;
164 }
165
166 static bool isPotentialBlockingStoreInst(int Opcode, int LoadOpcode) {
167 bool PBlock = false;
168 PBlock |= Opcode == X86::MOV64mr || Opcode == X86::MOV64mi32 ||
169 Opcode == X86::MOV32mr || Opcode == X86::MOV32mi ||
170 Opcode == X86::MOV16mr || Opcode == X86::MOV16mi ||
171 Opcode == X86::MOV8mr || Opcode == X86::MOV8mi;
172 if (isYMMLoadOpcode(LoadOpcode))
173 PBlock |= Opcode == X86::VMOVUPSmr || Opcode == X86::VMOVAPSmr ||
174 Opcode == X86::VMOVUPDmr || Opcode == X86::VMOVAPDmr ||
175 Opcode == X86::VMOVDQUmr || Opcode == X86::VMOVDQAmr ||
176 Opcode == X86::VMOVUPSZ128mr || Opcode == X86::VMOVAPSZ128mr ||
177 Opcode == X86::VMOVUPDZ128mr || Opcode == X86::VMOVAPDZ128mr ||
178 Opcode == X86::VMOVDQU64Z128mr ||
179 Opcode == X86::VMOVDQA64Z128mr ||
180 Opcode == X86::VMOVDQU32Z128mr || Opcode == X86::VMOVDQA32Z128mr;
181 return PBlock;
182 }
183
184 static const int MOV128SZ = 16;
185 static const int MOV64SZ = 8;
186 static const int MOV32SZ = 4;
187 static const int MOV16SZ = 2;
188 static const int MOV8SZ = 1;
189
190 std::map YMMtoXMMLoadMap = {
191 {X86::VMOVUPSYrm, X86::VMOVUPSrm},
192 {X86::VMOVAPSYrm, X86::VMOVUPSrm},
193 {X86::VMOVUPDYrm, X86::VMOVUPDrm},
194 {X86::VMOVAPDYrm, X86::VMOVUPDrm},
195 {X86::VMOVDQUYrm, X86::VMOVDQUrm},
196 {X86::VMOVDQAYrm, X86::VMOVDQUrm},
197 {X86::VMOVUPSZ256rm, X86::VMOVUPSZ128rm},
198 {X86::VMOVAPSZ256rm, X86::VMOVUPSZ128rm},
199 {X86::VMOVUPDZ256rm, X86::VMOVUPDZ128rm},
200 {X86::VMOVAPDZ256rm, X86::VMOVUPDZ128rm},
201 {X86::VMOVDQU64Z256rm, X86::VMOVDQU64Z128rm},
202 {X86::VMOVDQA64Z256rm, X86::VMOVDQU64Z128rm},
203 {X86::VMOVDQU32Z256rm, X86::VMOVDQU32Z128rm},
204 {X86::VMOVDQA32Z256rm, X86::VMOVDQU32Z128rm},
205 };
206
207 std::map YMMtoXMMStoreMap = {
208 {X86::VMOVUPSYmr, X86::VMOVUPSmr},
209 {X86::VMOVAPSYmr, X86::VMOVUPSmr},
210 {X86::VMOVUPDYmr, X86::VMOVUPDmr},
211 {X86::VMOVAPDYmr, X86::VMOVUPDmr},
212 {X86::VMOVDQUYmr, X86::VMOVDQUmr},
213 {X86::VMOVDQAYmr, X86::VMOVDQUmr},
214 {X86::VMOVUPSZ256mr, X86::VMOVUPSZ128mr},
215 {X86::VMOVAPSZ256mr, X86::VMOVUPSZ128mr},
216 {X86::VMOVUPDZ256mr, X86::VMOVUPDZ128mr},
217 {X86::VMOVAPDZ256mr, X86::VMOVUPDZ128mr},
218 {X86::VMOVDQU64Z256mr, X86::VMOVDQU64Z128mr},
219 {X86::VMOVDQA64Z256mr, X86::VMOVDQU64Z128mr},
220 {X86::VMOVDQU32Z256mr, X86::VMOVDQU32Z128mr},
221 {X86::VMOVDQA32Z256mr, X86::VMOVDQU32Z128mr},
222 };
223
224 static int getAddrOffset(MachineInstr *MI) {
225 const MCInstrDesc &Descl = MI->getDesc();
226 int AddrOffset = X86II::getMemoryOperandNo(Descl.TSFlags);
227 assert(AddrOffset != -1 && "Expected Memory Operand");
228 AddrOffset += X86II::getOperandBias(Descl);
229 return AddrOffset;
230 }
231
232 static MachineOperand &getBaseOperand(MachineInstr *MI) {
233 int AddrOffset = getAddrOffset(MI);
234 return MI->getOperand(AddrOffset + X86::AddrBaseReg);
235 }
236
237 static MachineOperand &getDispOperand(MachineInstr *MI) {
238 int AddrOffset = getAddrOffset(MI);
239 return MI->getOperand(AddrOffset + X86::AddrDisp);
240 }
241
242 // Relevant addressing modes contain only base register and immediate
243 // displacement or frameindex and immediate displacement.
244 // TODO: Consider expanding to other addressing modes in the future
245 static bool isRelevantAddressingMode(MachineInstr *MI) {
246 int AddrOffset = getAddrOffset(MI);
247 MachineOperand &Base = MI->getOperand(AddrOffset + X86::AddrBaseReg);
248 MachineOperand &Disp = MI->getOperand(AddrOffset + X86::AddrDisp);
249 MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt);
250 MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg);
251 MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg);
252
253 if (!((Base.isReg() && Base.getReg() != X86::NoRegister) || Base.isFI()))
254 return false;
255 if (!Disp.isImm())
256 return false;
257 if (Scale.getImm() != 1)
258 return false;
259 if (!(Index.isReg() && Index.getReg() == X86::NoRegister))
260 return false;
261 if (!(Segment.isReg() && Segment.getReg() == X86::NoRegister))
262 return false;
263 return true;
264 }
265
266 // Collect potentially blocking stores.
267 // Limit the number of instructions backwards we want to inspect
268 // since the effect of store block won't be visible if the store
269 // and load instructions have enough instructions in between to
270 // keep the core busy.
271 static const unsigned LIMIT = 20;
272 static SmallVector
273 findPotentialBlockers(MachineInstr *LoadInst) {
274 SmallVector PotentialBlockers;
275 unsigned BlockLimit = 0;
276 for (MachineBasicBlock::iterator LI = LoadInst,
277 BB = LoadInst->getParent()->begin();
278 LI != BB; --LI) {
279 BlockLimit++;
280 if (BlockLimit >= LIMIT)
281 break;
282 MachineInstr &MI = *LI;
283 if (MI.getDesc().isCall())
284 break;
285 PotentialBlockers.push_back(&MI);
286 }
287 // If we didn't get to the instructions limit try predecessing blocks.
288 // Ideally we should traverse the predecessor blocks in depth with some
289 // coloring algorithm, but for now let's just look at the first order
290 // predecessors.
291 if (BlockLimit < LIMIT) {
292 MachineBasicBlock *MBB = LoadInst->getParent();
293 int LimitLeft = LIMIT - BlockLimit;
294 for (MachineBasicBlock::pred_iterator PB = MBB->pred_begin(),
295 PE = MBB->pred_end();
296 PB != PE; ++PB) {
297 MachineBasicBlock *PMBB = *PB;
298 int PredLimit = 0;
299 for (MachineBasicBlock::reverse_iterator PMI = PMBB->rbegin(),
300 PME = PMBB->rend();
301 PMI != PME; ++PMI) {
302 PredLimit++;
303 if (PredLimit >= LimitLeft)
304 break;
305 if (PMI->getDesc().isCall())
306 break;
307 PotentialBlockers.push_back(&*PMI);
308 }
309 }
310 }
311 return PotentialBlockers;
312 }
313
314 void FixupSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
315 int64_t LoadDisp, MachineInstr *StoreInst,
316 unsigned NStoreOpcode, int64_t StoreDisp,
317 unsigned Size) {
318 MachineOperand &LoadBase = getBaseOperand(LoadInst);
319 MachineOperand &StoreBase = getBaseOperand(StoreInst);
320 MachineBasicBlock *MBB = LoadInst->getParent();
321 MachineMemOperand *LMMO = *LoadInst->memoperands_begin();
322 MachineMemOperand *SMMO = *StoreInst->memoperands_begin();
323
324 unsigned Reg1 = MRI->createVirtualRegister(
325 TII->getRegClass(TII->get(NLoadOpcode), 0, TRI, *(MBB->getParent())));
326 BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode), Reg1)
327 .add(LoadBase)
328 .addImm(1)
329 .addReg(X86::NoRegister)
330 .addImm(LoadDisp)
331 .addReg(X86::NoRegister)
332 .addMemOperand(MBB->getParent()->getMachineMemOperand(
333 LMMO->getPointerInfo(), LMMO->getFlags(), Size, 0));
334 DEBUG(LoadInst->getPrevNode()->dump());
335 // If the load and store are consecutive, use the loadInst location to
336 // reduce register pressure.
337 MachineInstr *StInst = StoreInst;
338 if (StoreInst->getPrevNode() == LoadInst)
339 StInst = LoadInst;
340 BuildMI(*MBB, StInst, StInst->getDebugLoc(), TII->get(NStoreOpcode))
341 .add(StoreBase)
342 .addImm(1)
343 .addReg(X86::NoRegister)
344 .addImm(StoreDisp)
345 .addReg(X86::NoRegister)
346 .addReg(Reg1)
347 .addMemOperand(MBB->getParent()->getMachineMemOperand(
348 SMMO->getPointerInfo(), SMMO->getFlags(), Size, 0));
349 DEBUG(StInst->getPrevNode()->dump());
350 }
351
352 void FixupSFBPass::buildCopies(int Size, MachineInstr *LoadInst,
353 int64_t LdDispImm, MachineInstr *StoreInst,
354 int64_t StDispImm) {
355 int LdDisp = LdDispImm;
356 int StDisp = StDispImm;
357 while (Size > 0) {
358 if ((Size - MOV128SZ >= 0) && isYMMLoadOpcode(LoadInst->getOpcode())) {
359 Size = Size - MOV128SZ;
360 buildCopy(LoadInst, YMMtoXMMLoadMap.at(LoadInst->getOpcode()), LdDisp,
361 StoreInst, YMMtoXMMStoreMap.at(StoreInst->getOpcode()), StDisp,
362 MOV128SZ);
363 LdDisp += MOV128SZ;
364 StDisp += MOV128SZ;
365 continue;
366 }
367 if (Size - MOV64SZ >= 0) {
368 Size = Size - MOV64SZ;
369 buildCopy(LoadInst, X86::MOV64rm, LdDisp, StoreInst, X86::MOV64mr, StDisp,
370 MOV64SZ);
371 LdDisp += MOV64SZ;
372 StDisp += MOV64SZ;
373 continue;
374 }
375 if (Size - MOV32SZ >= 0) {
376 Size = Size - MOV32SZ;
377 buildCopy(LoadInst, X86::MOV32rm, LdDisp, StoreInst, X86::MOV32mr, StDisp,
378 MOV32SZ);
379 LdDisp += MOV32SZ;
380 StDisp += MOV32SZ;
381 continue;
382 }
383 if (Size - MOV16SZ >= 0) {
384 Size = Size - MOV16SZ;
385 buildCopy(LoadInst, X86::MOV16rm, LdDisp, StoreInst, X86::MOV16mr, StDisp,
386 MOV16SZ);
387 LdDisp += MOV16SZ;
388 StDisp += MOV16SZ;
389 continue;
390 }
391 if (Size - MOV8SZ >= 0) {
392 Size = Size - MOV8SZ;
393 buildCopy(LoadInst, X86::MOV8rm, LdDisp, StoreInst, X86::MOV8mr, StDisp,
394 MOV8SZ);
395 LdDisp += MOV8SZ;
396 StDisp += MOV8SZ;
397 continue;
398 }
399 }
400 assert(Size == 0 && "Wrong size division");
401 }
402
403 static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst) {
404 MachineOperand &LoadBase = getBaseOperand(LoadInst);
405 MachineOperand &StoreBase = getBaseOperand(StoreInst);
406 if (LoadBase.isReg()) {
407 MachineInstr *LastLoad = LoadInst->getPrevNode();
408 // If the original load and store to xmm/ymm were consecutive
409 // then the partial copies were also created in
410 // a consecutive order to reduce register pressure,
411 // and the location of the last load is before the last store.
412 if (StoreInst->getPrevNode() == LoadInst)
413 LastLoad = LoadInst->getPrevNode()->getPrevNode();
414 getBaseOperand(LastLoad).setIsKill(LoadBase.isKill());
415 }
416 if (StoreBase.isReg()) {
417 MachineInstr *StInst = StoreInst;
418 if (StoreInst->getPrevNode() == LoadInst)
419 StInst = LoadInst;
420 getBaseOperand(StInst->getPrevNode()).setIsKill(StoreBase.isKill());
421 }
422 }
423
424 void FixupSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
425 for (auto &MBB : MF)
426 for (auto &MI : MBB)
427 if (isPotentialBlockedMemCpyLd(MI.getOpcode())) {
428 int DefVR = MI.getOperand(0).getReg();
429 if (MRI->hasOneUse(DefVR))
430 for (auto UI = MRI->use_nodbg_begin(DefVR), UE = MRI->use_nodbg_end();
431 UI != UE;) {
432 MachineOperand &StoreMO = *UI++;
433 MachineInstr &StoreMI = *StoreMO.getParent();
434 if (isPotentialBlockedMemCpyPair(MI.getOpcode(),
435 StoreMI.getOpcode()) &&
436 (StoreMI.getParent() == MI.getParent()))
437 if (isRelevantAddressingMode(&MI) &&
438 isRelevantAddressingMode(&StoreMI))
439 BlockedLoadsStores.push_back(
440 std::pair(&MI, &StoreMI));
441 }
442 }
443 }
444 unsigned FixupSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) {
445 auto TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI,
446 *LoadInst->getParent()->getParent());
447 return TRI->getRegSizeInBits(*TRC) / 8;
448 }
449
450 void FixupSFBPass::breakBlockedCopies(
451 MachineInstr *LoadInst, MachineInstr *StoreInst,
452 const std::map &BlockingStoresDisp) {
453 int64_t LdDispImm = getDispOperand(LoadInst).getImm();
454 int64_t StDispImm = getDispOperand(StoreInst).getImm();
455
456 int64_t LdDisp1 = LdDispImm;
457 int64_t LdDisp2 = 0;
458 int64_t StDisp1 = StDispImm;
459 int64_t StDisp2 = 0;
460 unsigned Size1 = 0;
461 unsigned Size2 = 0;
462 int64_t LdStDelta = StDispImm - LdDispImm;
463 for (auto inst : BlockingStoresDisp) {
464 LdDisp2 = inst.first;
465 StDisp2 = inst.first + LdStDelta;
466 Size1 = std::abs(std::abs(LdDisp2) - std::abs(LdDisp1));
467 Size2 = inst.second;
468 buildCopies(Size1, LoadInst, LdDisp1, StoreInst, StDisp1);
469 buildCopies(Size2, LoadInst, LdDisp2, StoreInst, StDisp2);
470 LdDisp1 = LdDisp2 + Size2;
471 StDisp1 = StDisp2 + Size2;
472 }
473 unsigned Size3 = (LdDispImm + getRegSizeInBytes(LoadInst)) - LdDisp1;
474 buildCopies(Size3, LoadInst, LdDisp1, StoreInst, StDisp1);
475 }
476
477 bool FixupSFBPass::runOnMachineFunction(MachineFunction &MF) {
478 bool Changed = false;
479
480 if (DisableX86FixupSFB || skipFunction(MF.getFunction()))
481 return false;
482
483 MRI = &MF.getRegInfo();
484 assert(MRI->isSSA() && "Expected MIR to be in SSA form");
485 TII = MF.getSubtarget().getInstrInfo();
486 TRI = MF.getSubtarget().getRegisterInfo();
487
488 DEBUG(dbgs() << "Start X86FixupSFB\n";);
489 // Look for a load then a store to XMM/YMM which look like a memcpy
490 findPotentiallylBlockedCopies(MF);
491
492 for (auto LoadStoreInst : BlockedLoadsStores) {
493 MachineInstr *LoadInst = LoadStoreInst.first;
494 SmallVector PotentialBlockers =
495 findPotentialBlockers(LoadInst);
496
497 MachineOperand &LoadBase = getBaseOperand(LoadInst);
498 int64_t LdDispImm = getDispOperand(LoadInst).getImm();
499 std::map BlockingStoresDisp;
500 int LdBaseReg = LoadBase.isReg() ? LoadBase.getReg() : LoadBase.getIndex();
501
502 for (auto PBInst : PotentialBlockers) {
503 if (isPotentialBlockingStoreInst(PBInst->getOpcode(),
504 LoadInst->getOpcode())) {
505 if (!isRelevantAddressingMode(PBInst))
506 continue;
507 MachineOperand &PBstoreBase = getBaseOperand(PBInst);
508 int64_t PBstDispImm = getDispOperand(PBInst).getImm();
509 assert(PBInst->hasOneMemOperand() && "Expected One Memory Operand");
510 unsigned PBstSize = (*PBInst->memoperands_begin())->getSize();
511 int PBstBaseReg =
512 PBstoreBase.isReg() ? PBstoreBase.getReg() : PBstoreBase.getIndex();
513 // This check doesn't cover all cases, but it will suffice for now.
514 // TODO: take branch probability into consideration, if the blocking
515 // store is in an unreached block, breaking the memcopy could lose
516 // performance.
517 if (((LoadBase.isReg() && PBstoreBase.isReg()) ||
518 (LoadBase.isFI() && PBstoreBase.isFI())) &&
519 LdBaseReg == PBstBaseReg &&
520 ((PBstDispImm >= LdDispImm) &&
521 (PBstDispImm <=
522 LdDispImm + (getRegSizeInBytes(LoadInst) - PBstSize)))) {
523 if (BlockingStoresDisp.count(PBstDispImm)) {
524 if (BlockingStoresDisp[PBstDispImm] > PBstSize)
525 BlockingStoresDisp[PBstDispImm] = PBstSize;
526
527 } else
528 BlockingStoresDisp[PBstDispImm] = PBstSize;
529 }
530 }
531 }
532
533 if (BlockingStoresDisp.size() == 0)
534 continue;
535
536 // We found a store forward block, break the memcpy's load and store
537 // into smaller copies such that each smaller store that was causing
538 // a store block would now be copied separately.
539 MachineInstr *StoreInst = LoadStoreInst.second;
540 DEBUG(dbgs() << "Blocked load and store instructions: \n");
541 DEBUG(LoadInst->dump());
542 DEBUG(StoreInst->dump());
543 DEBUG(dbgs() << "Replaced with:\n");
544 breakBlockedCopies(LoadInst, StoreInst, BlockingStoresDisp);
545 updateKillStatus(LoadInst, StoreInst);
546 ForRemoval.push_back(LoadInst);
547 ForRemoval.push_back(StoreInst);
548 }
549 for (auto RemovedInst : ForRemoval) {
550 RemovedInst->eraseFromParent();
551 }
552 ForRemoval.clear();
553 BlockedLoadsStores.clear();
554 DEBUG(dbgs() << "End X86FixupSFB\n";);
555
556 return Changed;
557 }
448448 addPass(createX86FixupSetCC());
449449 addPass(createX86OptimizeLEAs());
450450 addPass(createX86CallFrameOptimization());
451 addPass(createX86FixupSFB());
451452 }
452453
453454 addPass(createX86WinAllocaExpander());
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=CHECK
2 ; RUN: llc < %s -mtriple=x86_64-linux --disable-fixup-SFB | FileCheck %s --check-prefix=DISABLED
3 ; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK-AVX2
4 ; RUN: llc < %s -mtriple=x86_64-linux -mcpu=skx | FileCheck %s -check-prefix=CHECK-AVX512
5
6 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
7 target triple = "x86_64-unknown-linux-gnu"
8
9 %struct.S = type { i32, i32, i32, i32 }
10
11 ; Function Attrs: nounwind uwtable
12 define void @test_conditional_block(%struct.S* nocapture %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4) local_unnamed_addr #0 {
13 ; CHECK-LABEL: test_conditional_block:
14 ; CHECK: # %bb.0: # %entry
15 ; CHECK-NEXT: cmpl $18, %edx
16 ; CHECK-NEXT: jl .LBB0_2
17 ; CHECK-NEXT: # %bb.1: # %if.then
18 ; CHECK-NEXT: movl %edx, 4(%rdi)
19 ; CHECK-NEXT: .LBB0_2: # %if.end
20 ; CHECK-NEXT: movups (%r8), %xmm0
21 ; CHECK-NEXT: movups %xmm0, (%rcx)
22 ; CHECK-NEXT: movl (%rdi), %eax
23 ; CHECK-NEXT: movl %eax, (%rsi)
24 ; CHECK-NEXT: movl 4(%rdi), %eax
25 ; CHECK-NEXT: movl %eax, 4(%rsi)
26 ; CHECK-NEXT: movq 8(%rdi), %rax
27 ; CHECK-NEXT: movq %rax, 8(%rsi)
28 ; CHECK-NEXT: retq
29 ;
30 ; DISABLED-LABEL: test_conditional_block:
31 ; DISABLED: # %bb.0: # %entry
32 ; DISABLED-NEXT: cmpl $18, %edx
33 ; DISABLED-NEXT: jl .LBB0_2
34 ; DISABLED-NEXT: # %bb.1: # %if.then
35 ; DISABLED-NEXT: movl %edx, 4(%rdi)
36 ; DISABLED-NEXT: .LBB0_2: # %if.end
37 ; DISABLED-NEXT: movups (%r8), %xmm0
38 ; DISABLED-NEXT: movups %xmm0, (%rcx)
39 ; DISABLED-NEXT: movups (%rdi), %xmm0
40 ; DISABLED-NEXT: movups %xmm0, (%rsi)
41 ; DISABLED-NEXT: retq
42 ;
43 ; CHECK-AVX2-LABEL: test_conditional_block:
44 ; CHECK-AVX2: # %bb.0: # %entry
45 ; CHECK-AVX2-NEXT: cmpl $18, %edx
46 ; CHECK-AVX2-NEXT: jl .LBB0_2
47 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
48 ; CHECK-AVX2-NEXT: movl %edx, 4(%rdi)
49 ; CHECK-AVX2-NEXT: .LBB0_2: # %if.end
50 ; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0
51 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx)
52 ; CHECK-AVX2-NEXT: movl (%rdi), %eax
53 ; CHECK-AVX2-NEXT: movl %eax, (%rsi)
54 ; CHECK-AVX2-NEXT: movl 4(%rdi), %eax
55 ; CHECK-AVX2-NEXT: movl %eax, 4(%rsi)
56 ; CHECK-AVX2-NEXT: movq 8(%rdi), %rax
57 ; CHECK-AVX2-NEXT: movq %rax, 8(%rsi)
58 ; CHECK-AVX2-NEXT: retq
59 ;
60 ; CHECK-AVX512-LABEL: test_conditional_block:
61 ; CHECK-AVX512: # %bb.0: # %entry
62 ; CHECK-AVX512-NEXT: cmpl $18, %edx
63 ; CHECK-AVX512-NEXT: jl .LBB0_2
64 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
65 ; CHECK-AVX512-NEXT: movl %edx, 4(%rdi)
66 ; CHECK-AVX512-NEXT: .LBB0_2: # %if.end
67 ; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0
68 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx)
69 ; CHECK-AVX512-NEXT: movl (%rdi), %eax
70 ; CHECK-AVX512-NEXT: movl %eax, (%rsi)
71 ; CHECK-AVX512-NEXT: movl 4(%rdi), %eax
72 ; CHECK-AVX512-NEXT: movl %eax, 4(%rsi)
73 ; CHECK-AVX512-NEXT: movq 8(%rdi), %rax
74 ; CHECK-AVX512-NEXT: movq %rax, 8(%rsi)
75 ; CHECK-AVX512-NEXT: retq
76 entry:
77 %cmp = icmp sgt i32 %x, 17
78 br i1 %cmp, label %if.then, label %if.end
79
80 if.then: ; preds = %entry
81 %b = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 1
82 store i32 %x, i32* %b, align 4
83 br label %if.end
84
85 if.end: ; preds = %if.then, %entry
86 %0 = bitcast %struct.S* %s3 to i8*
87 %1 = bitcast %struct.S* %s4 to i8*
88 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
89 %2 = bitcast %struct.S* %s2 to i8*
90 %3 = bitcast %struct.S* %s1 to i8*
91 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 4, i1 false)
92 ret void
93 }
94
95 ; Function Attrs: nounwind uwtable
96 define void @test_imm_store(%struct.S* nocapture %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3) local_unnamed_addr #0 {
97 ; CHECK-LABEL: test_imm_store:
98 ; CHECK: # %bb.0: # %entry
99 ; CHECK-NEXT: movl $0, (%rdi)
100 ; CHECK-NEXT: movl $1, (%rcx)
101 ; CHECK-NEXT: movl (%rdi), %eax
102 ; CHECK-NEXT: movl %eax, (%rsi)
103 ; CHECK-NEXT: movq 4(%rdi), %rax
104 ; CHECK-NEXT: movq %rax, 4(%rsi)
105 ; CHECK-NEXT: movl 12(%rdi), %eax
106 ; CHECK-NEXT: movl %eax, 12(%rsi)
107 ; CHECK-NEXT: retq
108 ;
109 ; DISABLED-LABEL: test_imm_store:
110 ; DISABLED: # %bb.0: # %entry
111 ; DISABLED-NEXT: movl $0, (%rdi)
112 ; DISABLED-NEXT: movl $1, (%rcx)
113 ; DISABLED-NEXT: movups (%rdi), %xmm0
114 ; DISABLED-NEXT: movups %xmm0, (%rsi)
115 ; DISABLED-NEXT: retq
116 ;
117 ; CHECK-AVX2-LABEL: test_imm_store:
118 ; CHECK-AVX2: # %bb.0: # %entry
119 ; CHECK-AVX2-NEXT: movl $0, (%rdi)
120 ; CHECK-AVX2-NEXT: movl $1, (%rcx)
121 ; CHECK-AVX2-NEXT: movl (%rdi), %eax
122 ; CHECK-AVX2-NEXT: movl %eax, (%rsi)
123 ; CHECK-AVX2-NEXT: movq 4(%rdi), %rax
124 ; CHECK-AVX2-NEXT: movq %rax, 4(%rsi)
125 ; CHECK-AVX2-NEXT: movl 12(%rdi), %eax
126 ; CHECK-AVX2-NEXT: movl %eax, 12(%rsi)
127 ; CHECK-AVX2-NEXT: retq
128 ;
129 ; CHECK-AVX512-LABEL: test_imm_store:
130 ; CHECK-AVX512: # %bb.0: # %entry
131 ; CHECK-AVX512-NEXT: movl $0, (%rdi)
132 ; CHECK-AVX512-NEXT: movl $1, (%rcx)
133 ; CHECK-AVX512-NEXT: movl (%rdi), %eax
134 ; CHECK-AVX512-NEXT: movl %eax, (%rsi)
135 ; CHECK-AVX512-NEXT: movq 4(%rdi), %rax
136 ; CHECK-AVX512-NEXT: movq %rax, 4(%rsi)
137 ; CHECK-AVX512-NEXT: movl 12(%rdi), %eax
138 ; CHECK-AVX512-NEXT: movl %eax, 12(%rsi)
139 ; CHECK-AVX512-NEXT: retq
140 entry:
141 %a = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 0
142 store i32 0, i32* %a, align 4
143 %a1 = getelementptr inbounds %struct.S, %struct.S* %s3, i64 0, i32 0
144 store i32 1, i32* %a1, align 4
145 %0 = bitcast %struct.S* %s2 to i8*
146 %1 = bitcast %struct.S* %s1 to i8*
147 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
148 ret void
149 }
150
151 ; Function Attrs: nounwind uwtable
152 define void @test_nondirect_br(%struct.S* nocapture %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 {
153 ; CHECK-LABEL: test_nondirect_br:
154 ; CHECK: # %bb.0: # %entry
155 ; CHECK-NEXT: cmpl $18, %edx
156 ; CHECK-NEXT: jl .LBB2_2
157 ; CHECK-NEXT: # %bb.1: # %if.then
158 ; CHECK-NEXT: movl %edx, 4(%rdi)
159 ; CHECK-NEXT: .LBB2_2: # %if.end
160 ; CHECK-NEXT: cmpl $14, %r9d
161 ; CHECK-NEXT: jl .LBB2_4
162 ; CHECK-NEXT: # %bb.3: # %if.then2
163 ; CHECK-NEXT: movl %r9d, 12(%rdi)
164 ; CHECK-NEXT: .LBB2_4: # %if.end3
165 ; CHECK-NEXT: movups (%r8), %xmm0
166 ; CHECK-NEXT: movups %xmm0, (%rcx)
167 ; CHECK-NEXT: movq (%rdi), %rax
168 ; CHECK-NEXT: movq %rax, (%rsi)
169 ; CHECK-NEXT: movl 8(%rdi), %eax
170 ; CHECK-NEXT: movl %eax, 8(%rsi)
171 ; CHECK-NEXT: movl 12(%rdi), %eax
172 ; CHECK-NEXT: movl %eax, 12(%rsi)
173 ; CHECK-NEXT: retq
174 ;
175 ; DISABLED-LABEL: test_nondirect_br:
176 ; DISABLED: # %bb.0: # %entry
177 ; DISABLED-NEXT: cmpl $18, %edx
178 ; DISABLED-NEXT: jl .LBB2_2
179 ; DISABLED-NEXT: # %bb.1: # %if.then
180 ; DISABLED-NEXT: movl %edx, 4(%rdi)
181 ; DISABLED-NEXT: .LBB2_2: # %if.end
182 ; DISABLED-NEXT: cmpl $14, %r9d
183 ; DISABLED-NEXT: jl .LBB2_4
184 ; DISABLED-NEXT: # %bb.3: # %if.then2
185 ; DISABLED-NEXT: movl %r9d, 12(%rdi)
186 ; DISABLED-NEXT: .LBB2_4: # %if.end3
187 ; DISABLED-NEXT: movups (%r8), %xmm0
188 ; DISABLED-NEXT: movups %xmm0, (%rcx)
189 ; DISABLED-NEXT: movups (%rdi), %xmm0
190 ; DISABLED-NEXT: movups %xmm0, (%rsi)
191 ; DISABLED-NEXT: retq
192 ;
193 ; CHECK-AVX2-LABEL: test_nondirect_br:
194 ; CHECK-AVX2: # %bb.0: # %entry
195 ; CHECK-AVX2-NEXT: cmpl $18, %edx
196 ; CHECK-AVX2-NEXT: jl .LBB2_2
197 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
198 ; CHECK-AVX2-NEXT: movl %edx, 4(%rdi)
199 ; CHECK-AVX2-NEXT: .LBB2_2: # %if.end
200 ; CHECK-AVX2-NEXT: cmpl $14, %r9d
201 ; CHECK-AVX2-NEXT: jl .LBB2_4
202 ; CHECK-AVX2-NEXT: # %bb.3: # %if.then2
203 ; CHECK-AVX2-NEXT: movl %r9d, 12(%rdi)
204 ; CHECK-AVX2-NEXT: .LBB2_4: # %if.end3
205 ; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0
206 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx)
207 ; CHECK-AVX2-NEXT: movq (%rdi), %rax
208 ; CHECK-AVX2-NEXT: movq %rax, (%rsi)
209 ; CHECK-AVX2-NEXT: movl 8(%rdi), %eax
210 ; CHECK-AVX2-NEXT: movl %eax, 8(%rsi)
211 ; CHECK-AVX2-NEXT: movl 12(%rdi), %eax
212 ; CHECK-AVX2-NEXT: movl %eax, 12(%rsi)
213 ; CHECK-AVX2-NEXT: retq
214 ;
215 ; CHECK-AVX512-LABEL: test_nondirect_br:
216 ; CHECK-AVX512: # %bb.0: # %entry
217 ; CHECK-AVX512-NEXT: cmpl $18, %edx
218 ; CHECK-AVX512-NEXT: jl .LBB2_2
219 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
220 ; CHECK-AVX512-NEXT: movl %edx, 4(%rdi)
221 ; CHECK-AVX512-NEXT: .LBB2_2: # %if.end
222 ; CHECK-AVX512-NEXT: cmpl $14, %r9d
223 ; CHECK-AVX512-NEXT: jl .LBB2_4
224 ; CHECK-AVX512-NEXT: # %bb.3: # %if.then2
225 ; CHECK-AVX512-NEXT: movl %r9d, 12(%rdi)
226 ; CHECK-AVX512-NEXT: .LBB2_4: # %if.end3
227 ; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0
228 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx)
229 ; CHECK-AVX512-NEXT: movq (%rdi), %rax
230 ; CHECK-AVX512-NEXT: movq %rax, (%rsi)
231 ; CHECK-AVX512-NEXT: movl 8(%rdi), %eax
232 ; CHECK-AVX512-NEXT: movl %eax, 8(%rsi)
233 ; CHECK-AVX512-NEXT: movl 12(%rdi), %eax
234 ; CHECK-AVX512-NEXT: movl %eax, 12(%rsi)
235 ; CHECK-AVX512-NEXT: retq
236 entry:
237 %cmp = icmp sgt i32 %x, 17
238 br i1 %cmp, label %if.then, label %if.end
239
240 if.then: ; preds = %entry
241 %b = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 1
242 store i32 %x, i32* %b, align 4
243 br label %if.end
244
245 if.end: ; preds = %if.then, %entry
246 %cmp1 = icmp sgt i32 %x2, 13
247 br i1 %cmp1, label %if.then2, label %if.end3
248
249 if.then2: ; preds = %if.end
250 %d = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 3
251 store i32 %x2, i32* %d, align 4
252 br label %if.end3
253
254 if.end3: ; preds = %if.then2, %if.end
255 %0 = bitcast %struct.S* %s3 to i8*
256 %1 = bitcast %struct.S* %s4 to i8*
257 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
258 %2 = bitcast %struct.S* %s2 to i8*
259 %3 = bitcast %struct.S* %s1 to i8*
260 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 4, i1 false)
261 ret void
262 }
263
264 ; Function Attrs: nounwind uwtable
265 define void @test_2preds_block(%struct.S* nocapture %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 {
266 ; CHECK-LABEL: test_2preds_block:
267 ; CHECK: # %bb.0: # %entry
268 ; CHECK-NEXT: movl %r9d, 12(%rdi)
269 ; CHECK-NEXT: cmpl $18, %edx
270 ; CHECK-NEXT: jl .LBB3_2
271 ; CHECK-NEXT: # %bb.1: # %if.then
272 ; CHECK-NEXT: movl %edx, 4(%rdi)
273 ; CHECK-NEXT: .LBB3_2: # %if.end
274 ; CHECK-NEXT: movups (%r8), %xmm0
275 ; CHECK-NEXT: movups %xmm0, (%rcx)
276 ; CHECK-NEXT: movl (%rdi), %eax
277 ; CHECK-NEXT: movl %eax, (%rsi)
278 ; CHECK-NEXT: movl 4(%rdi), %eax
279 ; CHECK-NEXT: movl %eax, 4(%rsi)
280 ; CHECK-NEXT: movl 8(%rdi), %eax
281 ; CHECK-NEXT: movl %eax, 8(%rsi)
282 ; CHECK-NEXT: movl 12(%rdi), %eax
283 ; CHECK-NEXT: movl %eax, 12(%rsi)
284 ; CHECK-NEXT: retq
285 ;
286 ; DISABLED-LABEL: test_2preds_block:
287 ; DISABLED: # %bb.0: # %entry
288 ; DISABLED-NEXT: movl %r9d, 12(%rdi)
289 ; DISABLED-NEXT: cmpl $18, %edx
290 ; DISABLED-NEXT: jl .LBB3_2
291 ; DISABLED-NEXT: # %bb.1: # %if.then
292 ; DISABLED-NEXT: movl %edx, 4(%rdi)
293 ; DISABLED-NEXT: .LBB3_2: # %if.end
294 ; DISABLED-NEXT: movups (%r8), %xmm0
295 ; DISABLED-NEXT: movups %xmm0, (%rcx)
296 ; DISABLED-NEXT: movups (%rdi), %xmm0
297 ; DISABLED-NEXT: movups %xmm0, (%rsi)
298 ; DISABLED-NEXT: retq
299 ;
300 ; CHECK-AVX2-LABEL: test_2preds_block:
301 ; CHECK-AVX2: # %bb.0: # %entry
302 ; CHECK-AVX2-NEXT: movl %r9d, 12(%rdi)
303 ; CHECK-AVX2-NEXT: cmpl $18, %edx
304 ; CHECK-AVX2-NEXT: jl .LBB3_2
305 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
306 ; CHECK-AVX2-NEXT: movl %edx, 4(%rdi)
307 ; CHECK-AVX2-NEXT: .LBB3_2: # %if.end
308 ; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0
309 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx)
310 ; CHECK-AVX2-NEXT: movl (%rdi), %eax
311 ; CHECK-AVX2-NEXT: movl %eax, (%rsi)
312 ; CHECK-AVX2-NEXT: movl 4(%rdi), %eax
313 ; CHECK-AVX2-NEXT: movl %eax, 4(%rsi)
314 ; CHECK-AVX2-NEXT: movl 8(%rdi), %eax
315 ; CHECK-AVX2-NEXT: movl %eax, 8(%rsi)
316 ; CHECK-AVX2-NEXT: movl 12(%rdi), %eax
317 ; CHECK-AVX2-NEXT: movl %eax, 12(%rsi)
318 ; CHECK-AVX2-NEXT: retq
319 ;
320 ; CHECK-AVX512-LABEL: test_2preds_block:
321 ; CHECK-AVX512: # %bb.0: # %entry
322 ; CHECK-AVX512-NEXT: movl %r9d, 12(%rdi)
323 ; CHECK-AVX512-NEXT: cmpl $18, %edx
324 ; CHECK-AVX512-NEXT: jl .LBB3_2
325 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
326 ; CHECK-AVX512-NEXT: movl %edx, 4(%rdi)
327 ; CHECK-AVX512-NEXT: .LBB3_2: # %if.end
328 ; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0
329 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx)
330 ; CHECK-AVX512-NEXT: movl (%rdi), %eax
331 ; CHECK-AVX512-NEXT: movl %eax, (%rsi)
332 ; CHECK-AVX512-NEXT: movl 4(%rdi), %eax
333 ; CHECK-AVX512-NEXT: movl %eax, 4(%rsi)
334 ; CHECK-AVX512-NEXT: movl 8(%rdi), %eax
335 ; CHECK-AVX512-NEXT: movl %eax, 8(%rsi)
336 ; CHECK-AVX512-NEXT: movl 12(%rdi), %eax
337 ; CHECK-AVX512-NEXT: movl %eax, 12(%rsi)
338 ; CHECK-AVX512-NEXT: retq
339 entry:
340 %d = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 3
341 store i32 %x2, i32* %d, align 4
342 %cmp = icmp sgt i32 %x, 17
343 br i1 %cmp, label %if.then, label %if.end
344
345 if.then: ; preds = %entry
346 %b = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 1
347 store i32 %x, i32* %b, align 4
348 br label %if.end
349
350 if.end: ; preds = %if.then, %entry
351 %0 = bitcast %struct.S* %s3 to i8*
352 %1 = bitcast %struct.S* %s4 to i8*
353 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
354 %2 = bitcast %struct.S* %s2 to i8*
355 %3 = bitcast %struct.S* %s1 to i8*
356 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 4, i1 false)
357 ret void
358 }
359 %struct.S2 = type { i64, i64 }
360
361 ; Function Attrs: nounwind uwtable
362 define void @test_type64(%struct.S2* nocapture %s1, %struct.S2* nocapture %s2, i32 %x, %struct.S2* nocapture %s3, %struct.S2* nocapture readonly %s4) local_unnamed_addr #0 {
363 ; CHECK-LABEL: test_type64:
364 ; CHECK: # %bb.0: # %entry
365 ; CHECK-NEXT: cmpl $18, %edx
366 ; CHECK-NEXT: jl .LBB4_2
367 ; CHECK-NEXT: # %bb.1: # %if.then
368 ; CHECK-NEXT: movslq %edx, %rax
369 ; CHECK-NEXT: movq %rax, 8(%rdi)
370 ; CHECK-NEXT: .LBB4_2: # %if.end
371 ; CHECK-NEXT: movups (%r8), %xmm0
372 ; CHECK-NEXT: movups %xmm0, (%rcx)
373 ; CHECK-NEXT: movq (%rdi), %rax
374 ; CHECK-NEXT: movq %rax, (%rsi)
375 ; CHECK-NEXT: movq 8(%rdi), %rax
376 ; CHECK-NEXT: movq %rax, 8(%rsi)
377 ; CHECK-NEXT: retq
378 ;
379 ; DISABLED-LABEL: test_type64:
380 ; DISABLED: # %bb.0: # %entry
381 ; DISABLED-NEXT: cmpl $18, %edx
382 ; DISABLED-NEXT: jl .LBB4_2
383 ; DISABLED-NEXT: # %bb.1: # %if.then
384 ; DISABLED-NEXT: movslq %edx, %rax
385 ; DISABLED-NEXT: movq %rax, 8(%rdi)
386 ; DISABLED-NEXT: .LBB4_2: # %if.end
387 ; DISABLED-NEXT: movups (%r8), %xmm0
388 ; DISABLED-NEXT: movups %xmm0, (%rcx)
389 ; DISABLED-NEXT: movups (%rdi), %xmm0
390 ; DISABLED-NEXT: movups %xmm0, (%rsi)
391 ; DISABLED-NEXT: retq
392 ;
393 ; CHECK-AVX2-LABEL: test_type64:
394 ; CHECK-AVX2: # %bb.0: # %entry
395 ; CHECK-AVX2-NEXT: cmpl $18, %edx
396 ; CHECK-AVX2-NEXT: jl .LBB4_2
397 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
398 ; CHECK-AVX2-NEXT: movslq %edx, %rax
399 ; CHECK-AVX2-NEXT: movq %rax, 8(%rdi)
400 ; CHECK-AVX2-NEXT: .LBB4_2: # %if.end
401 ; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0
402 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx)
403 ; CHECK-AVX2-NEXT: movq (%rdi), %rax
404 ; CHECK-AVX2-NEXT: movq %rax, (%rsi)
405 ; CHECK-AVX2-NEXT: movq 8(%rdi), %rax
406 ; CHECK-AVX2-NEXT: movq %rax, 8(%rsi)
407 ; CHECK-AVX2-NEXT: retq
408 ;
409 ; CHECK-AVX512-LABEL: test_type64:
410 ; CHECK-AVX512: # %bb.0: # %entry
411 ; CHECK-AVX512-NEXT: cmpl $18, %edx
412 ; CHECK-AVX512-NEXT: jl .LBB4_2
413 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
414 ; CHECK-AVX512-NEXT: movslq %edx, %rax
415 ; CHECK-AVX512-NEXT: movq %rax, 8(%rdi)
416 ; CHECK-AVX512-NEXT: .LBB4_2: # %if.end
417 ; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0
418 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx)
419 ; CHECK-AVX512-NEXT: movq (%rdi), %rax
420 ; CHECK-AVX512-NEXT: movq %rax, (%rsi)
421 ; CHECK-AVX512-NEXT: movq 8(%rdi), %rax
422 ; CHECK-AVX512-NEXT: movq %rax, 8(%rsi)
423 ; CHECK-AVX512-NEXT: retq
424 entry:
425 %cmp = icmp sgt i32 %x, 17
426 br i1 %cmp, label %if.then, label %if.end
427
428 if.then: ; preds = %entry
429 %conv = sext i32 %x to i64
430 %b = getelementptr inbounds %struct.S2, %struct.S2* %s1, i64 0, i32 1
431 store i64 %conv, i64* %b, align 8
432 br label %if.end
433
434 if.end: ; preds = %if.then, %entry
435 %0 = bitcast %struct.S2* %s3 to i8*
436 %1 = bitcast %struct.S2* %s4 to i8*
437 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 8, i1 false)
438 %2 = bitcast %struct.S2* %s2 to i8*
439 %3 = bitcast %struct.S2* %s1 to i8*
440 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 8, i1 false)
441 ret void
442 }
443 %struct.S3 = type { i64, i8, i8, i16, i32 }
444
445 ; Function Attrs: noinline nounwind uwtable
446 define void @test_mixed_type(%struct.S3* nocapture %s1, %struct.S3* nocapture %s2, i32 %x, %struct.S3* nocapture readnone %s3, %struct.S3* nocapture readnone %s4) local_unnamed_addr #0 {
447 ; CHECK-LABEL: test_mixed_type:
448 ; CHECK: # %bb.0: # %entry
449 ; CHECK-NEXT: cmpl $18, %edx
450 ; CHECK-NEXT: jl .LBB5_2
451 ; CHECK-NEXT: # %bb.1: # %if.then
452 ; CHECK-NEXT: movslq %edx, %rax
453 ; CHECK-NEXT: movq %rax, (%rdi)
454 ; CHECK-NEXT: movb %dl, 8(%rdi)
455 ; CHECK-NEXT: .LBB5_2: # %if.end
456 ; CHECK-NEXT: movq (%rdi), %rax
457 ; CHECK-NEXT: movq %rax, (%rsi)
458 ; CHECK-NEXT: movb 8(%rdi), %al
459 ; CHECK-NEXT: movb %al, 8(%rsi)
460 ; CHECK-NEXT: movl 9(%rdi), %eax
461 ; CHECK-NEXT: movl %eax, 9(%rsi)
462 ; CHECK-NEXT: movzwl 13(%rdi), %eax
463 ; CHECK-NEXT: movw %ax, 13(%rsi)
464 ; CHECK-NEXT: movb 15(%rdi), %al
465 ; CHECK-NEXT: movb %al, 15(%rsi)
466 ; CHECK-NEXT: retq
467 ;
468 ; DISABLED-LABEL: test_mixed_type:
469 ; DISABLED: # %bb.0: # %entry
470 ; DISABLED-NEXT: cmpl $18, %edx
471 ; DISABLED-NEXT: jl .LBB5_2
472 ; DISABLED-NEXT: # %bb.1: # %if.then
473 ; DISABLED-NEXT: movslq %edx, %rax
474 ; DISABLED-NEXT: movq %rax, (%rdi)
475 ; DISABLED-NEXT: movb %dl, 8(%rdi)
476 ; DISABLED-NEXT: .LBB5_2: # %if.end
477 ; DISABLED-NEXT: movups (%rdi), %xmm0
478 ; DISABLED-NEXT: movups %xmm0, (%rsi)
479 ; DISABLED-NEXT: retq
480 ;
481 ; CHECK-AVX2-LABEL: test_mixed_type:
482 ; CHECK-AVX2: # %bb.0: # %entry
483 ; CHECK-AVX2-NEXT: cmpl $18, %edx
484 ; CHECK-AVX2-NEXT: jl .LBB5_2
485 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
486 ; CHECK-AVX2-NEXT: movslq %edx, %rax
487 ; CHECK-AVX2-NEXT: movq %rax, (%rdi)
488 ; CHECK-AVX2-NEXT: movb %dl, 8(%rdi)
489 ; CHECK-AVX2-NEXT: .LBB5_2: # %if.end
490 ; CHECK-AVX2-NEXT: movq (%rdi), %rax
491 ; CHECK-AVX2-NEXT: movq %rax, (%rsi)
492 ; CHECK-AVX2-NEXT: movb 8(%rdi), %al
493 ; CHECK-AVX2-NEXT: movb %al, 8(%rsi)
494 ; CHECK-AVX2-NEXT: movl 9(%rdi), %eax
495 ; CHECK-AVX2-NEXT: movl %eax, 9(%rsi)
496 ; CHECK-AVX2-NEXT: movzwl 13(%rdi), %eax
497 ; CHECK-AVX2-NEXT: movw %ax, 13(%rsi)
498 ; CHECK-AVX2-NEXT: movb 15(%rdi), %al
499 ; CHECK-AVX2-NEXT: movb %al, 15(%rsi)
500 ; CHECK-AVX2-NEXT: retq
501 ;
502 ; CHECK-AVX512-LABEL: test_mixed_type:
503 ; CHECK-AVX512: # %bb.0: # %entry
504 ; CHECK-AVX512-NEXT: cmpl $18, %edx
505 ; CHECK-AVX512-NEXT: jl .LBB5_2
506 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
507 ; CHECK-AVX512-NEXT: movslq %edx, %rax
508 ; CHECK-AVX512-NEXT: movq %rax, (%rdi)
509 ; CHECK-AVX512-NEXT: movb %dl, 8(%rdi)
510 ; CHECK-AVX512-NEXT: .LBB5_2: # %if.end
511 ; CHECK-AVX512-NEXT: movq (%rdi), %rax
512 ; CHECK-AVX512-NEXT: movq %rax, (%rsi)
513 ; CHECK-AVX512-NEXT: movb 8(%rdi), %al
514 ; CHECK-AVX512-NEXT: movb %al, 8(%rsi)
515 ; CHECK-AVX512-NEXT: movl 9(%rdi), %eax
516 ; CHECK-AVX512-NEXT: movl %eax, 9(%rsi)
517 ; CHECK-AVX512-NEXT: movzwl 13(%rdi), %eax
518 ; CHECK-AVX512-NEXT: movw %ax, 13(%rsi)
519 ; CHECK-AVX512-NEXT: movb 15(%rdi), %al
520 ; CHECK-AVX512-NEXT: movb %al, 15(%rsi)
521 ; CHECK-AVX512-NEXT: retq
522 entry:
523 %cmp = icmp sgt i32 %x, 17
524 br i1 %cmp, label %if.then, label %if.end
525
526 if.then: ; preds = %entry
527 %conv = sext i32 %x to i64
528 %a = getelementptr inbounds %struct.S3, %struct.S3* %s1, i64 0, i32 0
529 store i64 %conv, i64* %a, align 8
530 %conv1 = trunc i32 %x to i8
531 %b = getelementptr inbounds %struct.S3, %struct.S3* %s1, i64 0, i32 1
532 store i8 %conv1, i8* %b, align 8
533 br label %if.end
534
535 if.end: ; preds = %if.then, %entry
536 %0 = bitcast %struct.S3* %s2 to i8*
537 %1 = bitcast %struct.S3* %s1 to i8*
538 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 8, i1 false)
539 ret void
540 }
541 %struct.S4 = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
542
543 ; Function Attrs: nounwind uwtable
544 define void @test_multiple_blocks(%struct.S4* nocapture %s1, %struct.S4* nocapture %s2) local_unnamed_addr #0 {
545 ; CHECK-LABEL: test_multiple_blocks:
546 ; CHECK: # %bb.0: # %entry
547 ; CHECK-NEXT: movl $0, 4(%rdi)
548 ; CHECK-NEXT: movl $0, 36(%rdi)
549 ; CHECK-NEXT: movups 16(%rdi), %xmm0
550 ; CHECK-NEXT: movups %xmm0, 16(%rsi)
551 ; CHECK-NEXT: movl 32(%rdi), %eax
552 ; CHECK-NEXT: movl %eax, 32(%rsi)
553 ; CHECK-NEXT: movl 36(%rdi), %eax
554 ; CHECK-NEXT: movl %eax, 36(%rsi)
555 ; CHECK-NEXT: movq 40(%rdi), %rax
556 ; CHECK-NEXT: movq %rax, 40(%rsi)
557 ; CHECK-NEXT: movl (%rdi), %eax
558 ; CHECK-NEXT: movl %eax, (%rsi)
559 ; CHECK-NEXT: movl 4(%rdi), %eax
560 ; CHECK-NEXT: movl %eax, 4(%rsi)
561 ; CHECK-NEXT: movq 8(%rdi), %rax
562 ; CHECK-NEXT: movq %rax, 8(%rsi)
563 ; CHECK-NEXT: retq
564 ;
565 ; DISABLED-LABEL: test_multiple_blocks:
566 ; DISABLED: # %bb.0: # %entry
567 ; DISABLED-NEXT: movl $0, 4(%rdi)
568 ; DISABLED-NEXT: movl $0, 36(%rdi)
569 ; DISABLED-NEXT: movups 16(%rdi), %xmm0
570 ; DISABLED-NEXT: movups %xmm0, 16(%rsi)
571 ; DISABLED-NEXT: movups 32(%rdi), %xmm0
572 ; DISABLED-NEXT: movups %xmm0, 32(%rsi)
573 ; DISABLED-NEXT: movups (%rdi), %xmm0
574 ; DISABLED-NEXT: movups %xmm0, (%rsi)
575 ; DISABLED-NEXT: retq
576 ;
577 ; CHECK-AVX2-LABEL: test_multiple_blocks:
578 ; CHECK-AVX2: # %bb.0: # %entry
579 ; CHECK-AVX2-NEXT: movl $0, 4(%rdi)
580 ; CHECK-AVX2-NEXT: movl $0, 36(%rdi)
581 ; CHECK-AVX2-NEXT: vmovups 16(%rdi), %xmm0
582 ; CHECK-AVX2-NEXT: vmovups %xmm0, 16(%rsi)
583 ; CHECK-AVX2-NEXT: movl 32(%rdi), %eax
584 ; CHECK-AVX2-NEXT: movl %eax, 32(%rsi)
585 ; CHECK-AVX2-NEXT: movl 36(%rdi), %eax
586 ; CHECK-AVX2-NEXT: movl %eax, 36(%rsi)
587 ; CHECK-AVX2-NEXT: movq 40(%rdi), %rax
588 ; CHECK-AVX2-NEXT: movq %rax, 40(%rsi)
589 ; CHECK-AVX2-NEXT: movl (%rdi), %eax
590 ; CHECK-AVX2-NEXT: movl %eax, (%rsi)
591 ; CHECK-AVX2-NEXT: movl 4(%rdi), %eax
592 ; CHECK-AVX2-NEXT: movl %eax, 4(%rsi)
593 ; CHECK-AVX2-NEXT: vmovups 8(%rdi), %xmm0
594 ; CHECK-AVX2-NEXT: vmovups %xmm0, 8(%rsi)
595 ; CHECK-AVX2-NEXT: movq 24(%rdi), %rax
596 ; CHECK-AVX2-NEXT: movq %rax, 24(%rsi)
597 ; CHECK-AVX2-NEXT: retq
598 ;
599 ; CHECK-AVX512-LABEL: test_multiple_blocks:
600 ; CHECK-AVX512: # %bb.0: # %entry
601 ; CHECK-AVX512-NEXT: movl $0, 4(%rdi)
602 ; CHECK-AVX512-NEXT: movl $0, 36(%rdi)
603 ; CHECK-AVX512-NEXT: vmovups 16(%rdi), %xmm0
604 ; CHECK-AVX512-NEXT: vmovups %xmm0, 16(%rsi)
605 ; CHECK-AVX512-NEXT: movl 32(%rdi), %eax
606 ; CHECK-AVX512-NEXT: movl %eax, 32(%rsi)
607 ; CHECK-AVX512-NEXT: movl 36(%rdi), %eax
608 ; CHECK-AVX512-NEXT: movl %eax, 36(%rsi)
609 ; CHECK-AVX512-NEXT: movq 40(%rdi), %rax
610 ; CHECK-AVX512-NEXT: movq %rax, 40(%rsi)
611 ; CHECK-AVX512-NEXT: movl (%rdi), %eax
612 ; CHECK-AVX512-NEXT: movl %eax, (%rsi)
613 ; CHECK-AVX512-NEXT: movl 4(%rdi), %eax
614 ; CHECK-AVX512-NEXT: movl %eax, 4(%rsi)
615 ; CHECK-AVX512-NEXT: vmovups 8(%rdi), %xmm0
616 ; CHECK-AVX512-NEXT: vmovups %xmm0, 8(%rsi)
617 ; CHECK-AVX512-NEXT: movq 24(%rdi), %rax
618 ; CHECK-AVX512-NEXT: movq %rax, 24(%rsi)
619 ; CHECK-AVX512-NEXT: retq
620 entry:
621 %b = getelementptr inbounds %struct.S4, %struct.S4* %s1, i64 0, i32 1
622 store i32 0, i32* %b, align 4
623 %b3 = getelementptr inbounds %struct.S4, %struct.S4* %s1, i64 0, i32 9
624 store i32 0, i32* %b3, align 4
625 %0 = bitcast %struct.S4* %s2 to i8*
626 %1 = bitcast %struct.S4* %s1 to i8*
627 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 48, i32 4, i1 false)
628 ret void
629 }
630 %struct.S5 = type { i16, i16, i16, i16, i16, i16, i16, i16 }
631
632 ; Function Attrs: nounwind uwtable
633 define void @test_type16(%struct.S5* nocapture %s1, %struct.S5* nocapture %s2, i32 %x, %struct.S5* nocapture %s3, %struct.S5* nocapture readonly %s4) local_unnamed_addr #0 {
634 ; CHECK-LABEL: test_type16:
635 ; CHECK: # %bb.0: # %entry
636 ; CHECK-NEXT: cmpl $18, %edx
637 ; CHECK-NEXT: jl .LBB7_2
638 ; CHECK-NEXT: # %bb.1: # %if.then
639 ; CHECK-NEXT: movw %dx, 2(%rdi)
640 ; CHECK-NEXT: .LBB7_2: # %if.end
641 ; CHECK-NEXT: movups (%r8), %xmm0
642 ; CHECK-NEXT: movups %xmm0, (%rcx)
643 ; CHECK-NEXT: movzwl (%rdi), %eax
644 ; CHECK-NEXT: movw %ax, (%rsi)
645 ; CHECK-NEXT: movzwl 2(%rdi), %eax
646 ; CHECK-NEXT: movw %ax, 2(%rsi)
647 ; CHECK-NEXT: movq 4(%rdi), %rax
648 ; CHECK-NEXT: movq %rax, 4(%rsi)
649 ; CHECK-NEXT: movl 12(%rdi), %eax
650 ; CHECK-NEXT: movl %eax, 12(%rsi)
651 ; CHECK-NEXT: retq
652 ;
653 ; DISABLED-LABEL: test_type16:
654 ; DISABLED: # %bb.0: # %entry
655 ; DISABLED-NEXT: cmpl $18, %edx
656 ; DISABLED-NEXT: jl .LBB7_2
657 ; DISABLED-NEXT: # %bb.1: # %if.then
658 ; DISABLED-NEXT: movw %dx, 2(%rdi)
659 ; DISABLED-NEXT: .LBB7_2: # %if.end
660 ; DISABLED-NEXT: movups (%r8), %xmm0
661 ; DISABLED-NEXT: movups %xmm0, (%rcx)
662 ; DISABLED-NEXT: movups (%rdi), %xmm0
663 ; DISABLED-NEXT: movups %xmm0, (%rsi)
664 ; DISABLED-NEXT: retq
665 ;
666 ; CHECK-AVX2-LABEL: test_type16:
667 ; CHECK-AVX2: # %bb.0: # %entry
668 ; CHECK-AVX2-NEXT: cmpl $18, %edx
669 ; CHECK-AVX2-NEXT: jl .LBB7_2
670 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
671 ; CHECK-AVX2-NEXT: movw %dx, 2(%rdi)
672 ; CHECK-AVX2-NEXT: .LBB7_2: # %if.end
673 ; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0
674 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx)
675 ; CHECK-AVX2-NEXT: movzwl (%rdi), %eax
676 ; CHECK-AVX2-NEXT: movw %ax, (%rsi)
677 ; CHECK-AVX2-NEXT: movzwl 2(%rdi), %eax
678 ; CHECK-AVX2-NEXT: movw %ax, 2(%rsi)
679 ; CHECK-AVX2-NEXT: movq 4(%rdi), %rax
680 ; CHECK-AVX2-NEXT: movq %rax, 4(%rsi)
681 ; CHECK-AVX2-NEXT: movl 12(%rdi), %eax
682 ; CHECK-AVX2-NEXT: movl %eax, 12(%rsi)
683 ; CHECK-AVX2-NEXT: retq
684 ;
685 ; CHECK-AVX512-LABEL: test_type16:
686 ; CHECK-AVX512: # %bb.0: # %entry
687 ; CHECK-AVX512-NEXT: cmpl $18, %edx
688 ; CHECK-AVX512-NEXT: jl .LBB7_2
689 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
690 ; CHECK-AVX512-NEXT: movw %dx, 2(%rdi)
691 ; CHECK-AVX512-NEXT: .LBB7_2: # %if.end
692 ; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0
693 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx)
694 ; CHECK-AVX512-NEXT: movzwl (%rdi), %eax
695 ; CHECK-AVX512-NEXT: movw %ax, (%rsi)
696 ; CHECK-AVX512-NEXT: movzwl 2(%rdi), %eax
697 ; CHECK-AVX512-NEXT: movw %ax, 2(%rsi)
698 ; CHECK-AVX512-NEXT: movq 4(%rdi), %rax
699 ; CHECK-AVX512-NEXT: movq %rax, 4(%rsi)
700 ; CHECK-AVX512-NEXT: movl 12(%rdi), %eax
701 ; CHECK-AVX512-NEXT: movl %eax, 12(%rsi)
702 ; CHECK-AVX512-NEXT: retq
703 entry:
704 %cmp = icmp sgt i32 %x, 17
705 br i1 %cmp, label %if.then, label %if.end
706
707 if.then: ; preds = %entry
708 %conv = trunc i32 %x to i16
709 %b = getelementptr inbounds %struct.S5, %struct.S5* %s1, i64 0, i32 1
710 store i16 %conv, i16* %b, align 2
711 br label %if.end
712
713 if.end: ; preds = %if.then, %entry
714 %0 = bitcast %struct.S5* %s3 to i8*
715 %1 = bitcast %struct.S5* %s4 to i8*
716 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 2, i1 false)
717 %2 = bitcast %struct.S5* %s2 to i8*
718 %3 = bitcast %struct.S5* %s1 to i8*
719 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 2, i1 false)
720 ret void
721 }
722
723 %struct.S6 = type { [4 x i32], i32, i32, i32, i32 }
724
725 ; Function Attrs: nounwind uwtable
726 define void @test_stack(%struct.S6* noalias nocapture sret %agg.result, %struct.S6* byval nocapture readnone align 8 %s1, %struct.S6* byval nocapture align 8 %s2, i32 %x) local_unnamed_addr #0 {
727 ; CHECK-LABEL: test_stack:
728 ; CHECK: # %bb.0: # %entry
729 ; CHECK-NEXT: movl %esi, {{[0-9]+}}(%rsp)
730 ; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
731 ; CHECK-NEXT: movups %xmm0, (%rdi)
732 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
733 ; CHECK-NEXT: movq %rax, 16(%rdi)
734 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
735 ; CHECK-NEXT: movl %eax, 24(%rdi)
736 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
737 ; CHECK-NEXT: movl %eax, 28(%rdi)
738 ; CHECK-NEXT: movq %rdi, %rax
739 ; CHECK-NEXT: retq
740 ;
741 ; DISABLED-LABEL: test_stack:
742 ; DISABLED: # %bb.0: # %entry
743 ; DISABLED-NEXT: movl %esi, {{[0-9]+}}(%rsp)
744 ; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
745 ; DISABLED-NEXT: movups %xmm0, (%rdi)
746 ; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
747 ; DISABLED-NEXT: movups %xmm0, 16(%rdi)
748 ; DISABLED-NEXT: movq %rdi, %rax
749 ; DISABLED-NEXT: retq
750 ;
751 ; CHECK-AVX2-LABEL: test_stack:
752 ; CHECK-AVX2: # %bb.0: # %entry
753 ; CHECK-AVX2-NEXT: movl %esi, {{[0-9]+}}(%rsp)
754 ; CHECK-AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0
755 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%rdi)
756 ; CHECK-AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
757 ; CHECK-AVX2-NEXT: movq %rax, 16(%rdi)
758 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
759 ; CHECK-AVX2-NEXT: movl %eax, 24(%rdi)
760 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
761 ; CHECK-AVX2-NEXT: movl %eax, 28(%rdi)
762 ; CHECK-AVX2-NEXT: movq %rdi, %rax
763 ; CHECK-AVX2-NEXT: retq
764 ;
765 ; CHECK-AVX512-LABEL: test_stack:
766 ; CHECK-AVX512: # %bb.0: # %entry
767 ; CHECK-AVX512-NEXT: movl %esi, {{[0-9]+}}(%rsp)
768 ; CHECK-AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0
769 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%rdi)
770 ; CHECK-AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
771 ; CHECK-AVX512-NEXT: movq %rax, 16(%rdi)
772 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%rsp), %eax
773 ; CHECK-AVX512-NEXT: movl %eax, 24(%rdi)
774 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%rsp), %eax
775 ; CHECK-AVX512-NEXT: movl %eax, 28(%rdi)
776 ; CHECK-AVX512-NEXT: movq %rdi, %rax
777 ; CHECK-AVX512-NEXT: retq
778 entry:
779 %s6.sroa.0.0..sroa_cast1 = bitcast %struct.S6* %s2 to i8*
780 %s6.sroa.3.0..sroa_idx4 = getelementptr inbounds %struct.S6, %struct.S6* %s2, i64 0, i32 3
781 store i32 %x, i32* %s6.sroa.3.0..sroa_idx4, align 8
782 %0 = bitcast %struct.S6* %agg.result to i8*
783 call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* nonnull %s6.sroa.0.0..sroa_cast1, i64 32, i32 4, i1 false)
784 ret void
785 }
786
787 ; Function Attrs: nounwind uwtable
788 define void @test_limit_all(%struct.S* %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 {
789 ; CHECK-LABEL: test_limit_all:
790 ; CHECK: # %bb.0: # %entry
791 ; CHECK-NEXT: pushq %rbp
792 ; CHECK-NEXT: .cfi_def_cfa_offset 16
793 ; CHECK-NEXT: pushq %r15
794 ; CHECK-NEXT: .cfi_def_cfa_offset 24
795 ; CHECK-NEXT: pushq %r14
796 ; CHECK-NEXT: .cfi_def_cfa_offset 32
797 ; CHECK-NEXT: pushq %r12
798 ; CHECK-NEXT: .cfi_def_cfa_offset 40
799 ; CHECK-NEXT: pushq %rbx
800 ; CHECK-NEXT: .cfi_def_cfa_offset 48
801 ; CHECK-NEXT: .cfi_offset %rbx, -48
802 ; CHECK-NEXT: .cfi_offset %r12, -40
803 ; CHECK-NEXT: .cfi_offset %r14, -32
804 ; CHECK-NEXT: .cfi_offset %r15, -24
805 ; CHECK-NEXT: .cfi_offset %rbp, -16
806 ; CHECK-NEXT: movq %r8, %r15
807 ; CHECK-NEXT: movq %rcx, %r14
808 ; CHECK-NEXT: movl %edx, %ebp
809 ; CHECK-NEXT: movq %rsi, %r12
810 ; CHECK-NEXT: movq %rdi, %rbx
811 ; CHECK-NEXT: movl %r9d, 12(%rbx)
812 ; CHECK-NEXT: callq bar
813 ; CHECK-NEXT: cmpl $18, %ebp
814 ; CHECK-NEXT: jl .LBB9_2
815 ; CHECK-NEXT: # %bb.1: # %if.then
816 ; CHECK-NEXT: movl %ebp, 4(%rbx)
817 ; CHECK-NEXT: movq %rbx, %rdi
818 ; CHECK-NEXT: callq bar
819 ; CHECK-NEXT: .LBB9_2: # %if.end
820 ; CHECK-NEXT: movups (%r15), %xmm0
821 ; CHECK-NEXT: movups %xmm0, (%r14)
822 ; CHECK-NEXT: movups (%rbx), %xmm0
823 ; CHECK-NEXT: movups %xmm0, (%r12)
824 ; CHECK-NEXT: popq %rbx
825 ; CHECK-NEXT: popq %r12
826 ; CHECK-NEXT: popq %r14
827 ; CHECK-NEXT: popq %r15
828 ; CHECK-NEXT: popq %rbp
829 ; CHECK-NEXT: retq
830 ;
831 ; DISABLED-LABEL: test_limit_all:
832 ; DISABLED: # %bb.0: # %entry
833 ; DISABLED-NEXT: pushq %rbp
834 ; DISABLED-NEXT: .cfi_def_cfa_offset 16
835 ; DISABLED-NEXT: pushq %r15
836 ; DISABLED-NEXT: .cfi_def_cfa_offset 24
837 ; DISABLED-NEXT: pushq %r14
838 ; DISABLED-NEXT: .cfi_def_cfa_offset 32
839 ; DISABLED-NEXT: pushq %r12
840 ; DISABLED-NEXT: .cfi_def_cfa_offset 40
841 ; DISABLED-NEXT: pushq %rbx
842 ; DISABLED-NEXT: .cfi_def_cfa_offset 48
843 ; DISABLED-NEXT: .cfi_offset %rbx, -48
844 ; DISABLED-NEXT: .cfi_offset %r12, -40
845 ; DISABLED-NEXT: .cfi_offset %r14, -32
846 ; DISABLED-NEXT: .cfi_offset %r15, -24
847 ; DISABLED-NEXT: .cfi_offset %rbp, -16
848 ; DISABLED-NEXT: movq %r8, %r15
849 ; DISABLED-NEXT: movq %rcx, %r14
850 ; DISABLED-NEXT: movl %edx, %ebp
851 ; DISABLED-NEXT: movq %rsi, %r12
852 ; DISABLED-NEXT: movq %rdi, %rbx
853 ; DISABLED-NEXT: movl %r9d, 12(%rbx)
854 ; DISABLED-NEXT: callq bar
855 ; DISABLED-NEXT: cmpl $18, %ebp
856 ; DISABLED-NEXT: jl .LBB9_2
857 ; DISABLED-NEXT: # %bb.1: # %if.then
858 ; DISABLED-NEXT: movl %ebp, 4(%rbx)
859 ; DISABLED-NEXT: movq %rbx, %rdi
860 ; DISABLED-NEXT: callq bar
861 ; DISABLED-NEXT: .LBB9_2: # %if.end
862 ; DISABLED-NEXT: movups (%r15), %xmm0
863 ; DISABLED-NEXT: movups %xmm0, (%r14)
864 ; DISABLED-NEXT: movups (%rbx), %xmm0
865 ; DISABLED-NEXT: movups %xmm0, (%r12)
866 ; DISABLED-NEXT: popq %rbx
867 ; DISABLED-NEXT: popq %r12
868 ; DISABLED-NEXT: popq %r14
869 ; DISABLED-NEXT: popq %r15
870 ; DISABLED-NEXT: popq %rbp
871 ; DISABLED-NEXT: retq
872 ;
873 ; CHECK-AVX2-LABEL: test_limit_all:
874 ; CHECK-AVX2: # %bb.0: # %entry
875 ; CHECK-AVX2-NEXT: pushq %rbp
876 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 16
877 ; CHECK-AVX2-NEXT: pushq %r15
878 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 24
879 ; CHECK-AVX2-NEXT: pushq %r14
880 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 32
881 ; CHECK-AVX2-NEXT: pushq %r12
882 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 40
883 ; CHECK-AVX2-NEXT: pushq %rbx
884 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 48
885 ; CHECK-AVX2-NEXT: .cfi_offset %rbx, -48
886 ; CHECK-AVX2-NEXT: .cfi_offset %r12, -40
887 ; CHECK-AVX2-NEXT: .cfi_offset %r14, -32
888 ; CHECK-AVX2-NEXT: .cfi_offset %r15, -24
889 ; CHECK-AVX2-NEXT: .cfi_offset %rbp, -16
890 ; CHECK-AVX2-NEXT: movq %r8, %r15
891 ; CHECK-AVX2-NEXT: movq %rcx, %r14
892 ; CHECK-AVX2-NEXT: movl %edx, %ebp
893 ; CHECK-AVX2-NEXT: movq %rsi, %r12
894 ; CHECK-AVX2-NEXT: movq %rdi, %rbx
895 ; CHECK-AVX2-NEXT: movl %r9d, 12(%rbx)
896 ; CHECK-AVX2-NEXT: callq bar
897 ; CHECK-AVX2-NEXT: cmpl $18, %ebp
898 ; CHECK-AVX2-NEXT: jl .LBB9_2
899 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
900 ; CHECK-AVX2-NEXT: movl %ebp, 4(%rbx)
901 ; CHECK-AVX2-NEXT: movq %rbx, %rdi
902 ; CHECK-AVX2-NEXT: callq bar
903 ; CHECK-AVX2-NEXT: .LBB9_2: # %if.end
904 ; CHECK-AVX2-NEXT: vmovups (%r15), %xmm0
905 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%r14)
906 ; CHECK-AVX2-NEXT: vmovups (%rbx), %xmm0
907 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%r12)
908 ; CHECK-AVX2-NEXT: popq %rbx
909 ; CHECK-AVX2-NEXT: popq %r12
910 ; CHECK-AVX2-NEXT: popq %r14
911 ; CHECK-AVX2-NEXT: popq %r15
912 ; CHECK-AVX2-NEXT: popq %rbp
913 ; CHECK-AVX2-NEXT: retq
914 ;
915 ; CHECK-AVX512-LABEL: test_limit_all:
916 ; CHECK-AVX512: # %bb.0: # %entry
917 ; CHECK-AVX512-NEXT: pushq %rbp
918 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 16
919 ; CHECK-AVX512-NEXT: pushq %r15
920 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 24
921 ; CHECK-AVX512-NEXT: pushq %r14
922 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 32
923 ; CHECK-AVX512-NEXT: pushq %r12
924 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 40
925 ; CHECK-AVX512-NEXT: pushq %rbx
926 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 48
927 ; CHECK-AVX512-NEXT: .cfi_offset %rbx, -48
928 ; CHECK-AVX512-NEXT: .cfi_offset %r12, -40
929 ; CHECK-AVX512-NEXT: .cfi_offset %r14, -32
930 ; CHECK-AVX512-NEXT: .cfi_offset %r15, -24
931 ; CHECK-AVX512-NEXT: .cfi_offset %rbp, -16
932 ; CHECK-AVX512-NEXT: movq %r8, %r15
933 ; CHECK-AVX512-NEXT: movq %rcx, %r14
934 ; CHECK-AVX512-NEXT: movl %edx, %ebp
935 ; CHECK-AVX512-NEXT: movq %rsi, %r12
936 ; CHECK-AVX512-NEXT: movq %rdi, %rbx
937 ; CHECK-AVX512-NEXT: movl %r9d, 12(%rbx)
938 ; CHECK-AVX512-NEXT: callq bar
939 ; CHECK-AVX512-NEXT: cmpl $18, %ebp
940 ; CHECK-AVX512-NEXT: jl .LBB9_2
941 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
942 ; CHECK-AVX512-NEXT: movl %ebp, 4(%rbx)
943 ; CHECK-AVX512-NEXT: movq %rbx, %rdi
944 ; CHECK-AVX512-NEXT: callq bar
945 ; CHECK-AVX512-NEXT: .LBB9_2: # %if.end
946 ; CHECK-AVX512-NEXT: vmovups (%r15), %xmm0
947 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%r14)
948 ; CHECK-AVX512-NEXT: vmovups (%rbx), %xmm0
949 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%r12)
950 ; CHECK-AVX512-NEXT: popq %rbx
951 ; CHECK-AVX512-NEXT: popq %r12
952 ; CHECK-AVX512-NEXT: popq %r14
953 ; CHECK-AVX512-NEXT: popq %r15
954 ; CHECK-AVX512-NEXT: popq %rbp
955 ; CHECK-AVX512-NEXT: retq
956 entry:
957 %d = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 3
958 store i32 %x2, i32* %d, align 4
959 tail call void @bar(%struct.S* %s1) #3
960 %cmp = icmp sgt i32 %x, 17
961 br i1 %cmp, label %if.then, label %if.end
962
963 if.then: ; preds = %entry
964 %b = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 1
965 store i32 %x, i32* %b, align 4
966 tail call void @bar(%struct.S* nonnull %s1) #3
967 br label %if.end
968
969 if.end: ; preds = %if.then, %entry
970 %0 = bitcast %struct.S* %s3 to i8*
971 %1 = bitcast %struct.S* %s4 to i8*
972 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
973 %2 = bitcast %struct.S* %s2 to i8*
974 %3 = bitcast %struct.S* %s1 to i8*
975 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 4, i1 false)
976 ret void
977 }
978
979 ; Function Attrs: nounwind uwtable
980 define void @test_limit_one_pred(%struct.S* %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 {
981 ; CHECK-LABEL: test_limit_one_pred:
982 ; CHECK: # %bb.0: # %entry
983 ; CHECK-NEXT: pushq %r15
984 ; CHECK-NEXT: .cfi_def_cfa_offset 16
985 ; CHECK-NEXT: pushq %r14
986 ; CHECK-NEXT: .cfi_def_cfa_offset 24
987 ; CHECK-NEXT: pushq %r12
988 ; CHECK-NEXT: .cfi_def_cfa_offset 32
989 ; CHECK-NEXT: pushq %rbx
990 ; CHECK-NEXT: .cfi_def_cfa_offset 40
991 ; CHECK-NEXT: pushq %rax
992 ; CHECK-NEXT: .cfi_def_cfa_offset 48
993 ; CHECK-NEXT: .cfi_offset %rbx, -40
994 ; CHECK-NEXT: .cfi_offset %r12, -32
995 ; CHECK-NEXT: .cfi_offset %r14, -24
996 ; CHECK-NEXT: .cfi_offset %r15, -16
997 ; CHECK-NEXT: movq %r8, %r12
998 ; CHECK-NEXT: movq %rcx, %r15
999 ; CHECK-NEXT: movq %rsi, %r14
1000 ; CHECK-NEXT: movq %rdi, %rbx
1001 ; CHECK-NEXT: movl %r9d, 12(%rbx)
1002 ; CHECK-NEXT: cmpl $18, %edx
1003 ; CHECK-NEXT: jl .LBB10_2
1004 ; CHECK-NEXT: # %bb.1: # %if.then
1005 ; CHECK-NEXT: movl %edx, 4(%rbx)
1006 ; CHECK-NEXT: movq %rbx, %rdi
1007 ; CHECK-NEXT: callq bar
1008 ; CHECK-NEXT: .LBB10_2: # %if.end
1009 ; CHECK-NEXT: movups (%r12), %xmm0
1010 ; CHECK-NEXT: movups %xmm0, (%r15)
1011 ; CHECK-NEXT: movq (%rbx), %rax
1012 ; CHECK-NEXT: movq %rax, (%r14)
1013 ; CHECK-NEXT: movl 8(%rbx), %eax
1014 ; CHECK-NEXT: movl %eax, 8(%r14)
1015 ; CHECK-NEXT: movl 12(%rbx), %eax
1016 ; CHECK-NEXT: movl %eax, 12(%r14)
1017 ; CHECK-NEXT: addq $8, %rsp
1018 ; CHECK-NEXT: popq %rbx
1019 ; CHECK-NEXT: popq %r12
1020 ; CHECK-NEXT: popq %r14
1021 ; CHECK-NEXT: popq %r15
1022 ; CHECK-NEXT: retq
1023 ;
1024 ; DISABLED-LABEL: test_limit_one_pred:
1025 ; DISABLED: # %bb.0: # %entry
1026 ; DISABLED-NEXT: pushq %r15
1027 ; DISABLED-NEXT: .cfi_def_cfa_offset 16
1028 ; DISABLED-NEXT: pushq %r14
1029 ; DISABLED-NEXT: .cfi_def_cfa_offset 24
1030 ; DISABLED-NEXT: pushq %r12
1031 ; DISABLED-NEXT: .cfi_def_cfa_offset 32
1032 ; DISABLED-NEXT: pushq %rbx
1033 ; DISABLED-NEXT: .cfi_def_cfa_offset 40
1034 ; DISABLED-NEXT: pushq %rax
1035 ; DISABLED-NEXT: .cfi_def_cfa_offset 48
1036 ; DISABLED-NEXT: .cfi_offset %rbx, -40
1037 ; DISABLED-NEXT: .cfi_offset %r12, -32
1038 ; DISABLED-NEXT: .cfi_offset %r14, -24
1039 ; DISABLED-NEXT: .cfi_offset %r15, -16
1040 ; DISABLED-NEXT: movq %r8, %r15
1041 ; DISABLED-NEXT: movq %rcx, %r14
1042 ; DISABLED-NEXT: movq %rsi, %r12
1043 ; DISABLED-NEXT: movq %rdi, %rbx
1044 ; DISABLED-NEXT: movl %r9d, 12(%rbx)
1045 ; DISABLED-NEXT: cmpl $18, %edx
1046 ; DISABLED-NEXT: jl .LBB10_2
1047 ; DISABLED-NEXT: # %bb.1: # %if.then
1048 ; DISABLED-NEXT: movl %edx, 4(%rbx)
1049 ; DISABLED-NEXT: movq %rbx, %rdi
1050 ; DISABLED-NEXT: callq bar
1051 ; DISABLED-NEXT: .LBB10_2: # %if.end
1052 ; DISABLED-NEXT: movups (%r15), %xmm0
1053 ; DISABLED-NEXT: movups %xmm0, (%r14)
1054 ; DISABLED-NEXT: movups (%rbx), %xmm0
1055 ; DISABLED-NEXT: movups %xmm0, (%r12)
1056 ; DISABLED-NEXT: addq $8, %rsp
1057 ; DISABLED-NEXT: popq %rbx
1058 ; DISABLED-NEXT: popq %r12
1059 ; DISABLED-NEXT: popq %r14
1060 ; DISABLED-NEXT: popq %r15
1061 ; DISABLED-NEXT: retq
1062 ;
1063 ; CHECK-AVX2-LABEL: test_limit_one_pred:
1064 ; CHECK-AVX2: # %bb.0: # %entry
1065 ; CHECK-AVX2-NEXT: pushq %r15
1066 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 16
1067 ; CHECK-AVX2-NEXT: pushq %r14
1068 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 24
1069 ; CHECK-AVX2-NEXT: pushq %r12
1070 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 32
1071 ; CHECK-AVX2-NEXT: pushq %rbx
1072 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 40
1073 ; CHECK-AVX2-NEXT: pushq %rax
1074 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 48
1075 ; CHECK-AVX2-NEXT: .cfi_offset %rbx, -40
1076 ; CHECK-AVX2-NEXT: .cfi_offset %r12, -32
1077 ; CHECK-AVX2-NEXT: .cfi_offset %r14, -24
1078 ; CHECK-AVX2-NEXT: .cfi_offset %r15, -16
1079 ; CHECK-AVX2-NEXT: movq %r8, %r12
1080 ; CHECK-AVX2-NEXT: movq %rcx, %r15
1081 ; CHECK-AVX2-NEXT: movq %rsi, %r14
1082 ; CHECK-AVX2-NEXT: movq %rdi, %rbx
1083 ; CHECK-AVX2-NEXT: movl %r9d, 12(%rbx)
1084 ; CHECK-AVX2-NEXT: cmpl $18, %edx
1085 ; CHECK-AVX2-NEXT: jl .LBB10_2
1086 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
1087 ; CHECK-AVX2-NEXT: movl %edx, 4(%rbx)
1088 ; CHECK-AVX2-NEXT: movq %rbx, %rdi
1089 ; CHECK-AVX2-NEXT: callq bar
1090 ; CHECK-AVX2-NEXT: .LBB10_2: # %if.end
1091 ; CHECK-AVX2-NEXT: vmovups (%r12), %xmm0
1092 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%r15)
1093 ; CHECK-AVX2-NEXT: movq (%rbx), %rax
1094 ; CHECK-AVX2-NEXT: movq %rax, (%r14)
1095 ; CHECK-AVX2-NEXT: movl 8(%rbx), %eax
1096 ; CHECK-AVX2-NEXT: movl %eax, 8(%r14)
1097 ; CHECK-AVX2-NEXT: movl 12(%rbx), %eax
1098 ; CHECK-AVX2-NEXT: movl %eax, 12(%r14)
1099 ; CHECK-AVX2-NEXT: addq $8, %rsp
1100 ; CHECK-AVX2-NEXT: popq %rbx
1101 ; CHECK-AVX2-NEXT: popq %r12
1102 ; CHECK-AVX2-NEXT: popq %r14
1103 ; CHECK-AVX2-NEXT: popq %r15
1104 ; CHECK-AVX2-NEXT: retq
1105 ;
1106 ; CHECK-AVX512-LABEL: test_limit_one_pred:
1107 ; CHECK-AVX512: # %bb.0: # %entry
1108 ; CHECK-AVX512-NEXT: pushq %r15
1109 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 16
1110 ; CHECK-AVX512-NEXT: pushq %r14
1111 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 24
1112 ; CHECK-AVX512-NEXT: pushq %r12
1113 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 32
1114 ; CHECK-AVX512-NEXT: pushq %rbx
1115 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 40
1116 ; CHECK-AVX512-NEXT: pushq %rax
1117 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 48
1118 ; CHECK-AVX512-NEXT: .cfi_offset %rbx, -40
1119 ; CHECK-AVX512-NEXT: .cfi_offset %r12, -32
1120 ; CHECK-AVX512-NEXT: .cfi_offset %r14, -24
1121 ; CHECK-AVX512-NEXT: .cfi_offset %r15, -16
1122 ; CHECK-AVX512-NEXT: movq %r8, %r12
1123 ; CHECK-AVX512-NEXT: movq %rcx, %r15
1124 ; CHECK-AVX512-NEXT: movq %rsi, %r14
1125 ; CHECK-AVX512-NEXT: movq %rdi, %rbx
1126 ; CHECK-AVX512-NEXT: movl %r9d, 12(%rbx)
1127 ; CHECK-AVX512-NEXT: cmpl $18, %edx
1128 ; CHECK-AVX512-NEXT: jl .LBB10_2
1129 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
1130 ; CHECK-AVX512-NEXT: movl %edx, 4(%rbx)
1131 ; CHECK-AVX512-NEXT: movq %rbx, %rdi
1132 ; CHECK-AVX512-NEXT: callq bar
1133 ; CHECK-AVX512-NEXT: .LBB10_2: # %if.end
1134 ; CHECK-AVX512-NEXT: vmovups (%r12), %xmm0
1135 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%r15)
1136 ; CHECK-AVX512-NEXT: movq (%rbx), %rax
1137 ; CHECK-AVX512-NEXT: movq %rax, (%r14)
1138 ; CHECK-AVX512-NEXT: movl 8(%rbx), %eax
1139 ; CHECK-AVX512-NEXT: movl %eax, 8(%r14)
1140 ; CHECK-AVX512-NEXT: movl 12(%rbx), %eax
1141 ; CHECK-AVX512-NEXT: movl %eax, 12(%r14)
1142 ; CHECK-AVX512-NEXT: addq $8, %rsp
1143 ; CHECK-AVX512-NEXT: popq %rbx
1144 ; CHECK-AVX512-NEXT: popq %r12
1145 ; CHECK-AVX512-NEXT: popq %r14
1146 ; CHECK-AVX512-NEXT: popq %r15
1147 ; CHECK-AVX512-NEXT: retq
1148 entry:
1149 %d = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 3
1150 store i32 %x2, i32* %d, align 4
1151 %cmp = icmp sgt i32 %x, 17
1152 br i1 %cmp, label %if.then, label %if.end
1153
1154 if.then: ; preds = %entry
1155 %b = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 1
1156 store i32 %x, i32* %b, align 4
1157 tail call void @bar(%struct.S* nonnull %s1) #3
1158 br label %if.end
1159
1160 if.end: ; preds = %if.then, %entry
1161 %0 = bitcast %struct.S* %s3 to i8*
1162 %1 = bitcast %struct.S* %s4 to i8*
1163 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
1164 %2 = bitcast %struct.S* %s2 to i8*
1165 %3 = bitcast %struct.S* %s1 to i8*
1166 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 4, i1 false)
1167 ret void
1168 }
1169
1170
1171 declare void @bar(%struct.S*) local_unnamed_addr #1
1172
1173
1174 ; Function Attrs: argmemonly nounwind
1175 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1
1176
1177 attributes #0 = { nounwind uwtable "target-cpu"="x86-64" }
1178
1179 %struct.S7 = type { float, float, float , float, float, float, float, float }
1180
1181 ; Function Attrs: nounwind uwtable
1182 define void @test_conditional_block_float(%struct.S7* nocapture %s1, %struct.S7* nocapture %s2, i32 %x, %struct.S7* nocapture %s3, %struct.S7* nocapture readonly %s4, float %y) local_unnamed_addr #0 {
1183 ; CHECK-LABEL: test_conditional_block_float:
1184 ; CHECK: # %bb.0: # %entry
1185 ; CHECK-NEXT: cmpl $18, %edx
1186 ; CHECK-NEXT: jl .LBB11_2
1187 ; CHECK-NEXT: # %bb.1: # %if.then
1188 ; CHECK-NEXT: movl $1065353216, 4(%rdi) # imm = 0x3F800000
1189 ; CHECK-NEXT: .LBB11_2: # %if.end
1190 ; CHECK-NEXT: movups (%r8), %xmm0
1191 ; CHECK-NEXT: movups 16(%r8), %xmm1
1192 ; CHECK-NEXT: movups %xmm1, 16(%rcx)
1193 ; CHECK-NEXT: movups %xmm0, (%rcx)
1194 ; CHECK-NEXT: movl (%rdi), %eax
1195 ; CHECK-NEXT: movl 4(%rdi), %ecx
1196 ; CHECK-NEXT: movq 8(%rdi), %rdx
1197 ; CHECK-NEXT: movups 16(%rdi), %xmm0
1198 ; CHECK-NEXT: movups %xmm0, 16(%rsi)
1199 ; CHECK-NEXT: movl %eax, (%rsi)
1200 ; CHECK-NEXT: movl %ecx, 4(%rsi)
1201 ; CHECK-NEXT: movq %rdx, 8(%rsi)
1202 ; CHECK-NEXT: retq
1203 ;
1204 ; DISABLED-LABEL: test_conditional_block_float:
1205 ; DISABLED: # %bb.0: # %entry
1206 ; DISABLED-NEXT: cmpl $18, %edx
1207 ; DISABLED-NEXT: jl .LBB11_2
1208 ; DISABLED-NEXT: # %bb.1: # %if.then
1209 ; DISABLED-NEXT: movl $1065353216, 4(%rdi) # imm = 0x3F800000
1210 ; DISABLED-NEXT: .LBB11_2: # %if.end
1211 ; DISABLED-NEXT: movups (%r8), %xmm0
1212 ; DISABLED-NEXT: movups 16(%r8), %xmm1
1213 ; DISABLED-NEXT: movups %xmm1, 16(%rcx)
1214 ; DISABLED-NEXT: movups %xmm0, (%rcx)
1215 ; DISABLED-NEXT: movups (%rdi), %xmm0
1216 ; DISABLED-NEXT: movups 16(%rdi), %xmm1
1217 ; DISABLED-NEXT: movups %xmm1, 16(%rsi)
1218 ; DISABLED-NEXT: movups %xmm0, (%rsi)
1219 ; DISABLED-NEXT: retq
1220 ;
1221 ; CHECK-AVX2-LABEL: test_conditional_block_float:
1222 ; CHECK-AVX2: # %bb.0: # %entry
1223 ; CHECK-AVX2-NEXT: cmpl $18, %edx
1224 ; CHECK-AVX2-NEXT: jl .LBB11_2
1225 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
1226 ; CHECK-AVX2-NEXT: movl $1065353216, 4(%rdi) # imm = 0x3F800000
1227 ; CHECK-AVX2-NEXT: .LBB11_2: # %if.end
1228 ; CHECK-AVX2-NEXT: vmovups (%r8), %ymm0
1229 ; CHECK-AVX2-NEXT: vmovups %ymm0, (%rcx)
1230 ; CHECK-AVX2-NEXT: movl (%rdi), %eax
1231 ; CHECK-AVX2-NEXT: movl %eax, (%rsi)
1232 ; CHECK-AVX2-NEXT: movl 4(%rdi), %eax
1233 ; CHECK-AVX2-NEXT: movl %eax, 4(%rsi)
1234 ; CHECK-AVX2-NEXT: vmovups 8(%rdi), %xmm0
1235 ; CHECK-AVX2-NEXT: vmovups %xmm0, 8(%rsi)
1236 ; CHECK-AVX2-NEXT: movq 24(%rdi), %rax
1237 ; CHECK-AVX2-NEXT: movq %rax, 24(%rsi)
1238 ; CHECK-AVX2-NEXT: vzeroupper
1239 ; CHECK-AVX2-NEXT: retq
1240 ;
1241 ; CHECK-AVX512-LABEL: test_conditional_block_float:
1242 ; CHECK-AVX512: # %bb.0: # %entry
1243 ; CHECK-AVX512-NEXT: cmpl $18, %edx
1244 ; CHECK-AVX512-NEXT: jl .LBB11_2
1245 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
1246 ; CHECK-AVX512-NEXT: movl $1065353216, 4(%rdi) # imm = 0x3F800000
1247 ; CHECK-AVX512-NEXT: .LBB11_2: # %if.end
1248 ; CHECK-AVX512-NEXT: vmovups (%r8), %ymm0
1249 ; CHECK-AVX512-NEXT: vmovups %ymm0, (%rcx)
1250 ; CHECK-AVX512-NEXT: movl (%rdi), %eax
1251 ; CHECK-AVX512-NEXT: movl %eax, (%rsi)
1252 ; CHECK-AVX512-NEXT: movl 4(%rdi), %eax
1253 ; CHECK-AVX512-NEXT: movl %eax, 4(%rsi)
1254 ; CHECK-AVX512-NEXT: vmovups 8(%rdi), %xmm0
1255 ; CHECK-AVX512-NEXT: vmovups %xmm0, 8(%rsi)
1256 ; CHECK-AVX512-NEXT: movq 24(%rdi), %rax
1257 ; CHECK-AVX512-NEXT: movq %rax, 24(%rsi)
1258 ; CHECK-AVX512-NEXT: vzeroupper
1259 ; CHECK-AVX512-NEXT: retq
1260 entry:
1261 %cmp = icmp sgt i32 %x, 17
1262 br i1 %cmp, label %if.then, label %if.end
1263
1264 if.then: ; preds = %entry
1265 %b = getelementptr inbounds %struct.S7, %struct.S7* %s1, i64 0, i32 1
1266 store float 1.0, float* %b, align 4
1267 br label %if.end
1268
1269 if.end: ; preds = %if.then, %entry
1270 %0 = bitcast %struct.S7* %s3 to i8*
1271 %1 = bitcast %struct.S7* %s4 to i8*
1272 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 32, i32 4, i1 false)
1273 %2 = bitcast %struct.S7* %s2 to i8*
1274 %3 = bitcast %struct.S7* %s1 to i8*
1275 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 32, i32 4, i1 false)
1276 ret void
1277 }
1278
1279 %struct.S8 = type { i64, i64, i64, i64, i64, i64 }
1280
1281 ; Function Attrs: nounwind uwtable
1282 define void @test_conditional_block_ymm(%struct.S8* nocapture %s1, %struct.S8* nocapture %s2, i32 %x, %struct.S8* nocapture %s3, %struct.S8* nocapture readonly %s4) local_unnamed_addr #0 {
1283 ; CHECK-LABEL: test_conditional_block_ymm:
1284 ; CHECK: # %bb.0: # %entry
1285 ; CHECK-NEXT: cmpl $18, %edx
1286 ; CHECK-NEXT: jl .LBB12_2
1287 ; CHECK-NEXT: # %bb.1: # %if.then
1288 ; CHECK-NEXT: movq $1, 8(%rdi)
1289 ; CHECK-NEXT: .LBB12_2: # %if.end
1290 ; CHECK-NEXT: movups (%r8), %xmm0
1291 ; CHECK-NEXT: movups 16(%r8), %xmm1
1292 ; CHECK-NEXT: movups %xmm1, 16(%rcx)
1293 ; CHECK-NEXT: movups %xmm0, (%rcx)
1294 ; CHECK-NEXT: movq (%rdi), %rax
1295 ; CHECK-NEXT: movq 8(%rdi), %rcx
1296 ; CHECK-NEXT: movups 16(%rdi), %xmm0
1297 ; CHECK-NEXT: movups %xmm0, 16(%rsi)
1298 ; CHECK-NEXT: movq %rax, (%rsi)
1299 ; CHECK-NEXT: movq %rcx, 8(%rsi)
1300 ; CHECK-NEXT: retq
1301 ;
1302 ; DISABLED-LABEL: test_conditional_block_ymm:
1303 ; DISABLED: # %bb.0: # %entry
1304 ; DISABLED-NEXT: cmpl $18, %edx
1305 ; DISABLED-NEXT: jl .LBB12_2
1306 ; DISABLED-NEXT: # %bb.1: # %if.then
1307 ; DISABLED-NEXT: movq $1, 8(%rdi)
1308 ; DISABLED-NEXT: .LBB12_2: # %if.end
1309 ; DISABLED-NEXT: movups (%r8), %xmm0
1310 ; DISABLED-NEXT: movups 16(%r8), %xmm1
1311 ; DISABLED-NEXT: movups %xmm1, 16(%rcx)
1312 ; DISABLED-NEXT: movups %xmm0, (%rcx)
1313 ; DISABLED-NEXT: movups (%rdi), %xmm0
1314 ; DISABLED-NEXT: movups 16(%rdi), %xmm1
1315 ; DISABLED-NEXT: movups %xmm1, 16(%rsi)
1316 ; DISABLED-NEXT: movups %xmm0, (%rsi)
1317 ; DISABLED-NEXT: retq
1318 ;
1319 ; CHECK-AVX2-LABEL: test_conditional_block_ymm:
1320 ; CHECK-AVX2: # %bb.0: # %entry
1321 ; CHECK-AVX2-NEXT: cmpl $18, %edx
1322 ; CHECK-AVX2-NEXT: jl .LBB12_2
1323 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
1324 ; CHECK-AVX2-NEXT: movq $1, 8(%rdi)
1325 ; CHECK-AVX2-NEXT: .LBB12_2: # %if.end
1326 ; CHECK-AVX2-NEXT: vmovups (%r8), %ymm0
1327 ; CHECK-AVX2-NEXT: vmovups %ymm0, (%rcx)
1328 ; CHECK-AVX2-NEXT: movq (%rdi), %rax
1329 ; CHECK-AVX2-NEXT: movq %rax, (%rsi)
1330 ; CHECK-AVX2-NEXT: movq 8(%rdi), %rax
1331 ; CHECK-AVX2-NEXT: movq %rax, 8(%rsi)
1332 ; CHECK-AVX2-NEXT: vmovups 16(%rdi), %xmm0
1333 ; CHECK-AVX2-NEXT: vmovups %xmm0, 16(%rsi)
1334 ; CHECK-AVX2-NEXT: vzeroupper
1335 ; CHECK-AVX2-NEXT: retq
1336 ;
1337 ; CHECK-AVX512-LABEL: test_conditional_block_ymm:
1338 ; CHECK-AVX512: # %bb.0: # %entry
1339 ; CHECK-AVX512-NEXT: cmpl $18, %edx
1340 ; CHECK-AVX512-NEXT: jl .LBB12_2
1341 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
1342 ; CHECK-AVX512-NEXT: movq $1, 8(%rdi)
1343 ; CHECK-AVX512-NEXT: .LBB12_2: # %if.end
1344 ; CHECK-AVX512-NEXT: vmovups (%r8), %ymm0
1345 ; CHECK-AVX512-NEXT: vmovups %ymm0, (%rcx)
1346 ; CHECK-AVX512-NEXT: movq (%rdi), %rax
1347 ; CHECK-AVX512-NEXT: movq %rax, (%rsi)
1348 ; CHECK-AVX512-NEXT: movq 8(%rdi), %rax
1349 ; CHECK-AVX512-NEXT: movq %rax, 8(%rsi)
1350 ; CHECK-AVX512-NEXT: vmovups 16(%rdi), %xmm0
1351 ; CHECK-AVX512-NEXT: vmovups %xmm0, 16(%rsi)
1352 ; CHECK-AVX512-NEXT: vzeroupper
1353 ; CHECK-AVX512-NEXT: retq
1354 entry:
1355 %cmp = icmp sgt i32 %x, 17
1356 br i1 %cmp, label %if.then, label %if.end
1357
1358 if.then: ; preds = %entry
1359 %b = getelementptr inbounds %struct.S8, %struct.S8* %s1, i64 0, i32 1
1360 store i64 1, i64* %b, align 4
1361 br label %if.end
1362
1363 if.end: ; preds = %if.then, %entry
1364 %0 = bitcast %struct.S8* %s3 to i8*
1365 %1 = bitcast %struct.S8* %s4 to i8*
1366 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 32, i32 4, i1 false)
1367 %2 = bitcast %struct.S8* %s2 to i8*
1368 %3 = bitcast %struct.S8* %s1 to i8*
1369 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 32, i32 4, i1 false)
1370 ret void
1371 }
1372