llvm.org GIT mirror llvm / 3c03a2a
[X86] Reduce Store Forward Block issues in HW - Recommit after fixing Bug 36346 If a load follows a store and reloads data that the store has written to memory, Intel microarchitectures can in many cases forward the data directly from the store to the load, This "store forwarding" saves cycles by enabling the load to directly obtain the data instead of accessing the data from cache or memory. A "store forward block" occurs in cases that a store cannot be forwarded to the load. The most typical case of store forward block on Intel Core microarchiticutre that a small store cannot be forwarded to a large load. The estimated penalty for a store forward block is ~13 cycles. This pass tries to recognize and handle cases where "store forward block" is created by the compiler when lowering memcpy calls to a sequence of a load and a store. The pass currently only handles cases where memcpy is lowered to XMM/YMM registers, it tries to break the memcpy into smaller copies. breaking the memcpy should be possible since there is no atomicity guarantee for loads and stores to XMM/YMM. Differential revision: https://reviews.llvm.org/D41330 Change-Id: Ib48836ccdf6005989f7d4466fa2035b7b04415d9 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@328973 91177308-0d34-0410-b5e6-96231b3b80d8 Lama Saba 2 years ago
7 changed file(s) with 2742 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
3030 X86FastISel.cpp
3131 X86FixupBWInsts.cpp
3232 X86FixupLEAs.cpp
33 X86AvoidStoreForwardingBlocks.cpp
3334 X86FixupSetCC.cpp
3435 X86FloatingPoint.cpp
3536 X86FrameLowering.cpp
6969 /// Return a pass that transforms setcc + movzx pairs into xor + setcc.
7070 FunctionPass *createX86FixupSetCC();
7171
72 /// Return a pass that avoids creating store forward block issues in the hardware.
73 FunctionPass *createX86AvoidStoreForwardingBlocks();
74
7275 /// Return a pass that expands WinAlloca pseudo-instructions.
7376 FunctionPass *createX86WinAllocaExpander();
7477
0 //===- X86AvoidStoreForwardingBlockis.cpp - Avoid HW Store Forward Block --===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // If a load follows a store and reloads data that the store has written to
10 // memory, Intel microarchitectures can in many cases forward the data directly
11 // from the store to the load, This "store forwarding" saves cycles by enabling
12 // the load to directly obtain the data instead of accessing the data from
13 // cache or memory.
14 // A "store forward block" occurs in cases that a store cannot be forwarded to
15 // the load. The most typical case of store forward block on Intel Core
16 // microarchitecture that a small store cannot be forwarded to a large load.
17 // The estimated penalty for a store forward block is ~13 cycles.
18 //
19 // This pass tries to recognize and handle cases where "store forward block"
20 // is created by the compiler when lowering memcpy calls to a sequence
21 // of a load and a store.
22 //
23 // The pass currently only handles cases where memcpy is lowered to
24 // XMM/YMM registers, it tries to break the memcpy into smaller copies.
25 // breaking the memcpy should be possible since there is no atomicity
26 // guarantee for loads and stores to XMM/YMM.
27 //
28 // It could be better for performance to solve the problem by loading
29 // to XMM/YMM then inserting the partial store before storing back from XMM/YMM
30 // to memory, but this will result in a more conservative optimization since it
31 // requires we prove that all memory accesses between the blocking store and the
32 // load must alias/don't alias before we can move the store, whereas the
33 // transformation done here is correct regardless to other memory accesses.
34 //===----------------------------------------------------------------------===//
35
36 #include "X86InstrInfo.h"
37 #include "X86Subtarget.h"
38 #include "llvm/CodeGen/MachineBasicBlock.h"
39 #include "llvm/CodeGen/MachineFunction.h"
40 #include "llvm/CodeGen/MachineFunctionPass.h"
41 #include "llvm/CodeGen/MachineInstr.h"
42 #include "llvm/CodeGen/MachineInstrBuilder.h"
43 #include "llvm/CodeGen/MachineOperand.h"
44 #include "llvm/CodeGen/MachineRegisterInfo.h"
45 #include "llvm/IR/DebugInfoMetadata.h"
46 #include "llvm/IR/DebugLoc.h"
47 #include "llvm/IR/Function.h"
48 #include "llvm/MC/MCInstrDesc.h"
49
50 using namespace llvm;
51
52 #define DEBUG_TYPE "x86-avoid-SFB"
53
54 namespace llvm {
55 void initializeX86AvoidSFBPassPass(PassRegistry &);
56 } // end namespace llvm
57
58 static cl::opt DisableX86AvoidStoreForwardBlocks(
59 "x86-disable-avoid-SFB", cl::Hidden,
60 cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(false));
61
62 static cl::opt X86AvoidSFBInspectionLimit(
63 "x86-sfb-inspection-limit",
64 cl::desc("X86: Number of instructions backward to "
65 "inspect for store forwarding blocks."),
66 cl::init(20), cl::Hidden);
67
68 namespace {
69
70 using DisplacementSizeMap = std::map;
71
72 class X86AvoidSFBPass : public MachineFunctionPass {
73 public:
74 static char ID;
75 X86AvoidSFBPass() : MachineFunctionPass(ID) {
76 initializeX86AvoidSFBPassPass(*PassRegistry::getPassRegistry());
77 }
78
79 StringRef getPassName() const override {
80 return "X86 Avoid Store Forwarding Blocks";
81 }
82
83 bool runOnMachineFunction(MachineFunction &MF) override;
84
85 void getAnalysisUsage(AnalysisUsage &AU) const override {
86 MachineFunctionPass::getAnalysisUsage(AU);
87 AU.addRequired();
88 }
89
90 private:
91 MachineRegisterInfo *MRI;
92 const X86InstrInfo *TII;
93 const X86RegisterInfo *TRI;
94 SmallVector, 2>
95 BlockedLoadsStoresPairs;
96 SmallVector ForRemoval;
97 AliasAnalysis *AA;
98
99 /// \brief Returns couples of Load then Store to memory which look
100 /// like a memcpy.
101 void findPotentiallylBlockedCopies(MachineFunction &MF);
102 /// \brief Break the memcpy's load and store into smaller copies
103 /// such that each memory load that was blocked by a smaller store
104 /// would now be copied separately.
105 void breakBlockedCopies(MachineInstr *LoadInst, MachineInstr *StoreInst,
106 const DisplacementSizeMap &BlockingStoresDispSizeMap);
107 /// \brief Break a copy of size Size to smaller copies.
108 void buildCopies(int Size, MachineInstr *LoadInst, int64_t LdDispImm,
109 MachineInstr *StoreInst, int64_t StDispImm,
110 int64_t LMMOffset, int64_t SMMOffset);
111
112 void buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, int64_t LoadDisp,
113 MachineInstr *StoreInst, unsigned NStoreOpcode,
114 int64_t StoreDisp, unsigned Size, int64_t LMMOffset,
115 int64_t SMMOffset);
116
117 bool alias(const MachineMemOperand &Op1, const MachineMemOperand &Op2) const;
118
119 unsigned getRegSizeInBytes(MachineInstr *Inst);
120 };
121
122 } // end anonymous namespace
123
124 char X86AvoidSFBPass::ID = 0;
125
126 INITIALIZE_PASS_BEGIN(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking",
127 false, false)
128 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
129 INITIALIZE_PASS_END(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking", false,
130 false)
131
132 FunctionPass *llvm::createX86AvoidStoreForwardingBlocks() {
133 return new X86AvoidSFBPass();
134 }
135
136 static bool isXMMLoadOpcode(unsigned Opcode) {
137 return Opcode == X86::MOVUPSrm || Opcode == X86::MOVAPSrm ||
138 Opcode == X86::VMOVUPSrm || Opcode == X86::VMOVAPSrm ||
139 Opcode == X86::VMOVUPDrm || Opcode == X86::VMOVAPDrm ||
140 Opcode == X86::VMOVDQUrm || Opcode == X86::VMOVDQArm ||
141 Opcode == X86::VMOVUPSZ128rm || Opcode == X86::VMOVAPSZ128rm ||
142 Opcode == X86::VMOVUPDZ128rm || Opcode == X86::VMOVAPDZ128rm ||
143 Opcode == X86::VMOVDQU64Z128rm || Opcode == X86::VMOVDQA64Z128rm ||
144 Opcode == X86::VMOVDQU32Z128rm || Opcode == X86::VMOVDQA32Z128rm;
145 }
146 static bool isYMMLoadOpcode(unsigned Opcode) {
147 return Opcode == X86::VMOVUPSYrm || Opcode == X86::VMOVAPSYrm ||
148 Opcode == X86::VMOVUPDYrm || Opcode == X86::VMOVAPDYrm ||
149 Opcode == X86::VMOVDQUYrm || Opcode == X86::VMOVDQAYrm ||
150 Opcode == X86::VMOVUPSZ256rm || Opcode == X86::VMOVAPSZ256rm ||
151 Opcode == X86::VMOVUPDZ256rm || Opcode == X86::VMOVAPDZ256rm ||
152 Opcode == X86::VMOVDQU64Z256rm || Opcode == X86::VMOVDQA64Z256rm ||
153 Opcode == X86::VMOVDQU32Z256rm || Opcode == X86::VMOVDQA32Z256rm;
154 }
155
156 static bool isPotentialBlockedMemCpyLd(unsigned Opcode) {
157 return isXMMLoadOpcode(Opcode) || isYMMLoadOpcode(Opcode);
158 }
159
160 static bool isPotentialBlockedMemCpyPair(int LdOpcode, int StOpcode) {
161 switch (LdOpcode) {
162 case X86::MOVUPSrm:
163 case X86::MOVAPSrm:
164 return StOpcode == X86::MOVUPSmr || StOpcode == X86::MOVAPSmr;
165 case X86::VMOVUPSrm:
166 case X86::VMOVAPSrm:
167 return StOpcode == X86::VMOVUPSmr || StOpcode == X86::VMOVAPSmr;
168 case X86::VMOVUPDrm:
169 case X86::VMOVAPDrm:
170 return StOpcode == X86::VMOVUPDmr || StOpcode == X86::VMOVAPDmr;
171 case X86::VMOVDQUrm:
172 case X86::VMOVDQArm:
173 return StOpcode == X86::VMOVDQUmr || StOpcode == X86::VMOVDQAmr;
174 case X86::VMOVUPSZ128rm:
175 case X86::VMOVAPSZ128rm:
176 return StOpcode == X86::VMOVUPSZ128mr || StOpcode == X86::VMOVAPSZ128mr;
177 case X86::VMOVUPDZ128rm:
178 case X86::VMOVAPDZ128rm:
179 return StOpcode == X86::VMOVUPDZ128mr || StOpcode == X86::VMOVAPDZ128mr;
180 case X86::VMOVUPSYrm:
181 case X86::VMOVAPSYrm:
182 return StOpcode == X86::VMOVUPSYmr || StOpcode == X86::VMOVAPSYmr;
183 case X86::VMOVUPDYrm:
184 case X86::VMOVAPDYrm:
185 return StOpcode == X86::VMOVUPDYmr || StOpcode == X86::VMOVAPDYmr;
186 case X86::VMOVDQUYrm:
187 case X86::VMOVDQAYrm:
188 return StOpcode == X86::VMOVDQUYmr || StOpcode == X86::VMOVDQAYmr;
189 case X86::VMOVUPSZ256rm:
190 case X86::VMOVAPSZ256rm:
191 return StOpcode == X86::VMOVUPSZ256mr || StOpcode == X86::VMOVAPSZ256mr;
192 case X86::VMOVUPDZ256rm:
193 case X86::VMOVAPDZ256rm:
194 return StOpcode == X86::VMOVUPDZ256mr || StOpcode == X86::VMOVAPDZ256mr;
195 case X86::VMOVDQU64Z128rm:
196 case X86::VMOVDQA64Z128rm:
197 return StOpcode == X86::VMOVDQU64Z128mr || StOpcode == X86::VMOVDQA64Z128mr;
198 case X86::VMOVDQU32Z128rm:
199 case X86::VMOVDQA32Z128rm:
200 return StOpcode == X86::VMOVDQU32Z128mr || StOpcode == X86::VMOVDQA32Z128mr;
201 case X86::VMOVDQU64Z256rm:
202 case X86::VMOVDQA64Z256rm:
203 return StOpcode == X86::VMOVDQU64Z256mr || StOpcode == X86::VMOVDQA64Z256mr;
204 case X86::VMOVDQU32Z256rm:
205 case X86::VMOVDQA32Z256rm:
206 return StOpcode == X86::VMOVDQU32Z256mr || StOpcode == X86::VMOVDQA32Z256mr;
207 default:
208 return false;
209 }
210 }
211
212 static bool isPotentialBlockingStoreInst(int Opcode, int LoadOpcode) {
213 bool PBlock = false;
214 PBlock |= Opcode == X86::MOV64mr || Opcode == X86::MOV64mi32 ||
215 Opcode == X86::MOV32mr || Opcode == X86::MOV32mi ||
216 Opcode == X86::MOV16mr || Opcode == X86::MOV16mi ||
217 Opcode == X86::MOV8mr || Opcode == X86::MOV8mi;
218 if (isYMMLoadOpcode(LoadOpcode))
219 PBlock |= Opcode == X86::VMOVUPSmr || Opcode == X86::VMOVAPSmr ||
220 Opcode == X86::VMOVUPDmr || Opcode == X86::VMOVAPDmr ||
221 Opcode == X86::VMOVDQUmr || Opcode == X86::VMOVDQAmr ||
222 Opcode == X86::VMOVUPSZ128mr || Opcode == X86::VMOVAPSZ128mr ||
223 Opcode == X86::VMOVUPDZ128mr || Opcode == X86::VMOVAPDZ128mr ||
224 Opcode == X86::VMOVDQU64Z128mr ||
225 Opcode == X86::VMOVDQA64Z128mr ||
226 Opcode == X86::VMOVDQU32Z128mr || Opcode == X86::VMOVDQA32Z128mr;
227 return PBlock;
228 }
229
230 static const int MOV128SZ = 16;
231 static const int MOV64SZ = 8;
232 static const int MOV32SZ = 4;
233 static const int MOV16SZ = 2;
234 static const int MOV8SZ = 1;
235
236 static unsigned getYMMtoXMMLoadOpcode(unsigned LoadOpcode) {
237 switch (LoadOpcode) {
238 case X86::VMOVUPSYrm:
239 case X86::VMOVAPSYrm:
240 return X86::VMOVUPSrm;
241 case X86::VMOVUPDYrm:
242 case X86::VMOVAPDYrm:
243 return X86::VMOVUPDrm;
244 case X86::VMOVDQUYrm:
245 case X86::VMOVDQAYrm:
246 return X86::VMOVDQUrm;
247 case X86::VMOVUPSZ256rm:
248 case X86::VMOVAPSZ256rm:
249 return X86::VMOVUPSZ128rm;
250 case X86::VMOVUPDZ256rm:
251 case X86::VMOVAPDZ256rm:
252 return X86::VMOVUPDZ128rm;
253 case X86::VMOVDQU64Z256rm:
254 case X86::VMOVDQA64Z256rm:
255 return X86::VMOVDQU64Z128rm;
256 case X86::VMOVDQU32Z256rm:
257 case X86::VMOVDQA32Z256rm:
258 return X86::VMOVDQU32Z128rm;
259 default:
260 llvm_unreachable("Unexpected Load Instruction Opcode");
261 }
262 return 0;
263 }
264
265 static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode) {
266 switch (StoreOpcode) {
267 case X86::VMOVUPSYmr:
268 case X86::VMOVAPSYmr:
269 return X86::VMOVUPSmr;
270 case X86::VMOVUPDYmr:
271 case X86::VMOVAPDYmr:
272 return X86::VMOVUPDmr;
273 case X86::VMOVDQUYmr:
274 case X86::VMOVDQAYmr:
275 return X86::VMOVDQUmr;
276 case X86::VMOVUPSZ256mr:
277 case X86::VMOVAPSZ256mr:
278 return X86::VMOVUPSZ128mr;
279 case X86::VMOVUPDZ256mr:
280 case X86::VMOVAPDZ256mr:
281 return X86::VMOVUPDZ128mr;
282 case X86::VMOVDQU64Z256mr:
283 case X86::VMOVDQA64Z256mr:
284 return X86::VMOVDQU64Z128mr;
285 case X86::VMOVDQU32Z256mr:
286 case X86::VMOVDQA32Z256mr:
287 return X86::VMOVDQU32Z128mr;
288 default:
289 llvm_unreachable("Unexpected Load Instruction Opcode");
290 }
291 return 0;
292 }
293
294 static int getAddrOffset(MachineInstr *MI) {
295 const MCInstrDesc &Descl = MI->getDesc();
296 int AddrOffset = X86II::getMemoryOperandNo(Descl.TSFlags);
297 assert(AddrOffset != -1 && "Expected Memory Operand");
298 AddrOffset += X86II::getOperandBias(Descl);
299 return AddrOffset;
300 }
301
302 static MachineOperand &getBaseOperand(MachineInstr *MI) {
303 int AddrOffset = getAddrOffset(MI);
304 return MI->getOperand(AddrOffset + X86::AddrBaseReg);
305 }
306
307 static MachineOperand &getDispOperand(MachineInstr *MI) {
308 int AddrOffset = getAddrOffset(MI);
309 return MI->getOperand(AddrOffset + X86::AddrDisp);
310 }
311
312 // Relevant addressing modes contain only base register and immediate
313 // displacement or frameindex and immediate displacement.
314 // TODO: Consider expanding to other addressing modes in the future
315 static bool isRelevantAddressingMode(MachineInstr *MI) {
316 int AddrOffset = getAddrOffset(MI);
317 MachineOperand &Base = getBaseOperand(MI);
318 MachineOperand &Disp = getDispOperand(MI);
319 MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt);
320 MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg);
321 MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg);
322
323 if (!((Base.isReg() && Base.getReg() != X86::NoRegister) || Base.isFI()))
324 return false;
325 if (!Disp.isImm())
326 return false;
327 if (Scale.getImm() != 1)
328 return false;
329 if (!(Index.isReg() && Index.getReg() == X86::NoRegister))
330 return false;
331 if (!(Segment.isReg() && Segment.getReg() == X86::NoRegister))
332 return false;
333 return true;
334 }
335
336 // Collect potentially blocking stores.
337 // Limit the number of instructions backwards we want to inspect
338 // since the effect of store block won't be visible if the store
339 // and load instructions have enough instructions in between to
340 // keep the core busy.
341 static SmallVector
342 findPotentialBlockers(MachineInstr *LoadInst) {
343 SmallVector PotentialBlockers;
344 unsigned BlockCount = 0;
345 const unsigned InspectionLimit = X86AvoidSFBInspectionLimit;
346 for (auto PBInst = std::next(MachineBasicBlock::reverse_iterator(LoadInst)),
347 E = LoadInst->getParent()->rend();
348 PBInst != E; ++PBInst) {
349 BlockCount++;
350 if (BlockCount >= InspectionLimit)
351 break;
352 MachineInstr &MI = *PBInst;
353 if (MI.getDesc().isCall())
354 return PotentialBlockers;
355 PotentialBlockers.push_back(&MI);
356 }
357 // If we didn't get to the instructions limit try predecessing blocks.
358 // Ideally we should traverse the predecessor blocks in depth with some
359 // coloring algorithm, but for now let's just look at the first order
360 // predecessors.
361 if (BlockCount < InspectionLimit) {
362 MachineBasicBlock *MBB = LoadInst->getParent();
363 int LimitLeft = InspectionLimit - BlockCount;
364 for (MachineBasicBlock::pred_iterator PB = MBB->pred_begin(),
365 PE = MBB->pred_end();
366 PB != PE; ++PB) {
367 MachineBasicBlock *PMBB = *PB;
368 int PredCount = 0;
369 for (MachineBasicBlock::reverse_iterator PBInst = PMBB->rbegin(),
370 PME = PMBB->rend();
371 PBInst != PME; ++PBInst) {
372 PredCount++;
373 if (PredCount >= LimitLeft)
374 break;
375 if (PBInst->getDesc().isCall())
376 break;
377 PotentialBlockers.push_back(&*PBInst);
378 }
379 }
380 }
381 return PotentialBlockers;
382 }
383
384 void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
385 int64_t LoadDisp, MachineInstr *StoreInst,
386 unsigned NStoreOpcode, int64_t StoreDisp,
387 unsigned Size, int64_t LMMOffset,
388 int64_t SMMOffset) {
389 MachineOperand &LoadBase = getBaseOperand(LoadInst);
390 MachineOperand &StoreBase = getBaseOperand(StoreInst);
391 MachineBasicBlock *MBB = LoadInst->getParent();
392 MachineMemOperand *LMMO = *LoadInst->memoperands_begin();
393 MachineMemOperand *SMMO = *StoreInst->memoperands_begin();
394
395 unsigned Reg1 = MRI->createVirtualRegister(
396 TII->getRegClass(TII->get(NLoadOpcode), 0, TRI, *(MBB->getParent())));
397 BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode), Reg1)
398 .add(LoadBase)
399 .addImm(1)
400 .addReg(X86::NoRegister)
401 .addImm(LoadDisp)
402 .addReg(X86::NoRegister)
403 .addMemOperand(
404 MBB->getParent()->getMachineMemOperand(LMMO, LMMOffset, Size));
405 DEBUG(LoadInst->getPrevNode()->dump());
406 // If the load and store are consecutive, use the loadInst location to
407 // reduce register pressure.
408 MachineInstr *StInst = StoreInst;
409 if (StoreInst->getPrevNode() == LoadInst)
410 StInst = LoadInst;
411 BuildMI(*MBB, StInst, StInst->getDebugLoc(), TII->get(NStoreOpcode))
412 .add(StoreBase)
413 .addImm(1)
414 .addReg(X86::NoRegister)
415 .addImm(StoreDisp)
416 .addReg(X86::NoRegister)
417 .addReg(Reg1)
418 .addMemOperand(
419 MBB->getParent()->getMachineMemOperand(SMMO, SMMOffset, Size));
420 DEBUG(StInst->getPrevNode()->dump());
421 }
422
423 void X86AvoidSFBPass::buildCopies(int Size, MachineInstr *LoadInst,
424 int64_t LdDispImm, MachineInstr *StoreInst,
425 int64_t StDispImm, int64_t LMMOffset,
426 int64_t SMMOffset) {
427 int LdDisp = LdDispImm;
428 int StDisp = StDispImm;
429 while (Size > 0) {
430 if ((Size - MOV128SZ >= 0) && isYMMLoadOpcode(LoadInst->getOpcode())) {
431 Size = Size - MOV128SZ;
432 buildCopy(LoadInst, getYMMtoXMMLoadOpcode(LoadInst->getOpcode()), LdDisp,
433 StoreInst, getYMMtoXMMStoreOpcode(StoreInst->getOpcode()),
434 StDisp, MOV128SZ, LMMOffset, SMMOffset);
435 LdDisp += MOV128SZ;
436 StDisp += MOV128SZ;
437 LMMOffset += MOV128SZ;
438 SMMOffset += MOV128SZ;
439 continue;
440 }
441 if (Size - MOV64SZ >= 0) {
442 Size = Size - MOV64SZ;
443 buildCopy(LoadInst, X86::MOV64rm, LdDisp, StoreInst, X86::MOV64mr, StDisp,
444 MOV64SZ, LMMOffset, SMMOffset);
445 LdDisp += MOV64SZ;
446 StDisp += MOV64SZ;
447 LMMOffset += MOV64SZ;
448 SMMOffset += MOV64SZ;
449 continue;
450 }
451 if (Size - MOV32SZ >= 0) {
452 Size = Size - MOV32SZ;
453 buildCopy(LoadInst, X86::MOV32rm, LdDisp, StoreInst, X86::MOV32mr, StDisp,
454 MOV32SZ, LMMOffset, SMMOffset);
455 LdDisp += MOV32SZ;
456 StDisp += MOV32SZ;
457 LMMOffset += MOV32SZ;
458 SMMOffset += MOV32SZ;
459 continue;
460 }
461 if (Size - MOV16SZ >= 0) {
462 Size = Size - MOV16SZ;
463 buildCopy(LoadInst, X86::MOV16rm, LdDisp, StoreInst, X86::MOV16mr, StDisp,
464 MOV16SZ, LMMOffset, SMMOffset);
465 LdDisp += MOV16SZ;
466 StDisp += MOV16SZ;
467 LMMOffset += MOV16SZ;
468 SMMOffset += MOV16SZ;
469 continue;
470 }
471 if (Size - MOV8SZ >= 0) {
472 Size = Size - MOV8SZ;
473 buildCopy(LoadInst, X86::MOV8rm, LdDisp, StoreInst, X86::MOV8mr, StDisp,
474 MOV8SZ, LMMOffset, SMMOffset);
475 LdDisp += MOV8SZ;
476 StDisp += MOV8SZ;
477 LMMOffset += MOV8SZ;
478 SMMOffset += MOV8SZ;
479 continue;
480 }
481 }
482 assert(Size == 0 && "Wrong size division");
483 }
484
485 static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst) {
486 MachineOperand &LoadBase = getBaseOperand(LoadInst);
487 MachineOperand &StoreBase = getBaseOperand(StoreInst);
488 if (LoadBase.isReg()) {
489 MachineInstr *LastLoad = LoadInst->getPrevNode();
490 // If the original load and store to xmm/ymm were consecutive
491 // then the partial copies were also created in
492 // a consecutive order to reduce register pressure,
493 // and the location of the last load is before the last store.
494 if (StoreInst->getPrevNode() == LoadInst)
495 LastLoad = LoadInst->getPrevNode()->getPrevNode();
496 getBaseOperand(LastLoad).setIsKill(LoadBase.isKill());
497 }
498 if (StoreBase.isReg()) {
499 MachineInstr *StInst = StoreInst;
500 if (StoreInst->getPrevNode() == LoadInst)
501 StInst = LoadInst;
502 getBaseOperand(StInst->getPrevNode()).setIsKill(StoreBase.isKill());
503 }
504 }
505
506 bool X86AvoidSFBPass::alias(const MachineMemOperand &Op1,
507 const MachineMemOperand &Op2) const {
508 if (!Op1.getValue() || !Op2.getValue())
509 return true;
510
511 int64_t MinOffset = std::min(Op1.getOffset(), Op2.getOffset());
512 int64_t Overlapa = Op1.getSize() + Op1.getOffset() - MinOffset;
513 int64_t Overlapb = Op2.getSize() + Op2.getOffset() - MinOffset;
514
515 AliasResult AAResult =
516 AA->alias(MemoryLocation(Op1.getValue(), Overlapa, Op1.getAAInfo()),
517 MemoryLocation(Op2.getValue(), Overlapb, Op2.getAAInfo()));
518 return AAResult != NoAlias;
519 }
520
521 void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
522 for (auto &MBB : MF)
523 for (auto &MI : MBB) {
524 if (!isPotentialBlockedMemCpyLd(MI.getOpcode()))
525 continue;
526 int DefVR = MI.getOperand(0).getReg();
527 if (!MRI->hasOneUse(DefVR))
528 continue;
529 for (auto UI = MRI->use_nodbg_begin(DefVR), UE = MRI->use_nodbg_end();
530 UI != UE;) {
531 MachineOperand &StoreMO = *UI++;
532 MachineInstr &StoreMI = *StoreMO.getParent();
533 // Skip cases where the memcpy may overlap.
534 if (StoreMI.getParent() == MI.getParent() &&
535 isPotentialBlockedMemCpyPair(MI.getOpcode(), StoreMI.getOpcode()) &&
536 isRelevantAddressingMode(&MI) &&
537 isRelevantAddressingMode(&StoreMI)) {
538 assert(MI.hasOneMemOperand() &&
539 "Expected one memory operand for load instruction");
540 assert(StoreMI.hasOneMemOperand() &&
541 "Expected one memory operand for store instruction");
542 if (!alias(**MI.memoperands_begin(), **StoreMI.memoperands_begin()))
543 BlockedLoadsStoresPairs.push_back(std::make_pair(&MI, &StoreMI));
544 }
545 }
546 }
547 }
548
549 unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) {
550 auto TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI,
551 *LoadInst->getParent()->getParent());
552 return TRI->getRegSizeInBits(*TRC) / 8;
553 }
554
555 void X86AvoidSFBPass::breakBlockedCopies(
556 MachineInstr *LoadInst, MachineInstr *StoreInst,
557 const DisplacementSizeMap &BlockingStoresDispSizeMap) {
558 int64_t LdDispImm = getDispOperand(LoadInst).getImm();
559 int64_t StDispImm = getDispOperand(StoreInst).getImm();
560 int64_t LMMOffset = (*LoadInst->memoperands_begin())->getOffset();
561 int64_t SMMOffset = (*StoreInst->memoperands_begin())->getOffset();
562
563 int64_t LdDisp1 = LdDispImm;
564 int64_t LdDisp2 = 0;
565 int64_t StDisp1 = StDispImm;
566 int64_t StDisp2 = 0;
567 unsigned Size1 = 0;
568 unsigned Size2 = 0;
569 int64_t LdStDelta = StDispImm - LdDispImm;
570
571 for (auto DispSizePair : BlockingStoresDispSizeMap) {
572 LdDisp2 = DispSizePair.first;
573 StDisp2 = DispSizePair.first + LdStDelta;
574 Size2 = DispSizePair.second;
575 // Avoid copying overlapping areas.
576 if (LdDisp2 < LdDisp1) {
577 int OverlapDelta = LdDisp1 - LdDisp2;
578 LdDisp2 += OverlapDelta;
579 StDisp2 += OverlapDelta;
580 Size2 -= OverlapDelta;
581 }
582 Size1 = std::abs(std::abs(LdDisp2) - std::abs(LdDisp1));
583
584 // Build a copy for the point until the current blocking store's
585 // displacement.
586 buildCopies(Size1, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
587 SMMOffset);
588 // Build a copy for the current blocking store.
589 buildCopies(Size2, LoadInst, LdDisp2, StoreInst, StDisp2, LMMOffset + Size1,
590 SMMOffset + Size1);
591 LdDisp1 = LdDisp2 + Size2;
592 StDisp1 = StDisp2 + Size2;
593 LMMOffset += Size1 + Size2;
594 SMMOffset += Size1 + Size2;
595 }
596 unsigned Size3 = (LdDispImm + getRegSizeInBytes(LoadInst)) - LdDisp1;
597 buildCopies(Size3, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
598 LMMOffset);
599 }
600
601 static bool hasSameBaseOpValue(MachineInstr *LoadInst,
602 MachineInstr *StoreInst) {
603 MachineOperand &LoadBase = getBaseOperand(LoadInst);
604 MachineOperand &StoreBase = getBaseOperand(StoreInst);
605 if (LoadBase.isReg() != StoreBase.isReg())
606 return false;
607 if (LoadBase.isReg())
608 return LoadBase.getReg() == StoreBase.getReg();
609 return LoadBase.getIndex() == StoreBase.getIndex();
610 }
611
612 static bool isBlockingStore(int64_t LoadDispImm, unsigned LoadSize,
613 int64_t StoreDispImm, unsigned StoreSize) {
614 return ((StoreDispImm >= LoadDispImm) &&
615 (StoreDispImm <= LoadDispImm + (LoadSize - StoreSize)));
616 }
617
618 // Keep track of all stores blocking a load
619 static void
620 updateBlockingStoresDispSizeMap(DisplacementSizeMap &BlockingStoresDispSizeMap,
621 int64_t DispImm, unsigned Size) {
622 if (BlockingStoresDispSizeMap.count(DispImm)) {
623 // Choose the smallest blocking store starting at this displacement.
624 if (BlockingStoresDispSizeMap[DispImm] > Size)
625 BlockingStoresDispSizeMap[DispImm] = Size;
626
627 } else
628 BlockingStoresDispSizeMap[DispImm] = Size;
629 }
630
631 // Remove blocking stores contained in each other.
632 static void
633 removeRedundantBlockingStores(DisplacementSizeMap &BlockingStoresDispSizeMap) {
634 if (BlockingStoresDispSizeMap.size() <= 1)
635 return;
636
637 int64_t PrevDisp = BlockingStoresDispSizeMap.begin()->first;
638 unsigned PrevSize = BlockingStoresDispSizeMap.begin()->second;
639 SmallVector ForRemoval;
640 for (auto DispSizePair = std::next(BlockingStoresDispSizeMap.begin());
641 DispSizePair != BlockingStoresDispSizeMap.end(); ++DispSizePair) {
642 int64_t CurrDisp = DispSizePair->first;
643 unsigned CurrSize = DispSizePair->second;
644 if (CurrDisp + CurrSize <= PrevDisp + PrevSize) {
645 ForRemoval.push_back(PrevDisp);
646 }
647 PrevDisp = CurrDisp;
648 PrevSize = CurrSize;
649 }
650 for (auto Disp : ForRemoval)
651 BlockingStoresDispSizeMap.erase(Disp);
652 }
653
654 bool X86AvoidSFBPass::runOnMachineFunction(MachineFunction &MF) {
655 bool Changed = false;
656
657 if (DisableX86AvoidStoreForwardBlocks || skipFunction(MF.getFunction()) ||
658 !MF.getSubtarget().is64Bit())
659 return false;
660
661 MRI = &MF.getRegInfo();
662 assert(MRI->isSSA() && "Expected MIR to be in SSA form");
663 TII = MF.getSubtarget().getInstrInfo();
664 TRI = MF.getSubtarget().getRegisterInfo();
665 AA = &getAnalysis().getAAResults();
666 DEBUG(dbgs() << "Start X86AvoidStoreForwardBlocks\n";);
667 // Look for a load then a store to XMM/YMM which look like a memcpy
668 findPotentiallylBlockedCopies(MF);
669
670 for (auto LoadStoreInstPair : BlockedLoadsStoresPairs) {
671 MachineInstr *LoadInst = LoadStoreInstPair.first;
672 int64_t LdDispImm = getDispOperand(LoadInst).getImm();
673 DisplacementSizeMap BlockingStoresDispSizeMap;
674
675 SmallVector PotentialBlockers =
676 findPotentialBlockers(LoadInst);
677 for (auto PBInst : PotentialBlockers) {
678 if (!isPotentialBlockingStoreInst(PBInst->getOpcode(),
679 LoadInst->getOpcode()) ||
680 !isRelevantAddressingMode(PBInst))
681 continue;
682 int64_t PBstDispImm = getDispOperand(PBInst).getImm();
683 assert(PBInst->hasOneMemOperand() && "Expected One Memory Operand");
684 unsigned PBstSize = (*PBInst->memoperands_begin())->getSize();
685 // This check doesn't cover all cases, but it will suffice for now.
686 // TODO: take branch probability into consideration, if the blocking
687 // store is in an unreached block, breaking the memcopy could lose
688 // performance.
689 if (hasSameBaseOpValue(LoadInst, PBInst) &&
690 isBlockingStore(LdDispImm, getRegSizeInBytes(LoadInst), PBstDispImm,
691 PBstSize))
692 updateBlockingStoresDispSizeMap(BlockingStoresDispSizeMap, PBstDispImm,
693 PBstSize);
694 }
695
696 if (BlockingStoresDispSizeMap.empty())
697 continue;
698
699 // We found a store forward block, break the memcpy's load and store
700 // into smaller copies such that each smaller store that was causing
701 // a store block would now be copied separately.
702 MachineInstr *StoreInst = LoadStoreInstPair.second;
703 DEBUG(dbgs() << "Blocked load and store instructions: \n");
704 DEBUG(LoadInst->dump());
705 DEBUG(StoreInst->dump());
706 DEBUG(dbgs() << "Replaced with:\n");
707 removeRedundantBlockingStores(BlockingStoresDispSizeMap);
708 breakBlockedCopies(LoadInst, StoreInst, BlockingStoresDispSizeMap);
709 updateKillStatus(LoadInst, StoreInst);
710 ForRemoval.push_back(LoadInst);
711 ForRemoval.push_back(StoreInst);
712 }
713 for (auto RemovedInst : ForRemoval) {
714 RemovedInst->eraseFromParent();
715 }
716 ForRemoval.clear();
717 BlockedLoadsStoresPairs.clear();
718 DEBUG(dbgs() << "End X86AvoidStoreForwardBlocks\n";);
719
720 return Changed;
721 }
6161 void initializeX86CmovConverterPassPass(PassRegistry &);
6262 void initializeX86ExecutionDomainFixPass(PassRegistry &);
6363 void initializeX86DomainReassignmentPass(PassRegistry &);
64 void initializeX86AvoidSFBPassPass(PassRegistry &);
6465
6566 } // end namespace llvm
6667
7980 initializeX86CmovConverterPassPass(PR);
8081 initializeX86ExecutionDomainFixPass(PR);
8182 initializeX86DomainReassignmentPass(PR);
83 initializeX86AvoidSFBPassPass(PR);
8284 }
8385
8486 static std::unique_ptr createTLOF(const Triple &TT) {
448450 addPass(createX86FixupSetCC());
449451 addPass(createX86OptimizeLEAs());
450452 addPass(createX86CallFrameOptimization());
453 addPass(createX86AvoidStoreForwardingBlocks());
451454 }
452455
453456 addPass(createX86WinAllocaExpander());
8888 ; CHECK-NEXT: X86 Fixup SetCC
8989 ; CHECK-NEXT: X86 LEA Optimize
9090 ; CHECK-NEXT: X86 Optimize Call Frame
91 ; CHECK-NEXT: X86 Avoid Store Forwarding Block
9192 ; CHECK-NEXT: X86 WinAlloca Expander
9293 ; CHECK-NEXT: Detect Dead Lanes
9394 ; CHECK-NEXT: Process Implicit Definitions
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=CHECK
2 ; RUN: llc < %s -mtriple=x86_64-linux --x86-disable-avoid-SFB | FileCheck %s --check-prefix=DISABLED
3 ; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK-AVX2
4 ; RUN: llc < %s -mtriple=x86_64-linux -mcpu=skx | FileCheck %s -check-prefix=CHECK-AVX512
5
6 ; ModuleID = '../testSFB/testOverlapBlocks.c'
7 source_filename = "../testSFB/testOverlapBlocks.c"
8 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
9 target triple = "x86_64-unknown-linux-gnu"
10
11 ; Function Attrs: nounwind uwtable
12 define dso_local void @test_overlap_1(i8* nocapture %A, i32 %x) local_unnamed_addr #0 {
13 ; CHECK-LABEL: test_overlap_1:
14 ; CHECK: # %bb.0: # %entry
15 ; CHECK-NEXT: movl $7, -8(%rdi)
16 ; CHECK-NEXT: movq -16(%rdi), %rax
17 ; CHECK-NEXT: movq %rax, (%rdi)
18 ; CHECK-NEXT: movl -8(%rdi), %eax
19 ; CHECK-NEXT: movl %eax, 8(%rdi)
20 ; CHECK-NEXT: movl -4(%rdi), %eax
21 ; CHECK-NEXT: movl %eax, 12(%rdi)
22 ; CHECK-NEXT: movslq %esi, %rax
23 ; CHECK-NEXT: movq %rax, -9(%rdi)
24 ; CHECK-NEXT: movq %rax, -16(%rdi)
25 ; CHECK-NEXT: movb $0, -1(%rdi)
26 ; CHECK-NEXT: movq -16(%rdi), %rax
27 ; CHECK-NEXT: movq %rax, 16(%rdi)
28 ; CHECK-NEXT: movl -8(%rdi), %eax
29 ; CHECK-NEXT: movl %eax, 24(%rdi)
30 ; CHECK-NEXT: movzwl -4(%rdi), %eax
31 ; CHECK-NEXT: movw %ax, 28(%rdi)
32 ; CHECK-NEXT: movb -2(%rdi), %al
33 ; CHECK-NEXT: movb %al, 30(%rdi)
34 ; CHECK-NEXT: movb -1(%rdi), %al
35 ; CHECK-NEXT: movb %al, 31(%rdi)
36 ; CHECK-NEXT: retq
37 ;
38 ; DISABLED-LABEL: test_overlap_1:
39 ; DISABLED: # %bb.0: # %entry
40 ; DISABLED-NEXT: movl $7, -8(%rdi)
41 ; DISABLED-NEXT: movups -16(%rdi), %xmm0
42 ; DISABLED-NEXT: movups %xmm0, (%rdi)
43 ; DISABLED-NEXT: movslq %esi, %rax
44 ; DISABLED-NEXT: movq %rax, -9(%rdi)
45 ; DISABLED-NEXT: movq %rax, -16(%rdi)
46 ; DISABLED-NEXT: movb $0, -1(%rdi)
47 ; DISABLED-NEXT: movups -16(%rdi), %xmm0
48 ; DISABLED-NEXT: movups %xmm0, 16(%rdi)
49 ; DISABLED-NEXT: retq
50 ;
51 ; CHECK-AVX2-LABEL: test_overlap_1:
52 ; CHECK-AVX2: # %bb.0: # %entry
53 ; CHECK-AVX2-NEXT: movl $7, -8(%rdi)
54 ; CHECK-AVX2-NEXT: movq -16(%rdi), %rax
55 ; CHECK-AVX2-NEXT: movq %rax, (%rdi)
56 ; CHECK-AVX2-NEXT: movl -8(%rdi), %eax
57 ; CHECK-AVX2-NEXT: movl %eax, 8(%rdi)
58 ; CHECK-AVX2-NEXT: movl -4(%rdi), %eax
59 ; CHECK-AVX2-NEXT: movl %eax, 12(%rdi)
60 ; CHECK-AVX2-NEXT: movslq %esi, %rax
61 ; CHECK-AVX2-NEXT: movq %rax, -9(%rdi)
62 ; CHECK-AVX2-NEXT: movq %rax, -16(%rdi)
63 ; CHECK-AVX2-NEXT: movb $0, -1(%rdi)
64 ; CHECK-AVX2-NEXT: movq -16(%rdi), %rax
65 ; CHECK-AVX2-NEXT: movq %rax, 16(%rdi)
66 ; CHECK-AVX2-NEXT: movl -8(%rdi), %eax
67 ; CHECK-AVX2-NEXT: movl %eax, 24(%rdi)
68 ; CHECK-AVX2-NEXT: movzwl -4(%rdi), %eax
69 ; CHECK-AVX2-NEXT: movw %ax, 28(%rdi)
70 ; CHECK-AVX2-NEXT: movb -2(%rdi), %al
71 ; CHECK-AVX2-NEXT: movb %al, 30(%rdi)
72 ; CHECK-AVX2-NEXT: movb -1(%rdi), %al
73 ; CHECK-AVX2-NEXT: movb %al, 31(%rdi)
74 ; CHECK-AVX2-NEXT: retq
75 ;
76 ; CHECK-AVX512-LABEL: test_overlap_1:
77 ; CHECK-AVX512: # %bb.0: # %entry
78 ; CHECK-AVX512-NEXT: movl $7, -8(%rdi)
79 ; CHECK-AVX512-NEXT: movq -16(%rdi), %rax
80 ; CHECK-AVX512-NEXT: movq %rax, (%rdi)
81 ; CHECK-AVX512-NEXT: movl -8(%rdi), %eax
82 ; CHECK-AVX512-NEXT: movl %eax, 8(%rdi)
83 ; CHECK-AVX512-NEXT: movl -4(%rdi), %eax
84 ; CHECK-AVX512-NEXT: movl %eax, 12(%rdi)
85 ; CHECK-AVX512-NEXT: movslq %esi, %rax
86 ; CHECK-AVX512-NEXT: movq %rax, -9(%rdi)
87 ; CHECK-AVX512-NEXT: movq %rax, -16(%rdi)
88 ; CHECK-AVX512-NEXT: movb $0, -1(%rdi)
89 ; CHECK-AVX512-NEXT: movq -16(%rdi), %rax
90 ; CHECK-AVX512-NEXT: movq %rax, 16(%rdi)
91 ; CHECK-AVX512-NEXT: movl -8(%rdi), %eax
92 ; CHECK-AVX512-NEXT: movl %eax, 24(%rdi)
93 ; CHECK-AVX512-NEXT: movzwl -4(%rdi), %eax
94 ; CHECK-AVX512-NEXT: movw %ax, 28(%rdi)
95 ; CHECK-AVX512-NEXT: movb -2(%rdi), %al
96 ; CHECK-AVX512-NEXT: movb %al, 30(%rdi)
97 ; CHECK-AVX512-NEXT: movb -1(%rdi), %al
98 ; CHECK-AVX512-NEXT: movb %al, 31(%rdi)
99 ; CHECK-AVX512-NEXT: retq
100 entry:
101 %add.ptr = getelementptr inbounds i8, i8* %A, i64 -16
102 %add.ptr1 = getelementptr inbounds i8, i8* %A, i64 -8
103 %0 = bitcast i8* %add.ptr1 to i32*
104 store i32 7, i32* %0, align 4
105 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %A, i8* nonnull align 4 %add.ptr, i64 16, i1 false)
106 %conv = sext i32 %x to i64
107 %add.ptr2 = getelementptr inbounds i8, i8* %A, i64 -9
108 %1 = bitcast i8* %add.ptr2 to i64*
109 store i64 %conv, i64* %1, align 8
110 %2 = bitcast i8* %add.ptr to i64*
111 store i64 %conv, i64* %2, align 8
112 %add.ptr5 = getelementptr inbounds i8, i8* %A, i64 -1
113 store i8 0, i8* %add.ptr5, align 1
114 %add.ptr6 = getelementptr inbounds i8, i8* %A, i64 16
115 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 %add.ptr6, i8* nonnull align 4 %add.ptr, i64 16, i1 false)
116 ret void
117 }
118
119 ; Function Attrs: argmemonly nounwind
120 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1
121
122 ; Function Attrs: nounwind uwtable
123 define dso_local void @test_overlap_2(i8* nocapture %A, i32 %x) local_unnamed_addr #0 {
124 ; CHECK-LABEL: test_overlap_2:
125 ; CHECK: # %bb.0: # %entry
126 ; CHECK-NEXT: movslq %esi, %rax
127 ; CHECK-NEXT: movq %rax, -16(%rdi)
128 ; CHECK-NEXT: movq -16(%rdi), %rcx
129 ; CHECK-NEXT: movq %rcx, (%rdi)
130 ; CHECK-NEXT: movq -8(%rdi), %rcx
131 ; CHECK-NEXT: movq %rcx, 8(%rdi)
132 ; CHECK-NEXT: movq %rax, -8(%rdi)
133 ; CHECK-NEXT: movl $7, -12(%rdi)
134 ; CHECK-NEXT: movl -16(%rdi), %eax
135 ; CHECK-NEXT: movl %eax, 16(%rdi)
136 ; CHECK-NEXT: movl -12(%rdi), %eax
137 ; CHECK-NEXT: movl %eax, 20(%rdi)
138 ; CHECK-NEXT: movq -8(%rdi), %rax
139 ; CHECK-NEXT: movq %rax, 24(%rdi)
140 ; CHECK-NEXT: retq
141 ;
142 ; DISABLED-LABEL: test_overlap_2:
143 ; DISABLED: # %bb.0: # %entry
144 ; DISABLED-NEXT: movslq %esi, %rax
145 ; DISABLED-NEXT: movq %rax, -16(%rdi)
146 ; DISABLED-NEXT: movups -16(%rdi), %xmm0
147 ; DISABLED-NEXT: movups %xmm0, (%rdi)
148 ; DISABLED-NEXT: movq %rax, -8(%rdi)
149 ; DISABLED-NEXT: movl $7, -12(%rdi)
150 ; DISABLED-NEXT: movups -16(%rdi), %xmm0
151 ; DISABLED-NEXT: movups %xmm0, 16(%rdi)
152 ; DISABLED-NEXT: retq
153 ;
154 ; CHECK-AVX2-LABEL: test_overlap_2:
155 ; CHECK-AVX2: # %bb.0: # %entry
156 ; CHECK-AVX2-NEXT: movslq %esi, %rax
157 ; CHECK-AVX2-NEXT: movq %rax, -16(%rdi)
158 ; CHECK-AVX2-NEXT: movq -16(%rdi), %rcx
159 ; CHECK-AVX2-NEXT: movq %rcx, (%rdi)
160 ; CHECK-AVX2-NEXT: movq -8(%rdi), %rcx
161 ; CHECK-AVX2-NEXT: movq %rcx, 8(%rdi)
162 ; CHECK-AVX2-NEXT: movq %rax, -8(%rdi)
163 ; CHECK-AVX2-NEXT: movl $7, -12(%rdi)
164 ; CHECK-AVX2-NEXT: movl -16(%rdi), %eax
165 ; CHECK-AVX2-NEXT: movl %eax, 16(%rdi)
166 ; CHECK-AVX2-NEXT: movl -12(%rdi), %eax
167 ; CHECK-AVX2-NEXT: movl %eax, 20(%rdi)
168 ; CHECK-AVX2-NEXT: movq -8(%rdi), %rax
169 ; CHECK-AVX2-NEXT: movq %rax, 24(%rdi)
170 ; CHECK-AVX2-NEXT: retq
171 ;
172 ; CHECK-AVX512-LABEL: test_overlap_2:
173 ; CHECK-AVX512: # %bb.0: # %entry
174 ; CHECK-AVX512-NEXT: movslq %esi, %rax
175 ; CHECK-AVX512-NEXT: movq %rax, -16(%rdi)
176 ; CHECK-AVX512-NEXT: movq -16(%rdi), %rcx
177 ; CHECK-AVX512-NEXT: movq %rcx, (%rdi)
178 ; CHECK-AVX512-NEXT: movq -8(%rdi), %rcx
179 ; CHECK-AVX512-NEXT: movq %rcx, 8(%rdi)
180 ; CHECK-AVX512-NEXT: movq %rax, -8(%rdi)
181 ; CHECK-AVX512-NEXT: movl $7, -12(%rdi)
182 ; CHECK-AVX512-NEXT: movl -16(%rdi), %eax
183 ; CHECK-AVX512-NEXT: movl %eax, 16(%rdi)
184 ; CHECK-AVX512-NEXT: movl -12(%rdi), %eax
185 ; CHECK-AVX512-NEXT: movl %eax, 20(%rdi)
186 ; CHECK-AVX512-NEXT: movq -8(%rdi), %rax
187 ; CHECK-AVX512-NEXT: movq %rax, 24(%rdi)
188 ; CHECK-AVX512-NEXT: retq
189 entry:
190 %add.ptr = getelementptr inbounds i8, i8* %A, i64 -16
191 %conv = sext i32 %x to i64
192 %0 = bitcast i8* %add.ptr to i64*
193 store i64 %conv, i64* %0, align 8
194 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %A, i8* nonnull align 4 %add.ptr, i64 16, i1 false)
195 %add.ptr3 = getelementptr inbounds i8, i8* %A, i64 -8
196 %1 = bitcast i8* %add.ptr3 to i64*
197 store i64 %conv, i64* %1, align 8
198 %add.ptr4 = getelementptr inbounds i8, i8* %A, i64 -12
199 %2 = bitcast i8* %add.ptr4 to i32*
200 store i32 7, i32* %2, align 4
201 %add.ptr5 = getelementptr inbounds i8, i8* %A, i64 16
202 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 %add.ptr5, i8* nonnull align 4 %add.ptr, i64 16, i1 false)
203 ret void
204 }
205
206 ; Function Attrs: nounwind uwtable
207 define dso_local void @test_overlap_3(i8* nocapture %A, i32 %x) local_unnamed_addr #0 {
208 ; CHECK-LABEL: test_overlap_3:
209 ; CHECK: # %bb.0: # %entry
210 ; CHECK-NEXT: movl $7, -10(%rdi)
211 ; CHECK-NEXT: movl -16(%rdi), %eax
212 ; CHECK-NEXT: movl %eax, (%rdi)
213 ; CHECK-NEXT: movzwl -12(%rdi), %eax
214 ; CHECK-NEXT: movw %ax, 4(%rdi)
215 ; CHECK-NEXT: movl -10(%rdi), %eax
216 ; CHECK-NEXT: movl %eax, 6(%rdi)
217 ; CHECK-NEXT: movl -6(%rdi), %eax
218 ; CHECK-NEXT: movl %eax, 10(%rdi)
219 ; CHECK-NEXT: movzwl -2(%rdi), %eax
220 ; CHECK-NEXT: movw %ax, 14(%rdi)
221 ; CHECK-NEXT: movslq %esi, %rax
222 ; CHECK-NEXT: movq %rax, -9(%rdi)
223 ; CHECK-NEXT: movq %rax, -16(%rdi)
224 ; CHECK-NEXT: movb $0, -1(%rdi)
225 ; CHECK-NEXT: movq -16(%rdi), %rax
226 ; CHECK-NEXT: movq %rax, 16(%rdi)
227 ; CHECK-NEXT: movzwl -8(%rdi), %eax
228 ; CHECK-NEXT: movw %ax, 24(%rdi)
229 ; CHECK-NEXT: movl -6(%rdi), %eax
230 ; CHECK-NEXT: movl %eax, 26(%rdi)
231 ; CHECK-NEXT: movb -2(%rdi), %al
232 ; CHECK-NEXT: movb %al, 30(%rdi)
233 ; CHECK-NEXT: movb -1(%rdi), %al
234 ; CHECK-NEXT: movb %al, 31(%rdi)
235 ; CHECK-NEXT: retq
236 ;
237 ; DISABLED-LABEL: test_overlap_3:
238 ; DISABLED: # %bb.0: # %entry
239 ; DISABLED-NEXT: movl $7, -10(%rdi)
240 ; DISABLED-NEXT: movups -16(%rdi), %xmm0
241 ; DISABLED-NEXT: movups %xmm0, (%rdi)
242 ; DISABLED-NEXT: movslq %esi, %rax
243 ; DISABLED-NEXT: movq %rax, -9(%rdi)
244 ; DISABLED-NEXT: movq %rax, -16(%rdi)
245 ; DISABLED-NEXT: movb $0, -1(%rdi)
246 ; DISABLED-NEXT: movups -16(%rdi), %xmm0
247 ; DISABLED-NEXT: movups %xmm0, 16(%rdi)
248 ; DISABLED-NEXT: retq
249 ;
250 ; CHECK-AVX2-LABEL: test_overlap_3:
251 ; CHECK-AVX2: # %bb.0: # %entry
252 ; CHECK-AVX2-NEXT: movl $7, -10(%rdi)
253 ; CHECK-AVX2-NEXT: movl -16(%rdi), %eax
254 ; CHECK-AVX2-NEXT: movl %eax, (%rdi)
255 ; CHECK-AVX2-NEXT: movzwl -12(%rdi), %eax
256 ; CHECK-AVX2-NEXT: movw %ax, 4(%rdi)
257 ; CHECK-AVX2-NEXT: movl -10(%rdi), %eax
258 ; CHECK-AVX2-NEXT: movl %eax, 6(%rdi)
259 ; CHECK-AVX2-NEXT: movl -6(%rdi), %eax
260 ; CHECK-AVX2-NEXT: movl %eax, 10(%rdi)
261 ; CHECK-AVX2-NEXT: movzwl -2(%rdi), %eax
262 ; CHECK-AVX2-NEXT: movw %ax, 14(%rdi)
263 ; CHECK-AVX2-NEXT: movslq %esi, %rax
264 ; CHECK-AVX2-NEXT: movq %rax, -9(%rdi)
265 ; CHECK-AVX2-NEXT: movq %rax, -16(%rdi)
266 ; CHECK-AVX2-NEXT: movb $0, -1(%rdi)
267 ; CHECK-AVX2-NEXT: movq -16(%rdi), %rax
268 ; CHECK-AVX2-NEXT: movq %rax, 16(%rdi)
269 ; CHECK-AVX2-NEXT: movzwl -8(%rdi), %eax
270 ; CHECK-AVX2-NEXT: movw %ax, 24(%rdi)
271 ; CHECK-AVX2-NEXT: movl -6(%rdi), %eax
272 ; CHECK-AVX2-NEXT: movl %eax, 26(%rdi)
273 ; CHECK-AVX2-NEXT: movb -2(%rdi), %al
274 ; CHECK-AVX2-NEXT: movb %al, 30(%rdi)
275 ; CHECK-AVX2-NEXT: movb -1(%rdi), %al
276 ; CHECK-AVX2-NEXT: movb %al, 31(%rdi)
277 ; CHECK-AVX2-NEXT: retq
278 ;
279 ; CHECK-AVX512-LABEL: test_overlap_3:
280 ; CHECK-AVX512: # %bb.0: # %entry
281 ; CHECK-AVX512-NEXT: movl $7, -10(%rdi)
282 ; CHECK-AVX512-NEXT: movl -16(%rdi), %eax
283 ; CHECK-AVX512-NEXT: movl %eax, (%rdi)
284 ; CHECK-AVX512-NEXT: movzwl -12(%rdi), %eax
285 ; CHECK-AVX512-NEXT: movw %ax, 4(%rdi)
286 ; CHECK-AVX512-NEXT: movl -10(%rdi), %eax
287 ; CHECK-AVX512-NEXT: movl %eax, 6(%rdi)
288 ; CHECK-AVX512-NEXT: movl -6(%rdi), %eax
289 ; CHECK-AVX512-NEXT: movl %eax, 10(%rdi)
290 ; CHECK-AVX512-NEXT: movzwl -2(%rdi), %eax
291 ; CHECK-AVX512-NEXT: movw %ax, 14(%rdi)
292 ; CHECK-AVX512-NEXT: movslq %esi, %rax
293 ; CHECK-AVX512-NEXT: movq %rax, -9(%rdi)
294 ; CHECK-AVX512-NEXT: movq %rax, -16(%rdi)
295 ; CHECK-AVX512-NEXT: movb $0, -1(%rdi)
296 ; CHECK-AVX512-NEXT: movq -16(%rdi), %rax
297 ; CHECK-AVX512-NEXT: movq %rax, 16(%rdi)
298 ; CHECK-AVX512-NEXT: movzwl -8(%rdi), %eax
299 ; CHECK-AVX512-NEXT: movw %ax, 24(%rdi)
300 ; CHECK-AVX512-NEXT: movl -6(%rdi), %eax
301 ; CHECK-AVX512-NEXT: movl %eax, 26(%rdi)
302 ; CHECK-AVX512-NEXT: movb -2(%rdi), %al
303 ; CHECK-AVX512-NEXT: movb %al, 30(%rdi)
304 ; CHECK-AVX512-NEXT: movb -1(%rdi), %al
305 ; CHECK-AVX512-NEXT: movb %al, 31(%rdi)
306 ; CHECK-AVX512-NEXT: retq
307 entry:
308 %add.ptr = getelementptr inbounds i8, i8* %A, i64 -16
309 %add.ptr1 = getelementptr inbounds i8, i8* %A, i64 -10
310 %0 = bitcast i8* %add.ptr1 to i32*
311 store i32 7, i32* %0, align 4
312 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %A, i8* nonnull align 4 %add.ptr, i64 16, i1 false)
313 %conv = sext i32 %x to i64
314 %add.ptr2 = getelementptr inbounds i8, i8* %A, i64 -9
315 %1 = bitcast i8* %add.ptr2 to i64*
316 store i64 %conv, i64* %1, align 8
317 %2 = bitcast i8* %add.ptr to i64*
318 store i64 %conv, i64* %2, align 8
319 %add.ptr5 = getelementptr inbounds i8, i8* %A, i64 -1
320 store i8 0, i8* %add.ptr5, align 1
321 %add.ptr6 = getelementptr inbounds i8, i8* %A, i64 16
322 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 %add.ptr6, i8* nonnull align 4 %add.ptr, i64 16, i1 false)
323 ret void
324 }
325
326 ; Function Attrs: nounwind uwtable
327 define dso_local void @test_overlap_4(i8* nocapture %A, i32 %x) local_unnamed_addr #0 {
328 ; CHECK-LABEL: test_overlap_4:
329 ; CHECK: # %bb.0: # %entry
330 ; CHECK-NEXT: movups -16(%rdi), %xmm0
331 ; CHECK-NEXT: movups %xmm0, (%rdi)
332 ; CHECK-NEXT: movslq %esi, %rax
333 ; CHECK-NEXT: movq %rax, -8(%rdi)
334 ; CHECK-NEXT: movl %eax, -16(%rdi)
335 ; CHECK-NEXT: movl $0, -11(%rdi)
336 ; CHECK-NEXT: movl -16(%rdi), %eax
337 ; CHECK-NEXT: movl %eax, 16(%rdi)
338 ; CHECK-NEXT: movb -12(%rdi), %al
339 ; CHECK-NEXT: movb %al, 20(%rdi)
340 ; CHECK-NEXT: movl -11(%rdi), %eax
341 ; CHECK-NEXT: movl %eax, 21(%rdi)
342 ; CHECK-NEXT: movl -7(%rdi), %eax
343 ; CHECK-NEXT: movl %eax, 25(%rdi)
344 ; CHECK-NEXT: movzwl -3(%rdi), %eax
345 ; CHECK-NEXT: movw %ax, 29(%rdi)
346 ; CHECK-NEXT: movb -1(%rdi), %al
347 ; CHECK-NEXT: movb %al, 31(%rdi)
348 ; CHECK-NEXT: retq
349 ;
350 ; DISABLED-LABEL: test_overlap_4:
351 ; DISABLED: # %bb.0: # %entry
352 ; DISABLED-NEXT: movups -16(%rdi), %xmm0
353 ; DISABLED-NEXT: movups %xmm0, (%rdi)
354 ; DISABLED-NEXT: movslq %esi, %rax
355 ; DISABLED-NEXT: movq %rax, -8(%rdi)
356 ; DISABLED-NEXT: movl %eax, -16(%rdi)
357 ; DISABLED-NEXT: movl $0, -11(%rdi)
358 ; DISABLED-NEXT: movups -16(%rdi), %xmm0
359 ; DISABLED-NEXT: movups %xmm0, 16(%rdi)
360 ; DISABLED-NEXT: retq
361 ;
362 ; CHECK-AVX2-LABEL: test_overlap_4:
363 ; CHECK-AVX2: # %bb.0: # %entry
364 ; CHECK-AVX2-NEXT: vmovups -16(%rdi), %xmm0
365 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%rdi)
366 ; CHECK-AVX2-NEXT: movslq %esi, %rax
367 ; CHECK-AVX2-NEXT: movq %rax, -8(%rdi)
368 ; CHECK-AVX2-NEXT: movl %eax, -16(%rdi)
369 ; CHECK-AVX2-NEXT: movl $0, -11(%rdi)
370 ; CHECK-AVX2-NEXT: movl -16(%rdi), %eax
371 ; CHECK-AVX2-NEXT: movl %eax, 16(%rdi)
372 ; CHECK-AVX2-NEXT: movb -12(%rdi), %al
373 ; CHECK-AVX2-NEXT: movb %al, 20(%rdi)
374 ; CHECK-AVX2-NEXT: movl -11(%rdi), %eax
375 ; CHECK-AVX2-NEXT: movl %eax, 21(%rdi)
376 ; CHECK-AVX2-NEXT: movl -7(%rdi), %eax
377 ; CHECK-AVX2-NEXT: movl %eax, 25(%rdi)
378 ; CHECK-AVX2-NEXT: movzwl -3(%rdi), %eax
379 ; CHECK-AVX2-NEXT: movw %ax, 29(%rdi)
380 ; CHECK-AVX2-NEXT: movb -1(%rdi), %al
381 ; CHECK-AVX2-NEXT: movb %al, 31(%rdi)
382 ; CHECK-AVX2-NEXT: retq
383 ;
384 ; CHECK-AVX512-LABEL: test_overlap_4:
385 ; CHECK-AVX512: # %bb.0: # %entry
386 ; CHECK-AVX512-NEXT: vmovups -16(%rdi), %xmm0
387 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%rdi)
388 ; CHECK-AVX512-NEXT: movslq %esi, %rax
389 ; CHECK-AVX512-NEXT: movq %rax, -8(%rdi)
390 ; CHECK-AVX512-NEXT: movl %eax, -16(%rdi)
391 ; CHECK-AVX512-NEXT: movl $0, -11(%rdi)
392 ; CHECK-AVX512-NEXT: movl -16(%rdi), %eax
393 ; CHECK-AVX512-NEXT: movl %eax, 16(%rdi)
394 ; CHECK-AVX512-NEXT: movb -12(%rdi), %al
395 ; CHECK-AVX512-NEXT: movb %al, 20(%rdi)
396 ; CHECK-AVX512-NEXT: movl -11(%rdi), %eax
397 ; CHECK-AVX512-NEXT: movl %eax, 21(%rdi)
398 ; CHECK-AVX512-NEXT: movl -7(%rdi), %eax
399 ; CHECK-AVX512-NEXT: movl %eax, 25(%rdi)
400 ; CHECK-AVX512-NEXT: movzwl -3(%rdi), %eax
401 ; CHECK-AVX512-NEXT: movw %ax, 29(%rdi)
402 ; CHECK-AVX512-NEXT: movb -1(%rdi), %al
403 ; CHECK-AVX512-NEXT: movb %al, 31(%rdi)
404 ; CHECK-AVX512-NEXT: retq
405 entry:
406 %add.ptr = getelementptr inbounds i8, i8* %A, i64 -16
407 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %A, i8* nonnull align 4 %add.ptr, i64 16, i1 false)
408 %conv = sext i32 %x to i64
409 %add.ptr1 = getelementptr inbounds i8, i8* %A, i64 -8
410 %0 = bitcast i8* %add.ptr1 to i64*
411 store i64 %conv, i64* %0, align 8
412 %1 = bitcast i8* %add.ptr to i32*
413 store i32 %x, i32* %1, align 4
414 %add.ptr3 = getelementptr inbounds i8, i8* %A, i64 -11
415 %2 = bitcast i8* %add.ptr3 to i32*
416 store i32 0, i32* %2, align 4
417 %add.ptr4 = getelementptr inbounds i8, i8* %A, i64 16
418 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 %add.ptr4, i8* nonnull align 4 %add.ptr, i64 16, i1 false)
419 ret void
420 }
421
422 ; Function Attrs: nounwind uwtable
423 define dso_local void @test_overlap_5(i8* nocapture %A, i32 %x) local_unnamed_addr #0 {
424 ; CHECK-LABEL: test_overlap_5:
425 ; CHECK: # %bb.0: # %entry
426 ; CHECK-NEXT: movups -16(%rdi), %xmm0
427 ; CHECK-NEXT: movups %xmm0, (%rdi)
428 ; CHECK-NEXT: movslq %esi, %rax
429 ; CHECK-NEXT: movq %rax, -16(%rdi)
430 ; CHECK-NEXT: movb %al, -14(%rdi)
431 ; CHECK-NEXT: movb $0, -11(%rdi)
432 ; CHECK-NEXT: movzwl -16(%rdi), %eax
433 ; CHECK-NEXT: movw %ax, 16(%rdi)
434 ; CHECK-NEXT: movb -14(%rdi), %al
435 ; CHECK-NEXT: movb %al, 18(%rdi)
436 ; CHECK-NEXT: movzwl -13(%rdi), %eax
437 ; CHECK-NEXT: movw %ax, 19(%rdi)
438 ; CHECK-NEXT: movb -11(%rdi), %al
439 ; CHECK-NEXT: movb %al, 21(%rdi)
440 ; CHECK-NEXT: movq -10(%rdi), %rax
441 ; CHECK-NEXT: movq %rax, 22(%rdi)
442 ; CHECK-NEXT: movzwl -2(%rdi), %eax
443 ; CHECK-NEXT: movw %ax, 30(%rdi)
444 ; CHECK-NEXT: retq
445 ;
446 ; DISABLED-LABEL: test_overlap_5:
447 ; DISABLED: # %bb.0: # %entry
448 ; DISABLED-NEXT: movups -16(%rdi), %xmm0
449 ; DISABLED-NEXT: movups %xmm0, (%rdi)
450 ; DISABLED-NEXT: movslq %esi, %rax
451 ; DISABLED-NEXT: movq %rax, -16(%rdi)
452 ; DISABLED-NEXT: movb %al, -14(%rdi)
453 ; DISABLED-NEXT: movb $0, -11(%rdi)
454 ; DISABLED-NEXT: movups -16(%rdi), %xmm0
455 ; DISABLED-NEXT: movups %xmm0, 16(%rdi)
456 ; DISABLED-NEXT: retq
457 ;
458 ; CHECK-AVX2-LABEL: test_overlap_5:
459 ; CHECK-AVX2: # %bb.0: # %entry
460 ; CHECK-AVX2-NEXT: vmovups -16(%rdi), %xmm0
461 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%rdi)
462 ; CHECK-AVX2-NEXT: movslq %esi, %rax
463 ; CHECK-AVX2-NEXT: movq %rax, -16(%rdi)
464 ; CHECK-AVX2-NEXT: movb %al, -14(%rdi)
465 ; CHECK-AVX2-NEXT: movb $0, -11(%rdi)
466 ; CHECK-AVX2-NEXT: movzwl -16(%rdi), %eax
467 ; CHECK-AVX2-NEXT: movw %ax, 16(%rdi)
468 ; CHECK-AVX2-NEXT: movb -14(%rdi), %al
469 ; CHECK-AVX2-NEXT: movb %al, 18(%rdi)
470 ; CHECK-AVX2-NEXT: movzwl -13(%rdi), %eax
471 ; CHECK-AVX2-NEXT: movw %ax, 19(%rdi)
472 ; CHECK-AVX2-NEXT: movb -11(%rdi), %al
473 ; CHECK-AVX2-NEXT: movb %al, 21(%rdi)
474 ; CHECK-AVX2-NEXT: movq -10(%rdi), %rax
475 ; CHECK-AVX2-NEXT: movq %rax, 22(%rdi)
476 ; CHECK-AVX2-NEXT: movzwl -2(%rdi), %eax
477 ; CHECK-AVX2-NEXT: movw %ax, 30(%rdi)
478 ; CHECK-AVX2-NEXT: retq
479 ;
480 ; CHECK-AVX512-LABEL: test_overlap_5:
481 ; CHECK-AVX512: # %bb.0: # %entry
482 ; CHECK-AVX512-NEXT: vmovups -16(%rdi), %xmm0
483 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%rdi)
484 ; CHECK-AVX512-NEXT: movslq %esi, %rax
485 ; CHECK-AVX512-NEXT: movq %rax, -16(%rdi)
486 ; CHECK-AVX512-NEXT: movb %al, -14(%rdi)
487 ; CHECK-AVX512-NEXT: movb $0, -11(%rdi)
488 ; CHECK-AVX512-NEXT: movzwl -16(%rdi), %eax
489 ; CHECK-AVX512-NEXT: movw %ax, 16(%rdi)
490 ; CHECK-AVX512-NEXT: movb -14(%rdi), %al
491 ; CHECK-AVX512-NEXT: movb %al, 18(%rdi)
492 ; CHECK-AVX512-NEXT: movzwl -13(%rdi), %eax
493 ; CHECK-AVX512-NEXT: movw %ax, 19(%rdi)
494 ; CHECK-AVX512-NEXT: movb -11(%rdi), %al
495 ; CHECK-AVX512-NEXT: movb %al, 21(%rdi)
496 ; CHECK-AVX512-NEXT: movq -10(%rdi), %rax
497 ; CHECK-AVX512-NEXT: movq %rax, 22(%rdi)
498 ; CHECK-AVX512-NEXT: movzwl -2(%rdi), %eax
499 ; CHECK-AVX512-NEXT: movw %ax, 30(%rdi)
500 ; CHECK-AVX512-NEXT: retq
501 entry:
502 %add.ptr = getelementptr inbounds i8, i8* %A, i64 -16
503 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %A, i8* nonnull align 4 %add.ptr, i64 16, i1 false)
504 %conv = sext i32 %x to i64
505 %0 = bitcast i8* %add.ptr to i64*
506 store i64 %conv, i64* %0, align 8
507 %conv2 = trunc i32 %x to i8
508 %add.ptr3 = getelementptr inbounds i8, i8* %A, i64 -14
509 store i8 %conv2, i8* %add.ptr3, align 1
510 %add.ptr4 = getelementptr inbounds i8, i8* %A, i64 -11
511 store i8 0, i8* %add.ptr4, align 1
512 %add.ptr5 = getelementptr inbounds i8, i8* %A, i64 16
513 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 %add.ptr5, i8* nonnull align 4 %add.ptr, i64 16, i1 false)
514 ret void
515 }
516
517 attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
518 attributes #1 = { argmemonly nounwind }
519
520
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=CHECK
2 ; RUN: llc < %s -mtriple=x86_64-linux --x86-disable-avoid-SFB | FileCheck %s --check-prefix=DISABLED
3 ; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK-AVX2
4 ; RUN: llc < %s -mtriple=x86_64-linux -mcpu=skx | FileCheck %s -check-prefix=CHECK-AVX512
5
6 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
7 target triple = "x86_64-unknown-linux-gnu"
8
9 %struct.S = type { i32, i32, i32, i32 }
10
11 ; Function Attrs: nounwind uwtable
12 define void @test_conditional_block(%struct.S* nocapture noalias %s1 , %struct.S* nocapture noalias %s2, i32 %x, %struct.S* nocapture noalias %s3, %struct.S* nocapture noalias readonly %s4) local_unnamed_addr #0 {
13 ; CHECK-LABEL: test_conditional_block:
14 ; CHECK: # %bb.0: # %entry
15 ; CHECK-NEXT: cmpl $18, %edx
16 ; CHECK-NEXT: jl .LBB0_2
17 ; CHECK-NEXT: # %bb.1: # %if.then
18 ; CHECK-NEXT: movl %edx, 4(%rdi)
19 ; CHECK-NEXT: .LBB0_2: # %if.end
20 ; CHECK-NEXT: movups (%r8), %xmm0
21 ; CHECK-NEXT: movups %xmm0, (%rcx)
22 ; CHECK-NEXT: movl (%rdi), %eax
23 ; CHECK-NEXT: movl %eax, (%rsi)
24 ; CHECK-NEXT: movl 4(%rdi), %eax
25 ; CHECK-NEXT: movl %eax, 4(%rsi)
26 ; CHECK-NEXT: movq 8(%rdi), %rax
27 ; CHECK-NEXT: movq %rax, 8(%rsi)
28 ; CHECK-NEXT: retq
29 ;
30 ; DISABLED-LABEL: test_conditional_block:
31 ; DISABLED: # %bb.0: # %entry
32 ; DISABLED-NEXT: cmpl $18, %edx
33 ; DISABLED-NEXT: jl .LBB0_2
34 ; DISABLED-NEXT: # %bb.1: # %if.then
35 ; DISABLED-NEXT: movl %edx, 4(%rdi)
36 ; DISABLED-NEXT: .LBB0_2: # %if.end
37 ; DISABLED-NEXT: movups (%r8), %xmm0
38 ; DISABLED-NEXT: movups %xmm0, (%rcx)
39 ; DISABLED-NEXT: movups (%rdi), %xmm0
40 ; DISABLED-NEXT: movups %xmm0, (%rsi)
41 ; DISABLED-NEXT: retq
42 ;
43 ; CHECK-AVX2-LABEL: test_conditional_block:
44 ; CHECK-AVX2: # %bb.0: # %entry
45 ; CHECK-AVX2-NEXT: cmpl $18, %edx
46 ; CHECK-AVX2-NEXT: jl .LBB0_2
47 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
48 ; CHECK-AVX2-NEXT: movl %edx, 4(%rdi)
49 ; CHECK-AVX2-NEXT: .LBB0_2: # %if.end
50 ; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0
51 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx)
52 ; CHECK-AVX2-NEXT: movl (%rdi), %eax
53 ; CHECK-AVX2-NEXT: movl %eax, (%rsi)
54 ; CHECK-AVX2-NEXT: movl 4(%rdi), %eax
55 ; CHECK-AVX2-NEXT: movl %eax, 4(%rsi)
56 ; CHECK-AVX2-NEXT: movq 8(%rdi), %rax
57 ; CHECK-AVX2-NEXT: movq %rax, 8(%rsi)
58 ; CHECK-AVX2-NEXT: retq
59 ;
60 ; CHECK-AVX512-LABEL: test_conditional_block:
61 ; CHECK-AVX512: # %bb.0: # %entry
62 ; CHECK-AVX512-NEXT: cmpl $18, %edx
63 ; CHECK-AVX512-NEXT: jl .LBB0_2
64 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
65 ; CHECK-AVX512-NEXT: movl %edx, 4(%rdi)
66 ; CHECK-AVX512-NEXT: .LBB0_2: # %if.end
67 ; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0
68 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx)
69 ; CHECK-AVX512-NEXT: movl (%rdi), %eax
70 ; CHECK-AVX512-NEXT: movl %eax, (%rsi)
71 ; CHECK-AVX512-NEXT: movl 4(%rdi), %eax
72 ; CHECK-AVX512-NEXT: movl %eax, 4(%rsi)
73 ; CHECK-AVX512-NEXT: movq 8(%rdi), %rax
74 ; CHECK-AVX512-NEXT: movq %rax, 8(%rsi)
75 ; CHECK-AVX512-NEXT: retq
76 entry:
77 %cmp = icmp sgt i32 %x, 17
78 br i1 %cmp, label %if.then, label %if.end
79
80 if.then: ; preds = %entry
81 %b = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 1
82 store i32 %x, i32* %b, align 4
83 br label %if.end
84
85 if.end: ; preds = %if.then, %entry
86 %0 = bitcast %struct.S* %s3 to i8*
87 %1 = bitcast %struct.S* %s4 to i8*
88 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
89 %2 = bitcast %struct.S* %s2 to i8*
90 %3 = bitcast %struct.S* %s1 to i8*
91 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 4, i1 false)
92 ret void
93 }
94
95 ; Function Attrs: nounwind uwtable
96 define void @test_imm_store(%struct.S* nocapture noalias %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3) local_unnamed_addr #0 {
97 ; CHECK-LABEL: test_imm_store:
98 ; CHECK: # %bb.0: # %entry
99 ; CHECK-NEXT: movl $0, (%rdi)
100 ; CHECK-NEXT: movl $1, (%rcx)
101 ; CHECK-NEXT: movl (%rdi), %eax
102 ; CHECK-NEXT: movl %eax, (%rsi)
103 ; CHECK-NEXT: movq 4(%rdi), %rax
104 ; CHECK-NEXT: movq %rax, 4(%rsi)
105 ; CHECK-NEXT: movl 12(%rdi), %eax
106 ; CHECK-NEXT: movl %eax, 12(%rsi)
107 ; CHECK-NEXT: retq
108 ;
109 ; DISABLED-LABEL: test_imm_store:
110 ; DISABLED: # %bb.0: # %entry
111 ; DISABLED-NEXT: movl $0, (%rdi)
112 ; DISABLED-NEXT: movl $1, (%rcx)
113 ; DISABLED-NEXT: movups (%rdi), %xmm0
114 ; DISABLED-NEXT: movups %xmm0, (%rsi)
115 ; DISABLED-NEXT: retq
116 ;
117 ; CHECK-AVX2-LABEL: test_imm_store:
118 ; CHECK-AVX2: # %bb.0: # %entry
119 ; CHECK-AVX2-NEXT: movl $0, (%rdi)
120 ; CHECK-AVX2-NEXT: movl $1, (%rcx)
121 ; CHECK-AVX2-NEXT: movl (%rdi), %eax
122 ; CHECK-AVX2-NEXT: movl %eax, (%rsi)
123 ; CHECK-AVX2-NEXT: movq 4(%rdi), %rax
124 ; CHECK-AVX2-NEXT: movq %rax, 4(%rsi)
125 ; CHECK-AVX2-NEXT: movl 12(%rdi), %eax
126 ; CHECK-AVX2-NEXT: movl %eax, 12(%rsi)
127 ; CHECK-AVX2-NEXT: retq
128 ;
129 ; CHECK-AVX512-LABEL: test_imm_store:
130 ; CHECK-AVX512: # %bb.0: # %entry
131 ; CHECK-AVX512-NEXT: movl $0, (%rdi)
132 ; CHECK-AVX512-NEXT: movl $1, (%rcx)
133 ; CHECK-AVX512-NEXT: movl (%rdi), %eax
134 ; CHECK-AVX512-NEXT: movl %eax, (%rsi)
135 ; CHECK-AVX512-NEXT: movq 4(%rdi), %rax
136 ; CHECK-AVX512-NEXT: movq %rax, 4(%rsi)
137 ; CHECK-AVX512-NEXT: movl 12(%rdi), %eax
138 ; CHECK-AVX512-NEXT: movl %eax, 12(%rsi)
139 ; CHECK-AVX512-NEXT: retq
140 entry:
141 %a = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 0
142 store i32 0, i32* %a, align 4
143 %a1 = getelementptr inbounds %struct.S, %struct.S* %s3, i64 0, i32 0
144 store i32 1, i32* %a1, align 4
145 %0 = bitcast %struct.S* %s2 to i8*
146 %1 = bitcast %struct.S* %s1 to i8*
147 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
148 ret void
149 }
150
151 ; Function Attrs: nounwind uwtable
152 define void @test_nondirect_br(%struct.S* nocapture noalias %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 {
153 ; CHECK-LABEL: test_nondirect_br:
154 ; CHECK: # %bb.0: # %entry
155 ; CHECK-NEXT: cmpl $18, %edx
156 ; CHECK-NEXT: jl .LBB2_2
157 ; CHECK-NEXT: # %bb.1: # %if.then
158 ; CHECK-NEXT: movl %edx, 4(%rdi)
159 ; CHECK-NEXT: .LBB2_2: # %if.end
160 ; CHECK-NEXT: cmpl $14, %r9d
161 ; CHECK-NEXT: jl .LBB2_4
162 ; CHECK-NEXT: # %bb.3: # %if.then2
163 ; CHECK-NEXT: movl %r9d, 12(%rdi)
164 ; CHECK-NEXT: .LBB2_4: # %if.end3
165 ; CHECK-NEXT: movups (%r8), %xmm0
166 ; CHECK-NEXT: movups %xmm0, (%rcx)
167 ; CHECK-NEXT: movq (%rdi), %rax
168 ; CHECK-NEXT: movq %rax, (%rsi)
169 ; CHECK-NEXT: movl 8(%rdi), %eax
170 ; CHECK-NEXT: movl %eax, 8(%rsi)
171 ; CHECK-NEXT: movl 12(%rdi), %eax
172 ; CHECK-NEXT: movl %eax, 12(%rsi)
173 ; CHECK-NEXT: retq
174 ;
175 ; DISABLED-LABEL: test_nondirect_br:
176 ; DISABLED: # %bb.0: # %entry
177 ; DISABLED-NEXT: cmpl $18, %edx
178 ; DISABLED-NEXT: jl .LBB2_2
179 ; DISABLED-NEXT: # %bb.1: # %if.then
180 ; DISABLED-NEXT: movl %edx, 4(%rdi)
181 ; DISABLED-NEXT: .LBB2_2: # %if.end
182 ; DISABLED-NEXT: cmpl $14, %r9d
183 ; DISABLED-NEXT: jl .LBB2_4
184 ; DISABLED-NEXT: # %bb.3: # %if.then2
185 ; DISABLED-NEXT: movl %r9d, 12(%rdi)
186 ; DISABLED-NEXT: .LBB2_4: # %if.end3
187 ; DISABLED-NEXT: movups (%r8), %xmm0
188 ; DISABLED-NEXT: movups %xmm0, (%rcx)
189 ; DISABLED-NEXT: movups (%rdi), %xmm0
190 ; DISABLED-NEXT: movups %xmm0, (%rsi)
191 ; DISABLED-NEXT: retq
192 ;
193 ; CHECK-AVX2-LABEL: test_nondirect_br:
194 ; CHECK-AVX2: # %bb.0: # %entry
195 ; CHECK-AVX2-NEXT: cmpl $18, %edx
196 ; CHECK-AVX2-NEXT: jl .LBB2_2
197 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
198 ; CHECK-AVX2-NEXT: movl %edx, 4(%rdi)
199 ; CHECK-AVX2-NEXT: .LBB2_2: # %if.end
200 ; CHECK-AVX2-NEXT: cmpl $14, %r9d
201 ; CHECK-AVX2-NEXT: jl .LBB2_4
202 ; CHECK-AVX2-NEXT: # %bb.3: # %if.then2
203 ; CHECK-AVX2-NEXT: movl %r9d, 12(%rdi)
204 ; CHECK-AVX2-NEXT: .LBB2_4: # %if.end3
205 ; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0
206 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx)
207 ; CHECK-AVX2-NEXT: movq (%rdi), %rax
208 ; CHECK-AVX2-NEXT: movq %rax, (%rsi)
209 ; CHECK-AVX2-NEXT: movl 8(%rdi), %eax
210 ; CHECK-AVX2-NEXT: movl %eax, 8(%rsi)
211 ; CHECK-AVX2-NEXT: movl 12(%rdi), %eax
212 ; CHECK-AVX2-NEXT: movl %eax, 12(%rsi)
213 ; CHECK-AVX2-NEXT: retq
214 ;
215 ; CHECK-AVX512-LABEL: test_nondirect_br:
216 ; CHECK-AVX512: # %bb.0: # %entry
217 ; CHECK-AVX512-NEXT: cmpl $18, %edx
218 ; CHECK-AVX512-NEXT: jl .LBB2_2
219 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
220 ; CHECK-AVX512-NEXT: movl %edx, 4(%rdi)
221 ; CHECK-AVX512-NEXT: .LBB2_2: # %if.end
222 ; CHECK-AVX512-NEXT: cmpl $14, %r9d
223 ; CHECK-AVX512-NEXT: jl .LBB2_4
224 ; CHECK-AVX512-NEXT: # %bb.3: # %if.then2
225 ; CHECK-AVX512-NEXT: movl %r9d, 12(%rdi)
226 ; CHECK-AVX512-NEXT: .LBB2_4: # %if.end3
227 ; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0
228 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx)
229 ; CHECK-AVX512-NEXT: movq (%rdi), %rax
230 ; CHECK-AVX512-NEXT: movq %rax, (%rsi)
231 ; CHECK-AVX512-NEXT: movl 8(%rdi), %eax
232 ; CHECK-AVX512-NEXT: movl %eax, 8(%rsi)
233 ; CHECK-AVX512-NEXT: movl 12(%rdi), %eax
234 ; CHECK-AVX512-NEXT: movl %eax, 12(%rsi)
235 ; CHECK-AVX512-NEXT: retq
236 entry:
237 %cmp = icmp sgt i32 %x, 17
238 br i1 %cmp, label %if.then, label %if.end
239
240 if.then: ; preds = %entry
241 %b = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 1
242 store i32 %x, i32* %b, align 4
243 br label %if.end
244
245 if.end: ; preds = %if.then, %entry
246 %cmp1 = icmp sgt i32 %x2, 13
247 br i1 %cmp1, label %if.then2, label %if.end3
248
249 if.then2: ; preds = %if.end
250 %d = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 3
251 store i32 %x2, i32* %d, align 4
252 br label %if.end3
253
254 if.end3: ; preds = %if.then2, %if.end
255 %0 = bitcast %struct.S* %s3 to i8*
256 %1 = bitcast %struct.S* %s4 to i8*
257 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
258 %2 = bitcast %struct.S* %s2 to i8*
259 %3 = bitcast %struct.S* %s1 to i8*
260 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 4, i1 false)
261 ret void
262 }
263
264 ; Function Attrs: nounwind uwtable
265 define void @test_2preds_block(%struct.S* nocapture noalias %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 {
266 ; CHECK-LABEL: test_2preds_block:
267 ; CHECK: # %bb.0: # %entry
268 ; CHECK-NEXT: movl %r9d, 12(%rdi)
269 ; CHECK-NEXT: cmpl $18, %edx
270 ; CHECK-NEXT: jl .LBB3_2
271 ; CHECK-NEXT: # %bb.1: # %if.then
272 ; CHECK-NEXT: movl %edx, 4(%rdi)
273 ; CHECK-NEXT: .LBB3_2: # %if.end
274 ; CHECK-NEXT: movups (%r8), %xmm0
275 ; CHECK-NEXT: movups %xmm0, (%rcx)
276 ; CHECK-NEXT: movl (%rdi), %eax
277 ; CHECK-NEXT: movl %eax, (%rsi)
278 ; CHECK-NEXT: movl 4(%rdi), %eax
279 ; CHECK-NEXT: movl %eax, 4(%rsi)
280 ; CHECK-NEXT: movl 8(%rdi), %eax
281 ; CHECK-NEXT: movl %eax, 8(%rsi)
282 ; CHECK-NEXT: movl 12(%rdi), %eax
283 ; CHECK-NEXT: movl %eax, 12(%rsi)
284 ; CHECK-NEXT: retq
285 ;
286 ; DISABLED-LABEL: test_2preds_block:
287 ; DISABLED: # %bb.0: # %entry
288 ; DISABLED-NEXT: movl %r9d, 12(%rdi)
289 ; DISABLED-NEXT: cmpl $18, %edx
290 ; DISABLED-NEXT: jl .LBB3_2
291 ; DISABLED-NEXT: # %bb.1: # %if.then
292 ; DISABLED-NEXT: movl %edx, 4(%rdi)
293 ; DISABLED-NEXT: .LBB3_2: # %if.end
294 ; DISABLED-NEXT: movups (%r8), %xmm0
295 ; DISABLED-NEXT: movups %xmm0, (%rcx)
296 ; DISABLED-NEXT: movups (%rdi), %xmm0
297 ; DISABLED-NEXT: movups %xmm0, (%rsi)
298 ; DISABLED-NEXT: retq
299 ;
300 ; CHECK-AVX2-LABEL: test_2preds_block:
301 ; CHECK-AVX2: # %bb.0: # %entry
302 ; CHECK-AVX2-NEXT: movl %r9d, 12(%rdi)
303 ; CHECK-AVX2-NEXT: cmpl $18, %edx
304 ; CHECK-AVX2-NEXT: jl .LBB3_2
305 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
306 ; CHECK-AVX2-NEXT: movl %edx, 4(%rdi)
307 ; CHECK-AVX2-NEXT: .LBB3_2: # %if.end
308 ; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0
309 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx)
310 ; CHECK-AVX2-NEXT: movl (%rdi), %eax
311 ; CHECK-AVX2-NEXT: movl %eax, (%rsi)
312 ; CHECK-AVX2-NEXT: movl 4(%rdi), %eax
313 ; CHECK-AVX2-NEXT: movl %eax, 4(%rsi)
314 ; CHECK-AVX2-NEXT: movl 8(%rdi), %eax
315 ; CHECK-AVX2-NEXT: movl %eax, 8(%rsi)
316 ; CHECK-AVX2-NEXT: movl 12(%rdi), %eax
317 ; CHECK-AVX2-NEXT: movl %eax, 12(%rsi)
318 ; CHECK-AVX2-NEXT: retq
319 ;
320 ; CHECK-AVX512-LABEL: test_2preds_block:
321 ; CHECK-AVX512: # %bb.0: # %entry
322 ; CHECK-AVX512-NEXT: movl %r9d, 12(%rdi)
323 ; CHECK-AVX512-NEXT: cmpl $18, %edx
324 ; CHECK-AVX512-NEXT: jl .LBB3_2
325 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
326 ; CHECK-AVX512-NEXT: movl %edx, 4(%rdi)
327 ; CHECK-AVX512-NEXT: .LBB3_2: # %if.end
328 ; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0
329 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx)
330 ; CHECK-AVX512-NEXT: movl (%rdi), %eax
331 ; CHECK-AVX512-NEXT: movl %eax, (%rsi)
332 ; CHECK-AVX512-NEXT: movl 4(%rdi), %eax
333 ; CHECK-AVX512-NEXT: movl %eax, 4(%rsi)
334 ; CHECK-AVX512-NEXT: movl 8(%rdi), %eax
335 ; CHECK-AVX512-NEXT: movl %eax, 8(%rsi)
336 ; CHECK-AVX512-NEXT: movl 12(%rdi), %eax
337 ; CHECK-AVX512-NEXT: movl %eax, 12(%rsi)
338 ; CHECK-AVX512-NEXT: retq
339 entry:
340 %d = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 3
341 store i32 %x2, i32* %d, align 4
342 %cmp = icmp sgt i32 %x, 17
343 br i1 %cmp, label %if.then, label %if.end
344
345 if.then: ; preds = %entry
346 %b = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 1
347 store i32 %x, i32* %b, align 4
348 br label %if.end
349
350 if.end: ; preds = %if.then, %entry
351 %0 = bitcast %struct.S* %s3 to i8*
352 %1 = bitcast %struct.S* %s4 to i8*
353 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
354 %2 = bitcast %struct.S* %s2 to i8*
355 %3 = bitcast %struct.S* %s1 to i8*
356 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 4, i1 false)
357 ret void
358 }
359 %struct.S2 = type { i64, i64 }
360
361 ; Function Attrs: nounwind uwtable
362 define void @test_type64(%struct.S2* nocapture noalias %s1, %struct.S2* nocapture %s2, i32 %x, %struct.S2* nocapture %s3, %struct.S2* nocapture readonly %s4) local_unnamed_addr #0 {
363 ; CHECK-LABEL: test_type64:
364 ; CHECK: # %bb.0: # %entry
365 ; CHECK-NEXT: cmpl $18, %edx
366 ; CHECK-NEXT: jl .LBB4_2
367 ; CHECK-NEXT: # %bb.1: # %if.then
368 ; CHECK-NEXT: movslq %edx, %rax
369 ; CHECK-NEXT: movq %rax, 8(%rdi)
370 ; CHECK-NEXT: .LBB4_2: # %if.end
371 ; CHECK-NEXT: movups (%r8), %xmm0
372 ; CHECK-NEXT: movups %xmm0, (%rcx)
373 ; CHECK-NEXT: movq (%rdi), %rax
374 ; CHECK-NEXT: movq %rax, (%rsi)
375 ; CHECK-NEXT: movq 8(%rdi), %rax
376 ; CHECK-NEXT: movq %rax, 8(%rsi)
377 ; CHECK-NEXT: retq
378 ;
379 ; DISABLED-LABEL: test_type64:
380 ; DISABLED: # %bb.0: # %entry
381 ; DISABLED-NEXT: cmpl $18, %edx
382 ; DISABLED-NEXT: jl .LBB4_2
383 ; DISABLED-NEXT: # %bb.1: # %if.then
384 ; DISABLED-NEXT: movslq %edx, %rax
385 ; DISABLED-NEXT: movq %rax, 8(%rdi)
386 ; DISABLED-NEXT: .LBB4_2: # %if.end
387 ; DISABLED-NEXT: movups (%r8), %xmm0
388 ; DISABLED-NEXT: movups %xmm0, (%rcx)
389 ; DISABLED-NEXT: movups (%rdi), %xmm0
390 ; DISABLED-NEXT: movups %xmm0, (%rsi)
391 ; DISABLED-NEXT: retq
392 ;
393 ; CHECK-AVX2-LABEL: test_type64:
394 ; CHECK-AVX2: # %bb.0: # %entry
395 ; CHECK-AVX2-NEXT: cmpl $18, %edx
396 ; CHECK-AVX2-NEXT: jl .LBB4_2
397 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
398 ; CHECK-AVX2-NEXT: movslq %edx, %rax
399 ; CHECK-AVX2-NEXT: movq %rax, 8(%rdi)
400 ; CHECK-AVX2-NEXT: .LBB4_2: # %if.end
401 ; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0
402 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx)
403 ; CHECK-AVX2-NEXT: movq (%rdi), %rax
404 ; CHECK-AVX2-NEXT: movq %rax, (%rsi)
405 ; CHECK-AVX2-NEXT: movq 8(%rdi), %rax
406 ; CHECK-AVX2-NEXT: movq %rax, 8(%rsi)
407 ; CHECK-AVX2-NEXT: retq
408 ;
409 ; CHECK-AVX512-LABEL: test_type64:
410 ; CHECK-AVX512: # %bb.0: # %entry
411 ; CHECK-AVX512-NEXT: cmpl $18, %edx
412 ; CHECK-AVX512-NEXT: jl .LBB4_2
413 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
414 ; CHECK-AVX512-NEXT: movslq %edx, %rax
415 ; CHECK-AVX512-NEXT: movq %rax, 8(%rdi)
416 ; CHECK-AVX512-NEXT: .LBB4_2: # %if.end
417 ; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0
418 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx)
419 ; CHECK-AVX512-NEXT: movq (%rdi), %rax
420 ; CHECK-AVX512-NEXT: movq %rax, (%rsi)
421 ; CHECK-AVX512-NEXT: movq 8(%rdi), %rax
422 ; CHECK-AVX512-NEXT: movq %rax, 8(%rsi)
423 ; CHECK-AVX512-NEXT: retq
424 entry:
425 %cmp = icmp sgt i32 %x, 17
426 br i1 %cmp, label %if.then, label %if.end
427
428 if.then: ; preds = %entry
429 %conv = sext i32 %x to i64
430 %b = getelementptr inbounds %struct.S2, %struct.S2* %s1, i64 0, i32 1
431 store i64 %conv, i64* %b, align 8
432 br label %if.end
433
434 if.end: ; preds = %if.then, %entry
435 %0 = bitcast %struct.S2* %s3 to i8*
436 %1 = bitcast %struct.S2* %s4 to i8*
437 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 8, i1 false)
438 %2 = bitcast %struct.S2* %s2 to i8*
439 %3 = bitcast %struct.S2* %s1 to i8*
440 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 8, i1 false)
441 ret void
442 }
443 %struct.S3 = type { i64, i8, i8, i16, i32 }
444
445 ; Function Attrs: noinline nounwind uwtable
446 define void @test_mixed_type(%struct.S3* nocapture noalias %s1, %struct.S3* nocapture %s2, i32 %x, %struct.S3* nocapture readnone %s3, %struct.S3* nocapture readnone %s4) local_unnamed_addr #0 {
447 ; CHECK-LABEL: test_mixed_type:
448 ; CHECK: # %bb.0: # %entry
449 ; CHECK-NEXT: cmpl $18, %edx
450 ; CHECK-NEXT: jl .LBB5_2
451 ; CHECK-NEXT: # %bb.1: # %if.then
452 ; CHECK-NEXT: movslq %edx, %rax
453 ; CHECK-NEXT: movq %rax, (%rdi)
454 ; CHECK-NEXT: movb %dl, 8(%rdi)
455 ; CHECK-NEXT: .LBB5_2: # %if.end
456 ; CHECK-NEXT: movq (%rdi), %rax
457 ; CHECK-NEXT: movq %rax, (%rsi)
458 ; CHECK-NEXT: movb 8(%rdi), %al
459 ; CHECK-NEXT: movb %al, 8(%rsi)
460 ; CHECK-NEXT: movl 9(%rdi), %eax
461 ; CHECK-NEXT: movl %eax, 9(%rsi)
462 ; CHECK-NEXT: movzwl 13(%rdi), %eax
463 ; CHECK-NEXT: movw %ax, 13(%rsi)
464 ; CHECK-NEXT: movb 15(%rdi), %al
465 ; CHECK-NEXT: movb %al, 15(%rsi)
466 ; CHECK-NEXT: retq
467 ;
468 ; DISABLED-LABEL: test_mixed_type:
469 ; DISABLED: # %bb.0: # %entry
470 ; DISABLED-NEXT: cmpl $18, %edx
471 ; DISABLED-NEXT: jl .LBB5_2
472 ; DISABLED-NEXT: # %bb.1: # %if.then
473 ; DISABLED-NEXT: movslq %edx, %rax
474 ; DISABLED-NEXT: movq %rax, (%rdi)
475 ; DISABLED-NEXT: movb %dl, 8(%rdi)
476 ; DISABLED-NEXT: .LBB5_2: # %if.end
477 ; DISABLED-NEXT: movups (%rdi), %xmm0
478 ; DISABLED-NEXT: movups %xmm0, (%rsi)
479 ; DISABLED-NEXT: retq
480 ;
481 ; CHECK-AVX2-LABEL: test_mixed_type:
482 ; CHECK-AVX2: # %bb.0: # %entry
483 ; CHECK-AVX2-NEXT: cmpl $18, %edx
484 ; CHECK-AVX2-NEXT: jl .LBB5_2
485 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
486 ; CHECK-AVX2-NEXT: movslq %edx, %rax
487 ; CHECK-AVX2-NEXT: movq %rax, (%rdi)
488 ; CHECK-AVX2-NEXT: movb %dl, 8(%rdi)
489 ; CHECK-AVX2-NEXT: .LBB5_2: # %if.end
490 ; CHECK-AVX2-NEXT: movq (%rdi), %rax
491 ; CHECK-AVX2-NEXT: movq %rax, (%rsi)
492 ; CHECK-AVX2-NEXT: movb 8(%rdi), %al
493 ; CHECK-AVX2-NEXT: movb %al, 8(%rsi)
494 ; CHECK-AVX2-NEXT: movl 9(%rdi), %eax
495 ; CHECK-AVX2-NEXT: movl %eax, 9(%rsi)
496 ; CHECK-AVX2-NEXT: movzwl 13(%rdi), %eax
497 ; CHECK-AVX2-NEXT: movw %ax, 13(%rsi)
498 ; CHECK-AVX2-NEXT: movb 15(%rdi), %al
499 ; CHECK-AVX2-NEXT: movb %al, 15(%rsi)
500 ; CHECK-AVX2-NEXT: retq
501 ;
502 ; CHECK-AVX512-LABEL: test_mixed_type:
503 ; CHECK-AVX512: # %bb.0: # %entry
504 ; CHECK-AVX512-NEXT: cmpl $18, %edx
505 ; CHECK-AVX512-NEXT: jl .LBB5_2
506 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
507 ; CHECK-AVX512-NEXT: movslq %edx, %rax
508 ; CHECK-AVX512-NEXT: movq %rax, (%rdi)
509 ; CHECK-AVX512-NEXT: movb %dl, 8(%rdi)
510 ; CHECK-AVX512-NEXT: .LBB5_2: # %if.end
511 ; CHECK-AVX512-NEXT: movq (%rdi), %rax
512 ; CHECK-AVX512-NEXT: movq %rax, (%rsi)
513 ; CHECK-AVX512-NEXT: movb 8(%rdi), %al
514 ; CHECK-AVX512-NEXT: movb %al, 8(%rsi)
515 ; CHECK-AVX512-NEXT: movl 9(%rdi), %eax
516 ; CHECK-AVX512-NEXT: movl %eax, 9(%rsi)
517 ; CHECK-AVX512-NEXT: movzwl 13(%rdi), %eax
518 ; CHECK-AVX512-NEXT: movw %ax, 13(%rsi)
519 ; CHECK-AVX512-NEXT: movb 15(%rdi), %al
520 ; CHECK-AVX512-NEXT: movb %al, 15(%rsi)
521 ; CHECK-AVX512-NEXT: retq
522 entry:
523 %cmp = icmp sgt i32 %x, 17
524 br i1 %cmp, label %if.then, label %if.end
525
526 if.then: ; preds = %entry
527 %conv = sext i32 %x to i64
528 %a = getelementptr inbounds %struct.S3, %struct.S3* %s1, i64 0, i32 0
529 store i64 %conv, i64* %a, align 8
530 %conv1 = trunc i32 %x to i8
531 %b = getelementptr inbounds %struct.S3, %struct.S3* %s1, i64 0, i32 1
532 store i8 %conv1, i8* %b, align 8
533 br label %if.end
534
535 if.end: ; preds = %if.then, %entry
536 %0 = bitcast %struct.S3* %s2 to i8*
537 %1 = bitcast %struct.S3* %s1 to i8*
538 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 8, i1 false)
539 ret void
540 }
541 %struct.S4 = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
542
543 ; Function Attrs: nounwind uwtable
544 define void @test_multiple_blocks(%struct.S4* nocapture noalias %s1, %struct.S4* nocapture %s2) local_unnamed_addr #0 {
545 ; CHECK-LABEL: test_multiple_blocks:
546 ; CHECK: # %bb.0: # %entry
547 ; CHECK-NEXT: movl $0, 4(%rdi)
548 ; CHECK-NEXT: movl $0, 36(%rdi)
549 ; CHECK-NEXT: movups 16(%rdi), %xmm0
550 ; CHECK-NEXT: movups %xmm0, 16(%rsi)
551 ; CHECK-NEXT: movl 32(%rdi), %eax
552 ; CHECK-NEXT: movl %eax, 32(%rsi)
553 ; CHECK-NEXT: movl 36(%rdi), %eax
554 ; CHECK-NEXT: movl %eax, 36(%rsi)
555 ; CHECK-NEXT: movq 40(%rdi), %rax
556 ; CHECK-NEXT: movq %rax, 40(%rsi)
557 ; CHECK-NEXT: movl (%rdi), %eax
558 ; CHECK-NEXT: movl %eax, (%rsi)
559 ; CHECK-NEXT: movl 4(%rdi), %eax
560 ; CHECK-NEXT: movl %eax, 4(%rsi)
561 ; CHECK-NEXT: movq 8(%rdi), %rax
562 ; CHECK-NEXT: movq %rax, 8(%rsi)
563 ; CHECK-NEXT: retq
564 ;
565 ; DISABLED-LABEL: test_multiple_blocks:
566 ; DISABLED: # %bb.0: # %entry
567 ; DISABLED-NEXT: movl $0, 4(%rdi)
568 ; DISABLED-NEXT: movl $0, 36(%rdi)
569 ; DISABLED-NEXT: movups 16(%rdi), %xmm0
570 ; DISABLED-NEXT: movups %xmm0, 16(%rsi)
571 ; DISABLED-NEXT: movups 32(%rdi), %xmm0
572 ; DISABLED-NEXT: movups %xmm0, 32(%rsi)
573 ; DISABLED-NEXT: movups (%rdi), %xmm0
574 ; DISABLED-NEXT: movups %xmm0, (%rsi)
575 ; DISABLED-NEXT: retq
576 ;
577 ; CHECK-AVX2-LABEL: test_multiple_blocks:
578 ; CHECK-AVX2: # %bb.0: # %entry
579 ; CHECK-AVX2-NEXT: movl $0, 4(%rdi)
580 ; CHECK-AVX2-NEXT: movl $0, 36(%rdi)
581 ; CHECK-AVX2-NEXT: vmovups 16(%rdi), %xmm0
582 ; CHECK-AVX2-NEXT: vmovups %xmm0, 16(%rsi)
583 ; CHECK-AVX2-NEXT: movl 32(%rdi), %eax
584 ; CHECK-AVX2-NEXT: movl %eax, 32(%rsi)
585 ; CHECK-AVX2-NEXT: movl 36(%rdi), %eax
586 ; CHECK-AVX2-NEXT: movl %eax, 36(%rsi)
587 ; CHECK-AVX2-NEXT: movq 40(%rdi), %rax
588 ; CHECK-AVX2-NEXT: movq %rax, 40(%rsi)
589 ; CHECK-AVX2-NEXT: movl (%rdi), %eax
590 ; CHECK-AVX2-NEXT: movl %eax, (%rsi)
591 ; CHECK-AVX2-NEXT: movl 4(%rdi), %eax
592 ; CHECK-AVX2-NEXT: movl %eax, 4(%rsi)
593 ; CHECK-AVX2-NEXT: vmovups 8(%rdi), %xmm0
594 ; CHECK-AVX2-NEXT: vmovups %xmm0, 8(%rsi)
595 ; CHECK-AVX2-NEXT: movq 24(%rdi), %rax
596 ; CHECK-AVX2-NEXT: movq %rax, 24(%rsi)
597 ; CHECK-AVX2-NEXT: retq
598 ;
599 ; CHECK-AVX512-LABEL: test_multiple_blocks:
600 ; CHECK-AVX512: # %bb.0: # %entry
601 ; CHECK-AVX512-NEXT: movl $0, 4(%rdi)
602 ; CHECK-AVX512-NEXT: movl $0, 36(%rdi)
603 ; CHECK-AVX512-NEXT: vmovups 16(%rdi), %xmm0
604 ; CHECK-AVX512-NEXT: vmovups %xmm0, 16(%rsi)
605 ; CHECK-AVX512-NEXT: movl 32(%rdi), %eax
606 ; CHECK-AVX512-NEXT: movl %eax, 32(%rsi)
607 ; CHECK-AVX512-NEXT: movl 36(%rdi), %eax
608 ; CHECK-AVX512-NEXT: movl %eax, 36(%rsi)
609 ; CHECK-AVX512-NEXT: movq 40(%rdi), %rax
610 ; CHECK-AVX512-NEXT: movq %rax, 40(%rsi)
611 ; CHECK-AVX512-NEXT: movl (%rdi), %eax
612 ; CHECK-AVX512-NEXT: movl %eax, (%rsi)
613 ; CHECK-AVX512-NEXT: movl 4(%rdi), %eax
614 ; CHECK-AVX512-NEXT: movl %eax, 4(%rsi)
615 ; CHECK-AVX512-NEXT: vmovups 8(%rdi), %xmm0
616 ; CHECK-AVX512-NEXT: vmovups %xmm0, 8(%rsi)
617 ; CHECK-AVX512-NEXT: movq 24(%rdi), %rax
618 ; CHECK-AVX512-NEXT: movq %rax, 24(%rsi)
619 ; CHECK-AVX512-NEXT: retq
620 entry:
621 %b = getelementptr inbounds %struct.S4, %struct.S4* %s1, i64 0, i32 1
622 store i32 0, i32* %b, align 4
623 %b3 = getelementptr inbounds %struct.S4, %struct.S4* %s1, i64 0, i32 9
624 store i32 0, i32* %b3, align 4
625 %0 = bitcast %struct.S4* %s2 to i8*
626 %1 = bitcast %struct.S4* %s1 to i8*
627 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 48, i32 4, i1 false)
628 ret void
629 }
630 %struct.S5 = type { i16, i16, i16, i16, i16, i16, i16, i16 }
631
632 ; Function Attrs: nounwind uwtable
633 define void @test_type16(%struct.S5* nocapture noalias %s1, %struct.S5* nocapture %s2, i32 %x, %struct.S5* nocapture %s3, %struct.S5* nocapture readonly %s4) local_unnamed_addr #0 {
634 ; CHECK-LABEL: test_type16:
635 ; CHECK: # %bb.0: # %entry
636 ; CHECK-NEXT: cmpl $18, %edx
637 ; CHECK-NEXT: jl .LBB7_2
638 ; CHECK-NEXT: # %bb.1: # %if.then
639 ; CHECK-NEXT: movw %dx, 2(%rdi)
640 ; CHECK-NEXT: .LBB7_2: # %if.end
641 ; CHECK-NEXT: movups (%r8), %xmm0
642 ; CHECK-NEXT: movups %xmm0, (%rcx)
643 ; CHECK-NEXT: movzwl (%rdi), %eax
644 ; CHECK-NEXT: movw %ax, (%rsi)
645 ; CHECK-NEXT: movzwl 2(%rdi), %eax
646 ; CHECK-NEXT: movw %ax, 2(%rsi)
647 ; CHECK-NEXT: movq 4(%rdi), %rax
648 ; CHECK-NEXT: movq %rax, 4(%rsi)
649 ; CHECK-NEXT: movl 12(%rdi), %eax
650 ; CHECK-NEXT: movl %eax, 12(%rsi)
651 ; CHECK-NEXT: retq
652 ;
653 ; DISABLED-LABEL: test_type16:
654 ; DISABLED: # %bb.0: # %entry
655 ; DISABLED-NEXT: cmpl $18, %edx
656 ; DISABLED-NEXT: jl .LBB7_2
657 ; DISABLED-NEXT: # %bb.1: # %if.then
658 ; DISABLED-NEXT: movw %dx, 2(%rdi)
659 ; DISABLED-NEXT: .LBB7_2: # %if.end
660 ; DISABLED-NEXT: movups (%r8), %xmm0
661 ; DISABLED-NEXT: movups %xmm0, (%rcx)
662 ; DISABLED-NEXT: movups (%rdi), %xmm0
663 ; DISABLED-NEXT: movups %xmm0, (%rsi)
664 ; DISABLED-NEXT: retq
665 ;
666 ; CHECK-AVX2-LABEL: test_type16:
667 ; CHECK-AVX2: # %bb.0: # %entry
668 ; CHECK-AVX2-NEXT: cmpl $18, %edx
669 ; CHECK-AVX2-NEXT: jl .LBB7_2
670 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
671 ; CHECK-AVX2-NEXT: movw %dx, 2(%rdi)
672 ; CHECK-AVX2-NEXT: .LBB7_2: # %if.end
673 ; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0
674 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx)
675 ; CHECK-AVX2-NEXT: movzwl (%rdi), %eax
676 ; CHECK-AVX2-NEXT: movw %ax, (%rsi)
677 ; CHECK-AVX2-NEXT: movzwl 2(%rdi), %eax
678 ; CHECK-AVX2-NEXT: movw %ax, 2(%rsi)
679 ; CHECK-AVX2-NEXT: movq 4(%rdi), %rax
680 ; CHECK-AVX2-NEXT: movq %rax, 4(%rsi)
681 ; CHECK-AVX2-NEXT: movl 12(%rdi), %eax
682 ; CHECK-AVX2-NEXT: movl %eax, 12(%rsi)
683 ; CHECK-AVX2-NEXT: retq
684 ;
685 ; CHECK-AVX512-LABEL: test_type16:
686 ; CHECK-AVX512: # %bb.0: # %entry
687 ; CHECK-AVX512-NEXT: cmpl $18, %edx
688 ; CHECK-AVX512-NEXT: jl .LBB7_2
689 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
690 ; CHECK-AVX512-NEXT: movw %dx, 2(%rdi)
691 ; CHECK-AVX512-NEXT: .LBB7_2: # %if.end
692 ; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0
693 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx)
694 ; CHECK-AVX512-NEXT: movzwl (%rdi), %eax
695 ; CHECK-AVX512-NEXT: movw %ax, (%rsi)
696 ; CHECK-AVX512-NEXT: movzwl 2(%rdi), %eax
697 ; CHECK-AVX512-NEXT: movw %ax, 2(%rsi)
698 ; CHECK-AVX512-NEXT: movq 4(%rdi), %rax
699 ; CHECK-AVX512-NEXT: movq %rax, 4(%rsi)
700 ; CHECK-AVX512-NEXT: movl 12(%rdi), %eax
701 ; CHECK-AVX512-NEXT: movl %eax, 12(%rsi)
702 ; CHECK-AVX512-NEXT: retq
703 entry:
704 %cmp = icmp sgt i32 %x, 17
705 br i1 %cmp, label %if.then, label %if.end
706
707 if.then: ; preds = %entry
708 %conv = trunc i32 %x to i16
709 %b = getelementptr inbounds %struct.S5, %struct.S5* %s1, i64 0, i32 1
710 store i16 %conv, i16* %b, align 2
711 br label %if.end
712
713 if.end: ; preds = %if.then, %entry
714 %0 = bitcast %struct.S5* %s3 to i8*
715 %1 = bitcast %struct.S5* %s4 to i8*
716 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 2, i1 false)
717 %2 = bitcast %struct.S5* %s2 to i8*
718 %3 = bitcast %struct.S5* %s1 to i8*
719 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 2, i1 false)
720 ret void
721 }
722
723 %struct.S6 = type { [4 x i32], i32, i32, i32, i32 }
724
725 ; Function Attrs: nounwind uwtable
726 define void @test_stack(%struct.S6* noalias nocapture sret %agg.result, %struct.S6* byval nocapture readnone align 8 %s1, %struct.S6* byval nocapture align 8 %s2, i32 %x) local_unnamed_addr #0 {
727 ; CHECK-LABEL: test_stack:
728 ; CHECK: # %bb.0: # %entry
729 ; CHECK-NEXT: movl %esi, {{[0-9]+}}(%rsp)
730 ; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
731 ; CHECK-NEXT: movups %xmm0, (%rdi)
732 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
733 ; CHECK-NEXT: movq %rax, 16(%rdi)
734 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
735 ; CHECK-NEXT: movl %eax, 24(%rdi)
736 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
737 ; CHECK-NEXT: movl %eax, 28(%rdi)
738 ; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
739 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
740 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
741 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edx
742 ; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
743 ; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp)
744 ; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%rsp)
745 ; CHECK-NEXT: movl %edx, {{[0-9]+}}(%rsp)
746 ; CHECK-NEXT: movq %rdi, %rax
747 ; CHECK-NEXT: retq
748 ;
749 ; DISABLED-LABEL: test_stack:
750 ; DISABLED: # %bb.0: # %entry
751 ; DISABLED-NEXT: movl %esi, {{[0-9]+}}(%rsp)
752 ; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
753 ; DISABLED-NEXT: movups %xmm0, (%rdi)
754 ; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
755 ; DISABLED-NEXT: movups %xmm0, 16(%rdi)
756 ; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
757 ; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
758 ; DISABLED-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
759 ; DISABLED-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
760 ; DISABLED-NEXT: movq %rdi, %rax
761 ; DISABLED-NEXT: retq
762 ;
763 ; CHECK-AVX2-LABEL: test_stack:
764 ; CHECK-AVX2: # %bb.0: # %entry
765 ; CHECK-AVX2-NEXT: movl %esi, {{[0-9]+}}(%rsp)
766 ; CHECK-AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0
767 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%rdi)
768 ; CHECK-AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
769 ; CHECK-AVX2-NEXT: movq %rax, 16(%rdi)
770 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
771 ; CHECK-AVX2-NEXT: movl %eax, 24(%rdi)
772 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
773 ; CHECK-AVX2-NEXT: movl %eax, 28(%rdi)
774 ; CHECK-AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0
775 ; CHECK-AVX2-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp)
776 ; CHECK-AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
777 ; CHECK-AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
778 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
779 ; CHECK-AVX2-NEXT: movl %eax, {{[0-9]+}}(%rsp)
780 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
781 ; CHECK-AVX2-NEXT: movl %eax, {{[0-9]+}}(%rsp)
782 ; CHECK-AVX2-NEXT: movq %rdi, %rax
783 ; CHECK-AVX2-NEXT: retq
784 ;
785 ; CHECK-AVX512-LABEL: test_stack:
786 ; CHECK-AVX512: # %bb.0: # %entry
787 ; CHECK-AVX512-NEXT: movl %esi, {{[0-9]+}}(%rsp)
788 ; CHECK-AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0
789 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%rdi)
790 ; CHECK-AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
791 ; CHECK-AVX512-NEXT: movq %rax, 16(%rdi)
792 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%rsp), %eax
793 ; CHECK-AVX512-NEXT: movl %eax, 24(%rdi)
794 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%rsp), %eax
795 ; CHECK-AVX512-NEXT: movl %eax, 28(%rdi)
796 ; CHECK-AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0
797 ; CHECK-AVX512-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp)
798 ; CHECK-AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
799 ; CHECK-AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
800 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%rsp), %eax
801 ; CHECK-AVX512-NEXT: movl %eax, {{[0-9]+}}(%rsp)
802 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%rsp), %eax
803 ; CHECK-AVX512-NEXT: movl %eax, {{[0-9]+}}(%rsp)
804 ; CHECK-AVX512-NEXT: movq %rdi, %rax
805 ; CHECK-AVX512-NEXT: retq
806 entry:
807 %s6.sroa.0.0..sroa_cast1 = bitcast %struct.S6* %s2 to i8*
808 %s6.sroa.3.0..sroa_idx4 = getelementptr inbounds %struct.S6, %struct.S6* %s2, i64 0, i32 3
809 store i32 %x, i32* %s6.sroa.3.0..sroa_idx4, align 8
810 %0 = bitcast %struct.S6* %agg.result to i8*
811 %s6.sroa.0.0..sroa_cast2 = bitcast %struct.S6* %s1 to i8*
812 call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* nonnull %s6.sroa.0.0..sroa_cast1, i64 32, i32 4, i1 false)
813 call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %s6.sroa.0.0..sroa_cast2, i8* nonnull %s6.sroa.0.0..sroa_cast1, i64 32, i32 4, i1 false)
814
815 ret void
816 }
817
818 ; Function Attrs: nounwind uwtable
819 define void @test_limit_all(%struct.S* noalias %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 {
820 ; CHECK-LABEL: test_limit_all:
821 ; CHECK: # %bb.0: # %entry
822 ; CHECK-NEXT: pushq %rbp
823 ; CHECK-NEXT: .cfi_def_cfa_offset 16
824 ; CHECK-NEXT: pushq %r15
825 ; CHECK-NEXT: .cfi_def_cfa_offset 24
826 ; CHECK-NEXT: pushq %r14
827 ; CHECK-NEXT: .cfi_def_cfa_offset 32
828 ; CHECK-NEXT: pushq %r12
829 ; CHECK-NEXT: .cfi_def_cfa_offset 40
830 ; CHECK-NEXT: pushq %rbx
831 ; CHECK-NEXT: .cfi_def_cfa_offset 48
832 ; CHECK-NEXT: .cfi_offset %rbx, -48
833 ; CHECK-NEXT: .cfi_offset %r12, -40
834 ; CHECK-NEXT: .cfi_offset %r14, -32
835 ; CHECK-NEXT: .cfi_offset %r15, -24
836 ; CHECK-NEXT: .cfi_offset %rbp, -16
837 ; CHECK-NEXT: movq %r8, %r15
838 ; CHECK-NEXT: movq %rcx, %r14
839 ; CHECK-NEXT: movl %edx, %ebp
840 ; CHECK-NEXT: movq %rsi, %r12
841 ; CHECK-NEXT: movq %rdi, %rbx
842 ; CHECK-NEXT: movl %r9d, 12(%rdi)
843 ; CHECK-NEXT: callq bar
844 ; CHECK-NEXT: cmpl $18, %ebp
845 ; CHECK-NEXT: jl .LBB9_2
846 ; CHECK-NEXT: # %bb.1: # %if.then
847 ; CHECK-NEXT: movl %ebp, 4(%rbx)
848 ; CHECK-NEXT: movq %rbx, %rdi
849 ; CHECK-NEXT: callq bar
850 ; CHECK-NEXT: .LBB9_2: # %if.end
851 ; CHECK-NEXT: movups (%r15), %xmm0
852 ; CHECK-NEXT: movups %xmm0, (%r14)
853 ; CHECK-NEXT: movups (%rbx), %xmm0
854 ; CHECK-NEXT: movups %xmm0, (%r12)
855 ; CHECK-NEXT: popq %rbx
856 ; CHECK-NEXT: popq %r12
857 ; CHECK-NEXT: popq %r14
858 ; CHECK-NEXT: popq %r15
859 ; CHECK-NEXT: popq %rbp
860 ; CHECK-NEXT: retq
861 ;
862 ; DISABLED-LABEL: test_limit_all:
863 ; DISABLED: # %bb.0: # %entry
864 ; DISABLED-NEXT: pushq %rbp
865 ; DISABLED-NEXT: .cfi_def_cfa_offset 16
866 ; DISABLED-NEXT: pushq %r15
867 ; DISABLED-NEXT: .cfi_def_cfa_offset 24
868 ; DISABLED-NEXT: pushq %r14
869 ; DISABLED-NEXT: .cfi_def_cfa_offset 32
870 ; DISABLED-NEXT: pushq %r12
871 ; DISABLED-NEXT: .cfi_def_cfa_offset 40
872 ; DISABLED-NEXT: pushq %rbx
873 ; DISABLED-NEXT: .cfi_def_cfa_offset 48
874 ; DISABLED-NEXT: .cfi_offset %rbx, -48
875 ; DISABLED-NEXT: .cfi_offset %r12, -40
876 ; DISABLED-NEXT: .cfi_offset %r14, -32
877 ; DISABLED-NEXT: .cfi_offset %r15, -24
878 ; DISABLED-NEXT: .cfi_offset %rbp, -16
879 ; DISABLED-NEXT: movq %r8, %r15
880 ; DISABLED-NEXT: movq %rcx, %r14
881 ; DISABLED-NEXT: movl %edx, %ebp
882 ; DISABLED-NEXT: movq %rsi, %r12
883 ; DISABLED-NEXT: movq %rdi, %rbx
884 ; DISABLED-NEXT: movl %r9d, 12(%rdi)
885 ; DISABLED-NEXT: callq bar
886 ; DISABLED-NEXT: cmpl $18, %ebp
887 ; DISABLED-NEXT: jl .LBB9_2
888 ; DISABLED-NEXT: # %bb.1: # %if.then
889 ; DISABLED-NEXT: movl %ebp, 4(%rbx)
890 ; DISABLED-NEXT: movq %rbx, %rdi
891 ; DISABLED-NEXT: callq bar
892 ; DISABLED-NEXT: .LBB9_2: # %if.end
893 ; DISABLED-NEXT: movups (%r15), %xmm0
894 ; DISABLED-NEXT: movups %xmm0, (%r14)
895 ; DISABLED-NEXT: movups (%rbx), %xmm0
896 ; DISABLED-NEXT: movups %xmm0, (%r12)
897 ; DISABLED-NEXT: popq %rbx
898 ; DISABLED-NEXT: popq %r12
899 ; DISABLED-NEXT: popq %r14
900 ; DISABLED-NEXT: popq %r15
901 ; DISABLED-NEXT: popq %rbp
902 ; DISABLED-NEXT: retq
903 ;
904 ; CHECK-AVX2-LABEL: test_limit_all:
905 ; CHECK-AVX2: # %bb.0: # %entry
906 ; CHECK-AVX2-NEXT: pushq %rbp
907 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 16
908 ; CHECK-AVX2-NEXT: pushq %r15
909 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 24
910 ; CHECK-AVX2-NEXT: pushq %r14
911 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 32
912 ; CHECK-AVX2-NEXT: pushq %r12
913 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 40
914 ; CHECK-AVX2-NEXT: pushq %rbx
915 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 48
916 ; CHECK-AVX2-NEXT: .cfi_offset %rbx, -48
917 ; CHECK-AVX2-NEXT: .cfi_offset %r12, -40
918 ; CHECK-AVX2-NEXT: .cfi_offset %r14, -32
919 ; CHECK-AVX2-NEXT: .cfi_offset %r15, -24
920 ; CHECK-AVX2-NEXT: .cfi_offset %rbp, -16
921 ; CHECK-AVX2-NEXT: movq %r8, %r15
922 ; CHECK-AVX2-NEXT: movq %rcx, %r14
923 ; CHECK-AVX2-NEXT: movl %edx, %ebp
924 ; CHECK-AVX2-NEXT: movq %rsi, %r12
925 ; CHECK-AVX2-NEXT: movq %rdi, %rbx
926 ; CHECK-AVX2-NEXT: movl %r9d, 12(%rdi)
927 ; CHECK-AVX2-NEXT: callq bar
928 ; CHECK-AVX2-NEXT: cmpl $18, %ebp
929 ; CHECK-AVX2-NEXT: jl .LBB9_2
930 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
931 ; CHECK-AVX2-NEXT: movl %ebp, 4(%rbx)
932 ; CHECK-AVX2-NEXT: movq %rbx, %rdi
933 ; CHECK-AVX2-NEXT: callq bar
934 ; CHECK-AVX2-NEXT: .LBB9_2: # %if.end
935 ; CHECK-AVX2-NEXT: vmovups (%r15), %xmm0
936 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%r14)
937 ; CHECK-AVX2-NEXT: vmovups (%rbx), %xmm0
938 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%r12)
939 ; CHECK-AVX2-NEXT: popq %rbx
940 ; CHECK-AVX2-NEXT: popq %r12
941 ; CHECK-AVX2-NEXT: popq %r14
942 ; CHECK-AVX2-NEXT: popq %r15
943 ; CHECK-AVX2-NEXT: popq %rbp
944 ; CHECK-AVX2-NEXT: retq
945 ;
946 ; CHECK-AVX512-LABEL: test_limit_all:
947 ; CHECK-AVX512: # %bb.0: # %entry
948 ; CHECK-AVX512-NEXT: pushq %rbp
949 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 16
950 ; CHECK-AVX512-NEXT: pushq %r15
951 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 24
952 ; CHECK-AVX512-NEXT: pushq %r14
953 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 32
954 ; CHECK-AVX512-NEXT: pushq %r12
955 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 40
956 ; CHECK-AVX512-NEXT: pushq %rbx
957 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 48
958 ; CHECK-AVX512-NEXT: .cfi_offset %rbx, -48
959 ; CHECK-AVX512-NEXT: .cfi_offset %r12, -40
960 ; CHECK-AVX512-NEXT: .cfi_offset %r14, -32
961 ; CHECK-AVX512-NEXT: .cfi_offset %r15, -24
962 ; CHECK-AVX512-NEXT: .cfi_offset %rbp, -16
963 ; CHECK-AVX512-NEXT: movq %r8, %r15
964 ; CHECK-AVX512-NEXT: movq %rcx, %r14
965 ; CHECK-AVX512-NEXT: movl %edx, %ebp
966 ; CHECK-AVX512-NEXT: movq %rsi, %r12
967 ; CHECK-AVX512-NEXT: movq %rdi, %rbx
968 ; CHECK-AVX512-NEXT: movl %r9d, 12(%rdi)
969 ; CHECK-AVX512-NEXT: callq bar
970 ; CHECK-AVX512-NEXT: cmpl $18, %ebp
971 ; CHECK-AVX512-NEXT: jl .LBB9_2
972 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
973 ; CHECK-AVX512-NEXT: movl %ebp, 4(%rbx)
974 ; CHECK-AVX512-NEXT: movq %rbx, %rdi
975 ; CHECK-AVX512-NEXT: callq bar
976 ; CHECK-AVX512-NEXT: .LBB9_2: # %if.end
977 ; CHECK-AVX512-NEXT: vmovups (%r15), %xmm0
978 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%r14)
979 ; CHECK-AVX512-NEXT: vmovups (%rbx), %xmm0
980 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%r12)
981 ; CHECK-AVX512-NEXT: popq %rbx
982 ; CHECK-AVX512-NEXT: popq %r12
983 ; CHECK-AVX512-NEXT: popq %r14
984 ; CHECK-AVX512-NEXT: popq %r15
985 ; CHECK-AVX512-NEXT: popq %rbp
986 ; CHECK-AVX512-NEXT: retq
987 entry:
988 %d = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 3
989 store i32 %x2, i32* %d, align 4
990 tail call void @bar(%struct.S* %s1) #3
991 %cmp = icmp sgt i32 %x, 17
992 br i1 %cmp, label %if.then, label %if.end
993
994 if.then: ; preds = %entry
995 %b = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 1
996 store i32 %x, i32* %b, align 4
997 tail call void @bar(%struct.S* nonnull %s1) #3
998 br label %if.end
999
1000 if.end: ; preds = %if.then, %entry
1001 %0 = bitcast %struct.S* %s3 to i8*
1002 %1 = bitcast %struct.S* %s4 to i8*
1003 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
1004 %2 = bitcast %struct.S* %s2 to i8*
1005 %3 = bitcast %struct.S* %s1 to i8*
1006 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 4, i1 false)
1007 ret void
1008 }
1009
1010 ; Function Attrs: nounwind uwtable
1011 define void @test_limit_one_pred(%struct.S* noalias %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 {
1012 ; CHECK-LABEL: test_limit_one_pred:
1013 ; CHECK: # %bb.0: # %entry
1014 ; CHECK-NEXT: pushq %r15
1015 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1016 ; CHECK-NEXT: pushq %r14
1017 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1018 ; CHECK-NEXT: pushq %r12
1019 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1020 ; CHECK-NEXT: pushq %rbx
1021 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1022 ; CHECK-NEXT: pushq %rax
1023 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1024 ; CHECK-NEXT: .cfi_offset %rbx, -40
1025 ; CHECK-NEXT: .cfi_offset %r12, -32
1026 ; CHECK-NEXT: .cfi_offset %r14, -24
1027 ; CHECK-NEXT: .cfi_offset %r15, -16
1028 ; CHECK-NEXT: movq %r8, %r12
1029 ; CHECK-NEXT: movq %rcx, %r15
1030 ; CHECK-NEXT: movq %rsi, %r14
1031 ; CHECK-NEXT: movq %rdi, %rbx
1032 ; CHECK-NEXT: movl %r9d, 12(%rdi)
1033 ; CHECK-NEXT: cmpl $18, %edx
1034 ; CHECK-NEXT: jl .LBB10_2
1035 ; CHECK-NEXT: # %bb.1: # %if.then
1036 ; CHECK-NEXT: movl %edx, 4(%rbx)
1037 ; CHECK-NEXT: movq %rbx, %rdi
1038 ; CHECK-NEXT: callq bar
1039 ; CHECK-NEXT: .LBB10_2: # %if.end
1040 ; CHECK-NEXT: movups (%r12), %xmm0
1041 ; CHECK-NEXT: movups %xmm0, (%r15)
1042 ; CHECK-NEXT: movq (%rbx), %rax
1043 ; CHECK-NEXT: movq %rax, (%r14)
1044 ; CHECK-NEXT: movl 8(%rbx), %eax
1045 ; CHECK-NEXT: movl %eax, 8(%r14)
1046 ; CHECK-NEXT: movl 12(%rbx), %eax
1047 ; CHECK-NEXT: movl %eax, 12(%r14)
1048 ; CHECK-NEXT: addq $8, %rsp
1049 ; CHECK-NEXT: popq %rbx
1050 ; CHECK-NEXT: popq %r12
1051 ; CHECK-NEXT: popq %r14
1052 ; CHECK-NEXT: popq %r15
1053 ; CHECK-NEXT: retq
1054 ;
1055 ; DISABLED-LABEL: test_limit_one_pred:
1056 ; DISABLED: # %bb.0: # %entry
1057 ; DISABLED-NEXT: pushq %r15
1058 ; DISABLED-NEXT: .cfi_def_cfa_offset 16
1059 ; DISABLED-NEXT: pushq %r14
1060 ; DISABLED-NEXT: .cfi_def_cfa_offset 24
1061 ; DISABLED-NEXT: pushq %r12
1062 ; DISABLED-NEXT: .cfi_def_cfa_offset 32
1063 ; DISABLED-NEXT: pushq %rbx
1064 ; DISABLED-NEXT: .cfi_def_cfa_offset 40
1065 ; DISABLED-NEXT: pushq %rax
1066 ; DISABLED-NEXT: .cfi_def_cfa_offset 48
1067 ; DISABLED-NEXT: .cfi_offset %rbx, -40
1068 ; DISABLED-NEXT: .cfi_offset %r12, -32
1069 ; DISABLED-NEXT: .cfi_offset %r14, -24
1070 ; DISABLED-NEXT: .cfi_offset %r15, -16
1071 ; DISABLED-NEXT: movq %r8, %r15
1072 ; DISABLED-NEXT: movq %rcx, %r14
1073 ; DISABLED-NEXT: movq %rsi, %r12
1074 ; DISABLED-NEXT: movq %rdi, %rbx
1075 ; DISABLED-NEXT: movl %r9d, 12(%rdi)
1076 ; DISABLED-NEXT: cmpl $18, %edx
1077 ; DISABLED-NEXT: jl .LBB10_2
1078 ; DISABLED-NEXT: # %bb.1: # %if.then
1079 ; DISABLED-NEXT: movl %edx, 4(%rbx)
1080 ; DISABLED-NEXT: movq %rbx, %rdi
1081 ; DISABLED-NEXT: callq bar
1082 ; DISABLED-NEXT: .LBB10_2: # %if.end
1083 ; DISABLED-NEXT: movups (%r15), %xmm0
1084 ; DISABLED-NEXT: movups %xmm0, (%r14)
1085 ; DISABLED-NEXT: movups (%rbx), %xmm0
1086 ; DISABLED-NEXT: movups %xmm0, (%r12)
1087 ; DISABLED-NEXT: addq $8, %rsp
1088 ; DISABLED-NEXT: popq %rbx
1089 ; DISABLED-NEXT: popq %r12
1090 ; DISABLED-NEXT: popq %r14
1091 ; DISABLED-NEXT: popq %r15
1092 ; DISABLED-NEXT: retq
1093 ;
1094 ; CHECK-AVX2-LABEL: test_limit_one_pred:
1095 ; CHECK-AVX2: # %bb.0: # %entry
1096 ; CHECK-AVX2-NEXT: pushq %r15
1097 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 16
1098 ; CHECK-AVX2-NEXT: pushq %r14
1099 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 24
1100 ; CHECK-AVX2-NEXT: pushq %r12
1101 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 32
1102 ; CHECK-AVX2-NEXT: pushq %rbx
1103 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 40
1104 ; CHECK-AVX2-NEXT: pushq %rax
1105 ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 48
1106 ; CHECK-AVX2-NEXT: .cfi_offset %rbx, -40
1107 ; CHECK-AVX2-NEXT: .cfi_offset %r12, -32
1108 ; CHECK-AVX2-NEXT: .cfi_offset %r14, -24
1109 ; CHECK-AVX2-NEXT: .cfi_offset %r15, -16
1110 ; CHECK-AVX2-NEXT: movq %r8, %r12
1111 ; CHECK-AVX2-NEXT: movq %rcx, %r15
1112 ; CHECK-AVX2-NEXT: movq %rsi, %r14
1113 ; CHECK-AVX2-NEXT: movq %rdi, %rbx
1114 ; CHECK-AVX2-NEXT: movl %r9d, 12(%rdi)
1115 ; CHECK-AVX2-NEXT: cmpl $18, %edx
1116 ; CHECK-AVX2-NEXT: jl .LBB10_2
1117 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
1118 ; CHECK-AVX2-NEXT: movl %edx, 4(%rbx)
1119 ; CHECK-AVX2-NEXT: movq %rbx, %rdi
1120 ; CHECK-AVX2-NEXT: callq bar
1121 ; CHECK-AVX2-NEXT: .LBB10_2: # %if.end
1122 ; CHECK-AVX2-NEXT: vmovups (%r12), %xmm0
1123 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%r15)
1124 ; CHECK-AVX2-NEXT: movq (%rbx), %rax
1125 ; CHECK-AVX2-NEXT: movq %rax, (%r14)
1126 ; CHECK-AVX2-NEXT: movl 8(%rbx), %eax
1127 ; CHECK-AVX2-NEXT: movl %eax, 8(%r14)
1128 ; CHECK-AVX2-NEXT: movl 12(%rbx), %eax
1129 ; CHECK-AVX2-NEXT: movl %eax, 12(%r14)
1130 ; CHECK-AVX2-NEXT: addq $8, %rsp
1131 ; CHECK-AVX2-NEXT: popq %rbx
1132 ; CHECK-AVX2-NEXT: popq %r12
1133 ; CHECK-AVX2-NEXT: popq %r14
1134 ; CHECK-AVX2-NEXT: popq %r15
1135 ; CHECK-AVX2-NEXT: retq
1136 ;
1137 ; CHECK-AVX512-LABEL: test_limit_one_pred:
1138 ; CHECK-AVX512: # %bb.0: # %entry
1139 ; CHECK-AVX512-NEXT: pushq %r15
1140 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 16
1141 ; CHECK-AVX512-NEXT: pushq %r14
1142 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 24
1143 ; CHECK-AVX512-NEXT: pushq %r12
1144 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 32
1145 ; CHECK-AVX512-NEXT: pushq %rbx
1146 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 40
1147 ; CHECK-AVX512-NEXT: pushq %rax
1148 ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 48
1149 ; CHECK-AVX512-NEXT: .cfi_offset %rbx, -40
1150 ; CHECK-AVX512-NEXT: .cfi_offset %r12, -32
1151 ; CHECK-AVX512-NEXT: .cfi_offset %r14, -24
1152 ; CHECK-AVX512-NEXT: .cfi_offset %r15, -16
1153 ; CHECK-AVX512-NEXT: movq %r8, %r12
1154 ; CHECK-AVX512-NEXT: movq %rcx, %r15
1155 ; CHECK-AVX512-NEXT: movq %rsi, %r14
1156 ; CHECK-AVX512-NEXT: movq %rdi, %rbx
1157 ; CHECK-AVX512-NEXT: movl %r9d, 12(%rdi)
1158 ; CHECK-AVX512-NEXT: cmpl $18, %edx
1159 ; CHECK-AVX512-NEXT: jl .LBB10_2
1160 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
1161 ; CHECK-AVX512-NEXT: movl %edx, 4(%rbx)
1162 ; CHECK-AVX512-NEXT: movq %rbx, %rdi
1163 ; CHECK-AVX512-NEXT: callq bar
1164 ; CHECK-AVX512-NEXT: .LBB10_2: # %if.end
1165 ; CHECK-AVX512-NEXT: vmovups (%r12), %xmm0
1166 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%r15)
1167 ; CHECK-AVX512-NEXT: movq (%rbx), %rax
1168 ; CHECK-AVX512-NEXT: movq %rax, (%r14)
1169 ; CHECK-AVX512-NEXT: movl 8(%rbx), %eax
1170 ; CHECK-AVX512-NEXT: movl %eax, 8(%r14)
1171 ; CHECK-AVX512-NEXT: movl 12(%rbx), %eax
1172 ; CHECK-AVX512-NEXT: movl %eax, 12(%r14)
1173 ; CHECK-AVX512-NEXT: addq $8, %rsp
1174 ; CHECK-AVX512-NEXT: popq %rbx
1175 ; CHECK-AVX512-NEXT: popq %r12
1176 ; CHECK-AVX512-NEXT: popq %r14
1177 ; CHECK-AVX512-NEXT: popq %r15
1178 ; CHECK-AVX512-NEXT: retq
1179 entry:
1180 %d = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 3
1181 store i32 %x2, i32* %d, align 4
1182 %cmp = icmp sgt i32 %x, 17
1183 br i1 %cmp, label %if.then, label %if.end
1184
1185 if.then: ; preds = %entry
1186 %b = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 1
1187 store i32 %x, i32* %b, align 4
1188 tail call void @bar(%struct.S* nonnull %s1) #3
1189 br label %if.end
1190
1191 if.end: ; preds = %if.then, %entry
1192 %0 = bitcast %struct.S* %s3 to i8*
1193 %1 = bitcast %struct.S* %s4 to i8*
1194 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
1195 %2 = bitcast %struct.S* %s2 to i8*
1196 %3 = bitcast %struct.S* %s1 to i8*
1197 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 4, i1 false)
1198 ret void
1199 }
1200
1201
1202 declare void @bar(%struct.S*) local_unnamed_addr #1
1203
1204
1205 ; Function Attrs: argmemonly nounwind
1206 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1
1207
1208 attributes #0 = { nounwind uwtable "target-cpu"="x86-64" }
1209
1210 %struct.S7 = type { float, float, float , float, float, float, float, float }
1211
1212 ; Function Attrs: nounwind uwtable
1213 define void @test_conditional_block_float(%struct.S7* nocapture noalias %s1, %struct.S7* nocapture %s2, i32 %x, %struct.S7* nocapture %s3, %struct.S7* nocapture readonly %s4, float %y) local_unnamed_addr #0 {
1214 ; CHECK-LABEL: test_conditional_block_float:
1215 ; CHECK: # %bb.0: # %entry
1216 ; CHECK-NEXT: cmpl $18, %edx
1217 ; CHECK-NEXT: jl .LBB11_2
1218 ; CHECK-NEXT: # %bb.1: # %if.then
1219 ; CHECK-NEXT: movl $1065353216, 4(%rdi) # imm = 0x3F800000
1220 ; CHECK-NEXT: .LBB11_2: # %if.end
1221 ; CHECK-NEXT: movups (%r8), %xmm0
1222 ; CHECK-NEXT: movups 16(%r8), %xmm1
1223 ; CHECK-NEXT: movups %xmm1, 16(%rcx)
1224 ; CHECK-NEXT: movups %xmm0, (%rcx)
1225 ; CHECK-NEXT: movl (%rdi), %eax
1226 ; CHECK-NEXT: movl 4(%rdi), %ecx
1227 ; CHECK-NEXT: movq 8(%rdi), %rdx
1228 ; CHECK-NEXT: movups 16(%rdi), %xmm0
1229 ; CHECK-NEXT: movups %xmm0, 16(%rsi)
1230 ; CHECK-NEXT: movl %eax, (%rsi)
1231 ; CHECK-NEXT: movl %ecx, 4(%rsi)
1232 ; CHECK-NEXT: movq %rdx, 8(%rsi)
1233 ; CHECK-NEXT: retq
1234 ;
1235 ; DISABLED-LABEL: test_conditional_block_float:
1236 ; DISABLED: # %bb.0: # %entry
1237 ; DISABLED-NEXT: cmpl $18, %edx
1238 ; DISABLED-NEXT: jl .LBB11_2
1239 ; DISABLED-NEXT: # %bb.1: # %if.then
1240 ; DISABLED-NEXT: movl $1065353216, 4(%rdi) # imm = 0x3F800000
1241 ; DISABLED-NEXT: .LBB11_2: # %if.end
1242 ; DISABLED-NEXT: movups (%r8), %xmm0
1243 ; DISABLED-NEXT: movups 16(%r8), %xmm1
1244 ; DISABLED-NEXT: movups %xmm1, 16(%rcx)
1245 ; DISABLED-NEXT: movups %xmm0, (%rcx)
1246 ; DISABLED-NEXT: movups (%rdi), %xmm0
1247 ; DISABLED-NEXT: movups 16(%rdi), %xmm1
1248 ; DISABLED-NEXT: movups %xmm1, 16(%rsi)
1249 ; DISABLED-NEXT: movups %xmm0, (%rsi)
1250 ; DISABLED-NEXT: retq
1251 ;
1252 ; CHECK-AVX2-LABEL: test_conditional_block_float:
1253 ; CHECK-AVX2: # %bb.0: # %entry
1254 ; CHECK-AVX2-NEXT: cmpl $18, %edx
1255 ; CHECK-AVX2-NEXT: jl .LBB11_2
1256 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
1257 ; CHECK-AVX2-NEXT: movl $1065353216, 4(%rdi) # imm = 0x3F800000
1258 ; CHECK-AVX2-NEXT: .LBB11_2: # %if.end
1259 ; CHECK-AVX2-NEXT: vmovups (%r8), %ymm0
1260 ; CHECK-AVX2-NEXT: vmovups %ymm0, (%rcx)
1261 ; CHECK-AVX2-NEXT: movl (%rdi), %eax
1262 ; CHECK-AVX2-NEXT: movl %eax, (%rsi)
1263 ; CHECK-AVX2-NEXT: movl 4(%rdi), %eax
1264 ; CHECK-AVX2-NEXT: movl %eax, 4(%rsi)
1265 ; CHECK-AVX2-NEXT: vmovups 8(%rdi), %xmm0
1266 ; CHECK-AVX2-NEXT: vmovups %xmm0, 8(%rsi)
1267 ; CHECK-AVX2-NEXT: movq 24(%rdi), %rax
1268 ; CHECK-AVX2-NEXT: movq %rax, 24(%rsi)
1269 ; CHECK-AVX2-NEXT: vzeroupper
1270 ; CHECK-AVX2-NEXT: retq
1271 ;
1272 ; CHECK-AVX512-LABEL: test_conditional_block_float:
1273 ; CHECK-AVX512: # %bb.0: # %entry
1274 ; CHECK-AVX512-NEXT: cmpl $18, %edx
1275 ; CHECK-AVX512-NEXT: jl .LBB11_2
1276 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
1277 ; CHECK-AVX512-NEXT: movl $1065353216, 4(%rdi) # imm = 0x3F800000
1278 ; CHECK-AVX512-NEXT: .LBB11_2: # %if.end
1279 ; CHECK-AVX512-NEXT: vmovups (%r8), %ymm0
1280 ; CHECK-AVX512-NEXT: vmovups %ymm0, (%rcx)
1281 ; CHECK-AVX512-NEXT: movl (%rdi), %eax
1282 ; CHECK-AVX512-NEXT: movl %eax, (%rsi)
1283 ; CHECK-AVX512-NEXT: movl 4(%rdi), %eax
1284 ; CHECK-AVX512-NEXT: movl %eax, 4(%rsi)
1285 ; CHECK-AVX512-NEXT: vmovups 8(%rdi), %xmm0
1286 ; CHECK-AVX512-NEXT: vmovups %xmm0, 8(%rsi)
1287 ; CHECK-AVX512-NEXT: movq 24(%rdi), %rax
1288 ; CHECK-AVX512-NEXT: movq %rax, 24(%rsi)
1289 ; CHECK-AVX512-NEXT: vzeroupper
1290 ; CHECK-AVX512-NEXT: retq
1291 entry:
1292 %cmp = icmp sgt i32 %x, 17
1293 br i1 %cmp, label %if.then, label %if.end
1294
1295 if.then: ; preds = %entry
1296 %b = getelementptr inbounds %struct.S7, %struct.S7* %s1, i64 0, i32 1
1297 store float 1.0, float* %b, align 4
1298 br label %if.end
1299
1300 if.end: ; preds = %if.then, %entry
1301 %0 = bitcast %struct.S7* %s3 to i8*
1302 %1 = bitcast %struct.S7* %s4 to i8*
1303 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 32, i32 4, i1 false)
1304 %2 = bitcast %struct.S7* %s2 to i8*
1305 %3 = bitcast %struct.S7* %s1 to i8*
1306 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 32, i32 4, i1 false)
1307 ret void
1308 }
1309
1310 %struct.S8 = type { i64, i64, i64, i64, i64, i64 }
1311
1312 ; Function Attrs: nounwind uwtable
1313 define void @test_conditional_block_ymm(%struct.S8* nocapture noalias %s1, %struct.S8* nocapture %s2, i32 %x, %struct.S8* nocapture %s3, %struct.S8* nocapture readonly %s4) local_unnamed_addr #0 {
1314 ; CHECK-LABEL: test_conditional_block_ymm:
1315 ; CHECK: # %bb.0: # %entry
1316 ; CHECK-NEXT: cmpl $18, %edx
1317 ; CHECK-NEXT: jl .LBB12_2
1318 ; CHECK-NEXT: # %bb.1: # %if.then
1319 ; CHECK-NEXT: movq $1, 8(%rdi)
1320 ; CHECK-NEXT: .LBB12_2: # %if.end
1321 ; CHECK-NEXT: movups (%r8), %xmm0
1322 ; CHECK-NEXT: movups 16(%r8), %xmm1
1323 ; CHECK-NEXT: movups %xmm1, 16(%rcx)
1324 ; CHECK-NEXT: movups %xmm0, (%rcx)
1325 ; CHECK-NEXT: movq (%rdi), %rax
1326 ; CHECK-NEXT: movq 8(%rdi), %rcx
1327 ; CHECK-NEXT: movups 16(%rdi), %xmm0
1328 ; CHECK-NEXT: movups %xmm0, 16(%rsi)
1329 ; CHECK-NEXT: movq %rax, (%rsi)
1330 ; CHECK-NEXT: movq %rcx, 8(%rsi)
1331 ; CHECK-NEXT: retq
1332 ;
1333 ; DISABLED-LABEL: test_conditional_block_ymm:
1334 ; DISABLED: # %bb.0: # %entry
1335 ; DISABLED-NEXT: cmpl $18, %edx
1336 ; DISABLED-NEXT: jl .LBB12_2
1337 ; DISABLED-NEXT: # %bb.1: # %if.then
1338 ; DISABLED-NEXT: movq $1, 8(%rdi)
1339 ; DISABLED-NEXT: .LBB12_2: # %if.end
1340 ; DISABLED-NEXT: movups (%r8), %xmm0
1341 ; DISABLED-NEXT: movups 16(%r8), %xmm1
1342 ; DISABLED-NEXT: movups %xmm1, 16(%rcx)
1343 ; DISABLED-NEXT: movups %xmm0, (%rcx)
1344 ; DISABLED-NEXT: movups (%rdi), %xmm0
1345 ; DISABLED-NEXT: movups 16(%rdi), %xmm1
1346 ; DISABLED-NEXT: movups %xmm1, 16(%rsi)
1347 ; DISABLED-NEXT: movups %xmm0, (%rsi)
1348 ; DISABLED-NEXT: retq
1349 ;
1350 ; CHECK-AVX2-LABEL: test_conditional_block_ymm:
1351 ; CHECK-AVX2: # %bb.0: # %entry
1352 ; CHECK-AVX2-NEXT: cmpl $18, %edx
1353 ; CHECK-AVX2-NEXT: jl .LBB12_2
1354 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then
1355 ; CHECK-AVX2-NEXT: movq $1, 8(%rdi)
1356 ; CHECK-AVX2-NEXT: .LBB12_2: # %if.end
1357 ; CHECK-AVX2-NEXT: vmovups (%r8), %ymm0
1358 ; CHECK-AVX2-NEXT: vmovups %ymm0, (%rcx)
1359 ; CHECK-AVX2-NEXT: movq (%rdi), %rax
1360 ; CHECK-AVX2-NEXT: movq %rax, (%rsi)
1361 ; CHECK-AVX2-NEXT: movq 8(%rdi), %rax
1362 ; CHECK-AVX2-NEXT: movq %rax, 8(%rsi)
1363 ; CHECK-AVX2-NEXT: vmovups 16(%rdi), %xmm0
1364 ; CHECK-AVX2-NEXT: vmovups %xmm0, 16(%rsi)
1365 ; CHECK-AVX2-NEXT: vzeroupper
1366 ; CHECK-AVX2-NEXT: retq
1367 ;
1368 ; CHECK-AVX512-LABEL: test_conditional_block_ymm:
1369 ; CHECK-AVX512: # %bb.0: # %entry
1370 ; CHECK-AVX512-NEXT: cmpl $18, %edx
1371 ; CHECK-AVX512-NEXT: jl .LBB12_2
1372 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then
1373 ; CHECK-AVX512-NEXT: movq $1, 8(%rdi)
1374 ; CHECK-AVX512-NEXT: .LBB12_2: # %if.end
1375 ; CHECK-AVX512-NEXT: vmovups (%r8), %ymm0
1376 ; CHECK-AVX512-NEXT: vmovups %ymm0, (%rcx)
1377 ; CHECK-AVX512-NEXT: movq (%rdi), %rax
1378 ; CHECK-AVX512-NEXT: movq %rax, (%rsi)
1379 ; CHECK-AVX512-NEXT: movq 8(%rdi), %rax
1380 ; CHECK-AVX512-NEXT: movq %rax, 8(%rsi)
1381 ; CHECK-AVX512-NEXT: vmovups 16(%rdi), %xmm0
1382 ; CHECK-AVX512-NEXT: vmovups %xmm0, 16(%rsi)
1383 ; CHECK-AVX512-NEXT: vzeroupper
1384 ; CHECK-AVX512-NEXT: retq
1385 entry:
1386 %cmp = icmp sgt i32 %x, 17
1387 br i1 %cmp, label %if.then, label %if.end
1388
1389 if.then: ; preds = %entry
1390 %b = getelementptr inbounds %struct.S8, %struct.S8* %s1, i64 0, i32 1
1391 store i64 1, i64* %b, align 4
1392 br label %if.end
1393
1394 if.end: ; preds = %if.then, %entry
1395 %0 = bitcast %struct.S8* %s3 to i8*
1396 %1 = bitcast %struct.S8* %s4 to i8*
1397 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 32, i32 4, i1 false)
1398 %2 = bitcast %struct.S8* %s2 to i8*
1399 %3 = bitcast %struct.S8* %s1 to i8*
1400 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 32, i32 4, i1 false)
1401 ret void
1402 }
1403
1404 define dso_local void @test_alias(i8* nocapture %A, i32 %x) local_unnamed_addr #0 {
1405 ; CHECK-LABEL: test_alias:
1406 ; CHECK: # %bb.0: # %entry
1407 ; CHECK-NEXT: movl %esi, (%rdi)
1408 ; CHECK-NEXT: movups (%rdi), %xmm0
1409 ; CHECK-NEXT: movups %xmm0, 4(%rdi)
1410 ; CHECK-NEXT: retq
1411 ;
1412 ; DISABLED-LABEL: test_alias:
1413 ; DISABLED: # %bb.0: # %entry
1414 ; DISABLED-NEXT: movl %esi, (%rdi)
1415 ; DISABLED-NEXT: movups (%rdi), %xmm0
1416 ; DISABLED-NEXT: movups %xmm0, 4(%rdi)
1417 ; DISABLED-NEXT: retq
1418 ;
1419 ; CHECK-AVX2-LABEL: test_alias:
1420 ; CHECK-AVX2: # %bb.0: # %entry
1421 ; CHECK-AVX2-NEXT: movl %esi, (%rdi)
1422 ; CHECK-AVX2-NEXT: vmovups (%rdi), %xmm0
1423 ; CHECK-AVX2-NEXT: vmovups %xmm0, 4(%rdi)
1424 ; CHECK-AVX2-NEXT: retq
1425 ;
1426 ; CHECK-AVX512-LABEL: test_alias:
1427 ; CHECK-AVX512: # %bb.0: # %entry
1428 ; CHECK-AVX512-NEXT: movl %esi, (%rdi)
1429 ; CHECK-AVX512-NEXT: vmovups (%rdi), %xmm0
1430 ; CHECK-AVX512-NEXT: vmovups %xmm0, 4(%rdi)
1431 ; CHECK-AVX512-NEXT: retq
1432 entry:
1433 %a = bitcast i8* %A to i32*
1434 store i32 %x, i32* %a, align 4
1435 %add.ptr = getelementptr inbounds i8, i8* %A, i64 4
1436 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 %add.ptr, i8* align 4 %A, i64 16, i32 4, i1 false)
1437 ret void
1438 }
1439
1440 ; Function Attrs: nounwind uwtable
1441 define dso_local void @test_noalias(i8* nocapture %A, i32 %x) local_unnamed_addr #0 {
1442 ; CHECK-LABEL: test_noalias:
1443 ; CHECK: # %bb.0: # %entry
1444 ; CHECK-NEXT: movl %esi, (%rdi)
1445 ; CHECK-NEXT: movl (%rdi), %eax
1446 ; CHECK-NEXT: movl %eax, 20(%rdi)
1447 ; CHECK-NEXT: movq 4(%rdi), %rax
1448 ; CHECK-NEXT: movq %rax, 24(%rdi)
1449 ; CHECK-NEXT: movl 12(%rdi), %eax
1450 ; CHECK-NEXT: movl %eax, 32(%rdi)
1451 ; CHECK-NEXT: retq
1452 ;
1453 ; DISABLED-LABEL: test_noalias:
1454 ; DISABLED: # %bb.0: # %entry
1455 ; DISABLED-NEXT: movl %esi, (%rdi)
1456 ; DISABLED-NEXT: movups (%rdi), %xmm0
1457 ; DISABLED-NEXT: movups %xmm0, 20(%rdi)
1458 ; DISABLED-NEXT: retq
1459 ;
1460 ; CHECK-AVX2-LABEL: test_noalias:
1461 ; CHECK-AVX2: # %bb.0: # %entry
1462 ; CHECK-AVX2-NEXT: movl %esi, (%rdi)
1463 ; CHECK-AVX2-NEXT: movl (%rdi), %eax
1464 ; CHECK-AVX2-NEXT: movl %eax, 20(%rdi)
1465 ; CHECK-AVX2-NEXT: movq 4(%rdi), %rax
1466 ; CHECK-AVX2-NEXT: movq %rax, 24(%rdi)
1467 ; CHECK-AVX2-NEXT: movl 12(%rdi), %eax
1468 ; CHECK-AVX2-NEXT: movl %eax, 32(%rdi)
1469 ; CHECK-AVX2-NEXT: retq
1470 ;
1471 ; CHECK-AVX512-LABEL: test_noalias:
1472 ; CHECK-AVX512: # %bb.0: # %entry
1473 ; CHECK-AVX512-NEXT: movl %esi, (%rdi)
1474 ; CHECK-AVX512-NEXT: movl (%rdi), %eax
1475 ; CHECK-AVX512-NEXT: movl %eax, 20(%rdi)
1476 ; CHECK-AVX512-NEXT: movq 4(%rdi), %rax
1477 ; CHECK-AVX512-NEXT: movq %rax, 24(%rdi)
1478 ; CHECK-AVX512-NEXT: movl 12(%rdi), %eax
1479 ; CHECK-AVX512-NEXT: movl %eax, 32(%rdi)
1480 ; CHECK-AVX512-NEXT: retq
1481 entry:
1482 %a = bitcast i8* %A to i32*
1483 store i32 %x, i32* %a, align 4
1484 %add.ptr = getelementptr inbounds i8, i8* %A, i64 20
1485 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 %add.ptr, i8* align 4 %A, i64 16, i32 4, i1 false)
1486 ret void
1487 }
1488
1489
1490