llvm.org GIT mirror llvm / 6f6d98e
Revert "r364412 [ExpandMemCmp][MergeICmps] Move passes out of CodeGen into opt pipeline." Breaks sanitizers: libFuzzer :: cxxstring.test libFuzzer :: memcmp.test libFuzzer :: recommended-dictionary.test libFuzzer :: strcmp.test libFuzzer :: value-profile-mem.test libFuzzer :: value-profile-strcmp.test git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@364416 91177308-0d34-0410-b5e6-96231b3b80d8 Clement Courbet a month ago
41 changed file(s) with 4181 addition(s) and 2846 deletion(s). Raw diff Collapse all Expand all
434434 /// shuffles.
435435 FunctionPass *createExpandReductionsPass();
436436
437 // This pass expands memcmp() to load/stores.
438 FunctionPass *createExpandMemCmpPass();
439
437440 /// Creates Break False Dependencies pass. \see BreakFalseDeps.cpp
438441 FunctionPass *createBreakFalseDeps();
439442
194194 void addPGOInstrPasses(legacy::PassManagerBase &MPM, bool IsCS);
195195 void addFunctionSimplificationPasses(legacy::PassManagerBase &MPM);
196196 void addInstructionCombiningPass(legacy::PassManagerBase &MPM) const;
197 void addMemcmpPasses(legacy::PassManagerBase &MPM) const;
198197
199198 public:
200199 /// populateFunctionPassManager - This fills in the function pass manager,
374374
375375 //===----------------------------------------------------------------------===//
376376 //
377 // ExpandMemCmp - This pass expands memcmp() to load/stores.
378 //
379 Pass *createExpandMemCmpPass();
380
381 //===----------------------------------------------------------------------===//
382 //
383377 // ValuePropagation - Propagate CFG-derived value information
384378 //
385379 Pass *createCorrelatedValuePropagationPass();
2020 EarlyIfConversion.cpp
2121 EdgeBundles.cpp
2222 ExecutionDomainFix.cpp
23 ExpandMemCmp.cpp
2324 ExpandPostRAPseudos.cpp
2425 ExpandReductions.cpp
2526 FaultMaps.cpp
2929 initializeEarlyIfConverterPass(Registry);
3030 initializeEarlyMachineLICMPass(Registry);
3131 initializeEarlyTailDuplicatePass(Registry);
32 initializeExpandMemCmpPassPass(Registry);
3233 initializeExpandPostRAPass(Registry);
3334 initializeFEntryInserterPass(Registry);
3435 initializeFinalizeISelPass(Registry);
0 //===--- ExpandMemCmp.cpp - Expand memcmp() to load/stores ----------------===//
1 //
2 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
3 // See https://llvm.org/LICENSE.txt for license information.
4 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5 //
6 //===----------------------------------------------------------------------===//
7 //
8 // This pass tries to expand memcmp() calls into optimally-sized loads and
9 // compares for the target.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "llvm/ADT/Statistic.h"
14 #include "llvm/Analysis/ConstantFolding.h"
15 #include "llvm/Analysis/TargetLibraryInfo.h"
16 #include "llvm/Analysis/TargetTransformInfo.h"
17 #include "llvm/Analysis/ValueTracking.h"
18 #include "llvm/CodeGen/TargetLowering.h"
19 #include "llvm/CodeGen/TargetPassConfig.h"
20 #include "llvm/CodeGen/TargetSubtargetInfo.h"
21 #include "llvm/IR/IRBuilder.h"
22
23 using namespace llvm;
24
25 #define DEBUG_TYPE "expandmemcmp"
26
27 STATISTIC(NumMemCmpCalls, "Number of memcmp calls");
28 STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size");
29 STATISTIC(NumMemCmpGreaterThanMax,
30 "Number of memcmp calls with size greater than max size");
31 STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls");
32
33 static cl::opt MemCmpEqZeroNumLoadsPerBlock(
34 "memcmp-num-loads-per-block", cl::Hidden, cl::init(1),
35 cl::desc("The number of loads per basic block for inline expansion of "
36 "memcmp that is only being compared against zero."));
37
38 static cl::opt MaxLoadsPerMemcmp(
39 "max-loads-per-memcmp", cl::Hidden,
40 cl::desc("Set maximum number of loads used in expanded memcmp"));
41
42 static cl::opt MaxLoadsPerMemcmpOptSize(
43 "max-loads-per-memcmp-opt-size", cl::Hidden,
44 cl::desc("Set maximum number of loads used in expanded memcmp for -Os/Oz"));
45
46 namespace {
47
48
49 // This class provides helper functions to expand a memcmp library call into an
50 // inline expansion.
51 class MemCmpExpansion {
52 struct ResultBlock {
53 BasicBlock *BB = nullptr;
54 PHINode *PhiSrc1 = nullptr;
55 PHINode *PhiSrc2 = nullptr;
56
57 ResultBlock() = default;
58 };
59
60 CallInst *const CI;
61 ResultBlock ResBlock;
62 const uint64_t Size;
63 unsigned MaxLoadSize;
64 uint64_t NumLoadsNonOneByte;
65 const uint64_t NumLoadsPerBlockForZeroCmp;
66 std::vector LoadCmpBlocks;
67 BasicBlock *EndBlock;
68 PHINode *PhiRes;
69 const bool IsUsedForZeroCmp;
70 const DataLayout &DL;
71 IRBuilder<> Builder;
72 // Represents the decomposition in blocks of the expansion. For example,
73 // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and
74 // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}.
75 struct LoadEntry {
76 LoadEntry(unsigned LoadSize, uint64_t Offset)
77 : LoadSize(LoadSize), Offset(Offset) {
78 }
79
80 // The size of the load for this block, in bytes.
81 unsigned LoadSize;
82 // The offset of this load from the base pointer, in bytes.
83 uint64_t Offset;
84 };
85 using LoadEntryVector = SmallVector;
86 LoadEntryVector LoadSequence;
87
88 void createLoadCmpBlocks();
89 void createResultBlock();
90 void setupResultBlockPHINodes();
91 void setupEndBlockPHINodes();
92 Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex);
93 void emitLoadCompareBlock(unsigned BlockIndex);
94 void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,
95 unsigned &LoadIndex);
96 void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned OffsetBytes);
97 void emitMemCmpResultBlock();
98 Value *getMemCmpExpansionZeroCase();
99 Value *getMemCmpEqZeroOneBlock();
100 Value *getMemCmpOneBlock();
101 Value *getPtrToElementAtOffset(Value *Source, Type *LoadSizeType,
102 uint64_t OffsetBytes);
103
104 static LoadEntryVector
105 computeGreedyLoadSequence(uint64_t Size, llvm::ArrayRef LoadSizes,
106 unsigned MaxNumLoads, unsigned &NumLoadsNonOneByte);
107 static LoadEntryVector
108 computeOverlappingLoadSequence(uint64_t Size, unsigned MaxLoadSize,
109 unsigned MaxNumLoads,
110 unsigned &NumLoadsNonOneByte);
111
112 public:
113 MemCmpExpansion(CallInst *CI, uint64_t Size,
114 const TargetTransformInfo::MemCmpExpansionOptions &Options,
115 const bool IsUsedForZeroCmp, const DataLayout &TheDataLayout);
116
117 unsigned getNumBlocks();
118 uint64_t getNumLoads() const { return LoadSequence.size(); }
119
120 Value *getMemCmpExpansion();
121 };
122
123 MemCmpExpansion::LoadEntryVector MemCmpExpansion::computeGreedyLoadSequence(
124 uint64_t Size, llvm::ArrayRef LoadSizes,
125 const unsigned MaxNumLoads, unsigned &NumLoadsNonOneByte) {
126 NumLoadsNonOneByte = 0;
127 LoadEntryVector LoadSequence;
128 uint64_t Offset = 0;
129 while (Size && !LoadSizes.empty()) {
130 const unsigned LoadSize = LoadSizes.front();
131 const uint64_t NumLoadsForThisSize = Size / LoadSize;
132 if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) {
133 // Do not expand if the total number of loads is larger than what the
134 // target allows. Note that it's important that we exit before completing
135 // the expansion to avoid using a ton of memory to store the expansion for
136 // large sizes.
137 return {};
138 }
139 if (NumLoadsForThisSize > 0) {
140 for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) {
141 LoadSequence.push_back({LoadSize, Offset});
142 Offset += LoadSize;
143 }
144 if (LoadSize > 1)
145 ++NumLoadsNonOneByte;
146 Size = Size % LoadSize;
147 }
148 LoadSizes = LoadSizes.drop_front();
149 }
150 return LoadSequence;
151 }
152
153 MemCmpExpansion::LoadEntryVector
154 MemCmpExpansion::computeOverlappingLoadSequence(uint64_t Size,
155 const unsigned MaxLoadSize,
156 const unsigned MaxNumLoads,
157 unsigned &NumLoadsNonOneByte) {
158 // These are already handled by the greedy approach.
159 if (Size < 2 || MaxLoadSize < 2)
160 return {};
161
162 // We try to do as many non-overlapping loads as possible starting from the
163 // beginning.
164 const uint64_t NumNonOverlappingLoads = Size / MaxLoadSize;
165 assert(NumNonOverlappingLoads && "there must be at least one load");
166 // There remain 0 to (MaxLoadSize - 1) bytes to load, this will be done with
167 // an overlapping load.
168 Size = Size - NumNonOverlappingLoads * MaxLoadSize;
169 // Bail if we do not need an overloapping store, this is already handled by
170 // the greedy approach.
171 if (Size == 0)
172 return {};
173 // Bail if the number of loads (non-overlapping + potential overlapping one)
174 // is larger than the max allowed.
175 if ((NumNonOverlappingLoads + 1) > MaxNumLoads)
176 return {};
177
178 // Add non-overlapping loads.
179 LoadEntryVector LoadSequence;
180 uint64_t Offset = 0;
181 for (uint64_t I = 0; I < NumNonOverlappingLoads; ++I) {
182 LoadSequence.push_back({MaxLoadSize, Offset});
183 Offset += MaxLoadSize;
184 }
185
186 // Add the last overlapping load.
187 assert(Size > 0 && Size < MaxLoadSize && "broken invariant");
188 LoadSequence.push_back({MaxLoadSize, Offset - (MaxLoadSize - Size)});
189 NumLoadsNonOneByte = 1;
190 return LoadSequence;
191 }
192
193 // Initialize the basic block structure required for expansion of memcmp call
194 // with given maximum load size and memcmp size parameter.
195 // This structure includes:
196 // 1. A list of load compare blocks - LoadCmpBlocks.
197 // 2. An EndBlock, split from original instruction point, which is the block to
198 // return from.
199 // 3. ResultBlock, block to branch to for early exit when a
200 // LoadCmpBlock finds a difference.
201 MemCmpExpansion::MemCmpExpansion(
202 CallInst *const CI, uint64_t Size,
203 const TargetTransformInfo::MemCmpExpansionOptions &Options,
204 const bool IsUsedForZeroCmp, const DataLayout &TheDataLayout)
205 : CI(CI), Size(Size), MaxLoadSize(0), NumLoadsNonOneByte(0),
206 NumLoadsPerBlockForZeroCmp(Options.NumLoadsPerBlock),
207 IsUsedForZeroCmp(IsUsedForZeroCmp), DL(TheDataLayout), Builder(CI) {
208 assert(Size > 0 && "zero blocks");
209 // Scale the max size down if the target can load more bytes than we need.
210 llvm::ArrayRef LoadSizes(Options.LoadSizes);
211 while (!LoadSizes.empty() && LoadSizes.front() > Size) {
212 LoadSizes = LoadSizes.drop_front();
213 }
214 assert(!LoadSizes.empty() && "cannot load Size bytes");
215 MaxLoadSize = LoadSizes.front();
216 // Compute the decomposition.
217 unsigned GreedyNumLoadsNonOneByte = 0;
218 LoadSequence = computeGreedyLoadSequence(Size, LoadSizes, Options.MaxNumLoads,
219 GreedyNumLoadsNonOneByte);
220 NumLoadsNonOneByte = GreedyNumLoadsNonOneByte;
221 assert(LoadSequence.size() <= Options.MaxNumLoads && "broken invariant");
222 // If we allow overlapping loads and the load sequence is not already optimal,
223 // use overlapping loads.
224 if (Options.AllowOverlappingLoads &&
225 (LoadSequence.empty() || LoadSequence.size() > 2)) {
226 unsigned OverlappingNumLoadsNonOneByte = 0;
227 auto OverlappingLoads = computeOverlappingLoadSequence(
228 Size, MaxLoadSize, Options.MaxNumLoads, OverlappingNumLoadsNonOneByte);
229 if (!OverlappingLoads.empty() &&
230 (LoadSequence.empty() ||
231 OverlappingLoads.size() < LoadSequence.size())) {
232 LoadSequence = OverlappingLoads;
233 NumLoadsNonOneByte = OverlappingNumLoadsNonOneByte;
234 }
235 }
236 assert(LoadSequence.size() <= Options.MaxNumLoads && "broken invariant");
237 }
238
239 unsigned MemCmpExpansion::getNumBlocks() {
240 if (IsUsedForZeroCmp)
241 return getNumLoads() / NumLoadsPerBlockForZeroCmp +
242 (getNumLoads() % NumLoadsPerBlockForZeroCmp != 0 ? 1 : 0);
243 return getNumLoads();
244 }
245
246 void MemCmpExpansion::createLoadCmpBlocks() {
247 for (unsigned i = 0; i < getNumBlocks(); i++) {
248 BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb",
249 EndBlock->getParent(), EndBlock);
250 LoadCmpBlocks.push_back(BB);
251 }
252 }
253
254 void MemCmpExpansion::createResultBlock() {
255 ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block",
256 EndBlock->getParent(), EndBlock);
257 }
258
259 /// Return a pointer to an element of type `LoadSizeType` at offset
260 /// `OffsetBytes`.
261 Value *MemCmpExpansion::getPtrToElementAtOffset(Value *Source,
262 Type *LoadSizeType,
263 uint64_t OffsetBytes) {
264 if (OffsetBytes > 0) {
265 auto *ByteType = Type::getInt8Ty(CI->getContext());
266 Source = Builder.CreateGEP(
267 ByteType, Builder.CreateBitCast(Source, ByteType->getPointerTo()),
268 ConstantInt::get(ByteType, OffsetBytes));
269 }
270 return Builder.CreateBitCast(Source, LoadSizeType->getPointerTo());
271 }
272
273 // This function creates the IR instructions for loading and comparing 1 byte.
274 // It loads 1 byte from each source of the memcmp parameters with the given
275 // GEPIndex. It then subtracts the two loaded values and adds this result to the
276 // final phi node for selecting the memcmp result.
277 void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex,
278 unsigned OffsetBytes) {
279 Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
280 Type *LoadSizeType = Type::getInt8Ty(CI->getContext());
281 Value *Source1 =
282 getPtrToElementAtOffset(CI->getArgOperand(0), LoadSizeType, OffsetBytes);
283 Value *Source2 =
284 getPtrToElementAtOffset(CI->getArgOperand(1), LoadSizeType, OffsetBytes);
285
286 Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
287 Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
288
289 LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext()));
290 LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext()));
291 Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2);
292
293 PhiRes->addIncoming(Diff, LoadCmpBlocks[BlockIndex]);
294
295 if (BlockIndex < (LoadCmpBlocks.size() - 1)) {
296 // Early exit branch if difference found to EndBlock. Otherwise, continue to
297 // next LoadCmpBlock,
298 Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff,
299 ConstantInt::get(Diff->getType(), 0));
300 BranchInst *CmpBr =
301 BranchInst::Create(EndBlock, LoadCmpBlocks[BlockIndex + 1], Cmp);
302 Builder.Insert(CmpBr);
303 } else {
304 // The last block has an unconditional branch to EndBlock.
305 BranchInst *CmpBr = BranchInst::Create(EndBlock);
306 Builder.Insert(CmpBr);
307 }
308 }
309
310 /// Generate an equality comparison for one or more pairs of loaded values.
311 /// This is used in the case where the memcmp() call is compared equal or not
312 /// equal to zero.
313 Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex,
314 unsigned &LoadIndex) {
315 assert(LoadIndex < getNumLoads() &&
316 "getCompareLoadPairs() called with no remaining loads");
317 std::vector XorList, OrList;
318 Value *Diff = nullptr;
319
320 const unsigned NumLoads =
321 std::min(getNumLoads() - LoadIndex, NumLoadsPerBlockForZeroCmp);
322
323 // For a single-block expansion, start inserting before the memcmp call.
324 if (LoadCmpBlocks.empty())
325 Builder.SetInsertPoint(CI);
326 else
327 Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
328
329 Value *Cmp = nullptr;
330 // If we have multiple loads per block, we need to generate a composite
331 // comparison using xor+or. The type for the combinations is the largest load
332 // type.
333 IntegerType *const MaxLoadType =
334 NumLoads == 1 ? nullptr
335 : IntegerType::get(CI->getContext(), MaxLoadSize * 8);
336 for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) {
337 const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex];
338
339 IntegerType *LoadSizeType =
340 IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
341
342 Value *Source1 = getPtrToElementAtOffset(CI->getArgOperand(0), LoadSizeType,
343 CurLoadEntry.Offset);
344 Value *Source2 = getPtrToElementAtOffset(CI->getArgOperand(1), LoadSizeType,
345 CurLoadEntry.Offset);
346
347 // Get a constant or load a value for each source address.
348 Value *LoadSrc1 = nullptr;
349 if (auto *Source1C = dyn_cast(Source1))
350 LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL);
351 if (!LoadSrc1)
352 LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
353
354 Value *LoadSrc2 = nullptr;
355 if (auto *Source2C = dyn_cast(Source2))
356 LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL);
357 if (!LoadSrc2)
358 LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
359
360 if (NumLoads != 1) {
361 if (LoadSizeType != MaxLoadType) {
362 LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
363 LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
364 }
365 // If we have multiple loads per block, we need to generate a composite
366 // comparison using xor+or.
367 Diff = Builder.CreateXor(LoadSrc1, LoadSrc2);
368 Diff = Builder.CreateZExt(Diff, MaxLoadType);
369 XorList.push_back(Diff);
370 } else {
371 // If there's only one load per block, we just compare the loaded values.
372 Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2);
373 }
374 }
375
376 auto pairWiseOr = [&](std::vector &InList) -> std::vector {
377 std::vector OutList;
378 for (unsigned i = 0; i < InList.size() - 1; i = i + 2) {
379 Value *Or = Builder.CreateOr(InList[i], InList[i + 1]);
380 OutList.push_back(Or);
381 }
382 if (InList.size() % 2 != 0)
383 OutList.push_back(InList.back());
384 return OutList;
385 };
386
387 if (!Cmp) {
388 // Pairwise OR the XOR results.
389 OrList = pairWiseOr(XorList);
390
391 // Pairwise OR the OR results until one result left.
392 while (OrList.size() != 1) {
393 OrList = pairWiseOr(OrList);
394 }
395
396 assert(Diff && "Failed to find comparison diff");
397 Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0));
398 }
399
400 return Cmp;
401 }
402
403 void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,
404 unsigned &LoadIndex) {
405 Value *Cmp = getCompareLoadPairs(BlockIndex, LoadIndex);
406
407 BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1))
408 ? EndBlock
409 : LoadCmpBlocks[BlockIndex + 1];
410 // Early exit branch if difference found to ResultBlock. Otherwise,
411 // continue to next LoadCmpBlock or EndBlock.
412 BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp);
413 Builder.Insert(CmpBr);
414
415 // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
416 // since early exit to ResultBlock was not taken (no difference was found in
417 // any of the bytes).
418 if (BlockIndex == LoadCmpBlocks.size() - 1) {
419 Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
420 PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]);
421 }
422 }
423
424 // This function creates the IR intructions for loading and comparing using the
425 // given LoadSize. It loads the number of bytes specified by LoadSize from each
426 // source of the memcmp parameters. It then does a subtract to see if there was
427 // a difference in the loaded values. If a difference is found, it branches
428 // with an early exit to the ResultBlock for calculating which source was
429 // larger. Otherwise, it falls through to the either the next LoadCmpBlock or
430 // the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with
431 // a special case through emitLoadCompareByteBlock. The special handling can
432 // simply subtract the loaded values and add it to the result phi node.
433 void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) {
434 // There is one load per block in this case, BlockIndex == LoadIndex.
435 const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex];
436
437 if (CurLoadEntry.LoadSize == 1) {
438 MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex, CurLoadEntry.Offset);
439 return;
440 }
441
442 Type *LoadSizeType =
443 IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
444 Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
445 assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type");
446
447 Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
448
449 Value *Source1 = getPtrToElementAtOffset(CI->getArgOperand(0), LoadSizeType,
450 CurLoadEntry.Offset);
451 Value *Source2 = getPtrToElementAtOffset(CI->getArgOperand(1), LoadSizeType,
452 CurLoadEntry.Offset);
453
454 // Load LoadSizeType from the base address.
455 Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
456 Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
457
458 if (DL.isLittleEndian()) {
459 Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
460 Intrinsic::bswap, LoadSizeType);
461 LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
462 LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
463 }
464
465 if (LoadSizeType != MaxLoadType) {
466 LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
467 LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
468 }
469
470 // Add the loaded values to the phi nodes for calculating memcmp result only
471 // if result is not used in a zero equality.
472 if (!IsUsedForZeroCmp) {
473 ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[BlockIndex]);
474 ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[BlockIndex]);
475 }
476
477 Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2);
478 BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1))
479 ? EndBlock
480 : LoadCmpBlocks[BlockIndex + 1];
481 // Early exit branch if difference found to ResultBlock. Otherwise, continue
482 // to next LoadCmpBlock or EndBlock.
483 BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp);
484 Builder.Insert(CmpBr);
485
486 // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
487 // since early exit to ResultBlock was not taken (no difference was found in
488 // any of the bytes).
489 if (BlockIndex == LoadCmpBlocks.size() - 1) {
490 Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
491 PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]);
492 }
493 }
494
495 // This function populates the ResultBlock with a sequence to calculate the
496 // memcmp result. It compares the two loaded source values and returns -1 if
497 // src1 < src2 and 1 if src1 > src2.
498 void MemCmpExpansion::emitMemCmpResultBlock() {
499 // Special case: if memcmp result is used in a zero equality, result does not
500 // need to be calculated and can simply return 1.
501 if (IsUsedForZeroCmp) {
502 BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
503 Builder.SetInsertPoint(ResBlock.BB, InsertPt);
504 Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1);
505 PhiRes->addIncoming(Res, ResBlock.BB);
506 BranchInst *NewBr = BranchInst::Create(EndBlock);
507 Builder.Insert(NewBr);
508 return;
509 }
510 BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
511 Builder.SetInsertPoint(ResBlock.BB, InsertPt);
512
513 Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1,
514 ResBlock.PhiSrc2);
515
516 Value *Res =
517 Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1),
518 ConstantInt::get(Builder.getInt32Ty(), 1));
519
520 BranchInst *NewBr = BranchInst::Create(EndBlock);
521 Builder.Insert(NewBr);
522 PhiRes->addIncoming(Res, ResBlock.BB);
523 }
524
525 void MemCmpExpansion::setupResultBlockPHINodes() {
526 Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
527 Builder.SetInsertPoint(ResBlock.BB);
528 // Note: this assumes one load per block.
529 ResBlock.PhiSrc1 =
530 Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src1");
531 ResBlock.PhiSrc2 =
532 Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src2");
533 }
534
535 void MemCmpExpansion::setupEndBlockPHINodes() {
536 Builder.SetInsertPoint(&EndBlock->front());
537 PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res");
538 }
539
540 Value *MemCmpExpansion::getMemCmpExpansionZeroCase() {
541 unsigned LoadIndex = 0;
542 // This loop populates each of the LoadCmpBlocks with the IR sequence to
543 // handle multiple loads per block.
544 for (unsigned I = 0; I < getNumBlocks(); ++I) {
545 emitLoadCompareBlockMultipleLoads(I, LoadIndex);
546 }
547
548 emitMemCmpResultBlock();
549 return PhiRes;
550 }
551
552 /// A memcmp expansion that compares equality with 0 and only has one block of
553 /// load and compare can bypass the compare, branch, and phi IR that is required
554 /// in the general case.
555 Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() {
556 unsigned LoadIndex = 0;
557 Value *Cmp = getCompareLoadPairs(0, LoadIndex);
558 assert(LoadIndex == getNumLoads() && "some entries were not consumed");
559 return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext()));
560 }
561
562 /// A memcmp expansion that only has one block of load and compare can bypass
563 /// the compare, branch, and phi IR that is required in the general case.
564 Value *MemCmpExpansion::getMemCmpOneBlock() {
565 Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
566 Value *Source1 = CI->getArgOperand(0);
567 Value *Source2 = CI->getArgOperand(1);
568
569 // Cast source to LoadSizeType*.
570 if (Source1->getType() != LoadSizeType)
571 Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
572 if (Source2->getType() != LoadSizeType)
573 Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
574
575 // Load LoadSizeType from the base address.
576 Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
577 Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
578
579 if (DL.isLittleEndian() && Size != 1) {
580 Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
581 Intrinsic::bswap, LoadSizeType);
582 LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
583 LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
584 }
585
586 if (Size < 4) {
587 // The i8 and i16 cases don't need compares. We zext the loaded values and
588 // subtract them to get the suitable negative, zero, or positive i32 result.
589 LoadSrc1 = Builder.CreateZExt(LoadSrc1, Builder.getInt32Ty());
590 LoadSrc2 = Builder.CreateZExt(LoadSrc2, Builder.getInt32Ty());
591 return Builder.CreateSub(LoadSrc1, LoadSrc2);
592 }
593
594 // The result of memcmp is negative, zero, or positive, so produce that by
595 // subtracting 2 extended compare bits: sub (ugt, ult).
596 // If a target prefers to use selects to get -1/0/1, they should be able
597 // to transform this later. The inverse transform (going from selects to math)
598 // may not be possible in the DAG because the selects got converted into
599 // branches before we got there.
600 Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2);
601 Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2);
602 Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty());
603 Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty());
604 return Builder.CreateSub(ZextUGT, ZextULT);
605 }
606
607 // This function expands the memcmp call into an inline expansion and returns
608 // the memcmp result.
609 Value *MemCmpExpansion::getMemCmpExpansion() {
610 // Create the basic block framework for a multi-block expansion.
611 if (getNumBlocks() != 1) {
612 BasicBlock *StartBlock = CI->getParent();
613 EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
614 setupEndBlockPHINodes();
615 createResultBlock();
616
617 // If return value of memcmp is not used in a zero equality, we need to
618 // calculate which source was larger. The calculation requires the
619 // two loaded source values of each load compare block.
620 // These will be saved in the phi nodes created by setupResultBlockPHINodes.
621 if (!IsUsedForZeroCmp) setupResultBlockPHINodes();
622
623 // Create the number of required load compare basic blocks.
624 createLoadCmpBlocks();
625
626 // Update the terminator added by splitBasicBlock to branch to the first
627 // LoadCmpBlock.
628 StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);
629 }
630
631 Builder.SetCurrentDebugLocation(CI->getDebugLoc());
632
633 if (IsUsedForZeroCmp)
634 return getNumBlocks() == 1 ? getMemCmpEqZeroOneBlock()
635 : getMemCmpExpansionZeroCase();
636
637 if (getNumBlocks() == 1)
638 return getMemCmpOneBlock();
639
640 for (unsigned I = 0; I < getNumBlocks(); ++I) {
641 emitLoadCompareBlock(I);
642 }
643
644 emitMemCmpResultBlock();
645 return PhiRes;
646 }
647
648 // This function checks to see if an expansion of memcmp can be generated.
649 // It checks for constant compare size that is less than the max inline size.
650 // If an expansion cannot occur, returns false to leave as a library call.
651 // Otherwise, the library call is replaced with a new IR instruction sequence.
652 /// We want to transform:
653 /// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15)
654 /// To:
655 /// loadbb:
656 /// %0 = bitcast i32* %buffer2 to i8*
657 /// %1 = bitcast i32* %buffer1 to i8*
658 /// %2 = bitcast i8* %1 to i64*
659 /// %3 = bitcast i8* %0 to i64*
660 /// %4 = load i64, i64* %2
661 /// %5 = load i64, i64* %3
662 /// %6 = call i64 @llvm.bswap.i64(i64 %4)
663 /// %7 = call i64 @llvm.bswap.i64(i64 %5)
664 /// %8 = sub i64 %6, %7
665 /// %9 = icmp ne i64 %8, 0
666 /// br i1 %9, label %res_block, label %loadbb1
667 /// res_block: ; preds = %loadbb2,
668 /// %loadbb1, %loadbb
669 /// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ]
670 /// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ]
671 /// %10 = icmp ult i64 %phi.src1, %phi.src2
672 /// %11 = select i1 %10, i32 -1, i32 1
673 /// br label %endblock
674 /// loadbb1: ; preds = %loadbb
675 /// %12 = bitcast i32* %buffer2 to i8*
676 /// %13 = bitcast i32* %buffer1 to i8*
677 /// %14 = bitcast i8* %13 to i32*
678 /// %15 = bitcast i8* %12 to i32*
679 /// %16 = getelementptr i32, i32* %14, i32 2
680 /// %17 = getelementptr i32, i32* %15, i32 2
681 /// %18 = load i32, i32* %16
682 /// %19 = load i32, i32* %17
683 /// %20 = call i32 @llvm.bswap.i32(i32 %18)
684 /// %21 = call i32 @llvm.bswap.i32(i32 %19)
685 /// %22 = zext i32 %20 to i64
686 /// %23 = zext i32 %21 to i64
687 /// %24 = sub i64 %22, %23
688 /// %25 = icmp ne i64 %24, 0
689 /// br i1 %25, label %res_block, label %loadbb2
690 /// loadbb2: ; preds = %loadbb1
691 /// %26 = bitcast i32* %buffer2 to i8*
692 /// %27 = bitcast i32* %buffer1 to i8*
693 /// %28 = bitcast i8* %27 to i16*
694 /// %29 = bitcast i8* %26 to i16*
695 /// %30 = getelementptr i16, i16* %28, i16 6
696 /// %31 = getelementptr i16, i16* %29, i16 6
697 /// %32 = load i16, i16* %30
698 /// %33 = load i16, i16* %31
699 /// %34 = call i16 @llvm.bswap.i16(i16 %32)
700 /// %35 = call i16 @llvm.bswap.i16(i16 %33)
701 /// %36 = zext i16 %34 to i64
702 /// %37 = zext i16 %35 to i64
703 /// %38 = sub i64 %36, %37
704 /// %39 = icmp ne i64 %38, 0
705 /// br i1 %39, label %res_block, label %loadbb3
706 /// loadbb3: ; preds = %loadbb2
707 /// %40 = bitcast i32* %buffer2 to i8*
708 /// %41 = bitcast i32* %buffer1 to i8*
709 /// %42 = getelementptr i8, i8* %41, i8 14
710 /// %43 = getelementptr i8, i8* %40, i8 14
711 /// %44 = load i8, i8* %42
712 /// %45 = load i8, i8* %43
713 /// %46 = zext i8 %44 to i32
714 /// %47 = zext i8 %45 to i32
715 /// %48 = sub i32 %46, %47
716 /// br label %endblock
717 /// endblock: ; preds = %res_block,
718 /// %loadbb3
719 /// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ]
720 /// ret i32 %phi.res
721 static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
722 const TargetLowering *TLI, const DataLayout *DL) {
723 NumMemCmpCalls++;
724
725 // Early exit from expansion if -Oz.
726 if (CI->getFunction()->hasMinSize())
727 return false;
728
729 // Early exit from expansion if size is not a constant.
730 ConstantInt *SizeCast = dyn_cast(CI->getArgOperand(2));
731 if (!SizeCast) {
732 NumMemCmpNotConstant++;
733 return false;
734 }
735 const uint64_t SizeVal = SizeCast->getZExtValue();
736
737 if (SizeVal == 0) {
738 return false;
739 }
740 // TTI call to check if target would like to expand memcmp. Also, get the
741 // available load sizes.
742 const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
743 auto Options = TTI->enableMemCmpExpansion(CI->getFunction()->hasOptSize(),
744 IsUsedForZeroCmp);
745 if (!Options) return false;
746
747 if (MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences())
748 Options.NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock;
749
750 if (CI->getFunction()->hasOptSize() &&
751 MaxLoadsPerMemcmpOptSize.getNumOccurrences())
752 Options.MaxNumLoads = MaxLoadsPerMemcmpOptSize;
753
754 if (!CI->getFunction()->hasOptSize() && MaxLoadsPerMemcmp.getNumOccurrences())
755 Options.MaxNumLoads = MaxLoadsPerMemcmp;
756
757 MemCmpExpansion Expansion(CI, SizeVal, Options, IsUsedForZeroCmp, *DL);
758
759 // Don't expand if this will require more loads than desired by the target.
760 if (Expansion.getNumLoads() == 0) {
761 NumMemCmpGreaterThanMax++;
762 return false;
763 }
764
765 NumMemCmpInlined++;
766
767 Value *Res = Expansion.getMemCmpExpansion();
768
769 // Replace call with result of expansion and erase call.
770 CI->replaceAllUsesWith(Res);
771 CI->eraseFromParent();
772
773 return true;
774 }
775
776
777
778 class ExpandMemCmpPass : public FunctionPass {
779 public:
780 static char ID;
781
782 ExpandMemCmpPass() : FunctionPass(ID) {
783 initializeExpandMemCmpPassPass(*PassRegistry::getPassRegistry());
784 }
785
786 bool runOnFunction(Function &F) override {
787 if (skipFunction(F)) return false;
788
789 auto *TPC = getAnalysisIfAvailable();
790 if (!TPC) {
791 return false;
792 }
793 const TargetLowering* TL =
794 TPC->getTM().getSubtargetImpl(F)->getTargetLowering();
795
796 const TargetLibraryInfo *TLI =
797 &getAnalysis().getTLI();
798 const TargetTransformInfo *TTI =
799 &getAnalysis().getTTI(F);
800 auto PA = runImpl(F, TLI, TTI, TL);
801 return !PA.areAllPreserved();
802 }
803
804 private:
805 void getAnalysisUsage(AnalysisUsage &AU) const override {
806 AU.addRequired();
807 AU.addRequired();
808 FunctionPass::getAnalysisUsage(AU);
809 }
810
811 PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
812 const TargetTransformInfo *TTI,
813 const TargetLowering* TL);
814 // Returns true if a change was made.
815 bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,
816 const TargetTransformInfo *TTI, const TargetLowering* TL,
817 const DataLayout& DL);
818 };
819
820 bool ExpandMemCmpPass::runOnBlock(
821 BasicBlock &BB, const TargetLibraryInfo *TLI,
822 const TargetTransformInfo *TTI, const TargetLowering* TL,
823 const DataLayout& DL) {
824 for (Instruction& I : BB) {
825 CallInst *CI = dyn_cast(&I);
826 if (!CI) {
827 continue;
828 }
829 LibFunc Func;
830 if (TLI->getLibFunc(ImmutableCallSite(CI), Func) &&
831 (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
832 expandMemCmp(CI, TTI, TL, &DL)) {
833 return true;
834 }
835 }
836 return false;
837 }
838
839
840 PreservedAnalyses ExpandMemCmpPass::runImpl(
841 Function &F, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI,
842 const TargetLowering* TL) {
843 const DataLayout& DL = F.getParent()->getDataLayout();
844 bool MadeChanges = false;
845 for (auto BBIt = F.begin(); BBIt != F.end();) {
846 if (runOnBlock(*BBIt, TLI, TTI, TL, DL)) {
847 MadeChanges = true;
848 // If changes were made, restart the function from the beginning, since
849 // the structure of the function was changed.
850 BBIt = F.begin();
851 } else {
852 ++BBIt;
853 }
854 }
855 return MadeChanges ? PreservedAnalyses::none() : PreservedAnalyses::all();
856 }
857
858 } // namespace
859
860 char ExpandMemCmpPass::ID = 0;
861 INITIALIZE_PASS_BEGIN(ExpandMemCmpPass, "expandmemcmp",
862 "Expand memcmp() to load/stores", false, false)
863 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
864 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
865 INITIALIZE_PASS_END(ExpandMemCmpPass, "expandmemcmp",
866 "Expand memcmp() to load/stores", false, false)
867
868 FunctionPass *llvm::createExpandMemCmpPass() {
869 return new ExpandMemCmpPass();
870 }
9898 "enable-implicit-null-checks",
9999 cl::desc("Fold null checks into faulting memory operations"),
100100 cl::init(false), cl::Hidden);
101 static cl::opt DisableMergeICmps("disable-mergeicmps",
102 cl::desc("Disable MergeICmps Pass"),
103 cl::init(false), cl::Hidden);
101104 static cl::opt PrintLSR("print-lsr-output", cl::Hidden,
102105 cl::desc("Print LLVM IR produced by the loop-reduce pass"));
103106 static cl::opt PrintISelInput("print-isel-input", cl::Hidden,
634637 addPass(createLoopStrengthReducePass());
635638 if (PrintLSR)
636639 addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n"));
640 }
641
642 if (getOptLevel() != CodeGenOpt::None) {
643 // The MergeICmpsPass tries to create memcmp calls by grouping sequences of
644 // loads and compares. ExpandMemCmpPass then tries to expand those calls
645 // into optimally-sized loads and compares. The transforms are enabled by a
646 // target lowering hook.
647 if (!DisableMergeICmps)
648 addPass(createMergeICmpsLegacyPass());
649 addPass(createExpandMemCmpPass());
637650 }
638651
639652 // Run GC lowering passes for builtin collectors
245245 PM.add(createInstructionCombiningPass(ExpensiveCombines));
246246 }
247247
248 void PassManagerBuilder::addMemcmpPasses(legacy::PassManagerBase &PM) const {
249 if (OptLevel > 0) {
250 // The MergeICmpsPass tries to create memcmp calls by grouping sequences of
251 // loads and compares. ExpandMemCmpPass then tries to expand those calls
252 // into optimally-sized loads and compares. The transforms are enabled by a
253 // target transform info hook.
254 PM.add(createMergeICmpsLegacyPass());
255 PM.add(createExpandMemCmpPass());
256 PM.add(createEarlyCSEPass());
257 }
258 }
259
260248 void PassManagerBuilder::populateFunctionPassManager(
261249 legacy::FunctionPassManager &FPM) {
262250 addExtensionsToPM(EP_EarlyAsPossible, FPM);
402390 : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
403391 }
404392 MPM.add(createMemCpyOptPass()); // Remove memcpy / form memset
405 addMemcmpPasses(MPM); // Merge/Expand comparisons.
406393 MPM.add(createSCCPPass()); // Constant prop with SCCP
407394
408395 // Delete dead bit computations (instcombine runs after to fold away the dead
922909 PM.add(NewGVN ? createNewGVNPass()
923910 : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies.
924911 PM.add(createMemCpyOptPass()); // Remove dead memcpys.
925 addMemcmpPasses(PM); // Merge/Expand comparisons.
926912
927913 // Nuke dead stores.
928914 PM.add(createDeadStoreEliminationPass());
99 DeadStoreElimination.cpp
1010 DivRemPairs.cpp
1111 EarlyCSE.cpp
12 ExpandMemCmp.cpp
1312 FlattenCFGPass.cpp
1413 Float2Int.cpp
1514 GuardWidening.cpp
+0
-895
lib/Transforms/Scalar/ExpandMemCmp.cpp less more
None //===--- ExpandMemCmp.cpp - Expand memcmp() to load/stores ----------------===//
1 //
2 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
3 // See https://llvm.org/LICENSE.txt for license information.
4 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5 //
6 //===----------------------------------------------------------------------===//
7 //
8 // This pass tries to expand memcmp() calls into optimally-sized loads and
9 // compares for the target.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "llvm/ADT/Statistic.h"
14 #include "llvm/Analysis/ConstantFolding.h"
15 #include "llvm/Analysis/DomTreeUpdater.h"
16 #include "llvm/Analysis/GlobalsModRef.h"
17 #include "llvm/Analysis/TargetLibraryInfo.h"
18 #include "llvm/Analysis/TargetTransformInfo.h"
19 #include "llvm/Analysis/ValueTracking.h"
20 #include "llvm/CodeGen/TargetSubtargetInfo.h"
21 #include "llvm/IR/Dominators.h"
22 #include "llvm/IR/IRBuilder.h"
23 #include "llvm/Transforms/Scalar.h"
24
25 using namespace llvm;
26
27 #define DEBUG_TYPE "expandmemcmp"
28
29 STATISTIC(NumMemCmpCalls, "Number of memcmp calls");
30 STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size");
31 STATISTIC(NumMemCmpGreaterThanMax,
32 "Number of memcmp calls with size greater than max size");
33 STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls");
34
35 static cl::opt MemCmpEqZeroNumLoadsPerBlock(
36 "memcmp-num-loads-per-block", cl::Hidden, cl::init(1),
37 cl::desc("The number of loads per basic block for inline expansion of "
38 "memcmp that is only being compared against zero."));
39
40 static cl::opt MaxLoadsPerMemcmp(
41 "max-loads-per-memcmp", cl::Hidden,
42 cl::desc("Set maximum number of loads used in expanded memcmp"));
43
44 static cl::opt MaxLoadsPerMemcmpOptSize(
45 "max-loads-per-memcmp-opt-size", cl::Hidden,
46 cl::desc("Set maximum number of loads used in expanded memcmp for -Os/Oz"));
47
48 namespace {
49
50 // This class provides helper functions to expand a memcmp library call into an
51 // inline expansion.
52 class MemCmpExpansion {
53 struct ResultBlock {
54 BasicBlock *BB = nullptr;
55 PHINode *PhiSrc1 = nullptr;
56 PHINode *PhiSrc2 = nullptr;
57
58 ResultBlock() = default;
59 };
60
61 CallInst *const CI;
62 ResultBlock ResBlock;
63 const uint64_t Size;
64 unsigned MaxLoadSize;
65 uint64_t NumLoadsNonOneByte;
66 const uint64_t NumLoadsPerBlockForZeroCmp;
67 std::vector LoadCmpBlocks;
68 BasicBlock *EndBlock = nullptr;
69 PHINode *PhiRes;
70 const bool IsUsedForZeroCmp;
71 const DataLayout &DL;
72 IRBuilder<> Builder;
73 DomTreeUpdater DTU;
74 // Represents the decomposition in blocks of the expansion. For example,
75 // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and
76 // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}.
77 struct LoadEntry {
78 LoadEntry(unsigned LoadSize, uint64_t Offset)
79 : LoadSize(LoadSize), Offset(Offset) {}
80
81 // The size of the load for this block, in bytes.
82 unsigned LoadSize;
83 // The offset of this load from the base pointer, in bytes.
84 uint64_t Offset;
85 };
86 using LoadEntryVector = SmallVector;
87 LoadEntryVector LoadSequence;
88
89 void createLoadCmpBlocks();
90 void createResultBlock();
91 void setupResultBlockPHINodes();
92 void setupEndBlockPHINodes();
93 Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex);
94 void emitLoadCompareBlock(unsigned BlockIndex);
95 void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,
96 unsigned &LoadIndex);
97 void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned OffsetBytes);
98 void emitMemCmpResultBlock();
99 Value *getMemCmpExpansionZeroCase();
100 Value *getMemCmpEqZeroOneBlock();
101 Value *getMemCmpOneBlock();
102 Value *getPtrToElementAtOffset(Value *Source, Type *LoadSizeType,
103 uint64_t OffsetBytes);
104
105 static LoadEntryVector
106 computeGreedyLoadSequence(uint64_t Size, llvm::ArrayRef LoadSizes,
107 unsigned MaxNumLoads, unsigned &NumLoadsNonOneByte);
108 static LoadEntryVector
109 computeOverlappingLoadSequence(uint64_t Size, unsigned MaxLoadSize,
110 unsigned MaxNumLoads,
111 unsigned &NumLoadsNonOneByte);
112
113 public:
114 MemCmpExpansion(CallInst *CI, uint64_t Size,
115 const TargetTransformInfo::MemCmpExpansionOptions &Options,
116 const bool IsUsedForZeroCmp, const DataLayout &TheDataLayout,
117 DominatorTree *DT);
118
119 unsigned getNumBlocks();
120 uint64_t getNumLoads() const { return LoadSequence.size(); }
121
122 Value *getMemCmpExpansion();
123 };
124
125 MemCmpExpansion::LoadEntryVector MemCmpExpansion::computeGreedyLoadSequence(
126 uint64_t Size, llvm::ArrayRef LoadSizes,
127 const unsigned MaxNumLoads, unsigned &NumLoadsNonOneByte) {
128 NumLoadsNonOneByte = 0;
129 LoadEntryVector LoadSequence;
130 uint64_t Offset = 0;
131 while (Size && !LoadSizes.empty()) {
132 const unsigned LoadSize = LoadSizes.front();
133 const uint64_t NumLoadsForThisSize = Size / LoadSize;
134 if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) {
135 // Do not expand if the total number of loads is larger than what the
136 // target allows. Note that it's important that we exit before completing
137 // the expansion to avoid using a ton of memory to store the expansion for
138 // large sizes.
139 return {};
140 }
141 if (NumLoadsForThisSize > 0) {
142 for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) {
143 LoadSequence.push_back({LoadSize, Offset});
144 Offset += LoadSize;
145 }
146 if (LoadSize > 1)
147 ++NumLoadsNonOneByte;
148 Size = Size % LoadSize;
149 }
150 LoadSizes = LoadSizes.drop_front();
151 }
152 return LoadSequence;
153 }
154
155 MemCmpExpansion::LoadEntryVector
156 MemCmpExpansion::computeOverlappingLoadSequence(uint64_t Size,
157 const unsigned MaxLoadSize,
158 const unsigned MaxNumLoads,
159 unsigned &NumLoadsNonOneByte) {
160 // These are already handled by the greedy approach.
161 if (Size < 2 || MaxLoadSize < 2)
162 return {};
163
164 // We try to do as many non-overlapping loads as possible starting from the
165 // beginning.
166 const uint64_t NumNonOverlappingLoads = Size / MaxLoadSize;
167 assert(NumNonOverlappingLoads && "there must be at least one load");
168 // There remain 0 to (MaxLoadSize - 1) bytes to load, this will be done with
169 // an overlapping load.
170 Size = Size - NumNonOverlappingLoads * MaxLoadSize;
171 // Bail if we do not need an overloapping store, this is already handled by
172 // the greedy approach.
173 if (Size == 0)
174 return {};
175 // Bail if the number of loads (non-overlapping + potential overlapping one)
176 // is larger than the max allowed.
177 if ((NumNonOverlappingLoads + 1) > MaxNumLoads)
178 return {};
179
180 // Add non-overlapping loads.
181 LoadEntryVector LoadSequence;
182 uint64_t Offset = 0;
183 for (uint64_t I = 0; I < NumNonOverlappingLoads; ++I) {
184 LoadSequence.push_back({MaxLoadSize, Offset});
185 Offset += MaxLoadSize;
186 }
187
188 // Add the last overlapping load.
189 assert(Size > 0 && Size < MaxLoadSize && "broken invariant");
190 LoadSequence.push_back({MaxLoadSize, Offset - (MaxLoadSize - Size)});
191 NumLoadsNonOneByte = 1;
192 return LoadSequence;
193 }
194
195 // Initialize the basic block structure required for expansion of memcmp call
196 // with given maximum load size and memcmp size parameter.
197 // This structure includes:
198 // 1. A list of load compare blocks - LoadCmpBlocks.
199 // 2. An EndBlock, split from original instruction point, which is the block to
200 // return from.
201 // 3. ResultBlock, block to branch to for early exit when a
202 // LoadCmpBlock finds a difference.
203 MemCmpExpansion::MemCmpExpansion(
204 CallInst *const CI, uint64_t Size,
205 const TargetTransformInfo::MemCmpExpansionOptions &Options,
206 const bool IsUsedForZeroCmp, const DataLayout &TheDataLayout,
207 DominatorTree *DT)
208 : CI(CI), Size(Size), MaxLoadSize(0), NumLoadsNonOneByte(0),
209 NumLoadsPerBlockForZeroCmp(Options.NumLoadsPerBlock),
210 IsUsedForZeroCmp(IsUsedForZeroCmp), DL(TheDataLayout), Builder(CI),
211 DTU(DT, /*PostDominator*/ nullptr,
212 DomTreeUpdater::UpdateStrategy::Eager) {
213 assert(Size > 0 && "zero blocks");
214 // Scale the max size down if the target can load more bytes than we need.
215 llvm::ArrayRef LoadSizes(Options.LoadSizes);
216 while (!LoadSizes.empty() && LoadSizes.front() > Size) {
217 LoadSizes = LoadSizes.drop_front();
218 }
219 assert(!LoadSizes.empty() && "cannot load Size bytes");
220 MaxLoadSize = LoadSizes.front();
221 // Compute the decomposition.
222 unsigned GreedyNumLoadsNonOneByte = 0;
223 LoadSequence = computeGreedyLoadSequence(Size, LoadSizes, Options.MaxNumLoads,
224 GreedyNumLoadsNonOneByte);
225 NumLoadsNonOneByte = GreedyNumLoadsNonOneByte;
226 assert(LoadSequence.size() <= Options.MaxNumLoads && "broken invariant");
227 // If we allow overlapping loads and the load sequence is not already optimal,
228 // use overlapping loads.
229 if (Options.AllowOverlappingLoads &&
230 (LoadSequence.empty() || LoadSequence.size() > 2)) {
231 unsigned OverlappingNumLoadsNonOneByte = 0;
232 auto OverlappingLoads = computeOverlappingLoadSequence(
233 Size, MaxLoadSize, Options.MaxNumLoads, OverlappingNumLoadsNonOneByte);
234 if (!OverlappingLoads.empty() &&
235 (LoadSequence.empty() ||
236 OverlappingLoads.size() < LoadSequence.size())) {
237 LoadSequence = OverlappingLoads;
238 NumLoadsNonOneByte = OverlappingNumLoadsNonOneByte;
239 }
240 }
241 assert(LoadSequence.size() <= Options.MaxNumLoads && "broken invariant");
242 }
243
244 unsigned MemCmpExpansion::getNumBlocks() {
245 if (IsUsedForZeroCmp)
246 return getNumLoads() / NumLoadsPerBlockForZeroCmp +
247 (getNumLoads() % NumLoadsPerBlockForZeroCmp != 0 ? 1 : 0);
248 return getNumLoads();
249 }
250
251 void MemCmpExpansion::createLoadCmpBlocks() {
252 assert(ResBlock.BB && "ResBlock must be created before LoadCmpBlocks");
253 for (unsigned i = 0; i < getNumBlocks(); i++) {
254 BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb",
255 EndBlock->getParent(), EndBlock);
256 LoadCmpBlocks.push_back(BB);
257 }
258 }
259
260 void MemCmpExpansion::createResultBlock() {
261 assert(EndBlock && "EndBlock must be created before ResultBlock");
262 ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block",
263 EndBlock->getParent(), EndBlock);
264 }
265
266 /// Return a pointer to an element of type `LoadSizeType` at offset
267 /// `OffsetBytes`.
268 Value *MemCmpExpansion::getPtrToElementAtOffset(Value *Source,
269 Type *LoadSizeType,
270 uint64_t OffsetBytes) {
271 if (OffsetBytes > 0) {
272 auto *ByteType = Type::getInt8Ty(CI->getContext());
273 Source = Builder.CreateGEP(
274 ByteType, Builder.CreateBitCast(Source, ByteType->getPointerTo()),
275 ConstantInt::get(ByteType, OffsetBytes));
276 }
277 return Builder.CreateBitCast(Source, LoadSizeType->getPointerTo());
278 }
279
280 // This function creates the IR instructions for loading and comparing 1 byte.
281 // It loads 1 byte from each source of the memcmp parameters with the given
282 // GEPIndex. It then subtracts the two loaded values and adds this result to the
283 // final phi node for selecting the memcmp result.
284 void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex,
285 unsigned OffsetBytes) {
286 BasicBlock *const BB = LoadCmpBlocks[BlockIndex];
287 Builder.SetInsertPoint(BB);
288 Type *LoadSizeType = Type::getInt8Ty(CI->getContext());
289 Value *Source1 =
290 getPtrToElementAtOffset(CI->getArgOperand(0), LoadSizeType, OffsetBytes);
291 Value *Source2 =
292 getPtrToElementAtOffset(CI->getArgOperand(1), LoadSizeType, OffsetBytes);
293
294 Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
295 Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
296
297 LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext()));
298 LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext()));
299 Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2);
300
301 PhiRes->addIncoming(Diff, LoadCmpBlocks[BlockIndex]);
302
303 if (BlockIndex < (LoadCmpBlocks.size() - 1)) {
304 // Early exit branch if difference found to EndBlock. Otherwise, continue to
305 // next LoadCmpBlock,
306 Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff,
307 ConstantInt::get(Diff->getType(), 0));
308 BasicBlock *const NextBB = LoadCmpBlocks[BlockIndex + 1];
309 BranchInst *CmpBr = BranchInst::Create(EndBlock, NextBB, Cmp);
310 Builder.Insert(CmpBr);
311 DTU.applyUpdates({{DominatorTree::Insert, BB, EndBlock},
312 {DominatorTree::Insert, BB, NextBB}});
313 } else {
314 // The last block has an unconditional branch to EndBlock.
315 BranchInst *CmpBr = BranchInst::Create(EndBlock);
316 Builder.Insert(CmpBr);
317 DTU.applyUpdates({{DominatorTree::Insert, BB, EndBlock}});
318 }
319 }
320
321 /// Generate an equality comparison for one or more pairs of loaded values.
322 /// This is used in the case where the memcmp() call is compared equal or not
323 /// equal to zero.
324 Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex,
325 unsigned &LoadIndex) {
326 assert(LoadIndex < getNumLoads() &&
327 "getCompareLoadPairs() called with no remaining loads");
328 std::vector XorList, OrList;
329 Value *Diff = nullptr;
330
331 const unsigned NumLoads =
332 std::min(getNumLoads() - LoadIndex, NumLoadsPerBlockForZeroCmp);
333
334 // For a single-block expansion, start inserting before the memcmp call.
335 if (LoadCmpBlocks.empty())
336 Builder.SetInsertPoint(CI);
337 else
338 Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
339
340 Value *Cmp = nullptr;
341 // If we have multiple loads per block, we need to generate a composite
342 // comparison using xor+or. The type for the combinations is the largest load
343 // type.
344 IntegerType *const MaxLoadType =
345 NumLoads == 1 ? nullptr
346 : IntegerType::get(CI->getContext(), MaxLoadSize * 8);
347 for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) {
348 const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex];
349
350 IntegerType *LoadSizeType =
351 IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
352
353 Value *Source1 = getPtrToElementAtOffset(CI->getArgOperand(0), LoadSizeType,
354 CurLoadEntry.Offset);
355 Value *Source2 = getPtrToElementAtOffset(CI->getArgOperand(1), LoadSizeType,
356 CurLoadEntry.Offset);
357
358 // Get a constant or load a value for each source address.
359 Value *LoadSrc1 = nullptr;
360 if (auto *Source1C = dyn_cast(Source1))
361 LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL);
362 if (!LoadSrc1)
363 LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
364
365 Value *LoadSrc2 = nullptr;
366 if (auto *Source2C = dyn_cast(Source2))
367 LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL);
368 if (!LoadSrc2)
369 LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
370
371 if (NumLoads != 1) {
372 if (LoadSizeType != MaxLoadType) {
373 LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
374 LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
375 }
376 // If we have multiple loads per block, we need to generate a composite
377 // comparison using xor+or.
378 Diff = Builder.CreateXor(LoadSrc1, LoadSrc2);
379 Diff = Builder.CreateZExt(Diff, MaxLoadType);
380 XorList.push_back(Diff);
381 } else {
382 // If there's only one load per block, we just compare the loaded values.
383 Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2);
384 }
385 }
386
387 auto pairWiseOr = [&](std::vector &InList) -> std::vector {
388 std::vector OutList;
389 for (unsigned i = 0; i < InList.size() - 1; i = i + 2) {
390 Value *Or = Builder.CreateOr(InList[i], InList[i + 1]);
391 OutList.push_back(Or);
392 }
393 if (InList.size() % 2 != 0)
394 OutList.push_back(InList.back());
395 return OutList;
396 };
397
398 if (!Cmp) {
399 // Pairwise OR the XOR results.
400 OrList = pairWiseOr(XorList);
401
402 // Pairwise OR the OR results until one result left.
403 while (OrList.size() != 1) {
404 OrList = pairWiseOr(OrList);
405 }
406
407 assert(Diff && "Failed to find comparison diff");
408 Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0));
409 }
410
411 return Cmp;
412 }
413
414 void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,
415 unsigned &LoadIndex) {
416 Value *Cmp = getCompareLoadPairs(BlockIndex, LoadIndex);
417
418 BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1))
419 ? EndBlock
420 : LoadCmpBlocks[BlockIndex + 1];
421 // Early exit branch if difference found to ResultBlock. Otherwise,
422 // continue to next LoadCmpBlock or EndBlock.
423 BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp);
424 Builder.Insert(CmpBr);
425 BasicBlock *const BB = LoadCmpBlocks[BlockIndex];
426
427 // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
428 // since early exit to ResultBlock was not taken (no difference was found in
429 // any of the bytes).
430 if (BlockIndex == LoadCmpBlocks.size() - 1) {
431 Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
432 PhiRes->addIncoming(Zero, BB);
433 }
434 DTU.applyUpdates({{DominatorTree::Insert, BB, ResBlock.BB},
435 {DominatorTree::Insert, BB, NextBB}});
436 }
437
438 // This function creates the IR intructions for loading and comparing using the
439 // given LoadSize. It loads the number of bytes specified by LoadSize from each
440 // source of the memcmp parameters. It then does a subtract to see if there was
441 // a difference in the loaded values. If a difference is found, it branches
442 // with an early exit to the ResultBlock for calculating which source was
443 // larger. Otherwise, it falls through to the either the next LoadCmpBlock or
444 // the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with
445 // a special case through emitLoadCompareByteBlock. The special handling can
446 // simply subtract the loaded values and add it to the result phi node.
447 void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) {
448 // There is one load per block in this case, BlockIndex == LoadIndex.
449 const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex];
450
451 if (CurLoadEntry.LoadSize == 1) {
452 MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex, CurLoadEntry.Offset);
453 return;
454 }
455
456 Type *LoadSizeType =
457 IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
458 Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
459 assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type");
460
461 BasicBlock *const BB = LoadCmpBlocks[BlockIndex];
462 Builder.SetInsertPoint(BB);
463
464 Value *Source1 = getPtrToElementAtOffset(CI->getArgOperand(0), LoadSizeType,
465 CurLoadEntry.Offset);
466 Value *Source2 = getPtrToElementAtOffset(CI->getArgOperand(1), LoadSizeType,
467 CurLoadEntry.Offset);
468
469 // Load LoadSizeType from the base address.
470 Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
471 Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
472
473 if (DL.isLittleEndian()) {
474 Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
475 Intrinsic::bswap, LoadSizeType);
476 LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
477 LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
478 }
479
480 if (LoadSizeType != MaxLoadType) {
481 LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
482 LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
483 }
484
485 // Add the loaded values to the phi nodes for calculating memcmp result only
486 // if result is not used in a zero equality.
487 if (!IsUsedForZeroCmp) {
488 ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[BlockIndex]);
489 ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[BlockIndex]);
490 }
491
492 Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2);
493 BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1))
494 ? EndBlock
495 : LoadCmpBlocks[BlockIndex + 1];
496 // Early exit branch if difference found to ResultBlock. Otherwise, continue
497 // to next LoadCmpBlock or EndBlock.
498 BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp);
499 Builder.Insert(CmpBr);
500
501 // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
502 // since early exit to ResultBlock was not taken (no difference was found in
503 // any of the bytes).
504 if (BlockIndex == LoadCmpBlocks.size() - 1) {
505 Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
506 PhiRes->addIncoming(Zero, BB);
507 }
508 DTU.applyUpdates({{DominatorTree::Insert, BB, ResBlock.BB},
509 {DominatorTree::Insert, BB, NextBB}});
510 }
511
512 // This function populates the ResultBlock with a sequence to calculate the
513 // memcmp result. It compares the two loaded source values and returns -1 if
514 // src1 < src2 and 1 if src1 > src2.
515 void MemCmpExpansion::emitMemCmpResultBlock() {
516 // Special case: if memcmp result is used in a zero equality, result does not
517 // need to be calculated and can simply return 1.
518 if (IsUsedForZeroCmp) {
519 BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
520 Builder.SetInsertPoint(ResBlock.BB, InsertPt);
521 Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1);
522 PhiRes->addIncoming(Res, ResBlock.BB);
523 BranchInst *NewBr = BranchInst::Create(EndBlock);
524 Builder.Insert(NewBr);
525 DTU.applyUpdates({{DominatorTree::Insert, ResBlock.BB, EndBlock}});
526 return;
527 }
528 BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
529 Builder.SetInsertPoint(ResBlock.BB, InsertPt);
530
531 Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1,
532 ResBlock.PhiSrc2);
533
534 Value *Res =
535 Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1),
536 ConstantInt::get(Builder.getInt32Ty(), 1));
537
538 BranchInst *NewBr = BranchInst::Create(EndBlock);
539 Builder.Insert(NewBr);
540 PhiRes->addIncoming(Res, ResBlock.BB);
541 DTU.applyUpdates({{DominatorTree::Insert, ResBlock.BB, EndBlock}});
542 }
543
544 void MemCmpExpansion::setupResultBlockPHINodes() {
545 Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
546 Builder.SetInsertPoint(ResBlock.BB);
547 // Note: this assumes one load per block.
548 ResBlock.PhiSrc1 =
549 Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src1");
550 ResBlock.PhiSrc2 =
551 Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src2");
552 }
553
554 void MemCmpExpansion::setupEndBlockPHINodes() {
555 Builder.SetInsertPoint(&EndBlock->front());
556 PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res");
557 }
558
559 Value *MemCmpExpansion::getMemCmpExpansionZeroCase() {
560 unsigned LoadIndex = 0;
561 // This loop populates each of the LoadCmpBlocks with the IR sequence to
562 // handle multiple loads per block.
563 for (unsigned I = 0; I < getNumBlocks(); ++I) {
564 emitLoadCompareBlockMultipleLoads(I, LoadIndex);
565 }
566
567 emitMemCmpResultBlock();
568 return PhiRes;
569 }
570
571 /// A memcmp expansion that compares equality with 0 and only has one block of
572 /// load and compare can bypass the compare, branch, and phi IR that is required
573 /// in the general case.
574 Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() {
575 unsigned LoadIndex = 0;
576 Value *Cmp = getCompareLoadPairs(0, LoadIndex);
577 assert(LoadIndex == getNumLoads() && "some entries were not consumed");
578 return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext()));
579 }
580
581 /// A memcmp expansion that only has one block of load and compare can bypass
582 /// the compare, branch, and phi IR that is required in the general case.
583 Value *MemCmpExpansion::getMemCmpOneBlock() {
584 Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
585 Value *Source1 = CI->getArgOperand(0);
586 Value *Source2 = CI->getArgOperand(1);
587
588 // Cast source to LoadSizeType*.
589 if (Source1->getType() != LoadSizeType)
590 Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
591 if (Source2->getType() != LoadSizeType)
592 Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
593
594 // Load LoadSizeType from the base address.
595 Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
596 Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
597
598 if (DL.isLittleEndian() && Size != 1) {
599 Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
600 Intrinsic::bswap, LoadSizeType);
601 LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
602 LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
603 }
604
605 if (Size < 4) {
606 // The i8 and i16 cases don't need compares. We zext the loaded values and
607 // subtract them to get the suitable negative, zero, or positive i32 result.
608 LoadSrc1 = Builder.CreateZExt(LoadSrc1, Builder.getInt32Ty());
609 LoadSrc2 = Builder.CreateZExt(LoadSrc2, Builder.getInt32Ty());
610 return Builder.CreateSub(LoadSrc1, LoadSrc2);
611 }
612
613 // The result of memcmp is negative, zero, or positive, so produce that by
614 // subtracting 2 extended compare bits: sub (ugt, ult).
615 // If a target prefers to use selects to get -1/0/1, they should be able
616 // to transform this later. The inverse transform (going from selects to math)
617 // may not be possible in the DAG because the selects got converted into
618 // branches before we got there.
619 Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2);
620 Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2);
621 Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty());
622 Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty());
623 return Builder.CreateSub(ZextUGT, ZextULT);
624 }
625
626 // This function expands the memcmp call into an inline expansion and returns
627 // the memcmp result.
628 Value *MemCmpExpansion::getMemCmpExpansion() {
629 // Create the basic block framework for a multi-block expansion.
630 if (getNumBlocks() != 1) {
631 BasicBlock *StartBlock = CI->getParent();
632 EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
633 DTU.applyUpdates({{DominatorTree::Insert, StartBlock, EndBlock}});
634 setupEndBlockPHINodes();
635 createResultBlock();
636
637 // If return value of memcmp is not used in a zero equality, we need to
638 // calculate which source was larger. The calculation requires the
639 // two loaded source values of each load compare block.
640 // These will be saved in the phi nodes created by setupResultBlockPHINodes.
641 if (!IsUsedForZeroCmp)
642 setupResultBlockPHINodes();
643
644 // Create the number of required load compare basic blocks.
645 createLoadCmpBlocks();
646
647 // Update the terminator added by splitBasicBlock to branch to the first
648 // LoadCmpBlock.
649 BasicBlock *const FirstLoadBB = LoadCmpBlocks[0];
650 StartBlock->getTerminator()->setSuccessor(0, FirstLoadBB);
651 DTU.applyUpdates({{DominatorTree::Delete, StartBlock, EndBlock},
652 {DominatorTree::Insert, StartBlock, FirstLoadBB}});
653 }
654
655 Builder.SetCurrentDebugLocation(CI->getDebugLoc());
656
657 if (IsUsedForZeroCmp)
658 return getNumBlocks() == 1 ? getMemCmpEqZeroOneBlock()
659 : getMemCmpExpansionZeroCase();
660
661 if (getNumBlocks() == 1)
662 return getMemCmpOneBlock();
663
664 for (unsigned I = 0; I < getNumBlocks(); ++I) {
665 emitLoadCompareBlock(I);
666 }
667
668 emitMemCmpResultBlock();
669 return PhiRes;
670 }
671
672 // This function checks to see if an expansion of memcmp can be generated.
673 // It checks for constant compare size that is less than the max inline size.
674 // If an expansion cannot occur, returns false to leave as a library call.
675 // Otherwise, the library call is replaced with a new IR instruction sequence.
676 /// We want to transform:
677 /// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15)
678 /// To:
679 /// loadbb:
680 /// %0 = bitcast i32* %buffer2 to i8*
681 /// %1 = bitcast i32* %buffer1 to i8*
682 /// %2 = bitcast i8* %1 to i64*
683 /// %3 = bitcast i8* %0 to i64*
684 /// %4 = load i64, i64* %2
685 /// %5 = load i64, i64* %3
686 /// %6 = call i64 @llvm.bswap.i64(i64 %4)
687 /// %7 = call i64 @llvm.bswap.i64(i64 %5)
688 /// %8 = sub i64 %6, %7
689 /// %9 = icmp ne i64 %8, 0
690 /// br i1 %9, label %res_block, label %loadbb1
691 /// res_block: ; preds = %loadbb2,
692 /// %loadbb1, %loadbb
693 /// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ]
694 /// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ]
695 /// %10 = icmp ult i64 %phi.src1, %phi.src2
696 /// %11 = select i1 %10, i32 -1, i32 1
697 /// br label %endblock
698 /// loadbb1: ; preds = %loadbb
699 /// %12 = bitcast i32* %buffer2 to i8*
700 /// %13 = bitcast i32* %buffer1 to i8*
701 /// %14 = bitcast i8* %13 to i32*
702 /// %15 = bitcast i8* %12 to i32*
703 /// %16 = getelementptr i32, i32* %14, i32 2
704 /// %17 = getelementptr i32, i32* %15, i32 2
705 /// %18 = load i32, i32* %16
706 /// %19 = load i32, i32* %17
707 /// %20 = call i32 @llvm.bswap.i32(i32 %18)
708 /// %21 = call i32 @llvm.bswap.i32(i32 %19)
709 /// %22 = zext i32 %20 to i64
710 /// %23 = zext i32 %21 to i64
711 /// %24 = sub i64 %22, %23
712 /// %25 = icmp ne i64 %24, 0
713 /// br i1 %25, label %res_block, label %loadbb2
714 /// loadbb2: ; preds = %loadbb1
715 /// %26 = bitcast i32* %buffer2 to i8*
716 /// %27 = bitcast i32* %buffer1 to i8*
717 /// %28 = bitcast i8* %27 to i16*
718 /// %29 = bitcast i8* %26 to i16*
719 /// %30 = getelementptr i16, i16* %28, i16 6
720 /// %31 = getelementptr i16, i16* %29, i16 6
721 /// %32 = load i16, i16* %30
722 /// %33 = load i16, i16* %31
723 /// %34 = call i16 @llvm.bswap.i16(i16 %32)
724 /// %35 = call i16 @llvm.bswap.i16(i16 %33)
725 /// %36 = zext i16 %34 to i64
726 /// %37 = zext i16 %35 to i64
727 /// %38 = sub i64 %36, %37
728 /// %39 = icmp ne i64 %38, 0
729 /// br i1 %39, label %res_block, label %loadbb3
730 /// loadbb3: ; preds = %loadbb2
731 /// %40 = bitcast i32* %buffer2 to i8*
732 /// %41 = bitcast i32* %buffer1 to i8*
733 /// %42 = getelementptr i8, i8* %41, i8 14
734 /// %43 = getelementptr i8, i8* %40, i8 14
735 /// %44 = load i8, i8* %42
736 /// %45 = load i8, i8* %43
737 /// %46 = zext i8 %44 to i32
738 /// %47 = zext i8 %45 to i32
739 /// %48 = sub i32 %46, %47
740 /// br label %endblock
741 /// endblock: ; preds = %res_block,
742 /// %loadbb3
743 /// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ]
744 /// ret i32 %phi.res
745 static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
746 const DataLayout *DL, DominatorTree *DT) {
747 NumMemCmpCalls++;
748
749 // Early exit from expansion if -Oz.
750 if (CI->getFunction()->hasMinSize())
751 return false;
752
753 // Early exit from expansion if size is not a constant.
754 ConstantInt *SizeCast = dyn_cast(CI->getArgOperand(2));
755 if (!SizeCast) {
756 NumMemCmpNotConstant++;
757 return false;
758 }
759 const uint64_t SizeVal = SizeCast->getZExtValue();
760
761 if (SizeVal == 0) {
762 return false;
763 }
764 // TTI call to check if target would like to expand memcmp. Also, get the
765 // available load sizes.
766 const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
767 auto Options = TTI->enableMemCmpExpansion(CI->getFunction()->hasOptSize(),
768 IsUsedForZeroCmp);
769 if (!Options)
770 return false;
771
772 if (MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences())
773 Options.NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock;
774
775 if (CI->getFunction()->hasOptSize() &&
776 MaxLoadsPerMemcmpOptSize.getNumOccurrences())
777 Options.MaxNumLoads = MaxLoadsPerMemcmpOptSize;
778
779 if (!CI->getFunction()->hasOptSize() && MaxLoadsPerMemcmp.getNumOccurrences())
780 Options.MaxNumLoads = MaxLoadsPerMemcmp;
781
782 MemCmpExpansion Expansion(CI, SizeVal, Options, IsUsedForZeroCmp, *DL, DT);
783
784 // Don't expand if this will require more loads than desired by the target.
785 if (Expansion.getNumLoads() == 0) {
786 NumMemCmpGreaterThanMax++;
787 return false;
788 }
789
790 NumMemCmpInlined++;
791
792 Value *Res = Expansion.getMemCmpExpansion();
793
794 // Replace call with result of expansion and erase call.
795 CI->replaceAllUsesWith(Res);
796 CI->eraseFromParent();
797
798 return true;
799 }
800
801 class ExpandMemCmpPass : public FunctionPass {
802 public:
803 static char ID;
804
805 ExpandMemCmpPass() : FunctionPass(ID) {
806 initializeExpandMemCmpPassPass(*PassRegistry::getPassRegistry());
807 }
808
809 bool runOnFunction(Function &F) override {
810 if (skipFunction(F))
811 return false;
812
813 const TargetLibraryInfo *TLI =
814 &getAnalysis().getTLI();
815 const TargetTransformInfo *TTI =
816 &getAnalysis().getTTI(F);
817 // ExpandMemCmp does not need the DominatorTree, but we update it if it's
818 // already available.
819 auto *DTWP = getAnalysisIfAvailable();
820 auto PA = runImpl(F, TLI, TTI, DTWP ? &DTWP->getDomTree() : nullptr);
821 return !PA.areAllPreserved();
822 }
823
824 private:
825 void getAnalysisUsage(AnalysisUsage &AU) const override {
826 AU.addRequired();
827 AU.addRequired();
828 AU.addUsedIfAvailable();
829 AU.addPreserved();
830 AU.addPreserved();
831 FunctionPass::getAnalysisUsage(AU);
832 }
833
834 PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
835 const TargetTransformInfo *TTI, DominatorTree *DT);
836 // Returns true if a change was made.
837 bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,
838 const TargetTransformInfo *TTI, const DataLayout &DL,
839 DominatorTree *DT);
840 };
841
842 bool ExpandMemCmpPass::runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,
843 const TargetTransformInfo *TTI,
844 const DataLayout &DL, DominatorTree *DT) {
845 for (Instruction &I : BB) {
846 CallInst *CI = dyn_cast(&I);
847 if (!CI) {
848 continue;
849 }
850 LibFunc Func;
851 if (TLI->getLibFunc(ImmutableCallSite(CI), Func) &&
852 (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
853 expandMemCmp(CI, TTI, &DL, DT)) {
854 return true;
855 }
856 }
857 return false;
858 }
859
860 PreservedAnalyses ExpandMemCmpPass::runImpl(Function &F,
861 const TargetLibraryInfo *TLI,
862 const TargetTransformInfo *TTI,
863 DominatorTree *DT) {
864 const DataLayout &DL = F.getParent()->getDataLayout();
865 bool MadeChanges = false;
866 for (auto BBIt = F.begin(); BBIt != F.end();) {
867 if (runOnBlock(*BBIt, TLI, TTI, DL, DT)) {
868 MadeChanges = true;
869 // If changes were made, restart the function from the beginning, since
870 // the structure of the function was changed.
871 BBIt = F.begin();
872 } else {
873 ++BBIt;
874 }
875 }
876 if (!MadeChanges)
877 return PreservedAnalyses::all();
878 PreservedAnalyses PA;
879 PA.preserve();
880 PA.preserve();
881 return PA;
882 }
883
884 } // namespace
885
886 char ExpandMemCmpPass::ID = 0;
887 INITIALIZE_PASS_BEGIN(ExpandMemCmpPass, "expandmemcmp",
888 "Expand memcmp() to load/stores", false, false)
889 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
890 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
891 INITIALIZE_PASS_END(ExpandMemCmpPass, "expandmemcmp",
892 "Expand memcmp() to load/stores", false, false)
893
894 Pass *llvm::createExpandMemCmpPass() { return new ExpandMemCmpPass(); }
865865
866866 // We only try merging comparisons if the target wants to expand memcmp later.
867867 // The rationale is to avoid turning small chains into memcmp calls.
868 if (!TTI.enableMemCmpExpansion(F.hasOptSize(), /*IsZeroCmp*/ true))
868 if (!TTI.enableMemCmpExpansion(F.hasOptSize(), true))
869869 return false;
870870
871871 // If we don't have memcmp avaiable we can't emit calls to it.
8383 initializeLowerWidenableConditionLegacyPassPass(Registry);
8484 initializeMemCpyOptLegacyPassPass(Registry);
8585 initializeMergeICmpsLegacyPassPass(Registry);
86 initializeExpandMemCmpPassPass(Registry);
8786 initializeMergedLoadStoreMotionLegacyPassPass(Registry);
8887 initializeNaryReassociateLegacyPassPass(Registry);
8988 initializePartiallyInlineLibCallsLegacyPassPass(Registry);
3131 ; CHECK-NEXT: Loop Pass Manager
3232 ; CHECK-NEXT: Induction Variable Users
3333 ; CHECK-NEXT: Loop Strength Reduction
34 ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
35 ; CHECK-NEXT: Function Alias Analysis Results
36 ; CHECK-NEXT: Merge contiguous icmps into a memcmp
37 ; CHECK-NEXT: Expand memcmp() to load/stores
3438 ; CHECK-NEXT: Lower Garbage Collection Instructions
3539 ; CHECK-NEXT: Shadow Stack GC Lowering
3640 ; CHECK-NEXT: Remove unreachable blocks from the CFG
1515 ; CHECK-NEXT: Loop Pass Manager
1616 ; CHECK-NEXT: Induction Variable Users
1717 ; CHECK-NEXT: Loop Strength Reduction
18 ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
19 ; CHECK-NEXT: Function Alias Analysis Results
20 ; CHECK-NEXT: Merge contiguous icmps into a memcmp
21 ; CHECK-NEXT: Expand memcmp() to load/stores
1822 ; CHECK-NEXT: Lower Garbage Collection Instructions
1923 ; CHECK-NEXT: Shadow Stack GC Lowering
2024 ; CHECK-NEXT: Remove unreachable blocks from the CFG
1212 ; STOP-BEFORE-NOT: Loop Strength Reduction
1313
1414 ; RUN: llc < %s -debug-pass=Structure -start-after=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-AFTER
15 ; START-AFTER: -gc-lowering
15 ; START-AFTER: -aa -mergeicmps
1616 ; START-AFTER: FunctionPass Manager
17 ; START-AFTER-NEXT: Lower Garbage Collection Instructions
17 ; START-AFTER-NEXT: Dominator Tree Construction
1818
1919 ; RUN: llc < %s -debug-pass=Structure -start-before=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-BEFORE
2020 ; START-BEFORE: -machine-branch-prob -domtree
2121 ; START-BEFORE: FunctionPass Manager
2222 ; START-BEFORE: Loop Strength Reduction
23 ; START-BEFORE-NEXT: Lower Garbage Collection Instructions
23 ; START-BEFORE-NEXT: Basic Alias Analysis (stateless AA impl)
2424
2525 ; RUN: not llc < %s -start-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-START-BEFORE
2626 ; RUN: not llc < %s -stop-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-STOP-BEFORE
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc -ppc-gpr-icmps=all -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s
2 target datalayout = "e-m:e-i64:64-n32:64"
3 target triple = "powerpc64le-unknown-linux-gnu"
4
5 @zeroEqualityTest01.buffer1 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 4], align 4
6 @zeroEqualityTest01.buffer2 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 3], align 4
7 @zeroEqualityTest02.buffer1 = private unnamed_addr constant [4 x i32] [i32 4, i32 0, i32 0, i32 0], align 4
8 @zeroEqualityTest02.buffer2 = private unnamed_addr constant [4 x i32] [i32 3, i32 0, i32 0, i32 0], align 4
9 @zeroEqualityTest03.buffer1 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 3], align 4
10 @zeroEqualityTest03.buffer2 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 4], align 4
11 @zeroEqualityTest04.buffer1 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14], align 4
12 @zeroEqualityTest04.buffer2 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 13], align 4
13
14 declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_addr #1
15
16 ; Check 4 bytes - requires 1 load for each param.
17 define signext i32 @zeroEqualityTest02(i8* %x, i8* %y) {
18 ; CHECK-LABEL: zeroEqualityTest02:
19 ; CHECK: # %bb.0:
20 ; CHECK-NEXT: lwz 3, 0(3)
21 ; CHECK-NEXT: lwz 4, 0(4)
22 ; CHECK-NEXT: xor 3, 3, 4
23 ; CHECK-NEXT: cntlzw 3, 3
24 ; CHECK-NEXT: srwi 3, 3, 5
25 ; CHECK-NEXT: xori 3, 3, 1
26 ; CHECK-NEXT: blr
27 %call = tail call signext i32 @memcmp(i8* %x, i8* %y, i64 4)
28 %not.cmp = icmp ne i32 %call, 0
29 %. = zext i1 %not.cmp to i32
30 ret i32 %.
31 }
32
33 ; Check 16 bytes - requires 2 loads for each param (or use vectors?).
34 define signext i32 @zeroEqualityTest01(i8* %x, i8* %y) {
35 ; CHECK-LABEL: zeroEqualityTest01:
36 ; CHECK: # %bb.0:
37 ; CHECK-NEXT: ld 5, 0(3)
38 ; CHECK-NEXT: ld 6, 0(4)
39 ; CHECK-NEXT: cmpld 5, 6
40 ; CHECK-NEXT: bne 0, .LBB1_2
41 ; CHECK-NEXT: # %bb.1: # %loadbb1
42 ; CHECK-NEXT: ld 3, 8(3)
43 ; CHECK-NEXT: ld 4, 8(4)
44 ; CHECK-NEXT: cmpld 3, 4
45 ; CHECK-NEXT: li 3, 0
46 ; CHECK-NEXT: beq 0, .LBB1_3
47 ; CHECK-NEXT: .LBB1_2: # %res_block
48 ; CHECK-NEXT: li 3, 1
49 ; CHECK-NEXT: .LBB1_3: # %endblock
50 ; CHECK-NEXT: clrldi 3, 3, 32
51 ; CHECK-NEXT: blr
52 %call = tail call signext i32 @memcmp(i8* %x, i8* %y, i64 16)
53 %not.tobool = icmp ne i32 %call, 0
54 %. = zext i1 %not.tobool to i32
55 ret i32 %.
56 }
57
58 ; Check 7 bytes - requires 3 loads for each param.
59 define signext i32 @zeroEqualityTest03(i8* %x, i8* %y) {
60 ; CHECK-LABEL: zeroEqualityTest03:
61 ; CHECK: # %bb.0:
62 ; CHECK-NEXT: lwz 5, 0(3)
63 ; CHECK-NEXT: lwz 6, 0(4)
64 ; CHECK-NEXT: cmplw 5, 6
65 ; CHECK-NEXT: bne 0, .LBB2_3
66 ; CHECK-NEXT: # %bb.1: # %loadbb1
67 ; CHECK-NEXT: lhz 5, 4(3)
68 ; CHECK-NEXT: lhz 6, 4(4)
69 ; CHECK-NEXT: cmplw 5, 6
70 ; CHECK-NEXT: bne 0, .LBB2_3
71 ; CHECK-NEXT: # %bb.2: # %loadbb2
72 ; CHECK-NEXT: lbz 3, 6(3)
73 ; CHECK-NEXT: lbz 4, 6(4)
74 ; CHECK-NEXT: cmplw 3, 4
75 ; CHECK-NEXT: li 3, 0
76 ; CHECK-NEXT: beq 0, .LBB2_4
77 ; CHECK-NEXT: .LBB2_3: # %res_block
78 ; CHECK-NEXT: li 3, 1
79 ; CHECK-NEXT: .LBB2_4: # %endblock
80 ; CHECK-NEXT: clrldi 3, 3, 32
81 ; CHECK-NEXT: blr
82 %call = tail call signext i32 @memcmp(i8* %x, i8* %y, i64 7)
83 %not.lnot = icmp ne i32 %call, 0
84 %cond = zext i1 %not.lnot to i32
85 ret i32 %cond
86 }
87
88 ; Validate with > 0
89 define signext i32 @zeroEqualityTest04() {
90 ; CHECK-LABEL: zeroEqualityTest04:
91 ; CHECK: # %bb.0:
92 ; CHECK-NEXT: addis 3, 2, .LzeroEqualityTest02.buffer1@toc@ha
93 ; CHECK-NEXT: addis 4, 2, .LzeroEqualityTest02.buffer2@toc@ha
94 ; CHECK-NEXT: addi 6, 3, .LzeroEqualityTest02.buffer1@toc@l
95 ; CHECK-NEXT: addi 5, 4, .LzeroEqualityTest02.buffer2@toc@l
96 ; CHECK-NEXT: ldbrx 3, 0, 6
97 ; CHECK-NEXT: ldbrx 4, 0, 5
98 ; CHECK-NEXT: cmpld 3, 4
99 ; CHECK-NEXT: bne 0, .LBB3_2
100 ; CHECK-NEXT: # %bb.1: # %loadbb1
101 ; CHECK-NEXT: li 4, 8
102 ; CHECK-NEXT: ldbrx 3, 6, 4
103 ; CHECK-NEXT: ldbrx 4, 5, 4
104 ; CHECK-NEXT: li 5, 0
105 ; CHECK-NEXT: cmpld 3, 4
106 ; CHECK-NEXT: beq 0, .LBB3_3
107 ; CHECK-NEXT: .LBB3_2: # %res_block
108 ; CHECK-NEXT: cmpld 3, 4
109 ; CHECK-NEXT: li 3, 1
110 ; CHECK-NEXT: li 4, -1
111 ; CHECK-NEXT: isel 5, 4, 3, 0
112 ; CHECK-NEXT: .LBB3_3: # %endblock
113 ; CHECK-NEXT: extsw 3, 5
114 ; CHECK-NEXT: neg 3, 3
115 ; CHECK-NEXT: rldicl 3, 3, 1, 63
116 ; CHECK-NEXT: xori 3, 3, 1
117 ; CHECK-NEXT: blr
118 %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer2 to i8*), i64 16)
119 %not.cmp = icmp slt i32 %call, 1
120 %. = zext i1 %not.cmp to i32
121 ret i32 %.
122 }
123
124 ; Validate with < 0
125 define signext i32 @zeroEqualityTest05() {
126 ; CHECK-LABEL: zeroEqualityTest05:
127 ; CHECK: # %bb.0:
128 ; CHECK-NEXT: addis 3, 2, .LzeroEqualityTest03.buffer1@toc@ha
129 ; CHECK-NEXT: addis 4, 2, .LzeroEqualityTest03.buffer2@toc@ha
130 ; CHECK-NEXT: addi 6, 3, .LzeroEqualityTest03.buffer1@toc@l
131 ; CHECK-NEXT: addi 5, 4, .LzeroEqualityTest03.buffer2@toc@l
132 ; CHECK-NEXT: ldbrx 3, 0, 6
133 ; CHECK-NEXT: ldbrx 4, 0, 5
134 ; CHECK-NEXT: cmpld 3, 4
135 ; CHECK-NEXT: bne 0, .LBB4_2
136 ; CHECK-NEXT: # %bb.1: # %loadbb1
137 ; CHECK-NEXT: li 4, 8
138 ; CHECK-NEXT: ldbrx 3, 6, 4
139 ; CHECK-NEXT: ldbrx 4, 5, 4
140 ; CHECK-NEXT: li 5, 0
141 ; CHECK-NEXT: cmpld 3, 4
142 ; CHECK-NEXT: beq 0, .LBB4_3
143 ; CHECK-NEXT: .LBB4_2: # %res_block
144 ; CHECK-NEXT: cmpld 3, 4
145 ; CHECK-NEXT: li 3, 1
146 ; CHECK-NEXT: li 4, -1
147 ; CHECK-NEXT: isel 5, 4, 3, 0
148 ; CHECK-NEXT: .LBB4_3: # %endblock
149 ; CHECK-NEXT: nor 3, 5, 5
150 ; CHECK-NEXT: rlwinm 3, 3, 1, 31, 31
151 ; CHECK-NEXT: blr
152 %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer2 to i8*), i64 16)
153 %call.lobit = lshr i32 %call, 31
154 %call.lobit.not = xor i32 %call.lobit, 1
155 ret i32 %call.lobit.not
156 }
157
158 ; Validate with memcmp()?:
159 define signext i32 @equalityFoldTwoConstants() {
160 ; CHECK-LABEL: equalityFoldTwoConstants:
161 ; CHECK: # %bb.0: # %loadbb
162 ; CHECK-NEXT: li 3, 1
163 ; CHECK-NEXT: blr
164 %call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer2 to i8*), i64 16)
165 %not.tobool = icmp eq i32 %call, 0
166 %cond = zext i1 %not.tobool to i32
167 ret i32 %cond
168 }
169
170 define signext i32 @equalityFoldOneConstant(i8* %X) {
171 ; CHECK-LABEL: equalityFoldOneConstant:
172 ; CHECK: # %bb.0:
173 ; CHECK-NEXT: ld 4, 0(3)
174 ; CHECK-NEXT: li 5, 1
175 ; CHECK-NEXT: sldi 5, 5, 32
176 ; CHECK-NEXT: cmpld 4, 5
177 ; CHECK-NEXT: bne 0, .LBB6_2
178 ; CHECK-NEXT: # %bb.1: # %loadbb1
179 ; CHECK-NEXT: li 4, 3
180 ; CHECK-NEXT: ld 3, 8(3)
181 ; CHECK-NEXT: sldi 4, 4, 32
182 ; CHECK-NEXT: ori 4, 4, 2
183 ; CHECK-NEXT: cmpld 3, 4
184 ; CHECK-NEXT: li 3, 0
185 ; CHECK-NEXT: beq 0, .LBB6_3
186 ; CHECK-NEXT: .LBB6_2: # %res_block
187 ; CHECK-NEXT: li 3, 1
188 ; CHECK-NEXT: .LBB6_3: # %endblock
189 ; CHECK-NEXT: cntlzw 3, 3
190 ; CHECK-NEXT: srwi 3, 3, 5
191 ; CHECK-NEXT: blr
192 %call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* %X, i64 16)
193 %not.tobool = icmp eq i32 %call, 0
194 %cond = zext i1 %not.tobool to i32
195 ret i32 %cond
196 }
197
198 define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) {
199 ; CHECK-LABEL: length2_eq_nobuiltin_attr:
200 ; CHECK: # %bb.0:
201 ; CHECK-NEXT: mflr 0
202 ; CHECK-NEXT: std 0, 16(1)
203 ; CHECK-NEXT: stdu 1, -32(1)
204 ; CHECK-NEXT: .cfi_def_cfa_offset 32
205 ; CHECK-NEXT: .cfi_offset lr, 16
206 ; CHECK-NEXT: li 5, 2
207 ; CHECK-NEXT: bl memcmp
208 ; CHECK-NEXT: nop
209 ; CHECK-NEXT: cntlzw 3, 3
210 ; CHECK-NEXT: rlwinm 3, 3, 27, 31, 31
211 ; CHECK-NEXT: addi 1, 1, 32
212 ; CHECK-NEXT: ld 0, 16(1)
213 ; CHECK-NEXT: mtlr 0
214 ; CHECK-NEXT: blr
215 %m = tail call signext i32 @memcmp(i8* %X, i8* %Y, i64 2) nobuiltin
216 %c = icmp eq i32 %m, 0
217 ret i1 %c
218 }
219
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux < %s | FileCheck %s -check-prefix=PPC64LE
2
3 ; This tests interaction between MergeICmp and ExpandMemCmp.
4
5 %"struct.std::pair" = type { i32, i32 }
6
7 define zeroext i1 @opeq1(
8 ; PPC64LE-LABEL: opeq1:
9 ; PPC64LE: # %bb.0: # %"entry+land.rhs.i"
10 ; PPC64LE-NEXT: ld 3, 0(3)
11 ; PPC64LE-NEXT: ld 4, 0(4)
12 ; PPC64LE-NEXT: xor 3, 3, 4
13 ; PPC64LE-NEXT: cntlzd 3, 3
14 ; PPC64LE-NEXT: rldicl 3, 3, 58, 63
15 ; PPC64LE-NEXT: blr
16 %"struct.std::pair"* nocapture readonly dereferenceable(8) %a,
17 %"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 {
18 entry:
19 %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0
20 %0 = load i32, i32* %first.i, align 4
21 %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0
22 %1 = load i32, i32* %first1.i, align 4
23 %cmp.i = icmp eq i32 %0, %1
24 br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
25
26 land.rhs.i:
27 %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1
28 %2 = load i32, i32* %second.i, align 4
29 %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1
30 %3 = load i32, i32* %second2.i, align 4
31 %cmp3.i = icmp eq i32 %2, %3
32 br label %opeq1.exit
33
34 opeq1.exit:
35 %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ]
36 ret i1 %4
37 }
38
39
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux < %s | FileCheck %s -check-prefix=CHECK
2
3 define signext i32 @memcmp8(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) {
4 ; CHECK-LABEL: memcmp8:
5 ; CHECK: # %bb.0:
6 ; CHECK-NEXT: ldbrx 3, 0, 3
7 ; CHECK-NEXT: ldbrx 4, 0, 4
8 ; CHECK-NEXT: subfc 5, 3, 4
9 ; CHECK-NEXT: subfe 5, 4, 4
10 ; CHECK-NEXT: subfc 4, 4, 3
11 ; CHECK-NEXT: subfe 3, 3, 3
12 ; CHECK-NEXT: neg 4, 5
13 ; CHECK-NEXT: neg 3, 3
14 ; CHECK-NEXT: subf 3, 3, 4
15 ; CHECK-NEXT: extsw 3, 3
16 ; CHECK-NEXT: blr
17 %t0 = bitcast i32* %buffer1 to i8*
18 %t1 = bitcast i32* %buffer2 to i8*
19 %call = tail call signext i32 @memcmp(i8* %t0, i8* %t1, i64 8)
20 ret i32 %call
21 }
22
23 define signext i32 @memcmp4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) {
24 ; CHECK-LABEL: memcmp4:
25 ; CHECK: # %bb.0:
26 ; CHECK-NEXT: lwbrx 3, 0, 3
27 ; CHECK-NEXT: lwbrx 4, 0, 4
28 ; CHECK-NEXT: sub 5, 4, 3
29 ; CHECK-NEXT: sub 3, 3, 4
30 ; CHECK-NEXT: rldicl 4, 5, 1, 63
31 ; CHECK-NEXT: rldicl 3, 3, 1, 63
32 ; CHECK-NEXT: subf 3, 3, 4
33 ; CHECK-NEXT: extsw 3, 3
34 ; CHECK-NEXT: blr
35 %t0 = bitcast i32* %buffer1 to i8*
36 %t1 = bitcast i32* %buffer2 to i8*
37 %call = tail call signext i32 @memcmp(i8* %t0, i8* %t1, i64 4)
38 ret i32 %call
39 }
40
41 define signext i32 @memcmp2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) {
42 ; CHECK-LABEL: memcmp2:
43 ; CHECK: # %bb.0:
44 ; CHECK-NEXT: lhbrx 3, 0, 3
45 ; CHECK-NEXT: lhbrx 4, 0, 4
46 ; CHECK-NEXT: subf 3, 4, 3
47 ; CHECK-NEXT: extsw 3, 3
48 ; CHECK-NEXT: blr
49 %t0 = bitcast i32* %buffer1 to i8*
50 %t1 = bitcast i32* %buffer2 to i8*
51 %call = tail call signext i32 @memcmp(i8* %t0, i8* %t1, i64 2)
52 ret i32 %call
53 }
54
55 define signext i32 @memcmp1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) {
56 ; CHECK-LABEL: memcmp1:
57 ; CHECK: # %bb.0:
58 ; CHECK-NEXT: lbz 3, 0(3)
59 ; CHECK-NEXT: lbz 4, 0(4)
60 ; CHECK-NEXT: subf 3, 4, 3
61 ; CHECK-NEXT: extsw 3, 3
62 ; CHECK-NEXT: blr
63 %t0 = bitcast i32* %buffer1 to i8*
64 %t1 = bitcast i32* %buffer2 to i8*
65 %call = tail call signext i32 @memcmp(i8* %t0, i8* %t1, i64 1) #2
66 ret i32 %call
67 }
68
69 declare signext i32 @memcmp(i8*, i8*, i64)
0 ; RUN: llc -o - -mtriple=powerpc64le-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s
1 ; RUN: llc -o - -mtriple=powerpc64-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s --check-prefix=CHECK-BE
2
3 define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) {
4 entry:
5 ; CHECK-LABEL: @test1(
6 ; CHECK: [[LOAD1:%[0-9]+]] = load i64, i64*
7 ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
8 ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
9 ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
10 ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[BSWAP1]], [[BSWAP2]]
11 ; CHECK-NEXT: br i1 [[ICMP]], label %loadbb1, label %res_block
12
13 ; CHECK-LABEL: res_block:{{.*}}
14 ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64
15 ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
16 ; CHECK-NEXT: br label %endblock
17
18 ; CHECK-LABEL: loadbb1:{{.*}}
19 ; CHECK: [[BCC1:%[0-9]+]] = bitcast i32* {{.*}} to i8*
20 ; CHECK-NEXT: [[BCC2:%[0-9]+]] = bitcast i32* {{.*}} to i8*
21 ; CHECK-NEXT: [[GEP1:%[0-9]+]] = getelementptr i8, i8* [[BCC2]], i8 8
22 ; CHECK-NEXT: [[BCL1:%[0-9]+]] = bitcast i8* [[GEP1]] to i64*
23 ; CHECK-NEXT: [[GEP2:%[0-9]+]] = getelementptr i8, i8* [[BCC1]], i8 8
24 ; CHECK-NEXT: [[BCL2:%[0-9]+]] = bitcast i8* [[GEP2]] to i64*
25 ; CHECK-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[BCL1]]
26 ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[BCL2]]
27 ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
28 ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
29 ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[BSWAP1]], [[BSWAP2]]
30 ; CHECK-NEXT: br i1 [[ICMP]], label %endblock, label %res_block
31
32 ; CHECK-BE-LABEL: @test1(
33 ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, i64*
34 ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
35 ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[LOAD1]], [[LOAD2]]
36 ; CHECK-BE-NEXT: br i1 [[ICMP]], label %loadbb1, label %res_block
37
38 ; CHECK-BE-LABEL: res_block:{{.*}}
39 ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
40 ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
41 ; CHECK-BE-NEXT: br label %endblock
42
43 ; CHECK-BE-LABEL: loadbb1:{{.*}}
44 ; CHECK-BE: [[BCC1:%[0-9]+]] = bitcast i32* {{.*}} to i8*
45 ; CHECK-BE-NEXT: [[BCC2:%[0-9]+]] = bitcast i32* {{.*}} to i8*
46 ; CHECK-BE-NEXT: [[GEP1:%[0-9]+]] = getelementptr i8, i8* [[BCC2]], i8 8
47 ; CHECK-BE-NEXT: [[BCL1:%[0-9]+]] = bitcast i8* [[GEP1]] to i64*
48 ; CHECK-BE-NEXT: [[GEP2:%[0-9]+]] = getelementptr i8, i8* [[BCC1]], i8 8
49 ; CHECK-BE-NEXT: [[BCL2:%[0-9]+]] = bitcast i8* [[GEP2]] to i64*
50 ; CHECK-BE-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[BCL1]]
51 ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[BCL2]]
52 ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[LOAD1]], [[LOAD2]]
53 ; CHECK-BE-NEXT: br i1 [[ICMP]], label %endblock, label %res_block
54
55 %0 = bitcast i32* %buffer1 to i8*
56 %1 = bitcast i32* %buffer2 to i8*
57 %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 16)
58 ret i32 %call
59 }
60
61 declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_addr #1
62
63 define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) {
64 ; CHECK-LABEL: @test2(
65 ; CHECK: [[LOAD1:%[0-9]+]] = load i32, i32*
66 ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
67 ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]])
68 ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]])
69 ; CHECK-NEXT: [[CMP1:%[0-9]+]] = icmp ugt i32 [[BSWAP1]], [[BSWAP2]]
70 ; CHECK-NEXT: [[CMP2:%[0-9]+]] = icmp ult i32 [[BSWAP1]], [[BSWAP2]]
71 ; CHECK-NEXT: [[Z1:%[0-9]+]] = zext i1 [[CMP1]] to i32
72 ; CHECK-NEXT: [[Z2:%[0-9]+]] = zext i1 [[CMP2]] to i32
73 ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i32 [[Z1]], [[Z2]]
74 ; CHECK-NEXT: ret i32 [[SUB]]
75
76 ; CHECK-BE-LABEL: @test2(
77 ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32*
78 ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
79 ; CHECK-BE-NEXT: [[CMP1:%[0-9]+]] = icmp ugt i32 [[LOAD1]], [[LOAD2]]
80 ; CHECK-BE-NEXT: [[CMP2:%[0-9]+]] = icmp ult i32 [[LOAD1]], [[LOAD2]]
81 ; CHECK-BE-NEXT: [[Z1:%[0-9]+]] = zext i1 [[CMP1]] to i32
82 ; CHECK-BE-NEXT: [[Z2:%[0-9]+]] = zext i1 [[CMP2]] to i32
83 ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[Z1]], [[Z2]]
84 ; CHECK-BE-NEXT: ret i32 [[SUB]]
85
86 entry:
87 %0 = bitcast i32* %buffer1 to i8*
88 %1 = bitcast i32* %buffer2 to i8*
89 %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4)
90 ret i32 %call
91 }
92
93 define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) {
94 ; CHECK: [[LOAD1:%[0-9]+]] = load i64, i64*
95 ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
96 ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
97 ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
98 ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[BSWAP1]], [[BSWAP2]]
99 ; CHECK-NEXT: br i1 [[ICMP]], label %loadbb1, label %res_block
100
101 ; CHECK-LABEL: res_block:{{.*}}
102 ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64
103 ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
104 ; CHECK-NEXT: br label %endblock
105
106 ; CHECK-LABEL: loadbb1:{{.*}}
107 ; CHECK: [[LOAD1:%[0-9]+]] = load i32, i32*
108 ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
109 ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]])
110 ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]])
111 ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[BSWAP1]] to i64
112 ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[BSWAP2]] to i64
113 ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[ZEXT1]], [[ZEXT2]]
114 ; CHECK-NEXT: br i1 [[ICMP]], label %loadbb2, label %res_block
115
116 ; CHECK-LABEL: loadbb2:{{.*}}
117 ; CHECK: [[LOAD1:%[0-9]+]] = load i16, i16*
118 ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i16, i16*
119 ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i16 @llvm.bswap.i16(i16 [[LOAD1]])
120 ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i16 @llvm.bswap.i16(i16 [[LOAD2]])
121 ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[BSWAP1]] to i64
122 ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[BSWAP2]] to i64
123 ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[ZEXT1]], [[ZEXT2]]
124 ; CHECK-NEXT: br i1 [[ICMP]], label %loadbb3, label %res_block
125
126 ; CHECK-LABEL: loadbb3:{{.*}}
127 ; CHECK: [[LOAD1:%[0-9]+]] = load i8, i8*
128 ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i8, i8*
129 ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32
130 ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32
131 ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
132 ; CHECK-NEXT: br label %endblock
133
134 ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, i64*
135 ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
136 ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[LOAD1]], [[LOAD2]]
137 ; CHECK-BE-NEXT: br i1 [[ICMP]], label %loadbb1, label %res_block
138
139 ; CHECK-BE-LABEL: res_block:{{.*}}
140 ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
141 ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
142 ; CHECK-BE-NEXT: br label %endblock
143
144 ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32*
145 ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
146 ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64
147 ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64
148 ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[ZEXT1]], [[ZEXT2]]
149 ; CHECK-BE-NEXT: br i1 [[ICMP]], label %loadbb2, label %res_block
150
151 ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i16, i16*
152 ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i16, i16*
153 ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[LOAD1]] to i64
154 ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[LOAD2]] to i64
155 ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[ZEXT1]], [[ZEXT2]]
156 ; CHECK-BE-NEXT: br i1 [[ICMP]], label %loadbb3, label %res_block
157
158 ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i8, i8*
159 ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i8, i8*
160 ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32
161 ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32
162 ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
163 ; CHECK-BE-NEXT: br label %endblock
164
165 entry:
166 %0 = bitcast i32* %buffer1 to i8*
167 %1 = bitcast i32* %buffer2 to i8*
168 %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 15)
169 ret i32 %call
170 }
171 ; CHECK: call = tail call signext i32 @memcmp
172 ; CHECK-BE: call = tail call signext i32 @memcmp
173 define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) {
174
175 entry:
176 %0 = bitcast i32* %buffer1 to i8*
177 %1 = bitcast i32* %buffer2 to i8*
178 %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 65)
179 ret i32 %call
180 }
181
182 define signext i32 @test5(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2, i32 signext %SIZE) {
183 ; CHECK: call = tail call signext i32 @memcmp
184 ; CHECK-BE: call = tail call signext i32 @memcmp
185 entry:
186 %0 = bitcast i32* %buffer1 to i8*
187 %1 = bitcast i32* %buffer2 to i8*
188 %conv = sext i32 %SIZE to i64
189 %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 %conv)
190 ret i32 %call
191 }
2828 ; CHECK-NEXT: Loop Pass Manager
2929 ; CHECK-NEXT: Induction Variable Users
3030 ; CHECK-NEXT: Loop Strength Reduction
31 ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
32 ; CHECK-NEXT: Function Alias Analysis Results
33 ; CHECK-NEXT: Merge contiguous icmps into a memcmp
34 ; CHECK-NEXT: Expand memcmp() to load/stores
3135 ; CHECK-NEXT: Lower Garbage Collection Instructions
3236 ; CHECK-NEXT: Shadow Stack GC Lowering
3337 ; CHECK-NEXT: Remove unreachable blocks from the CFG
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
3
4 ; This tests interaction between MergeICmp and ExpandMemCmp.
5
6 %"struct.std::pair" = type { i32, i32 }
7
8 define zeroext i1 @opeq1(
9 ; X86-LABEL: opeq1:
10 ; X86: # %bb.0: # %"entry+land.rhs.i"
11 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
12 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
13 ; X86-NEXT: movl (%ecx), %edx
14 ; X86-NEXT: movl 4(%ecx), %ecx
15 ; X86-NEXT: xorl (%eax), %edx
16 ; X86-NEXT: xorl 4(%eax), %ecx
17 ; X86-NEXT: orl %edx, %ecx
18 ; X86-NEXT: sete %al
19 ; X86-NEXT: retl
20 ;
21 ; X64-LABEL: opeq1:
22 ; X64: # %bb.0: # %"entry+land.rhs.i"
23 ; X64-NEXT: movq (%rdi), %rax
24 ; X64-NEXT: cmpq (%rsi), %rax
25 ; X64-NEXT: sete %al
26 ; X64-NEXT: retq
27 %"struct.std::pair"* nocapture readonly dereferenceable(8) %a,
28 %"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 {
29 entry:
30 %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0
31 %0 = load i32, i32* %first.i, align 4
32 %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0
33 %1 = load i32, i32* %first1.i, align 4
34 %cmp.i = icmp eq i32 %0, %1
35 br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
36
37 land.rhs.i:
38 %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1
39 %2 = load i32, i32* %second.i, align 4
40 %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1
41 %3 = load i32, i32* %second2.i, align 4
42 %cmp3.i = icmp eq i32 %2, %3
43 br label %opeq1.exit
44
45 opeq1.exit:
46 %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ]
47 ret i1 %4
48 }
49
50
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2
5
6 ; This tests codegen time inlining/optimization of memcmp
7 ; rdar://6480398
8
9 @.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
10
11 declare i32 @memcmp(i8*, i8*, i64)
12 declare i32 @bcmp(i8*, i8*, i64)
13
14 define i32 @length2(i8* %X, i8* %Y) nounwind optsize {
15 ; X86-LABEL: length2:
16 ; X86: # %bb.0:
17 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
18 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
19 ; X86-NEXT: movzwl (%ecx), %ecx
20 ; X86-NEXT: movzwl (%eax), %edx
21 ; X86-NEXT: rolw $8, %cx
22 ; X86-NEXT: rolw $8, %dx
23 ; X86-NEXT: movzwl %cx, %eax
24 ; X86-NEXT: movzwl %dx, %ecx
25 ; X86-NEXT: subl %ecx, %eax
26 ; X86-NEXT: retl
27 ;
28 ; X64-LABEL: length2:
29 ; X64: # %bb.0:
30 ; X64-NEXT: movzwl (%rdi), %eax
31 ; X64-NEXT: movzwl (%rsi), %ecx
32 ; X64-NEXT: rolw $8, %ax
33 ; X64-NEXT: rolw $8, %cx
34 ; X64-NEXT: movzwl %ax, %eax
35 ; X64-NEXT: movzwl %cx, %ecx
36 ; X64-NEXT: subl %ecx, %eax
37 ; X64-NEXT: retq
38 %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
39 ret i32 %m
40 }
41
42 define i1 @length2_eq(i8* %X, i8* %Y) nounwind optsize {
43 ; X86-LABEL: length2_eq:
44 ; X86: # %bb.0:
45 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
46 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
47 ; X86-NEXT: movzwl (%ecx), %ecx
48 ; X86-NEXT: cmpw (%eax), %cx
49 ; X86-NEXT: sete %al
50 ; X86-NEXT: retl
51 ;
52 ; X64-LABEL: length2_eq:
53 ; X64: # %bb.0:
54 ; X64-NEXT: movzwl (%rdi), %eax
55 ; X64-NEXT: cmpw (%rsi), %ax
56 ; X64-NEXT: sete %al
57 ; X64-NEXT: retq
58 %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
59 %c = icmp eq i32 %m, 0
60 ret i1 %c
61 }
62
63 define i1 @length2_eq_const(i8* %X) nounwind optsize {
64 ; X86-LABEL: length2_eq_const:
65 ; X86: # %bb.0:
66 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
67 ; X86-NEXT: movzwl (%eax), %eax
68 ; X86-NEXT: cmpl $12849, %eax # imm = 0x3231
69 ; X86-NEXT: setne %al
70 ; X86-NEXT: retl
71 ;
72 ; X64-LABEL: length2_eq_const:
73 ; X64: # %bb.0:
74 ; X64-NEXT: movzwl (%rdi), %eax
75 ; X64-NEXT: cmpl $12849, %eax # imm = 0x3231
76 ; X64-NEXT: setne %al
77 ; X64-NEXT: retq
78 %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 2) nounwind
79 %c = icmp ne i32 %m, 0
80 ret i1 %c
81 }
82
83 define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind optsize {
84 ; X86-LABEL: length2_eq_nobuiltin_attr:
85 ; X86: # %bb.0:
86 ; X86-NEXT: pushl $0
87 ; X86-NEXT: pushl $2
88 ; X86-NEXT: pushl {{[0-9]+}}(%esp)
89 ; X86-NEXT: pushl {{[0-9]+}}(%esp)
90 ; X86-NEXT: calll memcmp
91 ; X86-NEXT: addl $16, %esp
92 ; X86-NEXT: testl %eax, %eax
93 ; X86-NEXT: sete %al
94 ; X86-NEXT: retl
95 ;
96 ; X64-LABEL: length2_eq_nobuiltin_attr:
97 ; X64: # %bb.0:
98 ; X64-NEXT: pushq %rax
99 ; X64-NEXT: movl $2, %edx
100 ; X64-NEXT: callq memcmp
101 ; X64-NEXT: testl %eax, %eax
102 ; X64-NEXT: sete %al
103 ; X64-NEXT: popq %rcx
104 ; X64-NEXT: retq
105 %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind nobuiltin
106 %c = icmp eq i32 %m, 0
107 ret i1 %c
108 }
109
110 define i32 @length3(i8* %X, i8* %Y) nounwind optsize {
111 ; X86-LABEL: length3:
112 ; X86: # %bb.0: # %loadbb
113 ; X86-NEXT: pushl %esi
114 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
115 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
116 ; X86-NEXT: movzwl (%eax), %edx
117 ; X86-NEXT: movzwl (%ecx), %esi
118 ; X86-NEXT: rolw $8, %dx
119 ; X86-NEXT: rolw $8, %si
120 ; X86-NEXT: cmpw %si, %dx
121 ; X86-NEXT: jne .LBB4_1
122 ; X86-NEXT: # %bb.2: # %loadbb1
123 ; X86-NEXT: movzbl 2(%eax), %eax
124 ; X86-NEXT: movzbl 2(%ecx), %ecx
125 ; X86-NEXT: subl %ecx, %eax
126 ; X86-NEXT: jmp .LBB4_3
127 ; X86-NEXT: .LBB4_1: # %res_block
128 ; X86-NEXT: setae %al
129 ; X86-NEXT: movzbl %al, %eax
130 ; X86-NEXT: leal -1(%eax,%eax), %eax
131 ; X86-NEXT: .LBB4_3: # %endblock
132 ; X86-NEXT: popl %esi
133 ; X86-NEXT: retl
134 ;
135 ; X64-LABEL: length3:
136 ; X64: # %bb.0: # %loadbb
137 ; X64-NEXT: movzwl (%rdi), %eax
138 ; X64-NEXT: movzwl (%rsi), %ecx
139 ; X64-NEXT: rolw $8, %ax
140 ; X64-NEXT: rolw $8, %cx
141 ; X64-NEXT: cmpw %cx, %ax
142 ; X64-NEXT: jne .LBB4_1
143 ; X64-NEXT: # %bb.2: # %loadbb1
144 ; X64-NEXT: movzbl 2(%rdi), %eax
145 ; X64-NEXT: movzbl 2(%rsi), %ecx
146 ; X64-NEXT: subl %ecx, %eax
147 ; X64-NEXT: retq
148 ; X64-NEXT: .LBB4_1: # %res_block
149 ; X64-NEXT: setae %al
150 ; X64-NEXT: movzbl %al, %eax
151 ; X64-NEXT: leal -1(%rax,%rax), %eax
152 ; X64-NEXT: retq
153 %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
154 ret i32 %m
155 }
156
157 define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize {
158 ; X86-LABEL: length3_eq:
159 ; X86: # %bb.0:
160 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
161 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
162 ; X86-NEXT: movzwl (%ecx), %edx
163 ; X86-NEXT: xorw (%eax), %dx
164 ; X86-NEXT: movb 2(%ecx), %cl
165 ; X86-NEXT: xorb 2(%eax), %cl
166 ; X86-NEXT: movzbl %cl, %eax
167 ; X86-NEXT: orw %dx, %ax
168 ; X86-NEXT: setne %al
169 ; X86-NEXT: retl
170 ;
171 ; X64-LABEL: length3_eq:
172 ; X64: # %bb.0:
173 ; X64-NEXT: movzwl (%rdi), %eax
174 ; X64-NEXT: xorw (%rsi), %ax
175 ; X64-NEXT: movb 2(%rdi), %cl
176 ; X64-NEXT: xorb 2(%rsi), %cl
177 ; X64-NEXT: movzbl %cl, %ecx
178 ; X64-NEXT: orw %ax, %cx
179 ; X64-NEXT: setne %al
180 ; X64-NEXT: retq
181 %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
182 %c = icmp ne i32 %m, 0
183 ret i1 %c
184 }
185
186 define i32 @length4(i8* %X, i8* %Y) nounwind optsize {
187 ; X86-LABEL: length4:
188 ; X86: # %bb.0:
189 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
190 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
191 ; X86-NEXT: movl (%ecx), %ecx
192 ; X86-NEXT: movl (%eax), %edx
193 ; X86-NEXT: bswapl %ecx
194 ; X86-NEXT: bswapl %edx
195 ; X86-NEXT: xorl %eax, %eax
196 ; X86-NEXT: cmpl %edx, %ecx
197 ; X86-NEXT: seta %al
198 ; X86-NEXT: sbbl $0, %eax
199 ; X86-NEXT: retl
200 ;
201 ; X64-LABEL: length4:
202 ; X64: # %bb.0:
203 ; X64-NEXT: movl (%rdi), %ecx
204 ; X64-NEXT: movl (%rsi), %edx
205 ; X64-NEXT: bswapl %ecx
206 ; X64-NEXT: bswapl %edx
207 ; X64-NEXT: xorl %eax, %eax
208 ; X64-NEXT: cmpl %edx, %ecx
209 ; X64-NEXT: seta %al
210 ; X64-NEXT: sbbl $0, %eax
211 ; X64-NEXT: retq
212 %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
213 ret i32 %m
214 }
215
216 define i1 @length4_eq(i8* %X, i8* %Y) nounwind optsize {
217 ; X86-LABEL: length4_eq:
218 ; X86: # %bb.0:
219 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
220 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
221 ; X86-NEXT: movl (%ecx), %ecx
222 ; X86-NEXT: cmpl (%eax), %ecx
223 ; X86-NEXT: setne %al
224 ; X86-NEXT: retl
225 ;
226 ; X64-LABEL: length4_eq:
227 ; X64: # %bb.0:
228 ; X64-NEXT: movl (%rdi), %eax
229 ; X64-NEXT: cmpl (%rsi), %eax
230 ; X64-NEXT: setne %al
231 ; X64-NEXT: retq
232 %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
233 %c = icmp ne i32 %m, 0
234 ret i1 %c
235 }
236
237 define i1 @length4_eq_const(i8* %X) nounwind optsize {
238 ; X86-LABEL: length4_eq_const:
239 ; X86: # %bb.0:
240 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
241 ; X86-NEXT: cmpl $875770417, (%eax) # imm = 0x34333231
242 ; X86-NEXT: sete %al
243 ; X86-NEXT: retl
244 ;
245 ; X64-LABEL: length4_eq_const:
246 ; X64: # %bb.0:
247 ; X64-NEXT: cmpl $875770417, (%rdi) # imm = 0x34333231
248 ; X64-NEXT: sete %al
249 ; X64-NEXT: retq
250 %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 4) nounwind
251 %c = icmp eq i32 %m, 0
252 ret i1 %c
253 }
254
255 define i32 @length5(i8* %X, i8* %Y) nounwind optsize {
256 ; X86-LABEL: length5:
257 ; X86: # %bb.0: # %loadbb
258 ; X86-NEXT: pushl %esi
259 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
260 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
261 ; X86-NEXT: movl (%eax), %edx
262 ; X86-NEXT: movl (%ecx), %esi
263 ; X86-NEXT: bswapl %edx
264 ; X86-NEXT: bswapl %esi
265 ; X86-NEXT: cmpl %esi, %edx
266 ; X86-NEXT: jne .LBB9_1
267 ; X86-NEXT: # %bb.2: # %loadbb1
268 ; X86-NEXT: movzbl 4(%eax), %eax
269 ; X86-NEXT: movzbl 4(%ecx), %ecx
270 ; X86-NEXT: subl %ecx, %eax
271 ; X86-NEXT: jmp .LBB9_3
272 ; X86-NEXT: .LBB9_1: # %res_block
273 ; X86-NEXT: setae %al
274 ; X86-NEXT: movzbl %al, %eax
275 ; X86-NEXT: leal -1(%eax,%eax), %eax
276 ; X86-NEXT: .LBB9_3: # %endblock
277 ; X86-NEXT: popl %esi
278 ; X86-NEXT: retl
279 ;
280 ; X64-LABEL: length5:
281 ; X64: # %bb.0: # %loadbb
282 ; X64-NEXT: movl (%rdi), %eax
283 ; X64-NEXT: movl (%rsi), %ecx
284 ; X64-NEXT: bswapl %eax
285 ; X64-NEXT: bswapl %ecx
286 ; X64-NEXT: cmpl %ecx, %eax
287 ; X64-NEXT: jne .LBB9_1
288 ; X64-NEXT: # %bb.2: # %loadbb1
289 ; X64-NEXT: movzbl 4(%rdi), %eax
290 ; X64-NEXT: movzbl 4(%rsi), %ecx
291 ; X64-NEXT: subl %ecx, %eax
292 ; X64-NEXT: retq
293 ; X64-NEXT: .LBB9_1: # %res_block
294 ; X64-NEXT: setae %al
295 ; X64-NEXT: movzbl %al, %eax
296 ; X64-NEXT: leal -1(%rax,%rax), %eax
297 ; X64-NEXT: retq
298 %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
299 ret i32 %m
300 }
301
302 define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize {
303 ; X86-LABEL: length5_eq:
304 ; X86: # %bb.0:
305 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
306 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
307 ; X86-NEXT: movl (%ecx), %edx
308 ; X86-NEXT: xorl (%eax), %edx
309 ; X86-NEXT: movb 4(%ecx), %cl
310 ; X86-NEXT: xorb 4(%eax), %cl
311 ; X86-NEXT: movzbl %cl, %eax
312 ; X86-NEXT: orl %edx, %eax
313 ; X86-NEXT: setne %al
314 ; X86-NEXT: retl
315 ;
316 ; X64-LABEL: length5_eq:
317 ; X64: # %bb.0:
318 ; X64-NEXT: movl (%rdi), %eax
319 ; X64-NEXT: xorl (%rsi), %eax
320 ; X64-NEXT: movb 4(%rdi), %cl
321 ; X64-NEXT: xorb 4(%rsi), %cl
322 ; X64-NEXT: movzbl %cl, %ecx
323 ; X64-NEXT: orl %eax, %ecx
324 ; X64-NEXT: setne %al
325 ; X64-NEXT: retq
326 %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
327 %c = icmp ne i32 %m, 0
328 ret i1 %c
329 }
330
331 define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
332 ; X86-LABEL: length8:
333 ; X86: # %bb.0:
334 ; X86-NEXT: pushl %esi
335 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
336 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
337 ; X86-NEXT: movl (%esi), %ecx
338 ; X86-NEXT: movl (%eax), %edx
339 ; X86-NEXT: bswapl %ecx
340 ; X86-NEXT: bswapl %edx
341 ; X86-NEXT: cmpl %edx, %ecx
342 ; X86-NEXT: jne .LBB11_2
343 ; X86-NEXT: # %bb.1: # %loadbb1
344 ; X86-NEXT: movl 4(%esi), %ecx
345 ; X86-NEXT: movl 4(%eax), %edx
346 ; X86-NEXT: bswapl %ecx
347 ; X86-NEXT: bswapl %edx
348 ; X86-NEXT: xorl %eax, %eax
349 ; X86-NEXT: cmpl %edx, %ecx
350 ; X86-NEXT: je .LBB11_3
351 ; X86-NEXT: .LBB11_2: # %res_block
352 ; X86-NEXT: xorl %eax, %eax
353 ; X86-NEXT: cmpl %edx, %ecx
354 ; X86-NEXT: setae %al
355 ; X86-NEXT: leal -1(%eax,%eax), %eax
356 ; X86-NEXT: .LBB11_3: # %endblock
357 ; X86-NEXT: popl %esi
358 ; X86-NEXT: retl
359 ;
360 ; X64-LABEL: length8:
361 ; X64: # %bb.0:
362 ; X64-NEXT: movq (%rdi), %rcx
363 ; X64-NEXT: movq (%rsi), %rdx
364 ; X64-NEXT: bswapq %rcx
365 ; X64-NEXT: bswapq %rdx
366 ; X64-NEXT: xorl %eax, %eax
367 ; X64-NEXT: cmpq %rdx, %rcx
368 ; X64-NEXT: seta %al
369 ; X64-NEXT: sbbl $0, %eax
370 ; X64-NEXT: retq
371 %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
372 ret i32 %m
373 }
374
375 define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize {
376 ; X86-LABEL: length8_eq:
377 ; X86: # %bb.0:
378 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
379 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
380 ; X86-NEXT: movl (%ecx), %edx
381 ; X86-NEXT: movl 4(%ecx), %ecx
382 ; X86-NEXT: xorl (%eax), %edx
383 ; X86-NEXT: xorl 4(%eax), %ecx
384 ; X86-NEXT: orl %edx, %ecx
385 ; X86-NEXT: sete %al
386 ; X86-NEXT: retl
387 ;
388 ; X64-LABEL: length8_eq:
389 ; X64: # %bb.0:
390 ; X64-NEXT: movq (%rdi), %rax
391 ; X64-NEXT: cmpq (%rsi), %rax
392 ; X64-NEXT: sete %al
393 ; X64-NEXT: retq
394 %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
395 %c = icmp eq i32 %m, 0
396 ret i1 %c
397 }
398
399 define i1 @length8_eq_const(i8* %X) nounwind optsize {
400 ; X86-LABEL: length8_eq_const:
401 ; X86: # %bb.0:
402 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
403 ; X86-NEXT: movl $858927408, %ecx # imm = 0x33323130
404 ; X86-NEXT: xorl (%eax), %ecx
405 ; X86-NEXT: movl $926299444, %edx # imm = 0x37363534
406 ; X86-NEXT: xorl 4(%eax), %edx
407 ; X86-NEXT: orl %ecx, %edx
408 ; X86-NEXT: setne %al
409 ; X86-NEXT: retl
410 ;
411 ; X64-LABEL: length8_eq_const:
412 ; X64: # %bb.0:
413 ; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
414 ; X64-NEXT: cmpq %rax, (%rdi)
415 ; X64-NEXT: setne %al
416 ; X64-NEXT: retq
417 %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 8) nounwind
418 %c = icmp ne i32 %m, 0
419 ret i1 %c
420 }
421
422 define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize {
423 ; X86-LABEL: length12_eq:
424 ; X86: # %bb.0:
425 ; X86-NEXT: pushl $0
426 ; X86-NEXT: pushl $12
427 ; X86-NEXT: pushl {{[0-9]+}}(%esp)
428 ; X86-NEXT: pushl {{[0-9]+}}(%esp)
429 ; X86-NEXT: calll memcmp
430 ; X86-NEXT: addl $16, %esp
431 ; X86-NEXT: testl %eax, %eax
432 ; X86-NEXT: setne %al
433 ; X86-NEXT: retl
434 ;
435 ; X64-LABEL: length12_eq:
436 ; X64: # %bb.0:
437 ; X64-NEXT: movq (%rdi), %rax
438 ; X64-NEXT: xorq (%rsi), %rax
439 ; X64-NEXT: movl 8(%rdi), %ecx
440 ; X64-NEXT: xorl 8(%rsi), %ecx
441 ; X64-NEXT: orq %rax, %rcx
442 ; X64-NEXT: setne %al
443 ; X64-NEXT: retq
444 %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
445 %c = icmp ne i32 %m, 0
446 ret i1 %c
447 }
448
449 define i32 @length12(i8* %X, i8* %Y) nounwind optsize {
450 ; X86-LABEL: length12:
451 ; X86: # %bb.0:
452 ; X86-NEXT: pushl $0
453 ; X86-NEXT: pushl $12
454 ; X86-NEXT: pushl {{[0-9]+}}(%esp)
455 ; X86-NEXT: pushl {{[0-9]+}}(%esp)
456 ; X86-NEXT: calll memcmp
457 ; X86-NEXT: addl $16, %esp
458 ; X86-NEXT: retl
459 ;
460 ; X64-LABEL: length12:
461 ; X64: # %bb.0:
462 ; X64-NEXT: movq (%rdi), %rcx
463 ; X64-NEXT: movq (%rsi), %rdx
464 ; X64-NEXT: bswapq %rcx
465 ; X64-NEXT: bswapq %rdx
466 ; X64-NEXT: cmpq %rdx, %rcx
467 ; X64-NEXT: jne .LBB15_2
468 ; X64-NEXT: # %bb.1: # %loadbb1
469 ; X64-NEXT: movl 8(%rdi), %ecx
470 ; X64-NEXT: movl 8(%rsi), %edx
471 ; X64-NEXT: bswapl %ecx
472 ; X64-NEXT: bswapl %edx
473 ; X64-NEXT: xorl %eax, %eax
474 ; X64-NEXT: cmpq %rdx, %rcx
475 ; X64-NEXT: je .LBB15_3
476 ; X64-NEXT: .LBB15_2: # %res_block
477 ; X64-NEXT: xorl %eax, %eax
478 ; X64-NEXT: cmpq %rdx, %rcx
479 ; X64-NEXT: setae %al
480 ; X64-NEXT: leal -1(%rax,%rax), %eax
481 ; X64-NEXT: .LBB15_3: # %endblock
482 ; X64-NEXT: retq
483 %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
484 ret i32 %m
485 }
486
487 ; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
488
489 define i32 @length16(i8* %X, i8* %Y) nounwind optsize {
490 ; X86-LABEL: length16:
491 ; X86: # %bb.0:
492 ; X86-NEXT: pushl $0
493 ; X86-NEXT: pushl $16
494 ; X86-NEXT: pushl {{[0-9]+}}(%esp)
495 ; X86-NEXT: pushl {{[0-9]+}}(%esp)
496 ; X86-NEXT: calll memcmp
497 ; X86-NEXT: addl $16, %esp
498 ; X86-NEXT: retl
499 ;
500 ; X64-LABEL: length16:
501 ; X64: # %bb.0:
502 ; X64-NEXT: movq (%rdi), %rcx
503 ; X64-NEXT: movq (%rsi), %rdx
504 ; X64-NEXT: bswapq %rcx
505 ; X64-NEXT: bswapq %rdx
506 ; X64-NEXT: cmpq %rdx, %rcx
507 ; X64-NEXT: jne .LBB16_2
508 ; X64-NEXT: # %bb.1: # %loadbb1
509 ; X64-NEXT: movq 8(%rdi), %rcx
510 ; X64-NEXT: movq 8(%rsi), %rdx
511 ; X64-NEXT: bswapq %rcx
512 ; X64-NEXT: bswapq %rdx
513 ; X64-NEXT: xorl %eax, %eax
514 ; X64-NEXT: cmpq %rdx, %rcx
515 ; X64-NEXT: je .LBB16_3
516 ; X64-NEXT: .LBB16_2: # %res_block
517 ; X64-NEXT: xorl %eax, %eax
518 ; X64-NEXT: cmpq %rdx, %rcx
519 ; X64-NEXT: setae %al
520 ; X64-NEXT: leal -1(%rax,%rax), %eax
521 ; X64-NEXT: .LBB16_3: # %endblock
522 ; X64-NEXT: retq
523 %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
524 ret i32 %m
525 }
526
527 define i1 @length16_eq(i8* %x, i8* %y) nounwind optsize {
528 ; X86-NOSSE-LABEL: length16_eq:
529 ; X86-NOSSE: # %bb.0:
530 ; X86-NOSSE-NEXT: pushl $0
531 ; X86-NOSSE-NEXT: pushl $16
532 ; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
533 ; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
534 ; X86-NOSSE-NEXT: calll memcmp
535 ; X86-NOSSE-NEXT: addl $16, %esp
536 ; X86-NOSSE-NEXT: testl %eax, %eax
537 ; X86-NOSSE-NEXT: setne %al
538 ; X86-NOSSE-NEXT: retl
539 ;
540 ; X86-SSE2-LABEL: length16_eq:
541 ; X86-SSE2: # %bb.0:
542 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
543 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
544 ; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
545 ; X86-SSE2-NEXT: movdqu (%eax), %xmm1
546 ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
547 ; X86-SSE2-NEXT: pmovmskb %xmm1, %eax
548 ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
549 ; X86-SSE2-NEXT: setne %al
550 ; X86-SSE2-NEXT: retl
551 ;
552 ; X64-SSE2-LABEL: length16_eq:
553 ; X64-SSE2: # %bb.0:
554 ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
555 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
556 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
557 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
558 ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
559 ; X64-SSE2-NEXT: setne %al
560 ; X64-SSE2-NEXT: retq
561 ;
562 ; X64-AVX2-LABEL: length16_eq:
563 ; X64-AVX2: # %bb.0:
564 ; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0
565 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
566 ; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax
567 ; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
568 ; X64-AVX2-NEXT: setne %al
569 ; X64-AVX2-NEXT: retq
570 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
571 %cmp = icmp ne i32 %call, 0
572 ret i1 %cmp
573 }
574
575 define i1 @length16_eq_const(i8* %X) nounwind optsize {
576 ; X86-NOSSE-LABEL: length16_eq_const:
577 ; X86-NOSSE: # %bb.0:
578 ; X86-NOSSE-NEXT: pushl $0
579 ; X86-NOSSE-NEXT: pushl $16
580 ; X86-NOSSE-NEXT: pushl $.L.str
581 ; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
582 ; X86-NOSSE-NEXT: calll memcmp
583 ; X86-NOSSE-NEXT: addl $16, %esp
584 ; X86-NOSSE-NEXT: testl %eax, %eax
585 ; X86-NOSSE-NEXT: sete %al
586 ; X86-NOSSE-NEXT: retl
587 ;
588 ; X86-SSE2-LABEL: length16_eq_const:
589 ; X86-SSE2: # %bb.0:
590 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
591 ; X86-SSE2-NEXT: movdqu (%eax), %xmm0
592 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
593 ; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
594 ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
595 ; X86-SSE2-NEXT: sete %al
596 ; X86-SSE2-NEXT: retl
597 ;
598 ; X64-SSE2-LABEL: length16_eq_const:
599 ; X64-SSE2: # %bb.0:
600 ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
601 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
602 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
603 ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
604 ; X64-SSE2-NEXT: sete %al
605 ; X64-SSE2-NEXT: retq
606 ;
607 ; X64-AVX2-LABEL: length16_eq_const:
608 ; X64-AVX2: # %bb.0:
609 ; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0
610 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
611 ; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax
612 ; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
613 ; X64-AVX2-NEXT: sete %al
614 ; X64-AVX2-NEXT: retq
615 %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind
616 %c = icmp eq i32 %m, 0
617 ret i1 %c
618 }
619
620 ; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
621
622 define i32 @length24(i8* %X, i8* %Y) nounwind optsize {
623 ; X86-LABEL: length24:
624 ; X86: # %bb.0:
625 ; X86-NEXT: pushl $0
626 ; X86-NEXT: pushl $24
627 ; X86-NEXT: pushl {{[0-9]+}}(%esp)
628 ; X86-NEXT: pushl {{[0-9]+}}(%esp)
629 ; X86-NEXT: calll memcmp
630 ; X86-NEXT: addl $16, %esp
631 ; X86-NEXT: retl
632 ;
633 ; X64-LABEL: length24:
634 ; X64: # %bb.0:
635 ; X64-NEXT: movl $24, %edx
636 ; X64-NEXT: jmp memcmp # TAILCALL
637 %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 24) nounwind
638 ret i32 %m
639 }
640
641 define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize {
642 ; X86-NOSSE-LABEL: length24_eq:
643 ; X86-NOSSE: # %bb.0:
644 ; X86-NOSSE-NEXT: pushl $0
645 ; X86-NOSSE-NEXT: pushl $24
646 ; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
647 ; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
648 ; X86-NOSSE-NEXT: calll memcmp
649 ; X86-NOSSE-NEXT: addl $16, %esp
650 ; X86-NOSSE-NEXT: testl %eax, %eax
651 ; X86-NOSSE-NEXT: sete %al
652 ; X86-NOSSE-NEXT: retl
653 ;
654 ; X86-SSE2-LABEL: length24_eq:
655 ; X86-SSE2: # %bb.0:
656 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
657 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
658 ; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
659 ; X86-SSE2-NEXT: movdqu 8(%ecx), %xmm1
660 ; X86-SSE2-NEXT: movdqu (%eax), %xmm2
661 ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2
662 ; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0
663 ; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
664 ; X86-SSE2-NEXT: pand %xmm2, %xmm0
665 ; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
666 ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
667 ; X86-SSE2-NEXT: sete %al
668 ; X86-SSE2-NEXT: retl
669 ;
670 ; X64-SSE2-LABEL: length24_eq:
671 ; X64-SSE2: # %bb.0:
672 ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
673 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
674 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
675 ; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
676 ; X64-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
677 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm2
678 ; X64-SSE2-NEXT: pand %xmm1, %xmm2
679 ; X64-SSE2-NEXT: pmovmskb %xmm2, %eax
680 ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
681 ; X64-SSE2-NEXT: sete %al
682 ; X64-SSE2-NEXT: retq
683 ;
684 ; X64-AVX2-LABEL: length24_eq:
685 ; X64-AVX2: # %bb.0:
686 ; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0
687 ; X64-AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
688 ; X64-AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
689 ; X64-AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1
690 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
691 ; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
692 ; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax
693 ; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
694 ; X64-AVX2-NEXT: sete %al
695 ; X64-AVX2-NEXT: retq
696 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind
697 %cmp = icmp eq i32 %call, 0
698 ret i1 %cmp
699 }
700
701 define i1 @length24_eq_const(i8* %X) nounwind optsize {
702 ; X86-NOSSE-LABEL: length24_eq_const:
703 ; X86-NOSSE: # %bb.0:
704 ; X86-NOSSE-NEXT: pushl $0
705 ; X86-NOSSE-NEXT: pushl $24
706 ; X86-NOSSE-NEXT: pushl $.L.str
707 ; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
708 ; X86-NOSSE-NEXT: calll memcmp
709 ; X86-NOSSE-NEXT: addl $16, %esp
710 ; X86-NOSSE-NEXT: testl %eax, %eax
711 ; X86-NOSSE-NEXT: setne %al
712 ; X86-NOSSE-NEXT: retl
713 ;
714 ; X86-SSE2-LABEL: length24_eq_const:
715 ; X86-SSE2: # %bb.0:
716 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
717 ; X86-SSE2-NEXT: movdqu (%eax), %xmm0
718 ; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1
719 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm1
720 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
721 ; X86-SSE2-NEXT: pand %xmm1, %xmm0
722 ; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
723 ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
724 ; X86-SSE2-NEXT: setne %al
725 ; X86-SSE2-NEXT: retl
726 ;
727 ; X64-SSE2-LABEL: length24_eq_const:
728 ; X64-SSE2: # %bb.0:
729 ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
730 ; X64-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
731 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm1
732 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
733 ; X64-SSE2-NEXT: pand %xmm1, %xmm0
734 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
735 ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
736 ; X64-SSE2-NEXT: setne %al
737 ; X64-SSE2-NEXT: retq
738 ;
739 ; X64-AVX2-LABEL: length24_eq_const:
740 ; X64-AVX2: # %bb.0:
741 ; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0
742 ; X64-AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
743 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm1, %xmm1
744 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
745 ; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
746 ; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax
747 ; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
748 ; X64-AVX2-NEXT: setne %al
749 ; X64-AVX2-NEXT: retq
750 %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind
751 %c = icmp ne i32 %m, 0
752 ret i1 %c
753 }
754
755 define i32 @length32(i8* %X, i8* %Y) nounwind optsize {
756 ; X86-LABEL: length32:
757 ; X86: # %bb.0:
758 ; X86-NEXT: pushl $0
759 ; X86-NEXT: pushl $32
760 ; X86-NEXT: pushl {{[0-9]+}}(%esp)
761 ; X86-NEXT: pushl {{[0-9]+}}(%esp)
762 ; X86-NEXT: calll memcmp
763 ; X86-NEXT: addl $16, %esp
764 ; X86-NEXT: retl
765 ;
766 ; X64-LABEL: length32:
767 ; X64: # %bb.0:
768 ; X64-NEXT: movl $32, %edx
769 ; X64-NEXT: jmp memcmp # TAILCALL
770 %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind
771 ret i32 %m
772 }
773
774 ; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
775
776 define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
777 ; X86-NOSSE-LABEL: length32_eq:
778 ; X86-NOSSE: # %bb.0:
779 ; X86-NOSSE-NEXT: pushl $0
780 ; X86-NOSSE-NEXT: pushl $32
781 ; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
782 ; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
783 ; X86-NOSSE-NEXT: calll memcmp
784 ; X86-NOSSE-NEXT: addl $16, %esp
785 ; X86-NOSSE-NEXT: testl %eax, %eax
786 ; X86-NOSSE-NEXT: sete %al
787 ; X86-NOSSE-NEXT: retl
788 ;
789 ; X86-SSE2-LABEL: length32_eq:
790 ; X86-SSE2: # %bb.0:
791 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
792 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
793 ; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
794 ; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1
795 ; X86-SSE2-NEXT: movdqu (%eax), %xmm2
796 ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2
797 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0
798 ; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
799 ; X86-SSE2-NEXT: pand %xmm2, %xmm0
800 ; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
801 ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
802 ; X86-SSE2-NEXT: sete %al
803 ; X86-SSE2-NEXT: retl
804 ;
805 ; X64-SSE2-LABEL: length32_eq:
806 ; X64-SSE2: # %bb.0:
807 ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
808 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1
809 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm2
810 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm2
811 ; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm0
812 ; X64-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
813 ; X64-SSE2-NEXT: pand %xmm2, %xmm0
814 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
815 ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
816 ; X64-SSE2-NEXT: sete %al
817 ; X64-SSE2-NEXT: retq
818 ;
819 ; X64-AVX2-LABEL: length32_eq:
820 ; X64-AVX2: # %bb.0:
821 ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
822 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
823 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
824 ; X64-AVX2-NEXT: cmpl $-1, %eax
825 ; X64-AVX2-NEXT: sete %al
826 ; X64-AVX2-NEXT: vzeroupper
827 ; X64-AVX2-NEXT: retq
828 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind
829 %cmp = icmp eq i32 %call, 0
830 ret i1 %cmp
831 }
832
833 define i1 @length32_eq_const(i8* %X) nounwind optsize {
834 ; X86-NOSSE-LABEL: length32_eq_const:
835 ; X86-NOSSE: # %bb.0:
836 ; X86-NOSSE-NEXT: pushl $0
837 ; X86-NOSSE-NEXT: pushl $32
838 ; X86-NOSSE-NEXT: pushl $.L.str
839 ; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
840 ; X86-NOSSE-NEXT: calll memcmp
841 ; X86-NOSSE-NEXT: addl $16, %esp
842 ; X86-NOSSE-NEXT: testl %eax, %eax
843 ; X86-NOSSE-NEXT: setne %al
844 ; X86-NOSSE-NEXT: retl
845 ;
846 ; X86-SSE2-LABEL: length32_eq_const:
847 ; X86-SSE2: # %bb.0:
848 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
849 ; X86-SSE2-NEXT: movdqu (%eax), %xmm0
850 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
851 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm1
852 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
853 ; X86-SSE2-NEXT: pand %xmm1, %xmm0
854 ; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
855 ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
856 ; X86-SSE2-NEXT: setne %al
857 ; X86-SSE2-NEXT: retl
858 ;
859 ; X64-SSE2-LABEL: length32_eq_const:
860 ; X64-SSE2: # %bb.0:
861 ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
862 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1
863 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm1
864 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
865 ; X64-SSE2-NEXT: pand %xmm1, %xmm0
866 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
867 ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
868 ; X64-SSE2-NEXT: setne %al
869 ; X64-SSE2-NEXT: retq
870 ;
871 ; X64-AVX2-LABEL: length32_eq_const:
872 ; X64-AVX2: # %bb.0:
873 ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
874 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
875 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
876 ; X64-AVX2-NEXT: cmpl $-1, %eax
877 ; X64-AVX2-NEXT: setne %al
878 ; X64-AVX2-NEXT: vzeroupper
879 ; X64-AVX2-NEXT: retq
880 %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind
881 %c = icmp ne i32 %m, 0
882 ret i1 %c
883 }
884
885 define i32 @length64(i8* %X, i8* %Y) nounwind optsize {
886 ; X86-LABEL: length64:
887 ; X86: # %bb.0:
888 ; X86-NEXT: pushl $0
889 ; X86-NEXT: pushl $64
890 ; X86-NEXT: pushl {{[0-9]+}}(%esp)
891 ; X86-NEXT: pushl {{[0-9]+}}(%esp)
892 ; X86-NEXT: calll memcmp
893 ; X86-NEXT: addl $16, %esp
894 ; X86-NEXT: retl
895 ;
896 ; X64-LABEL: length64:
897 ; X64: # %bb.0:
898 ; X64-NEXT: movl $64, %edx
899 ; X64-NEXT: jmp memcmp # TAILCALL
900 %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 64) nounwind
901 ret i32 %m
902 }
903
904 define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize {
905 ; X86-LABEL: length64_eq:
906 ; X86: # %bb.0:
907 ; X86-NEXT: pushl $0
908 ; X86-NEXT: pushl $64
909 ; X86-NEXT: pushl {{[0-9]+}}(%esp)
910 ; X86-NEXT: pushl {{[0-9]+}}(%esp)
911 ; X86-NEXT: calll memcmp
912 ; X86-NEXT: addl $16, %esp
913 ; X86-NEXT: testl %eax, %eax
914 ; X86-NEXT: setne %al
915 ; X86-NEXT: retl
916 ;
917 ; X64-SSE2-LABEL: length64_eq:
918 ; X64-SSE2: # %bb.0:
919 ; X64-SSE2-NEXT: pushq %rax
920 ; X64-SSE2-NEXT: movl $64, %edx
921 ; X64-SSE2-NEXT: callq memcmp
922 ; X64-SSE2-NEXT: testl %eax, %eax
923 ; X64-SSE2-NEXT: setne %al
924 ; X64-SSE2-NEXT: popq %rcx
925 ; X64-SSE2-NEXT: retq
926 ;
927 ; X64-AVX2-LABEL: length64_eq:
928 ; X64-AVX2: # %bb.0:
929 ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
930 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1
931 ; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm1, %ymm1
932 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
933 ; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
934 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
935 ; X64-AVX2-NEXT: cmpl $-1, %eax
936 ; X64-AVX2-NEXT: setne %al
937 ; X64-AVX2-NEXT: vzeroupper
938 ; X64-AVX2-NEXT: retq
939 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind
940 %cmp = icmp ne i32 %call, 0
941 ret i1 %cmp
942 }
943
944 define i1 @length64_eq_const(i8* %X) nounwind optsize {
945 ; X86-LABEL: length64_eq_const:
946 ; X86: # %bb.0:
947 ; X86-NEXT: pushl $0
948 ; X86-NEXT: pushl $64
949 ; X86-NEXT: pushl $.L.str
950 ; X86-NEXT: pushl {{[0-9]+}}(%esp)
951 ; X86-NEXT: calll memcmp
952 ; X86-NEXT: addl $16, %esp
953 ; X86-NEXT: testl %eax, %eax
954 ; X86-NEXT: sete %al
955 ; X86-NEXT: retl
956 ;
957