llvm.org GIT mirror llvm / c022286
Revert "[ExpandMemCmp] Split ExpandMemCmp from CodeGen into its own pass." undefined reference to `llvm::TargetPassConfig::ID' on clang-ppc64le-linux-multistage This reverts commit eea333c33fa73ad225ef28607795984829f65688. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317213 91177308-0d34-0410-b5e6-96231b3b80d8 Clement Courbet 1 year, 11 months ago
14 changed file(s) with 1722 addition(s) and 1874 deletion(s). Raw diff Collapse all Expand all
127127 void initializeEfficiencySanitizerPass(PassRegistry&);
128128 void initializeEliminateAvailableExternallyLegacyPassPass(PassRegistry&);
129129 void initializeExpandISelPseudosPass(PassRegistry&);
130 void initializeExpandMemCmpPassPass(PassRegistry&);
131130 void initializeExpandPostRAPass(PassRegistry&);
132131 void initializeExpandReductionsPass(PassRegistry&);
133132 void initializeExternalAAWrapperPassPass(PassRegistry&);
179179 (void) llvm::createReversePostOrderFunctionAttrsPass();
180180 (void) llvm::createMergeFunctionsPass();
181181 (void) llvm::createMergeICmpsPass();
182 (void) llvm::createExpandMemCmpPass();
183182 std::string buf;
184183 llvm::raw_string_ostream os(buf);
185184 (void) llvm::createPrintModulePass(os);
421421
422422 //===----------------------------------------------------------------------===//
423423 //
424 // MergeICmps - Merge integer comparison chains into a memcmp
424 // MergeICmps - Merge integer comparison chains
425425 //
426426 Pass *createMergeICmpsPass();
427
428 //===----------------------------------------------------------------------===//
429 //
430 // ExpandMemCmp - Expand memcmp() to load/stores.
431 //
432 Pass *createExpandMemCmpPass();
433427
434428 //===----------------------------------------------------------------------===//
435429 //
122122 STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
123123 STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
124124
125 STATISTIC(NumMemCmpCalls, "Number of memcmp calls");
126 STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size");
127 STATISTIC(NumMemCmpGreaterThanMax,
128 "Number of memcmp calls with size greater than max size");
129 STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls");
130
125131 static cl::opt DisableBranchOpts(
126132 "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
127133 cl::desc("Disable branch optimizations in CodeGenPrepare"));
181187 EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden,
182188 cl::desc("Enable merging of redundant sexts when one is dominating"
183189 " the other."), cl::init(true));
190
191 static cl::opt MemCmpNumLoadsPerBlock(
192 "memcmp-num-loads-per-block", cl::Hidden, cl::init(1),
193 cl::desc("The number of loads per basic block for inline expansion of "
194 "memcmp that is only being compared against zero."));
184195
185196 namespace {
186197
16851696 return true;
16861697 }
16871698
1699 namespace {
1700
1701 // This class provides helper functions to expand a memcmp library call into an
1702 // inline expansion.
1703 class MemCmpExpansion {
1704 struct ResultBlock {
1705 BasicBlock *BB = nullptr;
1706 PHINode *PhiSrc1 = nullptr;
1707 PHINode *PhiSrc2 = nullptr;
1708
1709 ResultBlock() = default;
1710 };
1711
1712 CallInst *const CI;
1713 ResultBlock ResBlock;
1714 const uint64_t Size;
1715 unsigned MaxLoadSize;
1716 uint64_t NumLoadsNonOneByte;
1717 const uint64_t NumLoadsPerBlock;
1718 std::vector LoadCmpBlocks;
1719 BasicBlock *EndBlock;
1720 PHINode *PhiRes;
1721 const bool IsUsedForZeroCmp;
1722 const DataLayout &DL;
1723 IRBuilder<> Builder;
1724 // Represents the decomposition in blocks of the expansion. For example,
1725 // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and
1726 // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}.
1727 // TODO(courbet): Involve the target more in this computation. On X86, 7
1728 // bytes can be done more efficiently with two overlaping 4-byte loads than
1729 // covering the interval with [{4, 0},{2, 4},{1, 6}}.
1730 struct LoadEntry {
1731 LoadEntry(unsigned LoadSize, uint64_t Offset)
1732 : LoadSize(LoadSize), Offset(Offset) {
1733 assert(Offset % LoadSize == 0 && "invalid load entry");
1734 }
1735
1736 uint64_t getGEPIndex() const { return Offset / LoadSize; }
1737
1738 // The size of the load for this block, in bytes.
1739 const unsigned LoadSize;
1740 // The offset of this load WRT the base pointer, in bytes.
1741 const uint64_t Offset;
1742 };
1743 SmallVector LoadSequence;
1744
1745 void createLoadCmpBlocks();
1746 void createResultBlock();
1747 void setupResultBlockPHINodes();
1748 void setupEndBlockPHINodes();
1749 Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex);
1750 void emitLoadCompareBlock(unsigned BlockIndex);
1751 void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,
1752 unsigned &LoadIndex);
1753 void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned GEPIndex);
1754 void emitMemCmpResultBlock();
1755 Value *getMemCmpExpansionZeroCase();
1756 Value *getMemCmpEqZeroOneBlock();
1757 Value *getMemCmpOneBlock();
1758
1759 public:
1760 MemCmpExpansion(CallInst *CI, uint64_t Size,
1761 const TargetTransformInfo::MemCmpExpansionOptions &Options,
1762 unsigned MaxNumLoads, const bool IsUsedForZeroCmp,
1763 unsigned NumLoadsPerBlock, const DataLayout &DL);
1764
1765 unsigned getNumBlocks();
1766 uint64_t getNumLoads() const { return LoadSequence.size(); }
1767
1768 Value *getMemCmpExpansion();
1769 };
1770
1771 } // end anonymous namespace
1772
1773 // Initialize the basic block structure required for expansion of memcmp call
1774 // with given maximum load size and memcmp size parameter.
1775 // This structure includes:
1776 // 1. A list of load compare blocks - LoadCmpBlocks.
1777 // 2. An EndBlock, split from original instruction point, which is the block to
1778 // return from.
1779 // 3. ResultBlock, block to branch to for early exit when a
1780 // LoadCmpBlock finds a difference.
1781 MemCmpExpansion::MemCmpExpansion(
1782 CallInst *const CI, uint64_t Size,
1783 const TargetTransformInfo::MemCmpExpansionOptions &Options,
1784 const unsigned MaxNumLoads, const bool IsUsedForZeroCmp,
1785 const unsigned NumLoadsPerBlock, const DataLayout &TheDataLayout)
1786 : CI(CI),
1787 Size(Size),
1788 MaxLoadSize(0),
1789 NumLoadsNonOneByte(0),
1790 NumLoadsPerBlock(NumLoadsPerBlock),
1791 IsUsedForZeroCmp(IsUsedForZeroCmp),
1792 DL(TheDataLayout),
1793 Builder(CI) {
1794 assert(Size > 0 && "zero blocks");
1795 // Scale the max size down if the target can load more bytes than we need.
1796 size_t LoadSizeIndex = 0;
1797 while (LoadSizeIndex < Options.LoadSizes.size() &&
1798 Options.LoadSizes[LoadSizeIndex] > Size) {
1799 ++LoadSizeIndex;
1800 }
1801 this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex];
1802 // Compute the decomposition.
1803 uint64_t CurSize = Size;
1804 uint64_t Offset = 0;
1805 while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) {
1806 const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex];
1807 assert(LoadSize > 0 && "zero load size");
1808 const uint64_t NumLoadsForThisSize = CurSize / LoadSize;
1809 if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) {
1810 // Do not expand if the total number of loads is larger than what the
1811 // target allows. Note that it's important that we exit before completing
1812 // the expansion to avoid using a ton of memory to store the expansion for
1813 // large sizes.
1814 LoadSequence.clear();
1815 return;
1816 }
1817 if (NumLoadsForThisSize > 0) {
1818 for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) {
1819 LoadSequence.push_back({LoadSize, Offset});
1820 Offset += LoadSize;
1821 }
1822 if (LoadSize > 1) {
1823 ++NumLoadsNonOneByte;
1824 }
1825 CurSize = CurSize % LoadSize;
1826 }
1827 ++LoadSizeIndex;
1828 }
1829 assert(LoadSequence.size() <= MaxNumLoads && "broken invariant");
1830 }
1831
1832 unsigned MemCmpExpansion::getNumBlocks() {
1833 if (IsUsedForZeroCmp)
1834 return getNumLoads() / NumLoadsPerBlock +
1835 (getNumLoads() % NumLoadsPerBlock != 0 ? 1 : 0);
1836 return getNumLoads();
1837 }
1838
1839 void MemCmpExpansion::createLoadCmpBlocks() {
1840 for (unsigned i = 0; i < getNumBlocks(); i++) {
1841 BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb",
1842 EndBlock->getParent(), EndBlock);
1843 LoadCmpBlocks.push_back(BB);
1844 }
1845 }
1846
1847 void MemCmpExpansion::createResultBlock() {
1848 ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block",
1849 EndBlock->getParent(), EndBlock);
1850 }
1851
1852 // This function creates the IR instructions for loading and comparing 1 byte.
1853 // It loads 1 byte from each source of the memcmp parameters with the given
1854 // GEPIndex. It then subtracts the two loaded values and adds this result to the
1855 // final phi node for selecting the memcmp result.
1856 void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex,
1857 unsigned GEPIndex) {
1858 Value *Source1 = CI->getArgOperand(0);
1859 Value *Source2 = CI->getArgOperand(1);
1860
1861 Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
1862 Type *LoadSizeType = Type::getInt8Ty(CI->getContext());
1863 // Cast source to LoadSizeType*.
1864 if (Source1->getType() != LoadSizeType)
1865 Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
1866 if (Source2->getType() != LoadSizeType)
1867 Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
1868
1869 // Get the base address using the GEPIndex.
1870 if (GEPIndex != 0) {
1871 Source1 = Builder.CreateGEP(LoadSizeType, Source1,
1872 ConstantInt::get(LoadSizeType, GEPIndex));
1873 Source2 = Builder.CreateGEP(LoadSizeType, Source2,
1874 ConstantInt::get(LoadSizeType, GEPIndex));
1875 }
1876
1877 Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
1878 Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
1879
1880 LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext()));
1881 LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext()));
1882 Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2);
1883
1884 PhiRes->addIncoming(Diff, LoadCmpBlocks[BlockIndex]);
1885
1886 if (BlockIndex < (LoadCmpBlocks.size() - 1)) {
1887 // Early exit branch if difference found to EndBlock. Otherwise, continue to
1888 // next LoadCmpBlock,
1889 Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff,
1890 ConstantInt::get(Diff->getType(), 0));
1891 BranchInst *CmpBr =
1892 BranchInst::Create(EndBlock, LoadCmpBlocks[BlockIndex + 1], Cmp);
1893 Builder.Insert(CmpBr);
1894 } else {
1895 // The last block has an unconditional branch to EndBlock.
1896 BranchInst *CmpBr = BranchInst::Create(EndBlock);
1897 Builder.Insert(CmpBr);
1898 }
1899 }
1900
1901 /// Generate an equality comparison for one or more pairs of loaded values.
1902 /// This is used in the case where the memcmp() call is compared equal or not
1903 /// equal to zero.
1904 Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex,
1905 unsigned &LoadIndex) {
1906 assert(LoadIndex < getNumLoads() &&
1907 "getCompareLoadPairs() called with no remaining loads");
1908 std::vector XorList, OrList;
1909 Value *Diff;
1910
1911 const unsigned NumLoads =
1912 std::min(getNumLoads() - LoadIndex, NumLoadsPerBlock);
1913
1914 // For a single-block expansion, start inserting before the memcmp call.
1915 if (LoadCmpBlocks.empty())
1916 Builder.SetInsertPoint(CI);
1917 else
1918 Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
1919
1920 Value *Cmp = nullptr;
1921 // If we have multiple loads per block, we need to generate a composite
1922 // comparison using xor+or. The type for the combinations is the largest load
1923 // type.
1924 IntegerType *const MaxLoadType =
1925 NumLoads == 1 ? nullptr
1926 : IntegerType::get(CI->getContext(), MaxLoadSize * 8);
1927 for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) {
1928 const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex];
1929
1930 IntegerType *LoadSizeType =
1931 IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
1932
1933 Value *Source1 = CI->getArgOperand(0);
1934 Value *Source2 = CI->getArgOperand(1);
1935
1936 // Cast source to LoadSizeType*.
1937 if (Source1->getType() != LoadSizeType)
1938 Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
1939 if (Source2->getType() != LoadSizeType)
1940 Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
1941
1942 // Get the base address using a GEP.
1943 if (CurLoadEntry.Offset != 0) {
1944 Source1 = Builder.CreateGEP(
1945 LoadSizeType, Source1,
1946 ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
1947 Source2 = Builder.CreateGEP(
1948 LoadSizeType, Source2,
1949 ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
1950 }
1951
1952 // Get a constant or load a value for each source address.
1953 Value *LoadSrc1 = nullptr;
1954 if (auto *Source1C = dyn_cast(Source1))
1955 LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL);
1956 if (!LoadSrc1)
1957 LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
1958
1959 Value *LoadSrc2 = nullptr;
1960 if (auto *Source2C = dyn_cast(Source2))
1961 LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL);
1962 if (!LoadSrc2)
1963 LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
1964
1965 if (NumLoads != 1) {
1966 if (LoadSizeType != MaxLoadType) {
1967 LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
1968 LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
1969 }
1970 // If we have multiple loads per block, we need to generate a composite
1971 // comparison using xor+or.
1972 Diff = Builder.CreateXor(LoadSrc1, LoadSrc2);
1973 Diff = Builder.CreateZExt(Diff, MaxLoadType);
1974 XorList.push_back(Diff);
1975 } else {
1976 // If there's only one load per block, we just compare the loaded values.
1977 Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2);
1978 }
1979 }
1980
1981 auto pairWiseOr = [&](std::vector &InList) -> std::vector {
1982 std::vector OutList;
1983 for (unsigned i = 0; i < InList.size() - 1; i = i + 2) {
1984 Value *Or = Builder.CreateOr(InList[i], InList[i + 1]);
1985 OutList.push_back(Or);
1986 }
1987 if (InList.size() % 2 != 0)
1988 OutList.push_back(InList.back());
1989 return OutList;
1990 };
1991
1992 if (!Cmp) {
1993 // Pairwise OR the XOR results.
1994 OrList = pairWiseOr(XorList);
1995
1996 // Pairwise OR the OR results until one result left.
1997 while (OrList.size() != 1) {
1998 OrList = pairWiseOr(OrList);
1999 }
2000 Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0));
2001 }
2002
2003 return Cmp;
2004 }
2005
2006 void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,
2007 unsigned &LoadIndex) {
2008 Value *Cmp = getCompareLoadPairs(BlockIndex, LoadIndex);
2009
2010 BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1))
2011 ? EndBlock
2012 : LoadCmpBlocks[BlockIndex + 1];
2013 // Early exit branch if difference found to ResultBlock. Otherwise,
2014 // continue to next LoadCmpBlock or EndBlock.
2015 BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp);
2016 Builder.Insert(CmpBr);
2017
2018 // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
2019 // since early exit to ResultBlock was not taken (no difference was found in
2020 // any of the bytes).
2021 if (BlockIndex == LoadCmpBlocks.size() - 1) {
2022 Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
2023 PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]);
2024 }
2025 }
2026
2027 // This function creates the IR intructions for loading and comparing using the
2028 // given LoadSize. It loads the number of bytes specified by LoadSize from each
2029 // source of the memcmp parameters. It then does a subtract to see if there was
2030 // a difference in the loaded values. If a difference is found, it branches
2031 // with an early exit to the ResultBlock for calculating which source was
2032 // larger. Otherwise, it falls through to the either the next LoadCmpBlock or
2033 // the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with
2034 // a special case through emitLoadCompareByteBlock. The special handling can
2035 // simply subtract the loaded values and add it to the result phi node.
2036 void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) {
2037 // There is one load per block in this case, BlockIndex == LoadIndex.
2038 const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex];
2039
2040 if (CurLoadEntry.LoadSize == 1) {
2041 MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex,
2042 CurLoadEntry.getGEPIndex());
2043 return;
2044 }
2045
2046 Type *LoadSizeType =
2047 IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
2048 Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
2049 assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type");
2050
2051 Value *Source1 = CI->getArgOperand(0);
2052 Value *Source2 = CI->getArgOperand(1);
2053
2054 Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
2055 // Cast source to LoadSizeType*.
2056 if (Source1->getType() != LoadSizeType)
2057 Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
2058 if (Source2->getType() != LoadSizeType)
2059 Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
2060
2061 // Get the base address using a GEP.
2062 if (CurLoadEntry.Offset != 0) {
2063 Source1 = Builder.CreateGEP(
2064 LoadSizeType, Source1,
2065 ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
2066 Source2 = Builder.CreateGEP(
2067 LoadSizeType, Source2,
2068 ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
2069 }
2070
2071 // Load LoadSizeType from the base address.
2072 Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
2073 Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
2074
2075 if (DL.isLittleEndian()) {
2076 Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
2077 Intrinsic::bswap, LoadSizeType);
2078 LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
2079 LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
2080 }
2081
2082 if (LoadSizeType != MaxLoadType) {
2083 LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
2084 LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
2085 }
2086
2087 // Add the loaded values to the phi nodes for calculating memcmp result only
2088 // if result is not used in a zero equality.
2089 if (!IsUsedForZeroCmp) {
2090 ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[BlockIndex]);
2091 ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[BlockIndex]);
2092 }
2093
2094 Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2);
2095 BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1))
2096 ? EndBlock
2097 : LoadCmpBlocks[BlockIndex + 1];
2098 // Early exit branch if difference found to ResultBlock. Otherwise, continue
2099 // to next LoadCmpBlock or EndBlock.
2100 BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp);
2101 Builder.Insert(CmpBr);
2102
2103 // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
2104 // since early exit to ResultBlock was not taken (no difference was found in
2105 // any of the bytes).
2106 if (BlockIndex == LoadCmpBlocks.size() - 1) {
2107 Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
2108 PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]);
2109 }
2110 }
2111
2112 // This function populates the ResultBlock with a sequence to calculate the
2113 // memcmp result. It compares the two loaded source values and returns -1 if
2114 // src1 < src2 and 1 if src1 > src2.
2115 void MemCmpExpansion::emitMemCmpResultBlock() {
2116 // Special case: if memcmp result is used in a zero equality, result does not
2117 // need to be calculated and can simply return 1.
2118 if (IsUsedForZeroCmp) {
2119 BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
2120 Builder.SetInsertPoint(ResBlock.BB, InsertPt);
2121 Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1);
2122 PhiRes->addIncoming(Res, ResBlock.BB);
2123 BranchInst *NewBr = BranchInst::Create(EndBlock);
2124 Builder.Insert(NewBr);
2125 return;
2126 }
2127 BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
2128 Builder.SetInsertPoint(ResBlock.BB, InsertPt);
2129
2130 Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1,
2131 ResBlock.PhiSrc2);
2132
2133 Value *Res =
2134 Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1),
2135 ConstantInt::get(Builder.getInt32Ty(), 1));
2136
2137 BranchInst *NewBr = BranchInst::Create(EndBlock);
2138 Builder.Insert(NewBr);
2139 PhiRes->addIncoming(Res, ResBlock.BB);
2140 }
2141
2142 void MemCmpExpansion::setupResultBlockPHINodes() {
2143 Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
2144 Builder.SetInsertPoint(ResBlock.BB);
2145 // Note: this assumes one load per block.
2146 ResBlock.PhiSrc1 =
2147 Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src1");
2148 ResBlock.PhiSrc2 =
2149 Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src2");
2150 }
2151
2152 void MemCmpExpansion::setupEndBlockPHINodes() {
2153 Builder.SetInsertPoint(&EndBlock->front());
2154 PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res");
2155 }
2156
2157 Value *MemCmpExpansion::getMemCmpExpansionZeroCase() {
2158 unsigned LoadIndex = 0;
2159 // This loop populates each of the LoadCmpBlocks with the IR sequence to
2160 // handle multiple loads per block.
2161 for (unsigned I = 0; I < getNumBlocks(); ++I) {
2162 emitLoadCompareBlockMultipleLoads(I, LoadIndex);
2163 }
2164
2165 emitMemCmpResultBlock();
2166 return PhiRes;
2167 }
2168
2169 /// A memcmp expansion that compares equality with 0 and only has one block of
2170 /// load and compare can bypass the compare, branch, and phi IR that is required
2171 /// in the general case.
2172 Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() {
2173 unsigned LoadIndex = 0;
2174 Value *Cmp = getCompareLoadPairs(0, LoadIndex);
2175 assert(LoadIndex == getNumLoads() && "some entries were not consumed");
2176 return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext()));
2177 }
2178
2179 /// A memcmp expansion that only has one block of load and compare can bypass
2180 /// the compare, branch, and phi IR that is required in the general case.
2181 Value *MemCmpExpansion::getMemCmpOneBlock() {
2182 assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block");
2183
2184 Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
2185 Value *Source1 = CI->getArgOperand(0);
2186 Value *Source2 = CI->getArgOperand(1);
2187
2188 // Cast source to LoadSizeType*.
2189 if (Source1->getType() != LoadSizeType)
2190 Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
2191 if (Source2->getType() != LoadSizeType)
2192 Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
2193
2194 // Load LoadSizeType from the base address.
2195 Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
2196 Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
2197
2198 if (DL.isLittleEndian() && Size != 1) {
2199 Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
2200 Intrinsic::bswap, LoadSizeType);
2201 LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
2202 LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
2203 }
2204
2205 if (Size < 4) {
2206 // The i8 and i16 cases don't need compares. We zext the loaded values and
2207 // subtract them to get the suitable negative, zero, or positive i32 result.
2208 LoadSrc1 = Builder.CreateZExt(LoadSrc1, Builder.getInt32Ty());
2209 LoadSrc2 = Builder.CreateZExt(LoadSrc2, Builder.getInt32Ty());
2210 return Builder.CreateSub(LoadSrc1, LoadSrc2);
2211 }
2212
2213 // The result of memcmp is negative, zero, or positive, so produce that by
2214 // subtracting 2 extended compare bits: sub (ugt, ult).
2215 // If a target prefers to use selects to get -1/0/1, they should be able
2216 // to transform this later. The inverse transform (going from selects to math)
2217 // may not be possible in the DAG because the selects got converted into
2218 // branches before we got there.
2219 Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2);
2220 Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2);
2221 Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty());
2222 Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty());
2223 return Builder.CreateSub(ZextUGT, ZextULT);
2224 }
2225
2226 // This function expands the memcmp call into an inline expansion and returns
2227 // the memcmp result.
2228 Value *MemCmpExpansion::getMemCmpExpansion() {
2229 // A memcmp with zero-comparison with only one block of load and compare does
2230 // not need to set up any extra blocks. This case could be handled in the DAG,
2231 // but since we have all of the machinery to flexibly expand any memcpy here,
2232 // we choose to handle this case too to avoid fragmented lowering.
2233 if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || getNumBlocks() != 1) {
2234 BasicBlock *StartBlock = CI->getParent();
2235 EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
2236 setupEndBlockPHINodes();
2237 createResultBlock();
2238
2239 // If return value of memcmp is not used in a zero equality, we need to
2240 // calculate which source was larger. The calculation requires the
2241 // two loaded source values of each load compare block.
2242 // These will be saved in the phi nodes created by setupResultBlockPHINodes.
2243 if (!IsUsedForZeroCmp) setupResultBlockPHINodes();
2244
2245 // Create the number of required load compare basic blocks.
2246 createLoadCmpBlocks();
2247
2248 // Update the terminator added by splitBasicBlock to branch to the first
2249 // LoadCmpBlock.
2250 StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);
2251 }
2252
2253 Builder.SetCurrentDebugLocation(CI->getDebugLoc());
2254
2255 if (IsUsedForZeroCmp)
2256 return getNumBlocks() == 1 ? getMemCmpEqZeroOneBlock()
2257 : getMemCmpExpansionZeroCase();
2258
2259 // TODO: Handle more than one load pair per block in getMemCmpOneBlock().
2260 if (getNumBlocks() == 1 && NumLoadsPerBlock == 1) return getMemCmpOneBlock();
2261
2262 for (unsigned I = 0; I < getNumBlocks(); ++I) {
2263 emitLoadCompareBlock(I);
2264 }
2265
2266 emitMemCmpResultBlock();
2267 return PhiRes;
2268 }
2269
2270 // This function checks to see if an expansion of memcmp can be generated.
2271 // It checks for constant compare size that is less than the max inline size.
2272 // If an expansion cannot occur, returns false to leave as a library call.
2273 // Otherwise, the library call is replaced with a new IR instruction sequence.
2274 /// We want to transform:
2275 /// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15)
2276 /// To:
2277 /// loadbb:
2278 /// %0 = bitcast i32* %buffer2 to i8*
2279 /// %1 = bitcast i32* %buffer1 to i8*
2280 /// %2 = bitcast i8* %1 to i64*
2281 /// %3 = bitcast i8* %0 to i64*
2282 /// %4 = load i64, i64* %2
2283 /// %5 = load i64, i64* %3
2284 /// %6 = call i64 @llvm.bswap.i64(i64 %4)
2285 /// %7 = call i64 @llvm.bswap.i64(i64 %5)
2286 /// %8 = sub i64 %6, %7
2287 /// %9 = icmp ne i64 %8, 0
2288 /// br i1 %9, label %res_block, label %loadbb1
2289 /// res_block: ; preds = %loadbb2,
2290 /// %loadbb1, %loadbb
2291 /// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ]
2292 /// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ]
2293 /// %10 = icmp ult i64 %phi.src1, %phi.src2
2294 /// %11 = select i1 %10, i32 -1, i32 1
2295 /// br label %endblock
2296 /// loadbb1: ; preds = %loadbb
2297 /// %12 = bitcast i32* %buffer2 to i8*
2298 /// %13 = bitcast i32* %buffer1 to i8*
2299 /// %14 = bitcast i8* %13 to i32*
2300 /// %15 = bitcast i8* %12 to i32*
2301 /// %16 = getelementptr i32, i32* %14, i32 2
2302 /// %17 = getelementptr i32, i32* %15, i32 2
2303 /// %18 = load i32, i32* %16
2304 /// %19 = load i32, i32* %17
2305 /// %20 = call i32 @llvm.bswap.i32(i32 %18)
2306 /// %21 = call i32 @llvm.bswap.i32(i32 %19)
2307 /// %22 = zext i32 %20 to i64
2308 /// %23 = zext i32 %21 to i64
2309 /// %24 = sub i64 %22, %23
2310 /// %25 = icmp ne i64 %24, 0
2311 /// br i1 %25, label %res_block, label %loadbb2
2312 /// loadbb2: ; preds = %loadbb1
2313 /// %26 = bitcast i32* %buffer2 to i8*
2314 /// %27 = bitcast i32* %buffer1 to i8*
2315 /// %28 = bitcast i8* %27 to i16*
2316 /// %29 = bitcast i8* %26 to i16*
2317 /// %30 = getelementptr i16, i16* %28, i16 6
2318 /// %31 = getelementptr i16, i16* %29, i16 6
2319 /// %32 = load i16, i16* %30
2320 /// %33 = load i16, i16* %31
2321 /// %34 = call i16 @llvm.bswap.i16(i16 %32)
2322 /// %35 = call i16 @llvm.bswap.i16(i16 %33)
2323 /// %36 = zext i16 %34 to i64
2324 /// %37 = zext i16 %35 to i64
2325 /// %38 = sub i64 %36, %37
2326 /// %39 = icmp ne i64 %38, 0
2327 /// br i1 %39, label %res_block, label %loadbb3
2328 /// loadbb3: ; preds = %loadbb2
2329 /// %40 = bitcast i32* %buffer2 to i8*
2330 /// %41 = bitcast i32* %buffer1 to i8*
2331 /// %42 = getelementptr i8, i8* %41, i8 14
2332 /// %43 = getelementptr i8, i8* %40, i8 14
2333 /// %44 = load i8, i8* %42
2334 /// %45 = load i8, i8* %43
2335 /// %46 = zext i8 %44 to i32
2336 /// %47 = zext i8 %45 to i32
2337 /// %48 = sub i32 %46, %47
2338 /// br label %endblock
2339 /// endblock: ; preds = %res_block,
2340 /// %loadbb3
2341 /// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ]
2342 /// ret i32 %phi.res
2343 static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
2344 const TargetLowering *TLI, const DataLayout *DL) {
2345 NumMemCmpCalls++;
2346
2347 // Early exit from expansion if -Oz.
2348 if (CI->getFunction()->optForMinSize())
2349 return false;
2350
2351 // Early exit from expansion if size is not a constant.
2352 ConstantInt *SizeCast = dyn_cast(CI->getArgOperand(2));
2353 if (!SizeCast) {
2354 NumMemCmpNotConstant++;
2355 return false;
2356 }
2357 const uint64_t SizeVal = SizeCast->getZExtValue();
2358
2359 if (SizeVal == 0) {
2360 return false;
2361 }
2362
2363 // TTI call to check if target would like to expand memcmp. Also, get the
2364 // available load sizes.
2365 const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
2366 const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp);
2367 if (!Options) return false;
2368
2369 const unsigned MaxNumLoads =
2370 TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize());
2371
2372 MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads,
2373 IsUsedForZeroCmp, MemCmpNumLoadsPerBlock, *DL);
2374
2375 // Don't expand if this will require more loads than desired by the target.
2376 if (Expansion.getNumLoads() == 0) {
2377 NumMemCmpGreaterThanMax++;
2378 return false;
2379 }
2380
2381 NumMemCmpInlined++;
2382
2383 Value *Res = Expansion.getMemCmpExpansion();
2384
2385 // Replace call with result of expansion and erase call.
2386 CI->replaceAllUsesWith(Res);
2387 CI->eraseFromParent();
2388
2389 return true;
2390 }
2391
16882392 bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
16892393 BasicBlock *BB = CI->getParent();
16902394
18372541 return true;
18382542 }
18392543
2544 LibFunc Func;
2545 if (TLInfo->getLibFunc(ImmutableCallSite(CI), Func) &&
2546 Func == LibFunc_memcmp && expandMemCmp(CI, TTI, TLI, DL)) {
2547 ModifiedDT = true;
2548 return true;
2549 }
18402550 return false;
18412551 }
18422552
599599 addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n"));
600600 }
601601
602 if (getOptLevel() != CodeGenOpt::None) {
603 // The MergeICmpsPass tries to create memcmp calls by grouping sequences of
604 // loads and compares. ExpandMemCmpPass then tries to expand those calls
605 // into optimally-sized loads and compares. The transforms are enabled by a
606 // target lowering hook.
607 if (EnableMergeICmps)
608 addPass(createMergeICmpsPass());
609 addPass(createExpandMemCmpPass());
602 if (getOptLevel() != CodeGenOpt::None && EnableMergeICmps) {
603 addPass(createMergeICmpsPass());
610604 }
611605
612606 // Run GC lowering passes for builtin collectors
88 DeadStoreElimination.cpp
99 DivRemPairs.cpp
1010 EarlyCSE.cpp
11 ExpandMemCmp.cpp
1211 FlattenCFGPass.cpp
1312 Float2Int.cpp
1413 GuardWidening.cpp
+0
-828
lib/Transforms/Scalar/ExpandMemCmp.cpp less more
None //===--- ExpandMemCmp.cpp - Expand memcmp() to load/stores ----------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to partially inline the fast path of well-known library
10 // functions, such as using square-root instructions for cases where sqrt()
11 // does not need to set errno.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "llvm/ADT/Statistic.h"
16 #include "llvm/Analysis/ConstantFolding.h"
17 #include "llvm/Analysis/TargetLibraryInfo.h"
18 #include "llvm/Analysis/TargetTransformInfo.h"
19 #include "llvm/Analysis/ValueTracking.h"
20 #include "llvm/CodeGen/TargetPassConfig.h"
21 #include "llvm/IR/IRBuilder.h"
22 #include "llvm/Target/TargetLowering.h"
23 #include "llvm/Target/TargetSubtargetInfo.h"
24 #include "llvm/Transforms/Scalar.h"
25 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
26
27 using namespace llvm;
28
29 #define DEBUG_TYPE "expandmemcmp"
30
31 STATISTIC(NumMemCmpCalls, "Number of memcmp calls");
32 STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size");
33 STATISTIC(NumMemCmpGreaterThanMax,
34 "Number of memcmp calls with size greater than max size");
35 STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls");
36
37 static cl::opt MemCmpNumLoadsPerBlock(
38 "memcmp-num-loads-per-block", cl::Hidden, cl::init(1),
39 cl::desc("The number of loads per basic block for inline expansion of "
40 "memcmp that is only being compared against zero."));
41
42 namespace {
43
44
45 // This class provides helper functions to expand a memcmp library call into an
46 // inline expansion.
47 class MemCmpExpansion {
48 struct ResultBlock {
49 BasicBlock *BB = nullptr;
50 PHINode *PhiSrc1 = nullptr;
51 PHINode *PhiSrc2 = nullptr;
52
53 ResultBlock() = default;
54 };
55
56 CallInst *const CI;
57 ResultBlock ResBlock;
58 const uint64_t Size;
59 unsigned MaxLoadSize;
60 uint64_t NumLoadsNonOneByte;
61 const uint64_t NumLoadsPerBlock;
62 std::vector LoadCmpBlocks;
63 BasicBlock *EndBlock;
64 PHINode *PhiRes;
65 const bool IsUsedForZeroCmp;
66 const DataLayout &DL;
67 IRBuilder<> Builder;
68 // Represents the decomposition in blocks of the expansion. For example,
69 // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and
70 // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}.
71 // TODO(courbet): Involve the target more in this computation. On X86, 7
72 // bytes can be done more efficiently with two overlaping 4-byte loads than
73 // covering the interval with [{4, 0},{2, 4},{1, 6}}.
74 struct LoadEntry {
75 LoadEntry(unsigned LoadSize, uint64_t Offset)
76 : LoadSize(LoadSize), Offset(Offset) {
77 assert(Offset % LoadSize == 0 && "invalid load entry");
78 }
79
80 uint64_t getGEPIndex() const { return Offset / LoadSize; }
81
82 // The size of the load for this block, in bytes.
83 const unsigned LoadSize;
84 // The offset of this load WRT the base pointer, in bytes.
85 const uint64_t Offset;
86 };
87 SmallVector LoadSequence;
88
89 void createLoadCmpBlocks();
90 void createResultBlock();
91 void setupResultBlockPHINodes();
92 void setupEndBlockPHINodes();
93 Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex);
94 void emitLoadCompareBlock(unsigned BlockIndex);
95 void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,
96 unsigned &LoadIndex);
97 void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned GEPIndex);
98 void emitMemCmpResultBlock();
99 Value *getMemCmpExpansionZeroCase();
100 Value *getMemCmpEqZeroOneBlock();
101 Value *getMemCmpOneBlock();
102
103 public:
104 MemCmpExpansion(CallInst *CI, uint64_t Size,
105 const TargetTransformInfo::MemCmpExpansionOptions &Options,
106 unsigned MaxNumLoads, const bool IsUsedForZeroCmp,
107 unsigned NumLoadsPerBlock, const DataLayout &DL);
108
109 unsigned getNumBlocks();
110 uint64_t getNumLoads() const { return LoadSequence.size(); }
111
112 Value *getMemCmpExpansion();
113 };
114
115 // Initialize the basic block structure required for expansion of memcmp call
116 // with given maximum load size and memcmp size parameter.
117 // This structure includes:
118 // 1. A list of load compare blocks - LoadCmpBlocks.
119 // 2. An EndBlock, split from original instruction point, which is the block to
120 // return from.
121 // 3. ResultBlock, block to branch to for early exit when a
122 // LoadCmpBlock finds a difference.
123 MemCmpExpansion::MemCmpExpansion(
124 CallInst *const CI, uint64_t Size,
125 const TargetTransformInfo::MemCmpExpansionOptions &Options,
126 const unsigned MaxNumLoads, const bool IsUsedForZeroCmp,
127 const unsigned NumLoadsPerBlock, const DataLayout &TheDataLayout)
128 : CI(CI),
129 Size(Size),
130 MaxLoadSize(0),
131 NumLoadsNonOneByte(0),
132 NumLoadsPerBlock(NumLoadsPerBlock),
133 IsUsedForZeroCmp(IsUsedForZeroCmp),
134 DL(TheDataLayout),
135 Builder(CI) {
136 assert(Size > 0 && "zero blocks");
137 // Scale the max size down if the target can load more bytes than we need.
138 size_t LoadSizeIndex = 0;
139 while (LoadSizeIndex < Options.LoadSizes.size() &&
140 Options.LoadSizes[LoadSizeIndex] > Size) {
141 ++LoadSizeIndex;
142 }
143 this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex];
144 // Compute the decomposition.
145 uint64_t CurSize = Size;
146 uint64_t Offset = 0;
147 while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) {
148 const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex];
149 assert(LoadSize > 0 && "zero load size");
150 const uint64_t NumLoadsForThisSize = CurSize / LoadSize;
151 if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) {
152 // Do not expand if the total number of loads is larger than what the
153 // target allows. Note that it's important that we exit before completing
154 // the expansion to avoid using a ton of memory to store the expansion for
155 // large sizes.
156 LoadSequence.clear();
157 return;
158 }
159 if (NumLoadsForThisSize > 0) {
160 for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) {
161 LoadSequence.push_back({LoadSize, Offset});
162 Offset += LoadSize;
163 }
164 if (LoadSize > 1) {
165 ++NumLoadsNonOneByte;
166 }
167 CurSize = CurSize % LoadSize;
168 }
169 ++LoadSizeIndex;
170 }
171 assert(LoadSequence.size() <= MaxNumLoads && "broken invariant");
172 }
173
174 unsigned MemCmpExpansion::getNumBlocks() {
175 if (IsUsedForZeroCmp)
176 return getNumLoads() / NumLoadsPerBlock +
177 (getNumLoads() % NumLoadsPerBlock != 0 ? 1 : 0);
178 return getNumLoads();
179 }
180
181 void MemCmpExpansion::createLoadCmpBlocks() {
182 for (unsigned i = 0; i < getNumBlocks(); i++) {
183 BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb",
184 EndBlock->getParent(), EndBlock);
185 LoadCmpBlocks.push_back(BB);
186 }
187 }
188
189 void MemCmpExpansion::createResultBlock() {
190 ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block",
191 EndBlock->getParent(), EndBlock);
192 }
193
194 // This function creates the IR instructions for loading and comparing 1 byte.
195 // It loads 1 byte from each source of the memcmp parameters with the given
196 // GEPIndex. It then subtracts the two loaded values and adds this result to the
197 // final phi node for selecting the memcmp result.
198 void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex,
199 unsigned GEPIndex) {
200 Value *Source1 = CI->getArgOperand(0);
201 Value *Source2 = CI->getArgOperand(1);
202
203 Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
204 Type *LoadSizeType = Type::getInt8Ty(CI->getContext());
205 // Cast source to LoadSizeType*.
206 if (Source1->getType() != LoadSizeType)
207 Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
208 if (Source2->getType() != LoadSizeType)
209 Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
210
211 // Get the base address using the GEPIndex.
212 if (GEPIndex != 0) {
213 Source1 = Builder.CreateGEP(LoadSizeType, Source1,
214 ConstantInt::get(LoadSizeType, GEPIndex));
215 Source2 = Builder.CreateGEP(LoadSizeType, Source2,
216 ConstantInt::get(LoadSizeType, GEPIndex));
217 }
218
219 Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
220 Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
221
222 LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext()));
223 LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext()));
224 Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2);
225
226 PhiRes->addIncoming(Diff, LoadCmpBlocks[BlockIndex]);
227
228 if (BlockIndex < (LoadCmpBlocks.size() - 1)) {
229 // Early exit branch if difference found to EndBlock. Otherwise, continue to
230 // next LoadCmpBlock,
231 Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff,
232 ConstantInt::get(Diff->getType(), 0));
233 BranchInst *CmpBr =
234 BranchInst::Create(EndBlock, LoadCmpBlocks[BlockIndex + 1], Cmp);
235 Builder.Insert(CmpBr);
236 } else {
237 // The last block has an unconditional branch to EndBlock.
238 BranchInst *CmpBr = BranchInst::Create(EndBlock);
239 Builder.Insert(CmpBr);
240 }
241 }
242
243 /// Generate an equality comparison for one or more pairs of loaded values.
244 /// This is used in the case where the memcmp() call is compared equal or not
245 /// equal to zero.
246 Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex,
247 unsigned &LoadIndex) {
248 assert(LoadIndex < getNumLoads() &&
249 "getCompareLoadPairs() called with no remaining loads");
250 std::vector XorList, OrList;
251 Value *Diff;
252
253 const unsigned NumLoads =
254 std::min(getNumLoads() - LoadIndex, NumLoadsPerBlock);
255
256 // For a single-block expansion, start inserting before the memcmp call.
257 if (LoadCmpBlocks.empty())
258 Builder.SetInsertPoint(CI);
259 else
260 Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
261
262 Value *Cmp = nullptr;
263 // If we have multiple loads per block, we need to generate a composite
264 // comparison using xor+or. The type for the combinations is the largest load
265 // type.
266 IntegerType *const MaxLoadType =
267 NumLoads == 1 ? nullptr
268 : IntegerType::get(CI->getContext(), MaxLoadSize * 8);
269 for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) {
270 const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex];
271
272 IntegerType *LoadSizeType =
273 IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
274
275 Value *Source1 = CI->getArgOperand(0);
276 Value *Source2 = CI->getArgOperand(1);
277
278 // Cast source to LoadSizeType*.
279 if (Source1->getType() != LoadSizeType)
280 Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
281 if (Source2->getType() != LoadSizeType)
282 Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
283
284 // Get the base address using a GEP.
285 if (CurLoadEntry.Offset != 0) {
286 Source1 = Builder.CreateGEP(
287 LoadSizeType, Source1,
288 ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
289 Source2 = Builder.CreateGEP(
290 LoadSizeType, Source2,
291 ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
292 }
293
294 // Get a constant or load a value for each source address.
295 Value *LoadSrc1 = nullptr;
296 if (auto *Source1C = dyn_cast(Source1))
297 LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL);
298 if (!LoadSrc1)
299 LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
300
301 Value *LoadSrc2 = nullptr;
302 if (auto *Source2C = dyn_cast(Source2))
303 LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL);
304 if (!LoadSrc2)
305 LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
306
307 if (NumLoads != 1) {
308 if (LoadSizeType != MaxLoadType) {
309 LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
310 LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
311 }
312 // If we have multiple loads per block, we need to generate a composite
313 // comparison using xor+or.
314 Diff = Builder.CreateXor(LoadSrc1, LoadSrc2);
315 Diff = Builder.CreateZExt(Diff, MaxLoadType);
316 XorList.push_back(Diff);
317 } else {
318 // If there's only one load per block, we just compare the loaded values.
319 Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2);
320 }
321 }
322
323 auto pairWiseOr = [&](std::vector &InList) -> std::vector {
324 std::vector OutList;
325 for (unsigned i = 0; i < InList.size() - 1; i = i + 2) {
326 Value *Or = Builder.CreateOr(InList[i], InList[i + 1]);
327 OutList.push_back(Or);
328 }
329 if (InList.size() % 2 != 0)
330 OutList.push_back(InList.back());
331 return OutList;
332 };
333
334 if (!Cmp) {
335 // Pairwise OR the XOR results.
336 OrList = pairWiseOr(XorList);
337
338 // Pairwise OR the OR results until one result left.
339 while (OrList.size() != 1) {
340 OrList = pairWiseOr(OrList);
341 }
342 Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0));
343 }
344
345 return Cmp;
346 }
347
348 void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,
349 unsigned &LoadIndex) {
350 Value *Cmp = getCompareLoadPairs(BlockIndex, LoadIndex);
351
352 BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1))
353 ? EndBlock
354 : LoadCmpBlocks[BlockIndex + 1];
355 // Early exit branch if difference found to ResultBlock. Otherwise,
356 // continue to next LoadCmpBlock or EndBlock.
357 BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp);
358 Builder.Insert(CmpBr);
359
360 // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
361 // since early exit to ResultBlock was not taken (no difference was found in
362 // any of the bytes).
363 if (BlockIndex == LoadCmpBlocks.size() - 1) {
364 Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
365 PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]);
366 }
367 }
368
369 // This function creates the IR intructions for loading and comparing using the
370 // given LoadSize. It loads the number of bytes specified by LoadSize from each
371 // source of the memcmp parameters. It then does a subtract to see if there was
372 // a difference in the loaded values. If a difference is found, it branches
373 // with an early exit to the ResultBlock for calculating which source was
374 // larger. Otherwise, it falls through to the either the next LoadCmpBlock or
375 // the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with
376 // a special case through emitLoadCompareByteBlock. The special handling can
377 // simply subtract the loaded values and add it to the result phi node.
378 void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) {
379 // There is one load per block in this case, BlockIndex == LoadIndex.
380 const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex];
381
382 if (CurLoadEntry.LoadSize == 1) {
383 MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex,
384 CurLoadEntry.getGEPIndex());
385 return;
386 }
387
388 Type *LoadSizeType =
389 IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
390 Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
391 assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type");
392
393 Value *Source1 = CI->getArgOperand(0);
394 Value *Source2 = CI->getArgOperand(1);
395
396 Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
397 // Cast source to LoadSizeType*.
398 if (Source1->getType() != LoadSizeType)
399 Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
400 if (Source2->getType() != LoadSizeType)
401 Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
402
403 // Get the base address using a GEP.
404 if (CurLoadEntry.Offset != 0) {
405 Source1 = Builder.CreateGEP(
406 LoadSizeType, Source1,
407 ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
408 Source2 = Builder.CreateGEP(
409 LoadSizeType, Source2,
410 ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
411 }
412
413 // Load LoadSizeType from the base address.
414 Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
415 Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
416
417 if (DL.isLittleEndian()) {
418 Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
419 Intrinsic::bswap, LoadSizeType);
420 LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
421 LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
422 }
423
424 if (LoadSizeType != MaxLoadType) {
425 LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
426 LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
427 }
428
429 // Add the loaded values to the phi nodes for calculating memcmp result only
430 // if result is not used in a zero equality.
431 if (!IsUsedForZeroCmp) {
432 ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[BlockIndex]);
433 ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[BlockIndex]);
434 }
435
436 Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2);
437 BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1))
438 ? EndBlock
439 : LoadCmpBlocks[BlockIndex + 1];
440 // Early exit branch if difference found to ResultBlock. Otherwise, continue
441 // to next LoadCmpBlock or EndBlock.
442 BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp);
443 Builder.Insert(CmpBr);
444
445 // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
446 // since early exit to ResultBlock was not taken (no difference was found in
447 // any of the bytes).
448 if (BlockIndex == LoadCmpBlocks.size() - 1) {
449 Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
450 PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]);
451 }
452 }
453
454 // This function populates the ResultBlock with a sequence to calculate the
455 // memcmp result. It compares the two loaded source values and returns -1 if
456 // src1 < src2 and 1 if src1 > src2.
457 void MemCmpExpansion::emitMemCmpResultBlock() {
458 // Special case: if memcmp result is used in a zero equality, result does not
459 // need to be calculated and can simply return 1.
460 if (IsUsedForZeroCmp) {
461 BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
462 Builder.SetInsertPoint(ResBlock.BB, InsertPt);
463 Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1);
464 PhiRes->addIncoming(Res, ResBlock.BB);
465 BranchInst *NewBr = BranchInst::Create(EndBlock);
466 Builder.Insert(NewBr);
467 return;
468 }
469 BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
470 Builder.SetInsertPoint(ResBlock.BB, InsertPt);
471
472 Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1,
473 ResBlock.PhiSrc2);
474
475 Value *Res =
476 Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1),
477 ConstantInt::get(Builder.getInt32Ty(), 1));
478
479 BranchInst *NewBr = BranchInst::Create(EndBlock);
480 Builder.Insert(NewBr);
481 PhiRes->addIncoming(Res, ResBlock.BB);
482 }
483
484 void MemCmpExpansion::setupResultBlockPHINodes() {
485 Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
486 Builder.SetInsertPoint(ResBlock.BB);
487 // Note: this assumes one load per block.
488 ResBlock.PhiSrc1 =
489 Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src1");
490 ResBlock.PhiSrc2 =
491 Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src2");
492 }
493
494 void MemCmpExpansion::setupEndBlockPHINodes() {
495 Builder.SetInsertPoint(&EndBlock->front());
496 PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res");
497 }
498
499 Value *MemCmpExpansion::getMemCmpExpansionZeroCase() {
500 unsigned LoadIndex = 0;
501 // This loop populates each of the LoadCmpBlocks with the IR sequence to
502 // handle multiple loads per block.
503 for (unsigned I = 0; I < getNumBlocks(); ++I) {
504 emitLoadCompareBlockMultipleLoads(I, LoadIndex);
505 }
506
507 emitMemCmpResultBlock();
508 return PhiRes;
509 }
510
511 /// A memcmp expansion that compares equality with 0 and only has one block of
512 /// load and compare can bypass the compare, branch, and phi IR that is required
513 /// in the general case.
514 Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() {
515 unsigned LoadIndex = 0;
516 Value *Cmp = getCompareLoadPairs(0, LoadIndex);
517 assert(LoadIndex == getNumLoads() && "some entries were not consumed");
518 return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext()));
519 }
520
521 /// A memcmp expansion that only has one block of load and compare can bypass
522 /// the compare, branch, and phi IR that is required in the general case.
523 Value *MemCmpExpansion::getMemCmpOneBlock() {
524 assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block");
525
526 Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
527 Value *Source1 = CI->getArgOperand(0);
528 Value *Source2 = CI->getArgOperand(1);
529
530 // Cast source to LoadSizeType*.
531 if (Source1->getType() != LoadSizeType)
532 Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
533 if (Source2->getType() != LoadSizeType)
534 Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
535
536 // Load LoadSizeType from the base address.
537 Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
538 Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
539
540 if (DL.isLittleEndian() && Size != 1) {
541 Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
542 Intrinsic::bswap, LoadSizeType);
543 LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
544 LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
545 }
546
547 if (Size < 4) {
548 // The i8 and i16 cases don't need compares. We zext the loaded values and
549 // subtract them to get the suitable negative, zero, or positive i32 result.
550 LoadSrc1 = Builder.CreateZExt(LoadSrc1, Builder.getInt32Ty());
551 LoadSrc2 = Builder.CreateZExt(LoadSrc2, Builder.getInt32Ty());
552 return Builder.CreateSub(LoadSrc1, LoadSrc2);
553 }
554
555 // The result of memcmp is negative, zero, or positive, so produce that by
556 // subtracting 2 extended compare bits: sub (ugt, ult).
557 // If a target prefers to use selects to get -1/0/1, they should be able
558 // to transform this later. The inverse transform (going from selects to math)
559 // may not be possible in the DAG because the selects got converted into
560 // branches before we got there.
561 Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2);
562 Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2);
563 Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty());
564 Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty());
565 return Builder.CreateSub(ZextUGT, ZextULT);
566 }
567
568 // This function expands the memcmp call into an inline expansion and returns
569 // the memcmp result.
570 Value *MemCmpExpansion::getMemCmpExpansion() {
571 // A memcmp with zero-comparison with only one block of load and compare does
572 // not need to set up any extra blocks. This case could be handled in the DAG,
573 // but since we have all of the machinery to flexibly expand any memcpy here,
574 // we choose to handle this case too to avoid fragmented lowering.
575 if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || getNumBlocks() != 1) {
576 BasicBlock *StartBlock = CI->getParent();
577 EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
578 setupEndBlockPHINodes();
579 createResultBlock();
580
581 // If return value of memcmp is not used in a zero equality, we need to
582 // calculate which source was larger. The calculation requires the
583 // two loaded source values of each load compare block.
584 // These will be saved in the phi nodes created by setupResultBlockPHINodes.
585 if (!IsUsedForZeroCmp) setupResultBlockPHINodes();
586
587 // Create the number of required load compare basic blocks.
588 createLoadCmpBlocks();
589
590 // Update the terminator added by splitBasicBlock to branch to the first
591 // LoadCmpBlock.
592 StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);
593 }
594
595 Builder.SetCurrentDebugLocation(CI->getDebugLoc());
596
597 if (IsUsedForZeroCmp)
598 return getNumBlocks() == 1 ? getMemCmpEqZeroOneBlock()
599 : getMemCmpExpansionZeroCase();
600
601 // TODO: Handle more than one load pair per block in getMemCmpOneBlock().
602 if (getNumBlocks() == 1 && NumLoadsPerBlock == 1) return getMemCmpOneBlock();
603
604 for (unsigned I = 0; I < getNumBlocks(); ++I) {
605 emitLoadCompareBlock(I);
606 }
607
608 emitMemCmpResultBlock();
609 return PhiRes;
610 }
611
612 // This function checks to see if an expansion of memcmp can be generated.
613 // It checks for constant compare size that is less than the max inline size.
614 // If an expansion cannot occur, returns false to leave as a library call.
615 // Otherwise, the library call is replaced with a new IR instruction sequence.
616 /// We want to transform:
617 /// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15)
618 /// To:
619 /// loadbb:
620 /// %0 = bitcast i32* %buffer2 to i8*
621 /// %1 = bitcast i32* %buffer1 to i8*
622 /// %2 = bitcast i8* %1 to i64*
623 /// %3 = bitcast i8* %0 to i64*
624 /// %4 = load i64, i64* %2
625 /// %5 = load i64, i64* %3
626 /// %6 = call i64 @llvm.bswap.i64(i64 %4)
627 /// %7 = call i64 @llvm.bswap.i64(i64 %5)
628 /// %8 = sub i64 %6, %7
629 /// %9 = icmp ne i64 %8, 0
630 /// br i1 %9, label %res_block, label %loadbb1
631 /// res_block: ; preds = %loadbb2,
632 /// %loadbb1, %loadbb
633 /// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ]
634 /// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ]
635 /// %10 = icmp ult i64 %phi.src1, %phi.src2
636 /// %11 = select i1 %10, i32 -1, i32 1
637 /// br label %endblock
638 /// loadbb1: ; preds = %loadbb
639 /// %12 = bitcast i32* %buffer2 to i8*
640 /// %13 = bitcast i32* %buffer1 to i8*
641 /// %14 = bitcast i8* %13 to i32*
642 /// %15 = bitcast i8* %12 to i32*
643 /// %16 = getelementptr i32, i32* %14, i32 2
644 /// %17 = getelementptr i32, i32* %15, i32 2
645 /// %18 = load i32, i32* %16
646 /// %19 = load i32, i32* %17
647 /// %20 = call i32 @llvm.bswap.i32(i32 %18)
648 /// %21 = call i32 @llvm.bswap.i32(i32 %19)
649 /// %22 = zext i32 %20 to i64
650 /// %23 = zext i32 %21 to i64
651 /// %24 = sub i64 %22, %23
652 /// %25 = icmp ne i64 %24, 0
653 /// br i1 %25, label %res_block, label %loadbb2
654 /// loadbb2: ; preds = %loadbb1
655 /// %26 = bitcast i32* %buffer2 to i8*
656 /// %27 = bitcast i32* %buffer1 to i8*
657 /// %28 = bitcast i8* %27 to i16*
658 /// %29 = bitcast i8* %26 to i16*
659 /// %30 = getelementptr i16, i16* %28, i16 6
660 /// %31 = getelementptr i16, i16* %29, i16 6
661 /// %32 = load i16, i16* %30
662 /// %33 = load i16, i16* %31
663 /// %34 = call i16 @llvm.bswap.i16(i16 %32)
664 /// %35 = call i16 @llvm.bswap.i16(i16 %33)
665 /// %36 = zext i16 %34 to i64
666 /// %37 = zext i16 %35 to i64
667 /// %38 = sub i64 %36, %37
668 /// %39 = icmp ne i64 %38, 0
669 /// br i1 %39, label %res_block, label %loadbb3
670 /// loadbb3: ; preds = %loadbb2
671 /// %40 = bitcast i32* %buffer2 to i8*
672 /// %41 = bitcast i32* %buffer1 to i8*
673 /// %42 = getelementptr i8, i8* %41, i8 14
674 /// %43 = getelementptr i8, i8* %40, i8 14
675 /// %44 = load i8, i8* %42
676 /// %45 = load i8, i8* %43
677 /// %46 = zext i8 %44 to i32
678 /// %47 = zext i8 %45 to i32
679 /// %48 = sub i32 %46, %47
680 /// br label %endblock
681 /// endblock: ; preds = %res_block,
682 /// %loadbb3
683 /// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ]
684 /// ret i32 %phi.res
685 static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
686 const TargetLowering *TLI, const DataLayout *DL) {
687 NumMemCmpCalls++;
688
689 // Early exit from expansion if -Oz.
690 if (CI->getFunction()->optForMinSize())
691 return false;
692
693 // Early exit from expansion if size is not a constant.
694 ConstantInt *SizeCast = dyn_cast(CI->getArgOperand(2));
695 if (!SizeCast) {
696 NumMemCmpNotConstant++;
697 return false;
698 }
699 const uint64_t SizeVal = SizeCast->getZExtValue();
700
701 if (SizeVal == 0) {
702 return false;
703 }
704
705 // TTI call to check if target would like to expand memcmp. Also, get the
706 // available load sizes.
707 const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
708 const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp);
709 if (!Options) return false;
710
711 const unsigned MaxNumLoads =
712 TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize());
713
714 MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads,
715 IsUsedForZeroCmp, MemCmpNumLoadsPerBlock, *DL);
716
717 // Don't expand if this will require more loads than desired by the target.
718 if (Expansion.getNumLoads() == 0) {
719 NumMemCmpGreaterThanMax++;
720 return false;
721 }
722
723 NumMemCmpInlined++;
724
725 Value *Res = Expansion.getMemCmpExpansion();
726
727 // Replace call with result of expansion and erase call.
728 CI->replaceAllUsesWith(Res);
729 CI->eraseFromParent();
730
731 return true;
732 }
733
734
735
736 class ExpandMemCmpPass : public FunctionPass {
737 public:
738 static char ID;
739
740 ExpandMemCmpPass() : FunctionPass(ID) {
741 initializeExpandMemCmpPassPass(*PassRegistry::getPassRegistry());
742 }
743
744 bool runOnFunction(Function &F) override {
745 if (skipFunction(F)) return false;
746
747 auto *TPC = getAnalysisIfAvailable();
748 if (!TPC) {
749 return false;
750 }
751 const TargetLowering* TL =
752 TPC->getTM().getSubtargetImpl(F)->getTargetLowering();
753
754 const TargetLibraryInfo *TLI =
755 &getAnalysis().getTLI();
756 const TargetTransformInfo *TTI =
757 &getAnalysis().getTTI(F);
758 auto PA = runImpl(F, TLI, TTI, TL);
759 return !PA.areAllPreserved();
760 }
761
762 private:
763 void getAnalysisUsage(AnalysisUsage &AU) const override {
764 AU.addRequired();
765 AU.addRequired();
766 FunctionPass::getAnalysisUsage(AU);
767 }
768
769 PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
770 const TargetTransformInfo *TTI,
771 const TargetLowering* TL);
772 // Returns true if a change was made.
773 bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,
774 const TargetTransformInfo *TTI, const TargetLowering* TL,
775 const DataLayout& DL);
776 };
777
778 bool ExpandMemCmpPass::runOnBlock(
779 BasicBlock &BB, const TargetLibraryInfo *TLI,
780 const TargetTransformInfo *TTI, const TargetLowering* TL,
781 const DataLayout& DL) {
782 for (Instruction& I : BB) {
783 CallInst *CI = dyn_cast(&I);
784 if (!CI) {
785 continue;
786 }
787 LibFunc Func;
788 if (TLI->getLibFunc(ImmutableCallSite(CI), Func) &&
789 Func == LibFunc_memcmp && expandMemCmp(CI, TTI, TL, &DL)) {
790 return true;
791 }
792 }
793 return false;
794 }
795
796
797 PreservedAnalyses ExpandMemCmpPass::runImpl(
798 Function &F, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI,
799 const TargetLowering* TL) {
800 const DataLayout& DL = F.getParent()->getDataLayout();
801 bool MadeChanges = false;
802 for (auto BBIt = F.begin(); BBIt != F.end();) {
803 if (runOnBlock(*BBIt, TLI, TTI, TL, DL)) {
804 MadeChanges = true;
805 // If changes were made, restart the function from the beginning, since
806 // the structure of the function was changed.
807 BBIt = F.begin();
808 } else {
809 ++BBIt;
810 }
811 }
812 return MadeChanges ? PreservedAnalyses::none() : PreservedAnalyses::all();
813 }
814
815 } // namespace
816
817 char ExpandMemCmpPass::ID = 0;
818 INITIALIZE_PASS_BEGIN(ExpandMemCmpPass, "expandmemcmp",
819 "Expand memcmp() to load/stores", false, false)
820 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
821 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
822 INITIALIZE_PASS_END(ExpandMemCmpPass, "expandmemcmp",
823 "Expand memcmp() to load/stores", false, false)
824
825 Pass *llvm::createExpandMemCmpPass() {
826 return new ExpandMemCmpPass();
827 }
4747 initializeNewGVNLegacyPassPass(Registry);
4848 initializeEarlyCSELegacyPassPass(Registry);
4949 initializeEarlyCSEMemSSALegacyPassPass(Registry);
50 initializeExpandMemCmpPassPass(Registry);
5150 initializeGVNHoistLegacyPassPass(Registry);
5251 initializeGVNSinkLegacyPassPass(Registry);
5352 initializeFlattenCFGPassPass(Registry);
1212 ; STOP-BEFORE-NOT: Loop Strength Reduction
1313
1414 ; RUN: llc < %s -debug-pass=Structure -start-after=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-AFTER
15 ; START-AFTER: -machine-branch-prob -expandmemcmp
15 ; START-AFTER: -machine-branch-prob -gc-lowering
1616 ; START-AFTER: FunctionPass Manager
17 ; START-AFTER-NEXT: Expand memcmp() to load/stores
17 ; START-AFTER-NEXT: Lower Garbage Collection Instructions
1818
1919 ; RUN: llc < %s -debug-pass=Structure -start-before=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-BEFORE
2020 ; START-BEFORE: -machine-branch-prob -domtree
2121 ; START-BEFORE: FunctionPass Manager
2222 ; START-BEFORE: Loop Strength Reduction
23 ; START-BEFORE-NEXT: Expand memcmp() to load/stores
23 ; START-BEFORE-NEXT: Lower Garbage Collection Instructions
2424
2525 ; RUN: not llc < %s -start-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-START-BEFORE
2626 ; RUN: not llc < %s -stop-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-STOP-BEFORE
155155
156156 define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize {
157157 ; X86-LABEL: length3_eq:
158 ; X86: # BB#0:
159 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
158 ; X86: # BB#0: # %loadbb
160159 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
161 ; X86-NEXT: movzwl (%ecx), %edx
162 ; X86-NEXT: cmpw (%eax), %dx
163 ; X86-NEXT: jne .LBB5_2
164 ; X86-NEXT: # BB#1: # %loadbb1
165 ; X86-NEXT: movb 2(%ecx), %dl
166 ; X86-NEXT: xorl %ecx, %ecx
167 ; X86-NEXT: cmpb 2(%eax), %dl
160 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
161 ; X86-NEXT: movzwl (%eax), %edx
162 ; X86-NEXT: cmpw (%ecx), %dx
163 ; X86-NEXT: jne .LBB5_1
164 ; X86-NEXT: # BB#2: # %loadbb1
165 ; X86-NEXT: movb 2(%eax), %dl
166 ; X86-NEXT: xorl %eax, %eax
167 ; X86-NEXT: cmpb 2(%ecx), %dl
168168 ; X86-NEXT: je .LBB5_3
169 ; X86-NEXT: .LBB5_2: # %res_block
170 ; X86-NEXT: xorl %ecx, %ecx
171 ; X86-NEXT: incl %ecx
169 ; X86-NEXT: .LBB5_1: # %res_block
170 ; X86-NEXT: xorl %eax, %eax
171 ; X86-NEXT: incl %eax
172172 ; X86-NEXT: .LBB5_3: # %endblock
173 ; X86-NEXT: testl %ecx, %ecx
173 ; X86-NEXT: testl %eax, %eax
174174 ; X86-NEXT: setne %al
175175 ; X86-NEXT: retl
176176 ;
177177 ; X64-LABEL: length3_eq:
178 ; X64: # BB#0:
178 ; X64: # BB#0: # %loadbb
179179 ; X64-NEXT: movzwl (%rdi), %eax
180180 ; X64-NEXT: cmpw (%rsi), %ax
181 ; X64-NEXT: jne .LBB5_2
182 ; X64-NEXT: # BB#1: # %loadbb1
181 ; X64-NEXT: jne .LBB5_1
182 ; X64-NEXT: # BB#2: # %loadbb1
183183 ; X64-NEXT: movb 2(%rdi), %cl
184184 ; X64-NEXT: xorl %eax, %eax
185185 ; X64-NEXT: cmpb 2(%rsi), %cl
186186 ; X64-NEXT: je .LBB5_3
187 ; X64-NEXT: .LBB5_2: # %res_block
187 ; X64-NEXT: .LBB5_1: # %res_block
188188 ; X64-NEXT: movl $1, %eax
189189 ; X64-NEXT: .LBB5_3: # %endblock
190190 ; X64-NEXT: testl %eax, %eax
313313
314314 define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize {
315315 ; X86-LABEL: length5_eq:
316 ; X86: # BB#0:
317 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
316 ; X86: # BB#0: # %loadbb
318317 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
319 ; X86-NEXT: movl (%ecx), %edx
320 ; X86-NEXT: cmpl (%eax), %edx
321 ; X86-NEXT: jne .LBB10_2
322 ; X86-NEXT: # BB#1: # %loadbb1
323 ; X86-NEXT: movb 4(%ecx), %dl
324 ; X86-NEXT: xorl %ecx, %ecx
325 ; X86-NEXT: cmpb 4(%eax), %dl
318 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
319 ; X86-NEXT: movl (%eax), %edx
320 ; X86-NEXT: cmpl (%ecx), %edx
321 ; X86-NEXT: jne .LBB10_1
322 ; X86-NEXT: # BB#2: # %loadbb1
323 ; X86-NEXT: movb 4(%eax), %dl
324 ; X86-NEXT: xorl %eax, %eax
325 ; X86-NEXT: cmpb 4(%ecx), %dl
326326 ; X86-NEXT: je .LBB10_3
327 ; X86-NEXT: .LBB10_2: # %res_block
328 ; X86-NEXT: xorl %ecx, %ecx
329 ; X86-NEXT: incl %ecx
327 ; X86-NEXT: .LBB10_1: # %res_block
328 ; X86-NEXT: xorl %eax, %eax
329 ; X86-NEXT: incl %eax
330330 ; X86-NEXT: .LBB10_3: # %endblock
331 ; X86-NEXT: testl %ecx, %ecx
331 ; X86-NEXT: testl %eax, %eax
332332 ; X86-NEXT: setne %al
333333 ; X86-NEXT: retl
334334 ;
335335 ; X64-LABEL: length5_eq:
336 ; X64: # BB#0:
336 ; X64: # BB#0: # %loadbb
337337 ; X64-NEXT: movl (%rdi), %eax
338338 ; X64-NEXT: cmpl (%rsi), %eax
339 ; X64-NEXT: jne .LBB10_2
340 ; X64-NEXT: # BB#1: # %loadbb1
339 ; X64-NEXT: jne .LBB10_1
340 ; X64-NEXT: # BB#2: # %loadbb1
341341 ; X64-NEXT: movb 4(%rdi), %cl
342342 ; X64-NEXT: xorl %eax, %eax
343343 ; X64-NEXT: cmpb 4(%rsi), %cl
344344 ; X64-NEXT: je .LBB10_3
345 ; X64-NEXT: .LBB10_2: # %res_block
345 ; X64-NEXT: .LBB10_1: # %res_block
346346 ; X64-NEXT: movl $1, %eax
347347 ; X64-NEXT: .LBB10_3: # %endblock
348348 ; X64-NEXT: testl %eax, %eax
355355
356356 define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
357357 ; X86-LABEL: length8:
358 ; X86: # BB#0:
358 ; X86: # BB#0: # %loadbb
359359 ; X86-NEXT: pushl %esi
360360 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
361361 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
364364 ; X86-NEXT: bswapl %ecx
365365 ; X86-NEXT: bswapl %edx
366366 ; X86-NEXT: cmpl %edx, %ecx
367 ; X86-NEXT: jne .LBB11_2
368 ; X86-NEXT: # BB#1: # %loadbb1
367 ; X86-NEXT: jne .LBB11_1
368 ; X86-NEXT: # BB#2: # %loadbb1
369369 ; X86-NEXT: movl 4(%esi), %ecx
370370 ; X86-NEXT: movl 4(%eax), %edx
371371 ; X86-NEXT: bswapl %ecx
373373 ; X86-NEXT: xorl %eax, %eax
374374 ; X86-NEXT: cmpl %edx, %ecx
375375 ; X86-NEXT: je .LBB11_3
376 ; X86-NEXT: .LBB11_2: # %res_block
376 ; X86-NEXT: .LBB11_1: # %res_block
377377 ; X86-NEXT: xorl %eax, %eax
378378 ; X86-NEXT: cmpl %edx, %ecx
379379 ; X86-NEXT: setae %al
399399
400400 define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize {
401401 ; X86-LABEL: length8_eq:
402 ; X86: # BB#0:
403 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
402 ; X86: # BB#0: # %loadbb
404403 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
405 ; X86-NEXT: movl (%ecx), %edx
406 ; X86-NEXT: cmpl (%eax), %edx
407 ; X86-NEXT: jne .LBB12_2
408 ; X86-NEXT: # BB#1: # %loadbb1
409 ; X86-NEXT: movl 4(%ecx), %edx
410 ; X86-NEXT: xorl %ecx, %ecx
411 ; X86-NEXT: cmpl 4(%eax), %edx
404 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
405 ; X86-NEXT: movl (%eax), %edx
406 ; X86-NEXT: cmpl (%ecx), %edx
407 ; X86-NEXT: jne .LBB12_1
408 ; X86-NEXT: # BB#2: # %loadbb1
409 ; X86-NEXT: movl 4(%eax), %edx
410 ; X86-NEXT: xorl %eax, %eax
411 ; X86-NEXT: cmpl 4(%ecx), %edx
412412 ; X86-NEXT: je .LBB12_3
413 ; X86-NEXT: .LBB12_2: # %res_block
414 ; X86-NEXT: xorl %ecx, %ecx
415 ; X86-NEXT: incl %ecx
413 ; X86-NEXT: .LBB12_1: # %res_block
414 ; X86-NEXT: xorl %eax, %eax
415 ; X86-NEXT: incl %eax
416416 ; X86-NEXT: .LBB12_3: # %endblock
417 ; X86-NEXT: testl %ecx, %ecx
417 ; X86-NEXT: testl %eax, %eax
418418 ; X86-NEXT: sete %al
419419 ; X86-NEXT: retl
420420 ;
431431
432432 define i1 @length8_eq_const(i8* %X) nounwind optsize {
433433 ; X86-LABEL: length8_eq_const:
434 ; X86: # BB#0:
434 ; X86: # BB#0: # %loadbb
435435 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
436436 ; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130
437 ; X86-NEXT: jne .LBB13_2
438 ; X86-NEXT: # BB#1: # %loadbb1
437 ; X86-NEXT: jne .LBB13_1
438 ; X86-NEXT: # BB#2: # %loadbb1
439439 ; X86-NEXT: xorl %eax, %eax
440440 ; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534
441441 ; X86-NEXT: je .LBB13_3
442 ; X86-NEXT: .LBB13_2: # %res_block
442 ; X86-NEXT: .LBB13_1: # %res_block
443443 ; X86-NEXT: xorl %eax, %eax
444444 ; X86-NEXT: incl %eax
445445 ; X86-NEXT: .LBB13_3: # %endblock
472472 ; X86-NEXT: retl
473473 ;
474474 ; X64-LABEL: length12_eq:
475 ; X64: # BB#0:
475 ; X64: # BB#0: # %loadbb
476476 ; X64-NEXT: movq (%rdi), %rax
477477 ; X64-NEXT: cmpq (%rsi), %rax
478 ; X64-NEXT: jne .LBB14_2
479 ; X64-NEXT: # BB#1: # %loadbb1
478 ; X64-NEXT: jne .LBB14_1
479 ; X64-NEXT: # BB#2: # %loadbb1
480480 ; X64-NEXT: movl 8(%rdi), %ecx
481481 ; X64-NEXT: xorl %eax, %eax
482482 ; X64-NEXT: cmpl 8(%rsi), %ecx
483483 ; X64-NEXT: je .LBB14_3
484 ; X64-NEXT: .LBB14_2: # %res_block
484 ; X64-NEXT: .LBB14_1: # %res_block
485485 ; X64-NEXT: movl $1, %eax
486486 ; X64-NEXT: .LBB14_3: # %endblock
487487 ; X64-NEXT: testl %eax, %eax
504504 ; X86-NEXT: retl
505505 ;
506506 ; X64-LABEL: length12:
507 ; X64: # BB#0:
507 ; X64: # BB#0: # %loadbb
508508 ; X64-NEXT: movq (%rdi), %rcx
509509 ; X64-NEXT: movq (%rsi), %rdx
510510 ; X64-NEXT: bswapq %rcx
511511 ; X64-NEXT: bswapq %rdx
512512 ; X64-NEXT: cmpq %rdx, %rcx
513 ; X64-NEXT: jne .LBB15_2
514 ; X64-NEXT: # BB#1: # %loadbb1
513 ; X64-NEXT: jne .LBB15_1
514 ; X64-NEXT: # BB#2: # %loadbb1
515515 ; X64-NEXT: movl 8(%rdi), %ecx
516516 ; X64-NEXT: movl 8(%rsi), %edx
517517 ; X64-NEXT: bswapl %ecx
518518 ; X64-NEXT: bswapl %edx
519519 ; X64-NEXT: xorl %eax, %eax
520520 ; X64-NEXT: cmpq %rdx, %rcx
521 ; X64-NEXT: je .LBB15_3
522 ; X64-NEXT: .LBB15_2: # %res_block
521 ; X64-NEXT: jne .LBB15_1
522 ; X64-NEXT: # BB#3: # %endblock
523 ; X64-NEXT: retq
524 ; X64-NEXT: .LBB15_1: # %res_block
523525 ; X64-NEXT: xorl %eax, %eax
524526 ; X64-NEXT: cmpq %rdx, %rcx
525527 ; X64-NEXT: setae %al
526528 ; X64-NEXT: leal -1(%rax,%rax), %eax
527 ; X64-NEXT: .LBB15_3: # %endblock
528529 ; X64-NEXT: retq
529530 %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
530531 ret i32 %m
544545 ; X86-NEXT: retl
545546 ;
546547 ; X64-LABEL: length16:
547 ; X64: # BB#0:
548 ; X64: # BB#0: # %loadbb
548549 ; X64-NEXT: movq (%rdi), %rcx
549550 ; X64-NEXT: movq (%rsi), %rdx
550551 ; X64-NEXT: bswapq %rcx
551552 ; X64-NEXT: bswapq %rdx
552553 ; X64-NEXT: cmpq %rdx, %rcx
553 ; X64-NEXT: jne .LBB16_2
554 ; X64-NEXT: # BB#1: # %loadbb1
554 ; X64-NEXT: jne .LBB16_1
555 ; X64-NEXT: # BB#2: # %loadbb1
555556 ; X64-NEXT: movq 8(%rdi), %rcx
556557 ; X64-NEXT: movq 8(%rsi), %rdx
557558 ; X64-NEXT: bswapq %rcx
558559 ; X64-NEXT: bswapq %rdx
559560 ; X64-NEXT: xorl %eax, %eax
560561 ; X64-NEXT: cmpq %rdx, %rcx
561 ; X64-NEXT: je .LBB16_3
562 ; X64-NEXT: .LBB16_2: # %res_block
562 ; X64-NEXT: jne .LBB16_1
563 ; X64-NEXT: # BB#3: # %endblock
564 ; X64-NEXT: retq
565 ; X64-NEXT: .LBB16_1: # %res_block
563566 ; X64-NEXT: xorl %eax, %eax
564567 ; X64-NEXT: cmpq %rdx, %rcx
565568 ; X64-NEXT: setae %al
566569 ; X64-NEXT: leal -1(%rax,%rax), %eax
567 ; X64-NEXT: .LBB16_3: # %endblock
568570 ; X64-NEXT: retq
569571 %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
570572 ret i32 %m
698700 ; X86-NEXT: retl
699701 ;
700702 ; X64-SSE2-LABEL: length24_eq:
701 ; X64-SSE2: # BB#0:
703 ; X64-SSE2: # BB#0: # %loadbb
702704 ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
703705 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
704706 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
705707 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
706708 ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
707 ; X64-SSE2-NEXT: jne .LBB20_2
708 ; X64-SSE2-NEXT: # BB#1: # %loadbb1
709 ; X64-SSE2-NEXT: jne .LBB20_1
710 ; X64-SSE2-NEXT: # BB#2: # %loadbb1
709711 ; X64-SSE2-NEXT: movq 16(%rdi), %rcx
710712 ; X64-SSE2-NEXT: xorl %eax, %eax
711713 ; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx
712714 ; X64-SSE2-NEXT: je .LBB20_3
713 ; X64-SSE2-NEXT: .LBB20_2: # %res_block
715 ; X64-SSE2-NEXT: .LBB20_1: # %res_block
714716 ; X64-SSE2-NEXT: movl $1, %eax
715717 ; X64-SSE2-NEXT: .LBB20_3: # %endblock
716718 ; X64-SSE2-NEXT: testl %eax, %eax
718720 ; X64-SSE2-NEXT: retq
719721 ;
720722 ; X64-AVX2-LABEL: length24_eq:
721 ; X64-AVX2: # BB#0:
723 ; X64-AVX2: # BB#0: # %loadbb
722724 ; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0
723725 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
724726 ; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax
725727 ; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
726 ; X64-AVX2-NEXT: jne .LBB20_2
727 ; X64-AVX2-NEXT: # BB#1: # %loadbb1
728 ; X64-AVX2-NEXT: jne .LBB20_1
729 ; X64-AVX2-NEXT: # BB#2: # %loadbb1
728730 ; X64-AVX2-NEXT: movq 16(%rdi), %rcx
729731 ; X64-AVX2-NEXT: xorl %eax, %eax
730732 ; X64-AVX2-NEXT: cmpq 16(%rsi), %rcx
731733 ; X64-AVX2-NEXT: je .LBB20_3
732 ; X64-AVX2-NEXT: .LBB20_2: # %res_block
734 ; X64-AVX2-NEXT: .LBB20_1: # %res_block
733735 ; X64-AVX2-NEXT: movl $1, %eax
734736 ; X64-AVX2-NEXT: .LBB20_3: # %endblock
735737 ; X64-AVX2-NEXT: testl %eax, %eax
754756 ; X86-NEXT: retl
755757 ;
756758 ; X64-SSE2-LABEL: length24_eq_const:
757 ; X64-SSE2: # BB#0:
759 ; X64-SSE2: # BB#0: # %loadbb
758760 ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
759761 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
760762 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
761763 ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
762 ; X64-SSE2-NEXT: jne .LBB21_2
763 ; X64-SSE2-NEXT: # BB#1: # %loadbb1
764 ; X64-SSE2-NEXT: jne .LBB21_1
765 ; X64-SSE2-NEXT: # BB#2: # %loadbb1
764766 ; X64-SSE2-NEXT: xorl %eax, %eax
765767 ; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
766768 ; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi)
767769 ; X64-SSE2-NEXT: je .LBB21_3
768 ; X64-SSE2-NEXT: .LBB21_2: # %res_block
770 ; X64-SSE2-NEXT: .LBB21_1: # %res_block
769771 ; X64-SSE2-NEXT: movl $1, %eax
770772 ; X64-SSE2-NEXT: .LBB21_3: # %endblock
771773 ; X64-SSE2-NEXT: testl %eax, %eax
773775 ; X64-SSE2-NEXT: retq
774776 ;
775777 ; X64-AVX2-LABEL: length24_eq_const:
776 ; X64-AVX2: # BB#0:
778 ; X64-AVX2: # BB#0: # %loadbb
777779 ; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0
778780 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
779781 ; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax
780782 ; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
781 ; X64-AVX2-NEXT: jne .LBB21_2
782 ; X64-AVX2-NEXT: # BB#1: # %loadbb1
783 ; X64-AVX2-NEXT: jne .LBB21_1
784 ; X64-AVX2-NEXT: # BB#2: # %loadbb1
783785 ; X64-AVX2-NEXT: xorl %eax, %eax
784786 ; X64-AVX2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
785787 ; X64-AVX2-NEXT: cmpq %rcx, 16(%rdi)
786788 ; X64-AVX2-NEXT: je .LBB21_3
787 ; X64-AVX2-NEXT: .LBB21_2: # %res_block
789 ; X64-AVX2-NEXT: .LBB21_1: # %res_block
788790 ; X64-AVX2-NEXT: movl $1, %eax
789791 ; X64-AVX2-NEXT: .LBB21_3: # %endblock
790792 ; X64-AVX2-NEXT: testl %eax, %eax
830832 ; X86-NOSSE-NEXT: retl
831833 ;
832834 ; X86-SSE2-LABEL: length32_eq:
833 ; X86-SSE2: # BB#0:
835 ; X86-SSE2: # BB#0: # %loadbb
834836 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
835837 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
836838 ; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
838840 ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
839841 ; X86-SSE2-NEXT: pmovmskb %xmm1, %edx
840842 ; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF
841 ; X86-SSE2-NEXT: jne .LBB23_2
842 ; X86-SSE2-NEXT: # BB#1: # %loadbb1
843 ; X86-SSE2-NEXT: jne .LBB23_1
844 ; X86-SSE2-NEXT: # BB#2: # %loadbb1
843845 ; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0
844846 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
845847 ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
847849 ; X86-SSE2-NEXT: xorl %eax, %eax
848850 ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
849851 ; X86-SSE2-NEXT: je .LBB23_3
850 ; X86-SSE2-NEXT: .LBB23_2: # %res_block
852 ; X86-SSE2-NEXT: .LBB23_1: # %res_block
851853 ; X86-SSE2-NEXT: xorl %eax, %eax
852854 ; X86-SSE2-NEXT: incl %eax
853855 ; X86-SSE2-NEXT: .LBB23_3: # %endblock
856858 ; X86-SSE2-NEXT: retl
857859 ;
858860 ; X64-SSE2-LABEL: length32_eq:
859 ; X64-SSE2: # BB#0:
861 ; X64-SSE2: # BB#0: # %loadbb
860862 ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
861863 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
862864 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
863865 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
864866 ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
865 ; X64-SSE2-NEXT: jne .LBB23_2
866 ; X64-SSE2-NEXT: # BB#1: # %loadbb1
867 ; X64-SSE2-NEXT: jne .LBB23_1
868 ; X64-SSE2-NEXT: # BB#2: # %loadbb1
867869 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
868870 ; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1
869871 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
871873 ; X64-SSE2-NEXT: xorl %eax, %eax
872874 ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
873875 ; X64-SSE2-NEXT: je .LBB23_3
874 ; X64-SSE2-NEXT: .LBB23_2: # %res_block
876 ; X64-SSE2-NEXT: .LBB23_1: # %res_block
875877 ; X64-SSE2-NEXT: movl $1, %eax
876878 ; X64-SSE2-NEXT: .LBB23_3: # %endblock
877879 ; X64-SSE2-NEXT: testl %eax, %eax
906908 ; X86-NOSSE-NEXT: retl
907909 ;
908910 ; X86-SSE2-LABEL: length32_eq_const:
909 ; X86-SSE2: # BB#0:
911 ; X86-SSE2: # BB#0: # %loadbb
910912 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
911913 ; X86-SSE2-NEXT: movdqu (%eax), %xmm0
912914 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
913915 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx
914916 ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
915 ; X86-SSE2-NEXT: jne .LBB24_2
916 ; X86-SSE2-NEXT: # BB#1: # %loadbb1
917 ; X86-SSE2-NEXT: jne .LBB24_1
918 ; X86-SSE2-NEXT: # BB#2: # %loadbb1
917919 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0
918920 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
919921 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx
920922 ; X86-SSE2-NEXT: xorl %eax, %eax
921923 ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
922924 ; X86-SSE2-NEXT: je .LBB24_3
923 ; X86-SSE2-NEXT: .LBB24_2: # %res_block
925 ; X86-SSE2-NEXT: .LBB24_1: # %res_block
924926 ; X86-SSE2-NEXT: xorl %eax, %eax
925927 ; X86-SSE2-NEXT: incl %eax
926928 ; X86-SSE2-NEXT: .LBB24_3: # %endblock
929931 ; X86-SSE2-NEXT: retl
930932 ;
931933 ; X64-SSE2-LABEL: length32_eq_const:
932 ; X64-SSE2: # BB#0:
934 ; X64-SSE2: # BB#0: # %loadbb
933935 ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
934936 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
935937 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
936938 ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
937 ; X64-SSE2-NEXT: jne .LBB24_2
938 ; X64-SSE2-NEXT: # BB#1: # %loadbb1
939 ; X64-SSE2-NEXT: jne .LBB24_1
940 ; X64-SSE2-NEXT: # BB#2: # %loadbb1
939941 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
940942 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
941943 ; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx
942944 ; X64-SSE2-NEXT: xorl %eax, %eax
943945 ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
944946 ; X64-SSE2-NEXT: je .LBB24_3
945 ; X64-SSE2-NEXT: .LBB24_2: # %res_block
947 ; X64-SSE2-NEXT: .LBB24_1: # %res_block
946948 ; X64-SSE2-NEXT: movl $1, %eax
947949 ; X64-SSE2-NEXT: .LBB24_3: # %endblock
948950 ; X64-SSE2-NEXT: testl %eax, %eax
10061008 ; X64-SSE2-NEXT: retq
10071009 ;
10081010 ; X64-AVX2-LABEL: length64_eq:
1009 ; X64-AVX2: # BB#0:
1011 ; X64-AVX2: # BB#0: # %loadbb
10101012 ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
10111013 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
10121014 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
10131015 ; X64-AVX2-NEXT: cmpl $-1, %eax
1014 ; X64-AVX2-NEXT: jne .LBB26_2
1015 ; X64-AVX2-NEXT: # BB#1: # %loadbb1
1016 ; X64-AVX2-NEXT: jne .LBB26_1
1017 ; X64-AVX2-NEXT: # BB#2: # %loadbb1
10161018 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
10171019 ; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0
10181020 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx
10191021 ; X64-AVX2-NEXT: xorl %eax, %eax
10201022 ; X64-AVX2-NEXT: cmpl $-1, %ecx
10211023 ; X64-AVX2-NEXT: je .LBB26_3
1022 ; X64-AVX2-NEXT: .LBB26_2: # %res_block
1024 ; X64-AVX2-NEXT: .LBB26_1: # %res_block
10231025 ; X64-AVX2-NEXT: movl $1, %eax
10241026 ; X64-AVX2-NEXT: .LBB26_3: # %endblock
10251027 ; X64-AVX2-NEXT: testl %eax, %eax
10561058 ; X64-SSE2-NEXT: retq
10571059 ;
10581060 ; X64-AVX2-LABEL: length64_eq_const:
1059 ; X64-AVX2: # BB#0:
1061 ; X64-AVX2: # BB#0: # %loadbb
10601062 ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
10611063 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
10621064 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
10631065 ; X64-AVX2-NEXT: cmpl $-1, %eax
1064 ; X64-AVX2-NEXT: jne .LBB27_2
1065 ; X64-AVX2-NEXT: # BB#1: # %loadbb1
1066 ; X64-AVX2-NEXT: jne .LBB27_1
1067 ; X64-AVX2-NEXT: # BB#2: # %loadbb1
10661068 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
10671069 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
10681070 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx
10691071 ; X64-AVX2-NEXT: xorl %eax, %eax
10701072 ; X64-AVX2-NEXT: cmpl $-1, %ecx
10711073 ; X64-AVX2-NEXT: je .LBB27_3
1072 ; X64-AVX2-NEXT: .LBB27_2: # %res_block
1074 ; X64-AVX2-NEXT: .LBB27_1: # %res_block
10731075 ; X64-AVX2-NEXT: movl $1, %eax
10741076 ; X64-AVX2-NEXT: .LBB27_3: # %endblock
10751077 ; X64-AVX2-NEXT: testl %eax, %eax
186186
187187 define i1 @length3_eq(i8* %X, i8* %Y) nounwind {
188188 ; X86-LABEL: length3_eq:
189 ; X86: # BB#0:
189 ; X86: # BB#0: # %loadbb
190 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
190191 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
191 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
192 ; X86-NEXT: movzwl (%ecx), %edx
193 ; X86-NEXT: cmpw (%eax), %dx
194 ; X86-NEXT: jne .LBB7_2
195 ; X86-NEXT: # BB#1: # %loadbb1
196 ; X86-NEXT: movb 2(%ecx), %dl
197 ; X86-NEXT: xorl %ecx, %ecx
198 ; X86-NEXT: cmpb 2(%eax), %dl
192 ; X86-NEXT: movzwl (%eax), %edx
193 ; X86-NEXT: cmpw (%ecx), %dx
194 ; X86-NEXT: jne .LBB7_1
195 ; X86-NEXT: # BB#2: # %loadbb1
196 ; X86-NEXT: movb 2(%eax), %dl
197 ; X86-NEXT: xorl %eax, %eax
198 ; X86-NEXT: cmpb 2(%ecx), %dl
199199 ; X86-NEXT: je .LBB7_3
200 ; X86-NEXT: .LBB7_2: # %res_block
201 ; X86-NEXT: movl $1, %ecx
200 ; X86-NEXT: .LBB7_1: # %res_block
201 ; X86-NEXT: movl $1, %eax
202202 ; X86-NEXT: .LBB7_3: # %endblock
203 ; X86-NEXT: testl %ecx, %ecx
203 ; X86-NEXT: testl %eax, %eax
204204 ; X86-NEXT: setne %al
205205 ; X86-NEXT: retl
206206 ;
207207 ; X64-LABEL: length3_eq:
208 ; X64: # BB#0:
208 ; X64: # BB#0: # %loadbb
209209 ; X64-NEXT: movzwl (%rdi), %eax
210210 ; X64-NEXT: cmpw (%rsi), %ax
211 ; X64-NEXT: jne .LBB7_2
212 ; X64-NEXT: # BB#1: # %loadbb1
211 ; X64-NEXT: jne .LBB7_1
212 ; X64-NEXT: # BB#2: # %loadbb1
213213 ; X64-NEXT: movb 2(%rdi), %cl
214214 ; X64-NEXT: xorl %eax, %eax
215215 ; X64-NEXT: cmpb 2(%rsi), %cl
216216 ; X64-NEXT: je .LBB7_3
217 ; X64-NEXT: .LBB7_2: # %res_block
217 ; X64-NEXT: .LBB7_1: # %res_block
218218 ; X64-NEXT: movl $1, %eax
219219 ; X64-NEXT: .LBB7_3: # %endblock
220220 ; X64-NEXT: testl %eax, %eax
343343
344344 define i1 @length5_eq(i8* %X, i8* %Y) nounwind {
345345 ; X86-LABEL: length5_eq:
346 ; X86: # BB#0:
346 ; X86: # BB#0: # %loadbb
347 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
347348 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
348 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
349 ; X86-NEXT: movl (%ecx), %edx
350 ; X86-NEXT: cmpl (%eax), %edx
351 ; X86-NEXT: jne .LBB12_2
352 ; X86-NEXT: # BB#1: # %loadbb1
353 ; X86-NEXT: movb 4(%ecx), %dl
354 ; X86-NEXT: xorl %ecx, %ecx
355 ; X86-NEXT: cmpb 4(%eax), %dl
349 ; X86-NEXT: movl (%eax), %edx
350 ; X86-NEXT: cmpl (%ecx), %edx
351 ; X86-NEXT: jne .LBB12_1
352 ; X86-NEXT: # BB#2: # %loadbb1
353 ; X86-NEXT: movb 4(%eax), %dl
354 ; X86-NEXT: xorl %eax, %eax
355 ; X86-NEXT: cmpb 4(%ecx), %dl
356356 ; X86-NEXT: je .LBB12_3
357 ; X86-NEXT: .LBB12_2: # %res_block
358 ; X86-NEXT: movl $1, %ecx
357 ; X86-NEXT: .LBB12_1: # %res_block
358 ; X86-NEXT: movl $1, %eax
359359 ; X86-NEXT: .LBB12_3: # %endblock
360 ; X86-NEXT: testl %ecx, %ecx
360 ; X86-NEXT: testl %eax, %eax
361361 ; X86-NEXT: setne %al
362362 ; X86-NEXT: retl
363363 ;
364364 ; X64-LABEL: length5_eq:
365 ; X64: # BB#0:
365 ; X64: # BB#0: # %loadbb
366366 ; X64-NEXT: movl (%rdi), %eax
367367 ; X64-NEXT: cmpl (%rsi), %eax
368 ; X64-NEXT: jne .LBB12_2
369 ; X64-NEXT: # BB#1: # %loadbb1
368 ; X64-NEXT: jne .LBB12_1
369 ; X64-NEXT: # BB#2: # %loadbb1
370370 ; X64-NEXT: movb 4(%rdi), %cl
371371 ; X64-NEXT: xorl %eax, %eax
372372 ; X64-NEXT: cmpb 4(%rsi), %cl
373373 ; X64-NEXT: je .LBB12_3
374 ; X64-NEXT: .LBB12_2: # %res_block
374 ; X64-NEXT: .LBB12_1: # %res_block
375375 ; X64-NEXT: movl $1, %eax
376376 ; X64-NEXT: .LBB12_3: # %endblock
377377 ; X64-NEXT: testl %eax, %eax
384384
385385 define i32 @length8(i8* %X, i8* %Y) nounwind {
386386 ; X86-LABEL: length8:
387 ; X86: # BB#0:
387 ; X86: # BB#0: # %loadbb
388388 ; X86-NEXT: pushl %esi
389389 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
390390 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
393393 ; X86-NEXT: bswapl %ecx
394394 ; X86-NEXT: bswapl %edx
395395 ; X86-NEXT: cmpl %edx, %ecx
396 ; X86-NEXT: jne .LBB13_2
397 ; X86-NEXT: # BB#1: # %loadbb1
396 ; X86-NEXT: jne .LBB13_1
397 ; X86-NEXT: # BB#2: # %loadbb1
398398 ; X86-NEXT: movl 4(%esi), %ecx
399399 ; X86-NEXT: movl 4(%eax), %edx
400400 ; X86-NEXT: bswapl %ecx
401401 ; X86-NEXT: bswapl %edx
402402 ; X86-NEXT: xorl %eax, %eax
403403 ; X86-NEXT: cmpl %edx, %ecx
404 ; X86-NEXT: je .LBB13_3
405 ; X86-NEXT: .LBB13_2: # %res_block
404 ; X86-NEXT: jne .LBB13_1
405 ; X86-NEXT: # BB#3: # %endblock
406 ; X86-NEXT: popl %esi
407 ; X86-NEXT: retl
408 ; X86-NEXT: .LBB13_1: # %res_block
406409 ; X86-NEXT: xorl %eax, %eax
407410 ; X86-NEXT: cmpl %edx, %ecx
408411 ; X86-NEXT: setae %al
409412 ; X86-NEXT: leal -1(%eax,%eax), %eax
410 ; X86-NEXT: .LBB13_3: # %endblock
411413 ; X86-NEXT: popl %esi
412414 ; X86-NEXT: retl
413415 ;
428430
429431 define i1 @length8_eq(i8* %X, i8* %Y) nounwind {
430432 ; X86-LABEL: length8_eq:
431 ; X86: # BB#0:
433 ; X86: # BB#0: # %loadbb
434 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
432435 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
433 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
434 ; X86-NEXT: movl (%ecx), %edx
435 ; X86-NEXT: cmpl (%eax), %edx
436 ; X86-NEXT: jne .LBB14_2
437 ; X86-NEXT: # BB#1: # %loadbb1
438 ; X86-NEXT: movl 4(%ecx), %edx
439 ; X86-NEXT: xorl %ecx, %ecx
440 ; X86-NEXT: cmpl 4(%eax), %edx
436 ; X86-NEXT: movl (%eax), %edx
437 ; X86-NEXT: cmpl (%ecx), %edx
438 ; X86-NEXT: jne .LBB14_1
439 ; X86-NEXT: # BB#2: # %loadbb1
440 ; X86-NEXT: movl 4(%eax), %edx
441 ; X86-NEXT: xorl %eax, %eax
442 ; X86-NEXT: cmpl 4(%ecx), %edx
441443 ; X86-NEXT: je .LBB14_3
442 ; X86-NEXT: .LBB14_2: # %res_block
443 ; X86-NEXT: movl $1, %ecx
444 ; X86-NEXT: .LBB14_1: # %res_block
445 ; X86-NEXT: movl $1, %eax
444446 ; X86-NEXT: .LBB14_3: # %endblock
445 ; X86-NEXT: testl %ecx, %ecx
447 ; X86-NEXT: testl %eax, %eax
446448 ; X86-NEXT: sete %al
447449 ; X86-NEXT: retl
448450 ;
459461
460462 define i1 @length8_eq_const(i8* %X) nounwind {
461463 ; X86-LABEL: length8_eq_const:
462 ; X86: # BB#0:
464 ; X86: # BB#0: # %loadbb
463465 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
464466 ; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130
465 ; X86-NEXT: jne .LBB15_2
466 ; X86-NEXT: # BB#1: # %loadbb1
467 ; X86-NEXT: jne .LBB15_1
468 ; X86-NEXT: # BB#2: # %loadbb1
467469 ; X86-NEXT: xorl %eax, %eax
468470 ; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534
469471 ; X86-NEXT: je .LBB15_3
470 ; X86-NEXT: .LBB15_2: # %res_block
472 ; X86-NEXT: .LBB15_1: # %res_block
471473 ; X86-NEXT: movl $1, %eax
472474 ; X86-NEXT: .LBB15_3: # %endblock
473475 ; X86-NEXT: testl %eax, %eax
499501 ; X86-NEXT: retl
500502 ;
501503 ; X64-LABEL: length12_eq:
502 ; X64: # BB#0:
504 ; X64: # BB#0: # %loadbb
503505 ; X64-NEXT: movq (%rdi), %rax
504506 ; X64-NEXT: cmpq (%rsi), %rax
505 ; X64-NEXT: jne .LBB16_2
506 ; X64-NEXT: # BB#1: # %loadbb1
507 ; X64-NEXT: jne .LBB16_1
508 ; X64-NEXT: # BB#2: # %loadbb1
507509 ; X64-NEXT: movl 8(%rdi), %ecx
508510 ; X64-NEXT: xorl %eax, %eax
509511 ; X64-NEXT: cmpl 8(%rsi), %ecx
510512 ; X64-NEXT: je .LBB16_3
511 ; X64-NEXT: .LBB16_2: # %res_block
513 ; X64-NEXT: .LBB16_1: # %res_block
512514 ; X64-NEXT: movl $1, %eax
513515 ; X64-NEXT: .LBB16_3: # %endblock
514516 ; X64-NEXT: testl %eax, %eax
531533 ; X86-NEXT: retl
532534 ;
533535 ; X64-LABEL: length12:
534 ; X64: # BB#0:
536 ; X64: # BB#0: # %loadbb
535537 ; X64-NEXT: movq (%rdi), %rcx
536538 ; X64-NEXT: movq (%rsi), %rdx
537539 ; X64-NEXT: bswapq %rcx
538540 ; X64-NEXT: bswapq %rdx
539541 ; X64-NEXT: cmpq %rdx, %rcx
540 ; X64-NEXT: jne .LBB17_2
541 ; X64-NEXT: # BB#1: # %loadbb1
542 ; X64-NEXT: jne .LBB17_1
543 ; X64-NEXT: # BB#2: # %loadbb1
542544 ; X64-NEXT: movl 8(%rdi), %ecx
543545 ; X64-NEXT: movl 8(%rsi), %edx
544546 ; X64-NEXT: bswapl %ecx
545547 ; X64-NEXT: bswapl %edx
546548 ; X64-NEXT: xorl %eax, %eax
547549 ; X64-NEXT: cmpq %rdx, %rcx
548 ; X64-NEXT: je .LBB17_3
549 ; X64-NEXT: .LBB17_2: # %res_block
550 ; X64-NEXT: jne .LBB17_1
551 ; X64-NEXT: # BB#3: # %endblock
552 ; X64-NEXT: retq
553 ; X64-NEXT: .LBB17_1: # %res_block
550554 ; X64-NEXT: xorl %eax, %eax
551555 ; X64-NEXT: cmpq %rdx, %rcx
552556 ; X64-NEXT: setae %al
553557 ; X64-NEXT: leal -1(%rax,%rax), %eax
554 ; X64-NEXT: .LBB17_3: # %endblock
555558 ; X64-NEXT: retq
556559 %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
557560 ret i32 %m
571574 ; X86-NEXT: retl
572575 ;
573576 ; X64-LABEL: length16:
574 ; X64: # BB#0:
577 ; X64: # BB#0: # %loadbb
575578 ; X64-NEXT: movq (%rdi), %rcx
576579 ; X64-NEXT: movq (%rsi), %rdx
577580 ; X64-NEXT: bswapq %rcx
578581 ; X64-NEXT: bswapq %rdx
579582 ; X64-NEXT: cmpq %rdx, %rcx
580 ; X64-NEXT: jne .LBB18_2
581 ; X64-NEXT: # BB#1: # %loadbb1
583 ; X64-NEXT: jne .LBB18_1
584 ; X64-NEXT: # BB#2: # %loadbb1
582585 ; X64-NEXT: movq 8(%rdi), %rcx
583586 ; X64-NEXT: movq 8(%rsi), %rdx
584587 ; X64-NEXT: bswapq %rcx
585588 ; X64-NEXT: bswapq %rdx
586589 ; X64-NEXT: xorl %eax, %eax
587590 ; X64-NEXT: cmpq %rdx, %rcx
588 ; X64-NEXT: je .LBB18_3
589 ; X64-NEXT: .LBB18_2: # %res_block
591 ; X64-NEXT: jne .LBB18_1
592 ; X64-NEXT: # BB#3: # %endblock
593 ; X64-NEXT: retq
594 ; X64-NEXT: .LBB18_1: # %res_block
590595 ; X64-NEXT: xorl %eax, %eax
591596 ; X64-NEXT: cmpq %rdx, %rcx
592597 ; X64-NEXT: setae %al
593598 ; X64-NEXT: leal -1(%rax,%rax), %eax
594 ; X64-NEXT: .LBB18_3: # %endblock
595599 ; X64-NEXT: retq
596600 %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
597601 ret i32 %m
749753 ; X86-NEXT: retl
750754 ;
751755 ; X64-SSE2-LABEL: length24_eq:
752 ; X64-SSE2: # BB#0:
756 ; X64-SSE2: # BB#0: # %loadbb
753757 ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
754758 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
755759 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
756760 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
757761 ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
758 ; X64-SSE2-NEXT: jne .LBB22_2
759 ; X64-SSE2-NEXT: # BB#1: # %loadbb1
762 ; X64-SSE2-NEXT: jne .LBB22_1
763 ; X64-SSE2-NEXT: # BB#2: # %loadbb1
760764 ; X64-SSE2-NEXT: movq 16(%rdi), %rcx
761765 ; X64-SSE2-NEXT: xorl %eax, %eax
762766 ; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx
763767 ; X64-SSE2-NEXT: je .LBB22_3
764 ; X64-SSE2-NEXT: .LBB22_2: # %res_block
768 ; X64-SSE2-NEXT: .LBB22_1: # %res_block
765769 ; X64-SSE2-NEXT: movl $1, %eax
766770 ; X64-SSE2-NEXT: .LBB22_3: # %endblock
767771 ; X64-SSE2-NEXT: testl %eax, %eax
769773 ; X64-SSE2-NEXT: retq
770774 ;
771775 ; X64-AVX-LABEL: length24_eq:
772 ; X64-AVX: # BB#0:
776 ; X64-AVX: # BB#0: # %loadbb
773777 ; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
774778 ; X64-AVX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
775779 ; X64-AVX-NEXT: vpmovmskb %xmm0, %eax
776780 ; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
777 ; X64-AVX-NEXT: jne .LBB22_2
778 ; X64-AVX-NEXT: # BB#1: # %loadbb1
781 ; X64-AVX-NEXT: jne .LBB22_1
782 ; X64-AVX-NEXT: # BB#2: # %loadbb1
779783 ; X64-AVX-NEXT: movq 16(%rdi), %rcx
780784 ; X64-AVX-NEXT: xorl %eax, %eax
781785 ; X64-AVX-NEXT: cmpq 16(%rsi), %rcx
782786 ; X64-AVX-NEXT: je .LBB22_3
783 ; X64-AVX-NEXT: .LBB22_2: # %res_block
787 ; X64-AVX-NEXT: .LBB22_1: # %res_block
784788 ; X64-AVX-NEXT: movl $1, %eax
785789 ; X64-AVX-NEXT: .LBB22_3: # %endblock
786790 ; X64-AVX-NEXT: testl %eax, %eax
805809 ; X86-NEXT: retl
806810 ;
807811 ; X64-SSE2-LABEL: length24_eq_const:
808 ; X64-SSE2: # BB#0:
812 ; X64-SSE2: # BB#0: # %loadbb
809813 ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
810814 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
811815 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
812816 ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
813 ; X64-SSE2-NEXT: jne .LBB23_2
814 ; X64-SSE2-NEXT: # BB#1: # %loadbb1
817 ; X64-SSE2-NEXT: jne .LBB23_1
818 ; X64-SSE2-NEXT: # BB#2: # %loadbb1
815819 ; X64-SSE2-NEXT: xorl %eax, %eax
816820 ; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
817821 ; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi)
818822 ; X64-SSE2-NEXT: je .LBB23_3
819 ; X64-SSE2-NEXT: .LBB23_2: # %res_block
823 ; X64-SSE2-NEXT: .LBB23_1: # %res_block
820824 ; X64-SSE2-NEXT: movl $1, %eax
821825 ; X64-SSE2-NEXT: .LBB23_3: # %endblock
822826 ; X64-SSE2-NEXT: testl %eax, %eax
824828 ; X64-SSE2-NEXT: retq
825829 ;
826830 ; X64-AVX-LABEL: length24_eq_const:
827 ; X64-AVX: # BB#0:
831 ; X64-AVX: # BB#0: # %loadbb
828832 ; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
829833 ; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
830834 ; X64-AVX-NEXT: vpmovmskb %xmm0, %eax
831835 ; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
832 ; X64-AVX-NEXT: jne .LBB23_2
833 ; X64-AVX-NEXT: # BB#1: # %loadbb1
836 ; X64-AVX-NEXT: jne .LBB23_1
837 ; X64-AVX-NEXT: # BB#2: # %loadbb1
834838 ; X64-AVX-NEXT: xorl %eax, %eax
835839 ; X64-AVX-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
836840 ; X64-AVX-NEXT: cmpq %rcx, 16(%rdi)
837841 ; X64-AVX-NEXT: je .LBB23_3
838 ; X64-AVX-NEXT: .LBB23_2: # %res_block
842 ; X64-AVX-NEXT: .LBB23_1: # %res_block
839843 ; X64-AVX-NEXT: movl $1, %eax
840844 ; X64-AVX-NEXT: .LBB23_3: # %endblock
841845 ; X64-AVX-NEXT: testl %eax, %eax
893897 ; X86-SSE1-NEXT: retl
894898 ;
895899 ; X86-SSE2-LABEL: length32_eq:
896 ; X86-SSE2: # BB#0:
900 ; X86-SSE2: # BB#0: # %loadbb
897901 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
898902 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
899903 ; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
901905 ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
902906 ; X86-SSE2-NEXT: pmovmskb %xmm1, %edx
903907 ; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF
904 ; X86-SSE2-NEXT: jne .LBB25_2
905 ; X86-SSE2-NEXT: # BB#1: # %loadbb1
908 ; X86-SSE2-NEXT: jne .LBB25_1
909 ; X86-SSE2-NEXT: # BB#2: # %loadbb1
906910 ; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0
907911 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
908912 ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
910914 ; X86-SSE2-NEXT: xorl %eax, %eax
911915 ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
912916 ; X86-SSE2-NEXT: je .LBB25_3
913 ; X86-SSE2-NEXT: .LBB25_2: # %res_block
917 ; X86-SSE2-NEXT: .LBB25_1: # %res_block
914918 ; X86-SSE2-NEXT: movl $1, %eax
915919 ; X86-SSE2-NEXT: .LBB25_3: # %endblock
916920 ; X86-SSE2-NEXT: testl %eax, %eax
918922 ; X86-SSE2-NEXT: retl
919923 ;
920924 ; X64-SSE2-LABEL: length32_eq:
921 ; X64-SSE2: # BB#0:
925 ; X64-SSE2: # BB#0: # %loadbb
922926 ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
923927 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
924928 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
925929 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
926930 ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
927 ; X64-SSE2-NEXT: jne .LBB25_2
928 ; X64-SSE2-NEXT: # BB#1: # %loadbb1
931 ; X64-SSE2-NEXT: jne .LBB25_1
932 ; X64-SSE2-NEXT: # BB#2: # %loadbb1
929933 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
930934 ; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1
931935 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
933937 ; X64-SSE2-NEXT: xorl %eax, %eax
934938 ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
935939 ; X64-SSE2-NEXT: je .LBB25_3
936 ; X64-SSE2-NEXT: .LBB25_2: # %res_block
940 ; X64-SSE2-NEXT: .LBB25_1: # %res_block
937941 ; X64-SSE2-NEXT: movl $1, %eax
938942 ; X64-SSE2-NEXT: .LBB25_3: # %endblock
939943 ; X64-SSE2-NEXT: testl %eax, %eax
941945 ; X64-SSE2-NEXT: retq
942946 ;
943947 ; X64-AVX1-LABEL: length32_eq:
944 ; X64-AVX1: # BB#0:
948 ; X64-AVX1: # BB#0: # %loadbb
945949 ; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0
946950 ; X64-AVX1-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
947951 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax
948952 ; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF
949 ; X64-AVX1-NEXT: jne .LBB25_2
950 ; X64-AVX1-NEXT: # BB#1: # %loadbb1
953 ; X64-AVX1-NEXT: jne .LBB25_1
954 ; X64-AVX1-NEXT: # BB#2: # %loadbb1
951955 ; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0
952956 ; X64-AVX1-NEXT: vpcmpeqb 16(%rsi), %xmm0, %xmm0
953957 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx
954958 ; X64-AVX1-NEXT: xorl %eax, %eax
955959 ; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
956960 ; X64-AVX1-NEXT: je .LBB25_3
957 ; X64-AVX1-NEXT: .LBB25_2: # %res_block
961 ; X64-AVX1-NEXT: .LBB25_1: # %res_block
958962 ; X64-AVX1-NEXT: movl $1, %eax
959963 ; X64-AVX1-NEXT: .LBB25_3: # %endblock
960964 ; X64-AVX1-NEXT: testl %eax, %eax
10011005 ; X86-SSE1-NEXT: retl
10021006 ;
10031007 ; X86-SSE2-LABEL: length32_eq_const:
1004 ; X86-SSE2: # BB#0:
1008 ; X86-SSE2: # BB#0: # %loadbb
10051009 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
10061010 ; X86-SSE2-NEXT: movdqu (%eax), %xmm0
10071011 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
10081012 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx
10091013 ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
1010 ; X86-SSE2-NEXT: jne .LBB26_2
1011 ; X86-SSE2-NEXT: # BB#1: # %loadbb1
1014 ; X86-SSE2-NEXT: jne .LBB26_1
1015 ; X86-SSE2-NEXT: # BB#2: # %loadbb1
10121016 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0
10131017 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
10141018 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx
10151019 ; X86-SSE2-NEXT: xorl %eax, %eax
10161020 ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
10171021 ; X86-SSE2-NEXT: je .LBB26_3
1018 ; X86-SSE2-NEXT: .LBB26_2: # %res_block
1022 ; X86-SSE2-NEXT: .LBB26_1: # %res_block
10191023 ; X86-SSE2-NEXT: movl $1, %eax
10201024 ; X86-SSE2-NEXT: .LBB26_3: # %endblock
10211025 ; X86-SSE2-NEXT: testl %eax, %eax
10231027 ; X86-SSE2-NEXT: retl
10241028 ;
10251029 ; X64-SSE2-LABEL: length32_eq_const:
1026 ; X64-SSE2: # BB#0:
1030 ; X64-SSE2: # BB#0: # %loadbb
10271031 ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
10281032 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
10291033 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
10301034 ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
1031 ; X64-SSE2-NEXT: jne .LBB26_2
1032 ; X64-SSE2-NEXT: # BB#1: # %loadbb1
1035 ; X64-SSE2-NEXT: jne .LBB26_1
1036 ; X64-SSE2-NEXT: # BB#2: # %loadbb1
10331037 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
10341038 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
10351039 ; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx
10361040 ; X64-SSE2-NEXT: xorl %eax, %eax
10371041 ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
10381042 ; X64-SSE2-NEXT: je .LBB26_3
1039 ; X64-SSE2-NEXT: .LBB26_2: # %res_block
1043 ; X64-SSE2-NEXT: .LBB26_1: # %res_block
10401044 ; X64-SSE2-NEXT: movl $1, %eax
10411045 ; X64-SSE2-NEXT: .LBB26_3: # %endblock
10421046 ; X64-SSE2-NEXT: testl %eax, %eax
10441048 ; X64-SSE2-NEXT: retq
10451049 ;
10461050 ; X64-AVX1-LABEL: length32_eq_const:
1047 ; X64-AVX1: # BB#0:
1051 ; X64-AVX1: # BB#0: # %loadbb
10481052 ; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0
10491053 ; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
10501054 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax
10511055 ; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF
1052 ; X64-AVX1-NEXT: jne .LBB26_2
1053 ; X64-AVX1-NEXT: # BB#1: # %loadbb1
1056 ; X64-AVX1-NEXT: jne .LBB26_1
1057 ; X64-AVX1-NEXT: # BB#2: # %loadbb1
10541058 ; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0
10551059 ; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
10561060 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx
10571061 ; X64-AVX1-NEXT: xorl %eax, %eax
10581062 ; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
10591063 ; X64-AVX1-NEXT: je .LBB26_3
1060 ; X64-AVX1-NEXT: .LBB26_2: # %res_block
1064 ; X64-AVX1-NEXT: .LBB26_1: # %res_block
10611065 ; X64-AVX1-NEXT: movl $1, %eax
10621066 ; X64-AVX1-NEXT: .LBB26_3: # %endblock
10631067 ; X64-AVX1-NEXT: testl %eax, %eax
11311135 ; X64-AVX1-NEXT: retq
11321136 ;
11331137 ; X64-AVX2-LABEL: length64_eq:
1134 ; X64-AVX2: # BB#0:
1138 ; X64-AVX2: # BB#0: # %loadbb
11351139 ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
11361140 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
11371141 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
11381142 ; X64-AVX2-NEXT: cmpl $-1, %eax
1139 ; X64-AVX2-NEXT: jne .LBB28_2
1140 ; X64-AVX2-NEXT: # BB#1: # %loadbb1
1143 ; X64-AVX2-NEXT: jne .LBB28_1
1144 ; X64-AVX2-NEXT: # BB#2: # %loadbb1
11411145 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
11421146 ; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0
11431147 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx
11441148 ; X64-AVX2-NEXT: xorl %eax, %eax
11451149 ; X64-AVX2-NEXT: cmpl $-1, %ecx
11461150 ; X64-AVX2-NEXT: je .LBB28_3
1147 ; X64-AVX2-NEXT: .LBB28_2: # %res_block
1151 ; X64-AVX2-NEXT: .LBB28_1: # %res_block
11481152 ; X64-AVX2-NEXT: movl $1, %eax
11491153 ; X64-AVX2-NEXT: .LBB28_3: # %endblock
11501154 ; X64-AVX2-NEXT: testl %eax, %eax
11921196 ; X64-AVX1-NEXT: retq
11931197 ;
11941198 ; X64-AVX2-LABEL: length64_eq_const:
1195 ; X64-AVX2: # BB#0:
1199 ; X64-AVX2: # BB#0: # %loadbb
11961200 ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
11971201 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
11981202 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
11991203 ; X64-AVX2-NEXT: cmpl $-1, %eax
1200 ; X64-AVX2-NEXT: jne .LBB29_2
1201 ; X64-AVX2-NEXT: # BB#1: # %loadbb1
1204 ; X64-AVX2-NEXT: jne .LBB29_1
1205 ; X64-AVX2-NEXT: # BB#2: # %loadbb1
12021206 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
12031207 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
12041208 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx
12051209 ; X64-AVX2-NEXT: xorl %eax, %eax
12061210 ; X64-AVX2-NEXT: cmpl $-1, %ecx
12071211 ; X64-AVX2-NEXT: je .LBB29_3
1208 ; X64-AVX2-NEXT: .LBB29_2: # %res_block
1212 ; X64-AVX2-NEXT: .LBB29_1: # %res_block
12091213 ; X64-AVX2-NEXT: movl $1, %eax
12101214 ; X64-AVX2-NEXT: .LBB29_3: # %endblock
12111215 ; X64-AVX2-NEXT: testl %eax, %eax
0 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
1 ; RUN: opt -S -codegenprepare -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X32
2 ; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64
3
4 declare i32 @memcmp(i8* nocapture, i8* nocapture, i64)
5
6 define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y) {
7 ; ALL-LABEL: @cmp2(
8 ; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16*
9 ; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16*
10 ; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]]
11 ; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]]
12 ; ALL-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
13 ; ALL-NEXT: [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
14 ; ALL-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
15 ; ALL-NEXT: [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
16 ; ALL-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
17 ; ALL-NEXT: ret i32 [[TMP9]]
18 ;
19 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2)
20 ret i32 %call
21 }
22
23 define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y) {
24 ; ALL-LABEL: @cmp3(
25 ; ALL-NEXT: loadbb:
26 ; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16*
27 ; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16*
28 ; ALL-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]]
29 ; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]]
30 ; ALL-NEXT: [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
31 ; ALL-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
32 ; ALL-NEXT: [[TMP6:%.*]] = icmp eq i16 [[TMP4]], [[TMP5]]
33 ; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
34 ; ALL: res_block:
35 ; ALL-NEXT: [[TMP7:%.*]] = icmp ult i16 [[TMP4]], [[TMP5]]
36 ; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
37 ; ALL-NEXT: br label [[ENDBLOCK:%.*]]
38 ; ALL: loadbb1:
39 ; ALL-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 2
40 ; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 2
41 ; ALL-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]]
42 ; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]]
43 ; ALL-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
44 ; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
45 ; ALL-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]]
46 ; ALL-NEXT: br label [[ENDBLOCK]]
47 ; ALL: endblock:
48 ; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
49 ; ALL-NEXT: ret i32 [[PHI_RES]]
50 ;
51 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3)
52 ret i32 %call
53 }
54
55 define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y) {
56 ; ALL-LABEL: @cmp4(
57 ; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
58 ; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
59 ; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
60 ; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]]
61 ; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
62 ; ALL-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
63 ; ALL-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]]
64 ; ALL-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]]
65 ; ALL-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
66 ; ALL-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
67 ; ALL-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
68 ; ALL-NEXT: ret i32 [[TMP11]]
69 ;
70 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4)
71 ret i32 %call
72 }
73
74 define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) {
75 ; ALL-LABEL: @cmp5(
76 ; ALL-NEXT: loadbb:
77 ; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
78 ; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
79 ; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]]
80 ; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
81 ; ALL-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
82 ; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
83 ; ALL-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
84 ; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
85 ; ALL: res_block:
86 ; ALL-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]]
87 ; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
88 ; ALL-NEXT: br label [[ENDBLOCK:%.*]]
89 ; ALL: loadbb1:
90 ; ALL-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 4
91 ; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 4
92 ; ALL-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]]
93 ; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]]
94 ; ALL-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
95 ; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
96 ; ALL-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]]
97 ; ALL-NEXT: br label [[ENDBLOCK]]
98 ; ALL: endblock:
99 ; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
100 ; ALL-NEXT: ret i32 [[PHI_RES]]
101 ;
102 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5)
103 ret i32 %call
104 }
105
106 define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y) {
107 ; ALL-LABEL: @cmp6(
108 ; ALL-NEXT: loadbb:
109 ; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
110 ; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
111 ; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]]
112 ; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
113 ; ALL-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
114 ; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
115 ; ALL-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
116 ; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
117 ; ALL: res_block:
118 ; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
119 ; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ]
120 ; ALL-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
121 ; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
122 ; ALL-NEXT: br label [[ENDBLOCK:%.*]]
123 ; ALL: loadbb1:
124 ; ALL-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16*
125 ; ALL-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16*
126 ; ALL-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 2
127 ; ALL-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2
128 ; ALL-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]]
129 ; ALL-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]]
130 ; ALL-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]])
131 ; ALL-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
132 ; ALL-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i32
133 ; ALL-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i32
134 ; ALL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], [[TMP18]]
135 ; ALL-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]]
136 ; ALL: endblock:
137 ; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
138 ; ALL-NEXT: ret i32 [[PHI_RES]]
139 ;
140 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6)
141 ret i32 %call
142 }
143
144 define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) {
145 ; ALL-LABEL: @cmp7(
146 ; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7)
147 ; ALL-NEXT: ret i32 [[CALL]]
148 ;
149 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7)
150 ret i32 %call
151 }
152
153 define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) {
154 ; X32-LABEL: @cmp8(
155 ; X32-NEXT: loadbb:
156 ; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
157 ; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
158 ; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]]
159 ; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
160 ; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
161 ; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
162 ; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
163 ; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
164 ; X32: res_block:
165 ; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
166 ; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ]
167 ; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
168 ; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
169 ; X32-NEXT: br label [[ENDBLOCK:%.*]]
170 ; X32: loadbb1:
171 ; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
172 ; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
173 ; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1
174 ; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1
175 ; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]]
176 ; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]]
177 ; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
178 ; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
179 ; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]]
180 ; X32-NEXT: br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]]
181 ; X32: endblock:
182 ; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
183 ; X32-NEXT: ret i32 [[PHI_RES]]
184 ;
185 ; X64-LABEL: @cmp8(
186 ; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
187 ; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64*
188 ; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
189 ; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]]
190 ; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
191 ; X64-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
192 ; X64-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]]
193 ; X64-NEXT: [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
194 ; X64-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
195 ; X64-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
196 ; X64-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
197 ; X64-NEXT: ret i32 [[TMP11]]
198 ;
199 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8)
200 ret i32 %call
201 }
202
203 define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y) {
204 ; X32-LABEL: @cmp9(
205 ; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9)
206 ; X32-NEXT: ret i32 [[CALL]]
207 ;
208 ; X64-LABEL: @cmp9(
209 ; X64-NEXT: loadbb:
210 ; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
211 ; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
212 ; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]]
213 ; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
214 ; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
215 ; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
216 ; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
217 ; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
218 ; X64: res_block:
219 ; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP4]], [[TMP5]]
220 ; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
221 ; X64-NEXT: br label [[ENDBLOCK:%.*]]
222 ; X64: loadbb1:
223 ; X64-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 8
224 ; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 8
225 ; X64-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]]
226 ; X64-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]]
227 ; X64-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
228 ; X64-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
229 ; X64-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]]
230 ; X64-NEXT: br label [[ENDBLOCK]]
231 ; X64: endblock:
232 ; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
233 ; X64-NEXT: ret i32 [[PHI_RES]]
234 ;
235 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9)
236 ret i32 %call
237 }
238
239 define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y) {
240 ; X32-LABEL: @cmp10(
241 ; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10)
242 ; X32-NEXT: ret i32 [[CALL]]
243 ;
244 ; X64-LABEL: @cmp10(
245 ; X64-NEXT: loadbb:
246 ; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
247 ; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
248 ; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]]
249 ; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
250 ; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
251 ; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
252 ; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
253 ; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
254 ; X64: res_block:
255 ; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
256 ; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ]
257 ; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
258 ; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
259 ; X64-NEXT: br label [[ENDBLOCK:%.*]]
260 ; X64: loadbb1:
261 ; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16*
262 ; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16*
263 ; X64-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 4
264 ; X64-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4
265 ; X64-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]]
266 ; X64-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]]
267 ; X64-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]])
268 ; X64-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
269 ; X64-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i64
270 ; X64-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i64
271 ; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]]
272 ; X64-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]]
273 ; X64: endblock:
274 ; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
275 ; X64-NEXT: ret i32 [[PHI_RES]]
276 ;
277 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10)
278 ret i32 %call
279 }
280
281 define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y) {
282 ; ALL-LABEL: @cmp11(
283 ; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11)
284 ; ALL-NEXT: ret i32 [[CALL]]
285 ;
286 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11)
287 ret i32 %call
288 }
289
290 define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y) {
291 ; X32-LABEL: @cmp12(
292 ; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12)
293 ; X32-NEXT: ret i32 [[CALL]]
294 ;
295 ; X64-LABEL: @cmp12(
296 ; X64-NEXT: loadbb:
297 ; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
298 ; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
299 ; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]]
300 ; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
301 ; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
302 ; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
303 ; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
304 ; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
305 ; X64: res_block:
306 ; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
307 ; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ]
308 ; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
309 ; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
310 ; X64-NEXT: br label [[ENDBLOCK:%.*]]
311 ; X64: loadbb1:
312 ; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
313 ; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
314 ; X64-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2
315 ; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2
316 ; X64-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]]
317 ; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]]
318 ; X64-NEXT: [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
319 ; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
320 ; X64-NEXT: [[TMP17]] = zext i32 [[TMP15]] to i64
321 ; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64
322 ; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]]
323 ; X64-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]]
324 ; X64: endblock:
325 ; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
326 ; X64-NEXT: ret i32 [[PHI_RES]]
327 ;
328 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12)
329 ret i32 %call
330 }
331
332 define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y) {
333 ; ALL-LABEL: @cmp13(
334 ; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13)
335 ; ALL-NEXT: ret i32 [[CALL]]
336 ;
337 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13)
338 ret i32 %call
339 }
340
341 define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y) {
342 ; ALL-LABEL: @cmp14(
343 ; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14)
344 ; ALL-NEXT: ret i32 [[CALL]]
345 ;
346 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14)
347 ret i32 %call
348 }
349
350 define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y) {
351 ; ALL-LABEL: @cmp15(
352 ; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15)
353 ; ALL-NEXT: ret i32 [[CALL]]
354 ;
355 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15)
356 ret i32 %call
357 }
358
359 define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y) {
360 ; X32-LABEL: @cmp16(
361 ; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16)
362 ; X32-NEXT: ret i32 [[CALL]]
363 ;
364 ; X64-LABEL: @cmp16(
365 ; X64-NEXT: loadbb:
366 ; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
367 ; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
368 ; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]]
369 ; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
370 ; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
371 ; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
372 ; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
373 ; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
374 ; X64: res_block:
375 ; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
376 ; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ]
377 ; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
378 ; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
379 ; X64-NEXT: br label [[ENDBLOCK:%.*]]
380 ; X64: loadbb1:
381 ; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i64*
382 ; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i64*
383 ; X64-NEXT: [[TMP11:%.*]] = getelementptr i64, i64* [[TMP9]], i64 1
384 ; X64-NEXT: [[TMP12:%.*]] = getelementptr i64, i64* [[TMP10]], i64 1
385 ; X64-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP11]]
386 ; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]]
387 ; X64-NEXT: [[TMP15]] = call i64 @llvm.bswap.i64(i64 [[TMP13]])
388 ; X64-NEXT: [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
389 ; X64-NEXT: [[TMP17:%.*]] = icmp eq i64 [[TMP15]], [[TMP16]]
390 ; X64-NEXT: br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]]
391 ; X64: endblock:
392 ; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
393 ; X64-NEXT: ret i32 [[PHI_RES]]
394 ;
395 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16)
396 ret i32 %call
397 }
398
399 define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y) {
400 ; ALL-LABEL: @cmp_eq2(
401 ; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16*
402 ; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16*
403 ; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]]
404 ; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]]
405 ; ALL-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]]
406 ; ALL-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
407 ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
408 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
409 ; ALL-NEXT: ret i32 [[CONV]]
410 ;
411 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2)
412 %cmp = icmp eq i32 %call, 0
413 %conv = zext i1 %cmp to i32
414 ret i32 %conv
415 }
416
417 define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y) {
418 ; ALL-LABEL: @cmp_eq3(
419 ; ALL-NEXT: loadbb:
420 ; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16*
421 ; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16*
422 ; ALL-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]]
423 ; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]]
424 ; ALL-NEXT: [[TMP4:%.*]] = icmp ne i16 [[TMP2]], [[TMP3]]
425 ; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
426 ; ALL: res_block:
427 ; ALL-NEXT: br label [[ENDBLOCK:%.*]]
428 ; ALL: loadbb1:
429 ; ALL-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 2
430 ; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 2
431 ; ALL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]]
432 ; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]]
433 ; ALL-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]]
434 ; ALL-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]]
435 ; ALL: endblock:
436 ; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
437 ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
438 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
439 ; ALL-NEXT: ret i32 [[CONV]]
440 ;
441 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3)
442 %cmp = icmp eq i32 %call, 0
443 %conv = zext i1 %cmp to i32
444 ret i32 %conv
445 }
446
447 define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y) {
448 ; ALL-LABEL: @cmp_eq4(
449 ; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
450 ; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
451 ; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
452 ; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]]
453 ; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
454 ; ALL-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
455 ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
456 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
457 ; ALL-NEXT: ret i32 [[CONV]]
458 ;
459 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4)
460 %cmp = icmp eq i32 %call, 0
461 %conv = zext i1 %cmp to i32
462 ret i32 %conv
463 }
464
465 define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) {
466 ; ALL-LABEL: @cmp_eq5(
467 ; ALL-NEXT: loadbb:
468 ; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
469 ; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
470 ; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]]
471 ; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
472 ; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
473 ; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
474 ; ALL: res_block:
475 ; ALL-NEXT: br label [[ENDBLOCK:%.*]]
476 ; ALL: loadbb1:
477 ; ALL-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 4
478 ; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 4
479 ; ALL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]]
480 ; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]]
481 ; ALL-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]]
482 ; ALL-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]]
483 ; ALL: endblock:
484 ; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
485 ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
486 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
487 ; ALL-NEXT: ret i32 [[CONV]]
488 ;
489 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5)
490 %cmp = icmp eq i32 %call, 0
491 %conv = zext i1 %cmp to i32
492 ret i32 %conv
493 }
494
495 define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) {
496 ; ALL-LABEL: @cmp_eq6(
497 ; ALL-NEXT: loadbb:
498 ; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
499 ; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
500 ; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]]
501 ; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
502 ; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
503 ; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
504 ; ALL: res_block:
505 ; ALL-NEXT: br label [[ENDBLOCK:%.*]]
506 ; ALL: loadbb1:
507 ; ALL-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16*
508 ; ALL-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16*
509 ; ALL-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 2
510 ; ALL-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2
511 ; ALL-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]]
512 ; ALL-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]]
513 ; ALL-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]]
514 ; ALL-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
515 ; ALL: endblock:
516 ; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
517 ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
518 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
519 ; ALL-NEXT: ret i32 [[CONV]]
520 ;
521 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6)
522 %cmp = icmp eq i32 %call, 0
523 %conv = zext i1 %cmp to i32
524 ret i32 %conv
525 }
526
527 define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) {
528 ; ALL-LABEL: @cmp_eq7(
529 ; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7)
530 ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
531 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
532 ; ALL-NEXT: ret i32 [[CONV]]
533 ;
534 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7)
535 %cmp = icmp eq i32 %call, 0
536 %conv = zext i1 %cmp to i32
537 ret i32 %conv
538 }
539
540 define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) {
541 ; X32-LABEL: @cmp_eq8(
542 ; X32-NEXT: loadbb:
543 ; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
544 ; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
545 ; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]]
546 ; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
547 ; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
548 ; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
549 ; X32: res_block:
550 ; X32-NEXT: br label [[ENDBLOCK:%.*]]
551 ; X32: loadbb1:
552 ; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
553 ; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
554 ; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1
555 ; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1
556 ; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]]
557 ; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]]
558 ; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
559 ; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
560 ; X32: endblock:
561 ; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
562 ; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
563 ; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
564 ; X32-NEXT: ret i32 [[CONV]]
565 ;
566 ; X64-LABEL: @cmp_eq8(
567 ; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
568 ; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64*
569 ; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
570 ; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]]
571 ; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
572 ; X64-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
573 ; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
574 ; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
575 ; X64-NEXT: ret i32 [[CONV]]
576 ;
577 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8)
578 %cmp = icmp eq i32 %call, 0
579 %conv = zext i1 %cmp to i32
580 ret i32 %conv
581 }
582
583 define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) {
584 ; X32-LABEL: @cmp_eq9(
585 ; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9)
586 ; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
587 ; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
588 ; X32-NEXT: ret i32 [[CONV]]
589 ;
590 ; X64-LABEL: @cmp_eq9(
591 ; X64-NEXT: loadbb:
592 ; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
593 ; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
594 ; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]]
595 ; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
596 ; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
597 ; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
598 ; X64: res_block:
599 ; X64-NEXT: br label [[ENDBLOCK:%.*]]
600 ; X64: loadbb1:
601 ; X64-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 8
602 ; X64-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 8
603 ; X64-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]]
604 ; X64-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]]
605 ; X64-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]]
606 ; X64-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]]
607 ; X64: endblock:
608 ; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
609 ; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
610 ; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
611 ; X64-NEXT: ret i32 [[CONV]]
612 ;
613 %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9)
614 %cmp = icmp eq i32 %call, 0
615 %conv = zext i1 %cmp to i32
616 ret i32 %conv
617 }
618
619 define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) {
620 ; X32-LABEL: @cmp_eq10(
621 ; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10)
622 ; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
623 ; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
624 ; X32-NEXT: ret i32 [[CONV]]
625 ;
626 ; X64-LABEL: @cmp_eq10(
627 ; X64-NEXT: loadbb:
628 ; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
629 ; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
630 ; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]]
631 ; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
632 ; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
633 ; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
634 ; X64: res_block:
635 ; X64-NEXT: br label [[ENDBLOCK:%.*]]