llvm.org GIT mirror llvm / eadb58f
[X86] Relocate code of replacement of subtarget unsupported masked memory intrinsics to run also on -O0 option. Currently, when masked load, store, gather or scatter intrinsics are used, we check in CodeGenPrepare pass if the subtarget support these intrinsics, if not we replace them with scalar code - this is a functional transformation not an optimization (not optional). CodeGenPrepare pass does not run when the optimization level is set to CodeGenOpt::None (-O0). Functional transformation should run with all optimization levels, so here I created a new pass which runs on all optimization levels and does no more than this transformation. Differential Revision: https://reviews.llvm.org/D32487 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@303050 91177308-0d34-0410-b5e6-96231b3b80d8 Ayman Musa 3 years ago
13 changed file(s) with 714 addition(s) and 547 deletion(s). Raw diff Collapse all Expand all
6666 /// createCodeGenPreparePass - Transform the code to expose more pattern
6767 /// matching during instruction selection.
6868 FunctionPass *createCodeGenPreparePass(const TargetMachine *TM = nullptr);
69
70 /// createScalarizeMaskedMemIntrinPass - Replace masked load, store, gather
71 /// and scatter intrinsics with scalar code when target doesn't support them.
72 FunctionPass *createScalarizeMaskedMemIntrinPass();
6973
7074 /// AtomicExpandID -- Lowers atomic operations in terms of either cmpxchg
7175 /// load-linked/store-conditional loops.
325325 void initializeSanitizerCoverageModulePass(PassRegistry&);
326326 void initializeScalarEvolutionWrapperPassPass(PassRegistry&);
327327 void initializeScalarizerPass(PassRegistry&);
328 void initializeScalarizeMaskedMemIntrinPass(PassRegistry&);
328329 void initializeScopedNoAliasAAWrapperPassPass(PassRegistry&);
329330 void initializeSeparateConstOffsetFromGEPPass(PassRegistry&);
330331 void initializeShadowStackGCLoweringPass(PassRegistry&);
205205 (void) llvm::createMemDerefPrinter();
206206 (void) llvm::createFloat2IntPass();
207207 (void) llvm::createEliminateAvailableExternallyPass();
208 (void) llvm::createScalarizeMaskedMemIntrinPass();
208209
209210 (void)new llvm::IntervalPartition();
210211 (void)new llvm::ScalarEvolutionWrapperPass();
119119 SafeStack.cpp
120120 SafeStackColoring.cpp
121121 SafeStackLayout.cpp
122 ScalarizeMaskedMemIntrin.cpp
122123 ScheduleDAG.cpp
123124 ScheduleDAGInstrs.cpp
124125 ScheduleDAGPrinter.cpp
8080 initializeRegisterCoalescerPass(Registry);
8181 initializeRenameIndependentSubregsPass(Registry);
8282 initializeSafeStackLegacyPassPass(Registry);
83 initializeScalarizeMaskedMemIntrinPass(Registry);
8384 initializeShrinkWrapPass(Registry);
8485 initializeSlotIndexesPass(Registry);
8586 initializeStackColoringPass(Registry);
15481548 return MadeChange;
15491549 }
15501550
1551 // Translate a masked load intrinsic like
1552 // <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
1553 // <16 x i1> %mask, <16 x i32> %passthru)
1554 // to a chain of basic blocks, with loading element one-by-one if
1555 // the appropriate mask bit is set
1556 //
1557 // %1 = bitcast i8* %addr to i32*
1558 // %2 = extractelement <16 x i1> %mask, i32 0
1559 // %3 = icmp eq i1 %2, true
1560 // br i1 %3, label %cond.load, label %else
1561 //
1562 //cond.load: ; preds = %0
1563 // %4 = getelementptr i32* %1, i32 0
1564 // %5 = load i32* %4
1565 // %6 = insertelement <16 x i32> undef, i32 %5, i32 0
1566 // br label %else
1567 //
1568 //else: ; preds = %0, %cond.load
1569 // %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ]
1570 // %7 = extractelement <16 x i1> %mask, i32 1
1571 // %8 = icmp eq i1 %7, true
1572 // br i1 %8, label %cond.load1, label %else2
1573 //
1574 //cond.load1: ; preds = %else
1575 // %9 = getelementptr i32* %1, i32 1
1576 // %10 = load i32* %9
1577 // %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1
1578 // br label %else2
1579 //
1580 //else2: ; preds = %else, %cond.load1
1581 // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
1582 // %12 = extractelement <16 x i1> %mask, i32 2
1583 // %13 = icmp eq i1 %12, true
1584 // br i1 %13, label %cond.load4, label %else5
1585 //
1586 static void scalarizeMaskedLoad(CallInst *CI) {
1587 Value *Ptr = CI->getArgOperand(0);
1588 Value *Alignment = CI->getArgOperand(1);
1589 Value *Mask = CI->getArgOperand(2);
1590 Value *Src0 = CI->getArgOperand(3);
1591
1592 unsigned AlignVal = cast(Alignment)->getZExtValue();
1593 VectorType *VecType = dyn_cast(CI->getType());
1594 assert(VecType && "Unexpected return type of masked load intrinsic");
1595
1596 Type *EltTy = CI->getType()->getVectorElementType();
1597
1598 IRBuilder<> Builder(CI->getContext());
1599 Instruction *InsertPt = CI;
1600 BasicBlock *IfBlock = CI->getParent();
1601 BasicBlock *CondBlock = nullptr;
1602 BasicBlock *PrevIfBlock = CI->getParent();
1603
1604 Builder.SetInsertPoint(InsertPt);
1605 Builder.SetCurrentDebugLocation(CI->getDebugLoc());
1606
1607 // Short-cut if the mask is all-true.
1608 bool IsAllOnesMask = isa(Mask) &&
1609 cast(Mask)->isAllOnesValue();
1610
1611 if (IsAllOnesMask) {
1612 Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal);
1613 CI->replaceAllUsesWith(NewI);
1614 CI->eraseFromParent();
1615 return;
1616 }
1617
1618 // Adjust alignment for the scalar instruction.
1619 AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits()/8);
1620 // Bitcast %addr fron i8* to EltTy*
1621 Type *NewPtrType =
1622 EltTy->getPointerTo(cast(Ptr->getType())->getAddressSpace());
1623 Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
1624 unsigned VectorWidth = VecType->getNumElements();
1625
1626 Value *UndefVal = UndefValue::get(VecType);
1627
1628 // The result vector
1629 Value *VResult = UndefVal;
1630
1631 if (isa(Mask)) {
1632 for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
1633 if (cast(Mask)->getOperand(Idx)->isNullValue())
1634 continue;
1635 Value *Gep =
1636 Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
1637 LoadInst* Load = Builder.CreateAlignedLoad(Gep, AlignVal);
1638 VResult = Builder.CreateInsertElement(VResult, Load,
1639 Builder.getInt32(Idx));
1640 }
1641 Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
1642 CI->replaceAllUsesWith(NewI);
1643 CI->eraseFromParent();
1644 return;
1645 }
1646
1647 PHINode *Phi = nullptr;
1648 Value *PrevPhi = UndefVal;
1649
1650 for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
1651
1652 // Fill the "else" block, created in the previous iteration
1653 //
1654 // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
1655 // %mask_1 = extractelement <16 x i1> %mask, i32 Idx
1656 // %to_load = icmp eq i1 %mask_1, true
1657 // br i1 %to_load, label %cond.load, label %else
1658 //
1659 if (Idx > 0) {
1660 Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
1661 Phi->addIncoming(VResult, CondBlock);
1662 Phi->addIncoming(PrevPhi, PrevIfBlock);
1663 PrevPhi = Phi;
1664 VResult = Phi;
1665 }
1666
1667 Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
1668 Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
1669 ConstantInt::get(Predicate->getType(), 1));
1670
1671 // Create "cond" block
1672 //
1673 // %EltAddr = getelementptr i32* %1, i32 0
1674 // %Elt = load i32* %EltAddr
1675 // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
1676 //
1677 CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load");
1678 Builder.SetInsertPoint(InsertPt);
1679
1680 Value *Gep =
1681 Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
1682 LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
1683 VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
1684
1685 // Create "else" block, fill it in the next iteration
1686 BasicBlock *NewIfBlock =
1687 CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
1688 Builder.SetInsertPoint(InsertPt);
1689 Instruction *OldBr = IfBlock->getTerminator();
1690 BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
1691 OldBr->eraseFromParent();
1692 PrevIfBlock = IfBlock;
1693 IfBlock = NewIfBlock;
1694 }
1695
1696 Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
1697 Phi->addIncoming(VResult, CondBlock);
1698 Phi->addIncoming(PrevPhi, PrevIfBlock);
1699 Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
1700 CI->replaceAllUsesWith(NewI);
1701 CI->eraseFromParent();
1702 }
1703
1704 // Translate a masked store intrinsic, like
1705 // void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
1706 // <16 x i1> %mask)
1707 // to a chain of basic blocks, that stores element one-by-one if
1708 // the appropriate mask bit is set
1709 //
1710 // %1 = bitcast i8* %addr to i32*
1711 // %2 = extractelement <16 x i1> %mask, i32 0
1712 // %3 = icmp eq i1 %2, true
1713 // br i1 %3, label %cond.store, label %else
1714 //
1715 // cond.store: ; preds = %0
1716 // %4 = extractelement <16 x i32> %val, i32 0
1717 // %5 = getelementptr i32* %1, i32 0
1718 // store i32 %4, i32* %5
1719 // br label %else
1720 //
1721 // else: ; preds = %0, %cond.store
1722 // %6 = extractelement <16 x i1> %mask, i32 1
1723 // %7 = icmp eq i1 %6, true
1724 // br i1 %7, label %cond.store1, label %else2
1725 //
1726 // cond.store1: ; preds = %else
1727 // %8 = extractelement <16 x i32> %val, i32 1
1728 // %9 = getelementptr i32* %1, i32 1
1729 // store i32 %8, i32* %9
1730 // br label %else2
1731 // . . .
1732 static void scalarizeMaskedStore(CallInst *CI) {
1733 Value *Src = CI->getArgOperand(0);
1734 Value *Ptr = CI->getArgOperand(1);
1735 Value *Alignment = CI->getArgOperand(2);
1736 Value *Mask = CI->getArgOperand(3);
1737
1738 unsigned AlignVal = cast(Alignment)->getZExtValue();
1739 VectorType *VecType = dyn_cast(Src->getType());
1740 assert(VecType && "Unexpected data type in masked store intrinsic");
1741
1742 Type *EltTy = VecType->getElementType();
1743
1744 IRBuilder<> Builder(CI->getContext());
1745 Instruction *InsertPt = CI;
1746 BasicBlock *IfBlock = CI->getParent();
1747 Builder.SetInsertPoint(InsertPt);
1748 Builder.SetCurrentDebugLocation(CI->getDebugLoc());
1749
1750 // Short-cut if the mask is all-true.
1751 bool IsAllOnesMask = isa(Mask) &&
1752 cast(Mask)->isAllOnesValue();
1753
1754 if (IsAllOnesMask) {
1755 Builder.CreateAlignedStore(Src, Ptr, AlignVal);
1756 CI->eraseFromParent();
1757 return;
1758 }
1759
1760 // Adjust alignment for the scalar instruction.
1761 AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits()/8);
1762 // Bitcast %addr fron i8* to EltTy*
1763 Type *NewPtrType =
1764 EltTy->getPointerTo(cast(Ptr->getType())->getAddressSpace());
1765 Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
1766 unsigned VectorWidth = VecType->getNumElements();
1767
1768 if (isa(Mask)) {
1769 for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
1770 if (cast(Mask)->getOperand(Idx)->isNullValue())
1771 continue;
1772 Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
1773 Value *Gep =
1774 Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
1775 Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
1776 }
1777 CI->eraseFromParent();
1778 return;
1779 }
1780
1781 for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
1782
1783 // Fill the "else" block, created in the previous iteration
1784 //
1785 // %mask_1 = extractelement <16 x i1> %mask, i32 Idx
1786 // %to_store = icmp eq i1 %mask_1, true
1787 // br i1 %to_store, label %cond.store, label %else
1788 //
1789 Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
1790 Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
1791 ConstantInt::get(Predicate->getType(), 1));
1792
1793 // Create "cond" block
1794 //
1795 // %OneElt = extractelement <16 x i32> %Src, i32 Idx
1796 // %EltAddr = getelementptr i32* %1, i32 0
1797 // %store i32 %OneElt, i32* %EltAddr
1798 //
1799 BasicBlock *CondBlock =
1800 IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
1801 Builder.SetInsertPoint(InsertPt);
1802
1803 Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
1804 Value *Gep =
1805 Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
1806 Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
1807
1808 // Create "else" block, fill it in the next iteration
1809 BasicBlock *NewIfBlock =
1810 CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
1811 Builder.SetInsertPoint(InsertPt);
1812 Instruction *OldBr = IfBlock->getTerminator();
1813 BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
1814 OldBr->eraseFromParent();
1815 IfBlock = NewIfBlock;
1816 }
1817 CI->eraseFromParent();
1818 }
1819
1820 // Translate a masked gather intrinsic like
1821 // <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4,
1822 // <16 x i1> %Mask, <16 x i32> %Src)
1823 // to a chain of basic blocks, with loading element one-by-one if
1824 // the appropriate mask bit is set
1825 //
1826 // % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind
1827 // % Mask0 = extractelement <16 x i1> %Mask, i32 0
1828 // % ToLoad0 = icmp eq i1 % Mask0, true
1829 // br i1 % ToLoad0, label %cond.load, label %else
1830 //
1831 // cond.load:
1832 // % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
1833 // % Load0 = load i32, i32* % Ptr0, align 4
1834 // % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0
1835 // br label %else
1836 //
1837 // else:
1838 // %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0]
1839 // % Mask1 = extractelement <16 x i1> %Mask, i32 1
1840 // % ToLoad1 = icmp eq i1 % Mask1, true
1841 // br i1 % ToLoad1, label %cond.load1, label %else2
1842 //
1843 // cond.load1:
1844 // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
1845 // % Load1 = load i32, i32* % Ptr1, align 4
1846 // % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1
1847 // br label %else2
1848 // . . .
1849 // % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src
1850 // ret <16 x i32> %Result
1851 static void scalarizeMaskedGather(CallInst *CI) {
1852 Value *Ptrs = CI->getArgOperand(0);
1853 Value *Alignment = CI->getArgOperand(1);
1854 Value *Mask = CI->getArgOperand(2);
1855 Value *Src0 = CI->getArgOperand(3);
1856
1857 VectorType *VecType = dyn_cast(CI->getType());
1858
1859 assert(VecType && "Unexpected return type of masked load intrinsic");
1860
1861 IRBuilder<> Builder(CI->getContext());
1862 Instruction *InsertPt = CI;
1863 BasicBlock *IfBlock = CI->getParent();
1864 BasicBlock *CondBlock = nullptr;
1865 BasicBlock *PrevIfBlock = CI->getParent();
1866 Builder.SetInsertPoint(InsertPt);
1867 unsigned AlignVal = cast(Alignment)->getZExtValue();
1868
1869 Builder.SetCurrentDebugLocation(CI->getDebugLoc());
1870
1871 Value *UndefVal = UndefValue::get(VecType);
1872
1873 // The result vector
1874 Value *VResult = UndefVal;
1875 unsigned VectorWidth = VecType->getNumElements();
1876
1877 // Shorten the way if the mask is a vector of constants.
1878 bool IsConstMask = isa(Mask);
1879
1880 if (IsConstMask) {
1881 for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
1882 if (cast(Mask)->getOperand(Idx)->isNullValue())
1883 continue;
1884 Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
1885 "Ptr" + Twine(Idx));
1886 LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal,
1887 "Load" + Twine(Idx));
1888 VResult = Builder.CreateInsertElement(VResult, Load,
1889 Builder.getInt32(Idx),
1890 "Res" + Twine(Idx));
1891 }
1892 Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
1893 CI->replaceAllUsesWith(NewI);
1894 CI->eraseFromParent();
1895 return;
1896 }
1897
1898 PHINode *Phi = nullptr;
1899 Value *PrevPhi = UndefVal;
1900
1901 for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
1902
1903 // Fill the "else" block, created in the previous iteration
1904 //
1905 // %Mask1 = extractelement <16 x i1> %Mask, i32 1
1906 // %ToLoad1 = icmp eq i1 %Mask1, true
1907 // br i1 %ToLoad1, label %cond.load, label %else
1908 //
1909 if (Idx > 0) {
1910 Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
1911 Phi->addIncoming(VResult, CondBlock);
1912 Phi->addIncoming(PrevPhi, PrevIfBlock);
1913 PrevPhi = Phi;
1914 VResult = Phi;
1915 }
1916
1917 Value *Predicate = Builder.CreateExtractElement(Mask,
1918 Builder.getInt32(Idx),
1919 "Mask" + Twine(Idx));
1920 Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
1921 ConstantInt::get(Predicate->getType(), 1),
1922 "ToLoad" + Twine(Idx));
1923
1924 // Create "cond" block
1925 //
1926 // %EltAddr = getelementptr i32* %1, i32 0
1927 // %Elt = load i32* %EltAddr
1928 // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
1929 //
1930 CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
1931 Builder.SetInsertPoint(InsertPt);
1932
1933 Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
1934 "Ptr" + Twine(Idx));
1935 LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal,
1936 "Load" + Twine(Idx));
1937 VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx),
1938 "Res" + Twine(Idx));
1939
1940 // Create "else" block, fill it in the next iteration
1941 BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
1942 Builder.SetInsertPoint(InsertPt);
1943 Instruction *OldBr = IfBlock->getTerminator();
1944 BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
1945 OldBr->eraseFromParent();
1946 PrevIfBlock = IfBlock;
1947 IfBlock = NewIfBlock;
1948 }
1949
1950 Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
1951 Phi->addIncoming(VResult, CondBlock);
1952 Phi->addIncoming(PrevPhi, PrevIfBlock);
1953 Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
1954 CI->replaceAllUsesWith(NewI);
1955 CI->eraseFromParent();
1956 }
1957
1958 // Translate a masked scatter intrinsic, like
1959 // void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4,
1960 // <16 x i1> %Mask)
1961 // to a chain of basic blocks, that stores element one-by-one if
1962 // the appropriate mask bit is set.
1963 //
1964 // % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind
1965 // % Mask0 = extractelement <16 x i1> % Mask, i32 0
1966 // % ToStore0 = icmp eq i1 % Mask0, true
1967 // br i1 %ToStore0, label %cond.store, label %else
1968 //
1969 // cond.store:
1970 // % Elt0 = extractelement <16 x i32> %Src, i32 0
1971 // % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
1972 // store i32 %Elt0, i32* % Ptr0, align 4
1973 // br label %else
1974 //
1975 // else:
1976 // % Mask1 = extractelement <16 x i1> % Mask, i32 1
1977 // % ToStore1 = icmp eq i1 % Mask1, true
1978 // br i1 % ToStore1, label %cond.store1, label %else2
1979 //
1980 // cond.store1:
1981 // % Elt1 = extractelement <16 x i32> %Src, i32 1
1982 // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
1983 // store i32 % Elt1, i32* % Ptr1, align 4
1984 // br label %else2
1985 // . . .
1986 static void scalarizeMaskedScatter(CallInst *CI) {
1987 Value *Src = CI->getArgOperand(0);
1988 Value *Ptrs = CI->getArgOperand(1);
1989 Value *Alignment = CI->getArgOperand(2);
1990 Value *Mask = CI->getArgOperand(3);
1991
1992 assert(isa(Src->getType()) &&
1993 "Unexpected data type in masked scatter intrinsic");
1994 assert(isa(Ptrs->getType()) &&
1995 isa(Ptrs->getType()->getVectorElementType()) &&
1996 "Vector of pointers is expected in masked scatter intrinsic");
1997
1998 IRBuilder<> Builder(CI->getContext());
1999 Instruction *InsertPt = CI;
2000 BasicBlock *IfBlock = CI->getParent();
2001 Builder.SetInsertPoint(InsertPt);
2002 Builder.SetCurrentDebugLocation(CI->getDebugLoc());
2003
2004 unsigned AlignVal = cast(Alignment)->getZExtValue();
2005 unsigned VectorWidth = Src->getType()->getVectorNumElements();
2006
2007 // Shorten the way if the mask is a vector of constants.
2008 bool IsConstMask = isa(Mask);
2009
2010 if (IsConstMask) {
2011 for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
2012 if (cast(Mask)->getOperand(Idx)->isNullValue())
2013 continue;
2014 Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
2015 "Elt" + Twine(Idx));
2016 Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
2017 "Ptr" + Twine(Idx));
2018 Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
2019 }
2020 CI->eraseFromParent();
2021 return;
2022 }
2023 for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
2024 // Fill the "else" block, created in the previous iteration
2025 //
2026 // % Mask1 = extractelement <16 x i1> % Mask, i32 Idx
2027 // % ToStore = icmp eq i1 % Mask1, true
2028 // br i1 % ToStore, label %cond.store, label %else
2029 //
2030 Value *Predicate = Builder.CreateExtractElement(Mask,
2031 Builder.getInt32(Idx),
2032 "Mask" + Twine(Idx));
2033 Value *Cmp =
2034 Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
2035 ConstantInt::get(Predicate->getType(), 1),
2036 "ToStore" + Twine(Idx));
2037
2038 // Create "cond" block
2039 //
2040 // % Elt1 = extractelement <16 x i32> %Src, i32 1
2041 // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
2042 // %store i32 % Elt1, i32* % Ptr1
2043 //
2044 BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
2045 Builder.SetInsertPoint(InsertPt);
2046
2047 Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
2048 "Elt" + Twine(Idx));
2049 Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
2050 "Ptr" + Twine(Idx));
2051 Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
2052
2053 // Create "else" block, fill it in the next iteration
2054 BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
2055 Builder.SetInsertPoint(InsertPt);
2056 Instruction *OldBr = IfBlock->getTerminator();
2057 BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
2058 OldBr->eraseFromParent();
2059 IfBlock = NewIfBlock;
2060 }
2061 CI->eraseFromParent();
2062 }
2063
20641551 /// If counting leading or trailing zeros is an expensive operation and a zero
20651552 /// input is defined, add a check for zero to avoid calling the intrinsic.
20661553 ///
22401727 SunkAddrs.clear();
22411728 }
22421729 return true;
2243 }
2244 case Intrinsic::masked_load: {
2245 // Scalarize unsupported vector masked load
2246 if (!TTI->isLegalMaskedLoad(CI->getType())) {
2247 scalarizeMaskedLoad(CI);
2248 ModifiedDT = true;
2249 return true;
2250 }
2251 return false;
2252 }
2253 case Intrinsic::masked_store: {
2254 if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) {
2255 scalarizeMaskedStore(CI);
2256 ModifiedDT = true;
2257 return true;
2258 }
2259 return false;
2260 }
2261 case Intrinsic::masked_gather: {
2262 if (!TTI->isLegalMaskedGather(CI->getType())) {
2263 scalarizeMaskedGather(CI);
2264 ModifiedDT = true;
2265 return true;
2266 }
2267 return false;
2268 }
2269 case Intrinsic::masked_scatter: {
2270 if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) {
2271 scalarizeMaskedScatter(CI);
2272 ModifiedDT = true;
2273 return true;
2274 }
2275 return false;
22761730 }
22771731 case Intrinsic::aarch64_stlxr:
22781732 case Intrinsic::aarch64_stxr: {
0 //=== ScalarizeMaskedMemIntrin.cpp - Scalarize unsupported masked mem ===//
1 //=== instrinsics ===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass replaces masked memory intrinsics - when unsupported by the target
11 // - with a chain of basic blocks, that deal with the elements one-by-one if the
12 // appropriate mask bit is set.
13 //
14 //===----------------------------------------------------------------------===//
15
16 #include "llvm/Analysis/TargetTransformInfo.h"
17 #include "llvm/IR/IRBuilder.h"
18 #include "llvm/Target/TargetSubtargetInfo.h"
19
20 using namespace llvm;
21
22 #define DEBUG_TYPE "scalarize-masked-mem-intrin"
23
24 namespace {
25
26 class ScalarizeMaskedMemIntrin : public FunctionPass {
27 const TargetTransformInfo *TTI;
28
29 public:
30 static char ID; // Pass identification, replacement for typeid
31 explicit ScalarizeMaskedMemIntrin() : FunctionPass(ID), TTI(nullptr) {
32 initializeScalarizeMaskedMemIntrinPass(*PassRegistry::getPassRegistry());
33 }
34 bool runOnFunction(Function &F) override;
35
36 StringRef getPassName() const override {
37 return "Scalarize Masked Memory Intrinsics";
38 }
39
40 void getAnalysisUsage(AnalysisUsage &AU) const override {
41 AU.addRequired();
42 }
43
44 private:
45 bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT);
46 bool optimizeCallInst(CallInst *CI, bool &ModifiedDT);
47 };
48 } // namespace
49
50 char ScalarizeMaskedMemIntrin::ID = 0;
51 INITIALIZE_PASS_BEGIN(ScalarizeMaskedMemIntrin, "scalarize-masked-mem-intrin",
52 "Scalarize unsupported masked memory intrinsics", false,
53 false)
54 INITIALIZE_PASS_END(ScalarizeMaskedMemIntrin, "scalarize-masked-mem-intrin",
55 "Scalarize unsupported masked memory intrinsics", false,
56 false)
57
58 FunctionPass *llvm::createScalarizeMaskedMemIntrinPass() {
59 return new ScalarizeMaskedMemIntrin();
60 }
61
62 // Translate a masked load intrinsic like
63 // <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
64 // <16 x i1> %mask, <16 x i32> %passthru)
65 // to a chain of basic blocks, with loading element one-by-one if
66 // the appropriate mask bit is set
67 //
68 // %1 = bitcast i8* %addr to i32*
69 // %2 = extractelement <16 x i1> %mask, i32 0
70 // %3 = icmp eq i1 %2, true
71 // br i1 %3, label %cond.load, label %else
72 //
73 // cond.load: ; preds = %0
74 // %4 = getelementptr i32* %1, i32 0
75 // %5 = load i32* %4
76 // %6 = insertelement <16 x i32> undef, i32 %5, i32 0
77 // br label %else
78 //
79 // else: ; preds = %0, %cond.load
80 // %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ]
81 // %7 = extractelement <16 x i1> %mask, i32 1
82 // %8 = icmp eq i1 %7, true
83 // br i1 %8, label %cond.load1, label %else2
84 //
85 // cond.load1: ; preds = %else
86 // %9 = getelementptr i32* %1, i32 1
87 // %10 = load i32* %9
88 // %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1
89 // br label %else2
90 //
91 // else2: ; preds = %else, %cond.load1
92 // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
93 // %12 = extractelement <16 x i1> %mask, i32 2
94 // %13 = icmp eq i1 %12, true
95 // br i1 %13, label %cond.load4, label %else5
96 //
97 static void scalarizeMaskedLoad(CallInst *CI) {
98 Value *Ptr = CI->getArgOperand(0);
99 Value *Alignment = CI->getArgOperand(1);
100 Value *Mask = CI->getArgOperand(2);
101 Value *Src0 = CI->getArgOperand(3);
102
103 unsigned AlignVal = cast(Alignment)->getZExtValue();
104 VectorType *VecType = dyn_cast(CI->getType());
105 assert(VecType && "Unexpected return type of masked load intrinsic");
106
107 Type *EltTy = CI->getType()->getVectorElementType();
108
109 IRBuilder<> Builder(CI->getContext());
110 Instruction *InsertPt = CI;
111 BasicBlock *IfBlock = CI->getParent();
112 BasicBlock *CondBlock = nullptr;
113 BasicBlock *PrevIfBlock = CI->getParent();
114
115 Builder.SetInsertPoint(InsertPt);
116 Builder.SetCurrentDebugLocation(CI->getDebugLoc());
117
118 // Short-cut if the mask is all-true.
119 bool IsAllOnesMask =
120 isa(Mask) && cast(Mask)->isAllOnesValue();
121
122 if (IsAllOnesMask) {
123 Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal);
124 CI->replaceAllUsesWith(NewI);
125 CI->eraseFromParent();
126 return;
127 }
128
129 // Adjust alignment for the scalar instruction.
130 AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits() / 8);
131 // Bitcast %addr fron i8* to EltTy*
132 Type *NewPtrType =
133 EltTy->getPointerTo(cast(Ptr->getType())->getAddressSpace());
134 Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
135 unsigned VectorWidth = VecType->getNumElements();
136
137 Value *UndefVal = UndefValue::get(VecType);
138
139 // The result vector
140 Value *VResult = UndefVal;
141
142 if (isa(Mask)) {
143 for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
144 if (cast(Mask)->getOperand(Idx)->isNullValue())
145 continue;
146 Value *Gep =
147 Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
148 LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
149 VResult =
150 Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
151 }
152 Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
153 CI->replaceAllUsesWith(NewI);
154 CI->eraseFromParent();
155 return;
156 }
157
158 PHINode *Phi = nullptr;
159 Value *PrevPhi = UndefVal;
160
161 for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
162
163 // Fill the "else" block, created in the previous iteration
164 //
165 // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
166 // %mask_1 = extractelement <16 x i1> %mask, i32 Idx
167 // %to_load = icmp eq i1 %mask_1, true
168 // br i1 %to_load, label %cond.load, label %else
169 //
170 if (Idx > 0) {
171 Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
172 Phi->addIncoming(VResult, CondBlock);
173 Phi->addIncoming(PrevPhi, PrevIfBlock);
174 PrevPhi = Phi;
175 VResult = Phi;
176 }
177
178 Value *Predicate =
179 Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
180 Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
181 ConstantInt::get(Predicate->getType(), 1));
182
183 // Create "cond" block
184 //
185 // %EltAddr = getelementptr i32* %1, i32 0
186 // %Elt = load i32* %EltAddr
187 // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
188 //
189 CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load");
190 Builder.SetInsertPoint(InsertPt);
191
192 Value *Gep =
193 Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
194 LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
195 VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
196
197 // Create "else" block, fill it in the next iteration
198 BasicBlock *NewIfBlock =
199 CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
200 Builder.SetInsertPoint(InsertPt);
201 Instruction *OldBr = IfBlock->getTerminator();
202 BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
203 OldBr->eraseFromParent();
204 PrevIfBlock = IfBlock;
205 IfBlock = NewIfBlock;
206 }
207
208 Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
209 Phi->addIncoming(VResult, CondBlock);
210 Phi->addIncoming(PrevPhi, PrevIfBlock);
211 Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
212 CI->replaceAllUsesWith(NewI);
213 CI->eraseFromParent();
214 }
215
216 // Translate a masked store intrinsic, like
217 // void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
218 // <16 x i1> %mask)
219 // to a chain of basic blocks, that stores element one-by-one if
220 // the appropriate mask bit is set
221 //
222 // %1 = bitcast i8* %addr to i32*
223 // %2 = extractelement <16 x i1> %mask, i32 0
224 // %3 = icmp eq i1 %2, true
225 // br i1 %3, label %cond.store, label %else
226 //
227 // cond.store: ; preds = %0
228 // %4 = extractelement <16 x i32> %val, i32 0
229 // %5 = getelementptr i32* %1, i32 0
230 // store i32 %4, i32* %5
231 // br label %else
232 //
233 // else: ; preds = %0, %cond.store
234 // %6 = extractelement <16 x i1> %mask, i32 1
235 // %7 = icmp eq i1 %6, true
236 // br i1 %7, label %cond.store1, label %else2
237 //
238 // cond.store1: ; preds = %else
239 // %8 = extractelement <16 x i32> %val, i32 1
240 // %9 = getelementptr i32* %1, i32 1
241 // store i32 %8, i32* %9
242 // br label %else2
243 // . . .
244 static void scalarizeMaskedStore(CallInst *CI) {
245 Value *Src = CI->getArgOperand(0);
246 Value *Ptr = CI->getArgOperand(1);
247 Value *Alignment = CI->getArgOperand(2);
248 Value *Mask = CI->getArgOperand(3);
249
250 unsigned AlignVal = cast(Alignment)->getZExtValue();
251 VectorType *VecType = dyn_cast(Src->getType());
252 assert(VecType && "Unexpected data type in masked store intrinsic");
253
254 Type *EltTy = VecType->getElementType();
255
256 IRBuilder<> Builder(CI->getContext());
257 Instruction *InsertPt = CI;
258 BasicBlock *IfBlock = CI->getParent();
259 Builder.SetInsertPoint(InsertPt);
260 Builder.SetCurrentDebugLocation(CI->getDebugLoc());
261
262 // Short-cut if the mask is all-true.
263 bool IsAllOnesMask =
264 isa(Mask) && cast(Mask)->isAllOnesValue();
265
266 if (IsAllOnesMask) {
267 Builder.CreateAlignedStore(Src, Ptr, AlignVal);
268 CI->eraseFromParent();
269 return;
270 }
271
272 // Adjust alignment for the scalar instruction.
273 AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits() / 8);
274 // Bitcast %addr fron i8* to EltTy*
275 Type *NewPtrType =
276 EltTy->getPointerTo(cast(Ptr->getType())->getAddressSpace());
277 Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
278 unsigned VectorWidth = VecType->getNumElements();
279
280 if (isa(Mask)) {
281 for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
282 if (cast(Mask)->getOperand(Idx)->isNullValue())
283 continue;
284 Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
285 Value *Gep =
286 Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
287 Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
288 }
289 CI->eraseFromParent();
290 return;
291 }
292
293 for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
294
295 // Fill the "else" block, created in the previous iteration
296 //
297 // %mask_1 = extractelement <16 x i1> %mask, i32 Idx
298 // %to_store = icmp eq i1 %mask_1, true
299 // br i1 %to_store, label %cond.store, label %else
300 //
301 Value *Predicate =
302 Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
303 Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
304 ConstantInt::get(Predicate->getType(), 1));
305
306 // Create "cond" block
307 //
308 // %OneElt = extractelement <16 x i32> %Src, i32 Idx
309 // %EltAddr = getelementptr i32* %1, i32 0
310 // %store i32 %OneElt, i32* %EltAddr
311 //
312 BasicBlock *CondBlock =
313 IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
314 Builder.SetInsertPoint(InsertPt);
315
316 Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
317 Value *Gep =
318 Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
319 Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
320
321 // Create "else" block, fill it in the next iteration
322 BasicBlock *NewIfBlock =
323 CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
324 Builder.SetInsertPoint(InsertPt);
325 Instruction *OldBr = IfBlock->getTerminator();
326 BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
327 OldBr->eraseFromParent();
328 IfBlock = NewIfBlock;
329 }
330 CI->eraseFromParent();
331 }
332
333 // Translate a masked gather intrinsic like
334 // <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4,
335 // <16 x i1> %Mask, <16 x i32> %Src)
336 // to a chain of basic blocks, with loading element one-by-one if
337 // the appropriate mask bit is set
338 //
339 // % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind
340 // % Mask0 = extractelement <16 x i1> %Mask, i32 0
341 // % ToLoad0 = icmp eq i1 % Mask0, true
342 // br i1 % ToLoad0, label %cond.load, label %else
343 //
344 // cond.load:
345 // % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
346 // % Load0 = load i32, i32* % Ptr0, align 4
347 // % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0
348 // br label %else
349 //
350 // else:
351 // %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0]
352 // % Mask1 = extractelement <16 x i1> %Mask, i32 1
353 // % ToLoad1 = icmp eq i1 % Mask1, true
354 // br i1 % ToLoad1, label %cond.load1, label %else2
355 //
356 // cond.load1:
357 // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
358 // % Load1 = load i32, i32* % Ptr1, align 4
359 // % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1
360 // br label %else2
361 // . . .
362 // % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src
363 // ret <16 x i32> %Result
364 static void scalarizeMaskedGather(CallInst *CI) {
365 Value *Ptrs = CI->getArgOperand(0);
366 Value *Alignment = CI->getArgOperand(1);
367 Value *Mask = CI->getArgOperand(2);
368 Value *Src0 = CI->getArgOperand(3);
369
370 VectorType *VecType = dyn_cast(CI->getType());
371
372 assert(VecType && "Unexpected return type of masked load intrinsic");
373
374 IRBuilder<> Builder(CI->getContext());
375 Instruction *InsertPt = CI;
376 BasicBlock *IfBlock = CI->getParent();
377 BasicBlock *CondBlock = nullptr;
378 BasicBlock *PrevIfBlock = CI->getParent();
379 Builder.SetInsertPoint(InsertPt);
380 unsigned AlignVal = cast(Alignment)->getZExtValue();
381
382 Builder.SetCurrentDebugLocation(CI->getDebugLoc());
383
384 Value *UndefVal = UndefValue::get(VecType);
385
386 // The result vector
387 Value *VResult = UndefVal;
388 unsigned VectorWidth = VecType->getNumElements();
389
390 // Shorten the way if the mask is a vector of constants.
391 bool IsConstMask = isa(Mask);
392
393 if (IsConstMask) {
394 for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
395 if (cast(Mask)->getOperand(Idx)->isNullValue())
396 continue;
397 Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
398 "Ptr" + Twine(Idx));
399 LoadInst *Load =
400 Builder.CreateAlignedLoad(Ptr, AlignVal, "Load" + Twine(Idx));
401 VResult = Builder.CreateInsertElement(
402 VResult, Load, Builder.getInt32(Idx), "Res" + Twine(Idx));
403 }
404 Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
405 CI->replaceAllUsesWith(NewI);
406 CI->eraseFromParent();
407 return;
408 }
409
410 PHINode *Phi = nullptr;
411 Value *PrevPhi = UndefVal;
412
413 for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
414
415 // Fill the "else" block, created in the previous iteration
416 //
417 // %Mask1 = extractelement <16 x i1> %Mask, i32 1
418 // %ToLoad1 = icmp eq i1 %Mask1, true
419 // br i1 %ToLoad1, label %cond.load, label %else
420 //
421 if (Idx > 0) {
422 Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
423 Phi->addIncoming(VResult, CondBlock);
424 Phi->addIncoming(PrevPhi, PrevIfBlock);
425 PrevPhi = Phi;
426 VResult = Phi;
427 }
428
429 Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx),
430 "Mask" + Twine(Idx));
431 Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
432 ConstantInt::get(Predicate->getType(), 1),
433 "ToLoad" + Twine(Idx));
434
435 // Create "cond" block
436 //
437 // %EltAddr = getelementptr i32* %1, i32 0
438 // %Elt = load i32* %EltAddr
439 // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
440 //
441 CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
442 Builder.SetInsertPoint(InsertPt);
443
444 Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
445 "Ptr" + Twine(Idx));
446 LoadInst *Load =
447 Builder.CreateAlignedLoad(Ptr, AlignVal, "Load" + Twine(Idx));
448 VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx),
449 "Res" + Twine(Idx));
450
451 // Create "else" block, fill it in the next iteration
452 BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
453 Builder.SetInsertPoint(InsertPt);
454 Instruction *OldBr = IfBlock->getTerminator();
455 BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
456 OldBr->eraseFromParent();
457 PrevIfBlock = IfBlock;
458 IfBlock = NewIfBlock;
459 }
460
461 Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
462 Phi->addIncoming(VResult, CondBlock);
463 Phi->addIncoming(PrevPhi, PrevIfBlock);
464 Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
465 CI->replaceAllUsesWith(NewI);
466 CI->eraseFromParent();
467 }
468
469 // Translate a masked scatter intrinsic, like
470 // void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4,
471 // <16 x i1> %Mask)
472 // to a chain of basic blocks, that stores element one-by-one if
473 // the appropriate mask bit is set.
474 //
475 // % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind
476 // % Mask0 = extractelement <16 x i1> % Mask, i32 0
477 // % ToStore0 = icmp eq i1 % Mask0, true
478 // br i1 %ToStore0, label %cond.store, label %else
479 //
480 // cond.store:
481 // % Elt0 = extractelement <16 x i32> %Src, i32 0
482 // % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
483 // store i32 %Elt0, i32* % Ptr0, align 4
484 // br label %else
485 //
486 // else:
487 // % Mask1 = extractelement <16 x i1> % Mask, i32 1
488 // % ToStore1 = icmp eq i1 % Mask1, true
489 // br i1 % ToStore1, label %cond.store1, label %else2
490 //
491 // cond.store1:
492 // % Elt1 = extractelement <16 x i32> %Src, i32 1
493 // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
494 // store i32 % Elt1, i32* % Ptr1, align 4
495 // br label %else2
496 // . . .
497 static void scalarizeMaskedScatter(CallInst *CI) {
498 Value *Src = CI->getArgOperand(0);
499 Value *Ptrs = CI->getArgOperand(1);
500 Value *Alignment = CI->getArgOperand(2);
501 Value *Mask = CI->getArgOperand(3);
502
503 assert(isa(Src->getType()) &&
504 "Unexpected data type in masked scatter intrinsic");
505 assert(isa(Ptrs->getType()) &&
506 isa(Ptrs->getType()->getVectorElementType()) &&
507 "Vector of pointers is expected in masked scatter intrinsic");
508
509 IRBuilder<> Builder(CI->getContext());
510 Instruction *InsertPt = CI;
511 BasicBlock *IfBlock = CI->getParent();
512 Builder.SetInsertPoint(InsertPt);
513 Builder.SetCurrentDebugLocation(CI->getDebugLoc());
514
515 unsigned AlignVal = cast(Alignment)->getZExtValue();
516 unsigned VectorWidth = Src->getType()->getVectorNumElements();
517
518 // Shorten the way if the mask is a vector of constants.
519 bool IsConstMask = isa(Mask);
520
521 if (IsConstMask) {
522 for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
523 if (cast(Mask)->getOperand(Idx)->isNullValue())
524 continue;
525 Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
526 "Elt" + Twine(Idx));
527 Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
528 "Ptr" + Twine(Idx));
529 Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
530 }
531 CI->eraseFromParent();
532 return;
533 }
534 for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
535 // Fill the "else" block, created in the previous iteration
536 //
537 // % Mask1 = extractelement <16 x i1> % Mask, i32 Idx
538 // % ToStore = icmp eq i1 % Mask1, true
539 // br i1 % ToStore, label %cond.store, label %else
540 //
541 Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx),
542 "Mask" + Twine(Idx));
543 Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
544 ConstantInt::get(Predicate->getType(), 1),
545 "ToStore" + Twine(Idx));
546
547 // Create "cond" block
548 //
549 // % Elt1 = extractelement <16 x i32> %Src, i32 1
550 // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
551 // %store i32 % Elt1, i32* % Ptr1
552 //
553 BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
554 Builder.SetInsertPoint(InsertPt);
555
556 Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
557 "Elt" + Twine(Idx));
558 Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
559 "Ptr" + Twine(Idx));
560 Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
561
562 // Create "else" block, fill it in the next iteration
563 BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
564 Builder.SetInsertPoint(InsertPt);
565 Instruction *OldBr = IfBlock->getTerminator();
566 BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
567 OldBr->eraseFromParent();
568 IfBlock = NewIfBlock;
569 }
570 CI->eraseFromParent();
571 }
572
573 bool ScalarizeMaskedMemIntrin::runOnFunction(Function &F) {
574 if (skipFunction(F))
575 return false;
576
577 bool EverMadeChange = false;
578
579 TTI = &getAnalysis().getTTI(F);
580
581 bool MadeChange = true;
582 while (MadeChange) {
583 MadeChange = false;
584 for (Function::iterator I = F.begin(); I != F.end();) {
585 BasicBlock *BB = &*I++;
586 bool ModifiedDTOnIteration = false;
587 MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration);
588
589 // Restart BB iteration if the dominator tree of the Function was changed
590 if (ModifiedDTOnIteration)
591 break;
592 }
593
594 EverMadeChange |= MadeChange;
595 }
596
597 return EverMadeChange;
598 }
599
600 bool ScalarizeMaskedMemIntrin::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {
601 bool MadeChange = false;
602
603 BasicBlock::iterator CurInstIterator = BB.begin();
604 while (CurInstIterator != BB.end()) {
605 if (CallInst *CI = dyn_cast(&*CurInstIterator++))
606 MadeChange |= optimizeCallInst(CI, ModifiedDT);
607 if (ModifiedDT)
608 return true;
609 }
610
611 return MadeChange;
612 }
613
614 bool ScalarizeMaskedMemIntrin::optimizeCallInst(CallInst *CI,
615 bool &ModifiedDT) {
616
617 IntrinsicInst *II = dyn_cast(CI);
618 if (II) {
619 switch (II->getIntrinsicID()) {
620 default:
621 break;
622 case Intrinsic::masked_load: {
623 // Scalarize unsupported vector masked load
624 if (!TTI->isLegalMaskedLoad(CI->getType())) {
625 scalarizeMaskedLoad(CI);
626 ModifiedDT = true;
627 return true;
628 }
629 return false;
630 }
631 case Intrinsic::masked_store: {
632 if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) {
633 scalarizeMaskedStore(CI);
634 ModifiedDT = true;
635 return true;
636 }
637 return false;
638 }
639 case Intrinsic::masked_gather: {
640 if (!TTI->isLegalMaskedGather(CI->getType())) {
641 scalarizeMaskedGather(CI);
642 ModifiedDT = true;
643 return true;
644 }
645 return false;
646 }
647 case Intrinsic::masked_scatter: {
648 if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) {
649 scalarizeMaskedScatter(CI);
650 ModifiedDT = true;
651 return true;
652 }
653 return false;
654 }
655 }
656 }
657
658 return false;
659 }
487487 // Insert calls to mcount-like functions.
488488 addPass(createCountingFunctionInserterPass());
489489
490 // Add scalarization of target's unsupported masked memory intrinsics pass.
491 // the unsupported intrinsic will be replaced with a chain of basic blocks,
492 // that stores/loads element one-by-one if the appropriate mask bit is set.
493 addPass(createScalarizeMaskedMemIntrinPass());
494
490495 // Expand reduction intrinsics into shuffle sequences if the target wants to.
491496 addPass(createExpandReductionsPass());
492497 }
2222 ; CHECK-NEXT: Shadow Stack GC Lowering
2323 ; CHECK-NEXT: Remove unreachable blocks from the CFG
2424 ; CHECK-NEXT: Inserts calls to mcount-like functions
25 ; CHECK-NEXT: Scalarize Masked Memory Intrinsics
2526 ; CHECK-NEXT: Expand reduction intrinsics
2627 ; CHECK-NEXT: Rewrite Symbols
2728 ; CHECK-NEXT: FunctionPass Manager
22 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_32
33 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX
44 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX_32
5 ; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
5 ; RUN: opt -mtriple=x86_64-apple-darwin -scalarize-masked-mem-intrin -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
66 ; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mcpu=skx < %s -o /dev/null
77
88 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
0 ; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,+sse2 < %s -o /dev/null
1 ; pr33001 - Check that llc doesn't crash when running with O0 option.
2
3 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
4 target triple = "x86_64-unknown-linux-gnu"
5
6 define <4 x i32> @test_masked_load(<4 x i32>* %base, <4 x i1> %mask) {
7 %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %base, i32 4, <4 x i1> %mask, <4 x i32> zeroinitializer)
8 ret <4 x i32> %res
9 }
10
11 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
12
13
14 define void @test_masked_store(<4 x i32>* %base, <4 x i32> %value, <4 x i1> %mask) {
15 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %value, <4 x i32>* %base, i32 4, <4 x i1> %mask)
16 ret void
17 }
18
19 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
20
21
22 define <4 x i32> @llvm_masked_gather(<4 x i32*> %ptrs, <4 x i1> %mask) {
23 %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> undef)
24 ret <4 x i32> %res
25 }
26
27 declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
28
29
30 define void @llvm_masked_scatter(<4 x i32*> %ptrs, <4 x i32> %value, <4 x i1> %mask) {
31 call void @llvm.masked.scatter.v4i32(<4 x i32> %value, <4 x i32*> %ptrs, i32 4, <4 x i1> %mask)
32 ret void
33 }
34
35 declare void @llvm.masked.scatter.v4i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>)
36
300300 initializeConstantHoistingLegacyPassPass(*Registry);
301301 initializeScalarOpts(*Registry);
302302 initializeVectorization(*Registry);
303 initializeScalarizeMaskedMemIntrinPass(*Registry);
303304 initializeExpandReductionsPass(*Registry);
304305
305306 // Register the target printer for --version.
384384 initializeTarget(Registry);
385385 // For codegen passes, only passes that do IR to IR transformation are
386386 // supported.
387 initializeScalarizeMaskedMemIntrinPass(Registry);
387388 initializeCodeGenPreparePass(Registry);
388389 initializeAtomicExpandPass(Registry);
389390 initializeRewriteSymbolsLegacyPassPass(Registry);