llvm.org GIT mirror llvm / cee0476
Revert rL317618 The implemented pass fails and is breaking a large number of unit tests. Example: http://lab.llvm.org:8011/builders/clang-with-lto-ubuntu/builds/5777/steps/build-stage3-compiler/logs/stdio This reverts commit rL317618 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317641 91177308-0d34-0410-b5e6-96231b3b80d8 Mitch Phillips 2 years ago
3 changed file(s) with 288 addition(s) and 455 deletion(s). Raw diff Collapse all Expand all
332332 case Instruction::Sub:
333333 return Instruction::Add;
334334 default:
335 return Op;
335 return 0;
336336 }
337337 }
338338
343343 static bool sameOpcodeOrAlt(unsigned Opcode, unsigned AltOpcode,
344344 unsigned CheckedOpcode) {
345345 return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode;
346 }
347
348 /// Checks if the \p Opcode can be considered as an operand of a (possibly)
349 /// binary operation \p I.
350 /// \returns The code of the binary operation of instruction \p I if the
351 /// instruction with \p Opcode can be considered as an operand of \p I with the
352 /// default value.
353 static unsigned tryToRepresentAsInstArg(unsigned Opcode, Instruction *I) {
354 assert(!sameOpcodeOrAlt(Opcode, getAltOpcode(Opcode), I->getOpcode())
355 && "Invalid Opcode");
356 if (Opcode != Instruction::PHI && isa(I) &&
357 (I->getType()->isIntegerTy() || cast(I)->isFast()))
358 return I->getOpcode();
359 return 0;
360346 }
361347
362348 /// Chooses the correct key for scheduling data. If \p Op has the same (or
380366 struct RawInstructionsData {
381367 /// Main Opcode of the instructions going to be vectorized.
382368 unsigned Opcode = 0;
383 /// Position of the first instruction with the \a Opcode.
384 unsigned OpcodePos = 0;
385 /// Need an additional analysis (if at least one of the instruction is not
386 /// same instruction kind as an instruction at OpcodePos position in the
387 /// list).
388 bool NeedAnalysis = false;
369
389370 /// The list of instructions have some instructions with alternate opcodes.
390371 bool HasAltOpcodes = false;
391372 };
400381 return {};
401382 RawInstructionsData Res;
402383 unsigned Opcode = I0->getOpcode();
403 unsigned AltOpcode = getAltOpcode(Opcode);
404 unsigned NewOpcodePos = 0;
405384 // Walk through the list of the vectorized instructions
406385 // in order to check its structure described by RawInstructionsData.
407386 for (unsigned Cnt = 0, E = VL.size(); Cnt != E; ++Cnt) {
408387 auto *I = dyn_cast(VL[Cnt]);
409388 if (!I)
410389 return {};
411 if (sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode())) {
412 if (Opcode != I->getOpcode()) {
413 Res.HasAltOpcodes = true;
414 if (Res.NeedAnalysis && isOdd(NewOpcodePos))
415 std::swap(Opcode, AltOpcode);
416 }
417 continue;
418 }
419 if (unsigned NewOpcode = tryToRepresentAsInstArg(Opcode, I)) {
420 if (!Instruction::isBinaryOp(Opcode) ||
421 !Instruction::isCommutative(Opcode)) {
422 NewOpcodePos = Cnt;
423 Opcode = NewOpcode;
424 AltOpcode = getAltOpcode(Opcode);
425 Res.NeedAnalysis = true;
426 }
427 } else if (tryToRepresentAsInstArg(I->getOpcode(),
428 cast(VL[NewOpcodePos])))
429 Res.NeedAnalysis = true;
430 else
431 return {};
390 if (Opcode != I->getOpcode())
391 Res.HasAltOpcodes = true;
432392 }
433393 Res.Opcode = Opcode;
434 Res.OpcodePos = NewOpcodePos;
435394 return Res;
436395 }
437396
461420 static InstructionsState getSameOpcode(ArrayRef VL) {
462421 auto Res = getMainOpcode(VL);
463422 unsigned Opcode = Res.Opcode;
464 if (!Res.NeedAnalysis && !Res.HasAltOpcodes)
465 return InstructionsState(VL[Res.OpcodePos], Opcode, false);
466 auto *OpInst = cast(VL[Res.OpcodePos]);
423 if (!Res.HasAltOpcodes)
424 return InstructionsState(VL[0], Opcode, false);
425 auto *OpInst = cast(VL[0]);
467426 unsigned AltOpcode = getAltOpcode(Opcode);
468427 // Examine each element in the list instructions VL to determine
469428 // if some operations there could be considered as an alternative
470 // (for example as subtraction relates to addition operation) or
471 // operation could be an operand of a (possibly) binary operation.
429 // (for example as subtraction relates to addition operation).
472430 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
473431 auto *I = cast(VL[Cnt]);
474432 unsigned InstOpcode = I->getOpcode();
475 if (Res.NeedAnalysis && !sameOpcodeOrAlt(Opcode, AltOpcode, InstOpcode))
476 if (tryToRepresentAsInstArg(InstOpcode, OpInst))
477 InstOpcode = (Res.HasAltOpcodes && isOdd(Cnt)) ? AltOpcode : Opcode;
478433 if ((Res.HasAltOpcodes &&
479434 InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode)) ||
480435 (!Res.HasAltOpcodes && InstOpcode != Opcode)) {
627582 void deleteTree() {
628583 VectorizableTree.clear();
629584 ScalarToTreeEntry.clear();
630 ExtraScalarToTreeEntry.clear();
631585 MustGather.clear();
632586 ExternalUses.clear();
633587 NumLoadsWantToKeepOrder = 0;
767721 /// The TreeEntry index containing the user of this entry. We can actually
768722 /// have multiple users so the data structure is not truly a tree.
769723 SmallVector UserTreeIndices;
770
771 /// Info about instruction in this tree entry.
772 InstructionsState State;
773724 };
774725
775726 /// Create a new VectorizableTree entry.
776727 TreeEntry *newTreeEntry(ArrayRef VL, bool Vectorized,
777 int &UserTreeIdx, const InstructionsState &S) {
778 assert((!Vectorized || S.Opcode != 0) &&
779 "Vectorized TreeEntry without opcode");
728 int &UserTreeIdx) {
780729 VectorizableTree.emplace_back(VectorizableTree);
781730 int idx = VectorizableTree.size() - 1;
782731 TreeEntry *Last = &VectorizableTree[idx];
783732 Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
784733 Last->NeedToGather = !Vectorized;
785734 if (Vectorized) {
786 Last->State = S;
787 unsigned AltOpcode = getAltOpcode(S.Opcode);
788735 for (int i = 0, e = VL.size(); i != e; ++i) {
789 unsigned RealOpcode =
790 (S.IsAltShuffle && isOdd(i)) ? AltOpcode : S.Opcode;
791 Value *Key = (cast(VL[i])->getOpcode() == RealOpcode)
792 ? VL[i]
793 : S.OpValue;
794 assert(!getTreeEntry(VL[i], Key) && "Scalar already in tree!");
795 if (VL[i] == Key)
796 ScalarToTreeEntry[Key] = idx;
797 else
798 ExtraScalarToTreeEntry[VL[i]][Key] = idx;
736 assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
737 ScalarToTreeEntry[VL[i]] = idx;
799738 }
800739 } else {
801 Last->State.Opcode = 0;
802 Last->State.OpValue = VL[0];
803 Last->State.IsAltShuffle = false;
804740 MustGather.insert(VL.begin(), VL.end());
805741 }
806742
828764 return nullptr;
829765 }
830766
831 TreeEntry *getTreeEntry(Value *V, Value *OpValue) {
832 if (V == OpValue)
833 return getTreeEntry(V);
834 auto I = ExtraScalarToTreeEntry.find(V);
835 if (I != ExtraScalarToTreeEntry.end()) {
836 auto &STT = I->second;
837 auto STTI = STT.find(OpValue);
838 if (STTI != STT.end())
839 return &VectorizableTree[STTI->second];
840 }
841 return nullptr;
842 }
843
844767 /// Maps a specific scalar to its tree entry.
845 SmallDenseMap ScalarToTreeEntry;
846
847 /// Maps a specific scalar to its tree entry(s) with leading scalar.
848 SmallDenseMap *, SmallDenseMap> ExtraScalarToTreeEntry;
768 SmallDenseMap*, int> ScalarToTreeEntry;
849769
850770 /// A list of scalars that we found that we need to keep as scalars.
851771 ValueSet MustGather;
14171337 continue;
14181338
14191339 // For each lane:
1420 const unsigned Opcode = Entry->State.Opcode;
1421 const unsigned AltOpcode = getAltOpcode(Opcode);
14221340 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
14231341 Value *Scalar = Entry->Scalars[Lane];
1424
1425 if (!sameOpcodeOrAlt(Opcode, AltOpcode,
1426 cast(Scalar)->getOpcode()))
1427 continue;
14281342
14291343 // Check if the scalar is externally used as an extra arg.
14301344 auto ExtI = ExternallyUsedValues.find(Scalar);
14681382 }
14691383 }
14701384
1471 static Value *getDefaultConstantForOpcode(unsigned Opcode, Type *Ty) {
1472 switch(Opcode) {
1473 case Instruction::Add:
1474 case Instruction::Sub:
1475 case Instruction::Or:
1476 case Instruction::Xor:
1477 return ConstantInt::getNullValue(Ty);
1478 case Instruction::Mul:
1479 case Instruction::UDiv:
1480 case Instruction::SDiv:
1481 case Instruction::URem:
1482 case Instruction::SRem:
1483 return ConstantInt::get(Ty, /*V=*/1);
1484 case Instruction::FAdd:
1485 case Instruction::FSub:
1486 return ConstantFP::get(Ty, /*V=*/0.0);
1487 case Instruction::FMul:
1488 case Instruction::FDiv:
1489 case Instruction::FRem:
1490 return ConstantFP::get(Ty, /*V=*/1.0);
1491 case Instruction::And:
1492 return ConstantInt::getAllOnesValue(Ty);
1493 case Instruction::Shl:
1494 case Instruction::LShr:
1495 case Instruction::AShr:
1496 return ConstantInt::getNullValue(Type::getInt32Ty(Ty->getContext()));
1497 default:
1498 break;
1499 }
1500 llvm_unreachable("unknown binop for default constant value");
1501 }
1502
15031385 void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth,
15041386 int UserTreeIdx) {
15051387 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
15071389 InstructionsState S = getSameOpcode(VL);
15081390 if (Depth == RecursionMaxDepth) {
15091391 DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
1510 newTreeEntry(VL, false, UserTreeIdx, S);
1392 newTreeEntry(VL, false, UserTreeIdx);
15111393 return;
15121394 }
15131395
15141396 // Don't handle vectors.
15151397 if (S.OpValue->getType()->isVectorTy()) {
15161398 DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
1517 newTreeEntry(VL, false, UserTreeIdx, S);
1399 newTreeEntry(VL, false, UserTreeIdx);
15181400 return;
15191401 }
15201402
15211403 if (StoreInst *SI = dyn_cast(S.OpValue))
15221404 if (SI->getValueOperand()->getType()->isVectorTy()) {
15231405 DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
1524 newTreeEntry(VL, false, UserTreeIdx, S);
1406 newTreeEntry(VL, false, UserTreeIdx);
15251407 return;
15261408 }
15271409
15281410 // If all of the operands are identical or constant we have a simple solution.
15291411 if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) {
15301412 DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
1531 newTreeEntry(VL, false, UserTreeIdx, S);
1413 newTreeEntry(VL, false, UserTreeIdx);
15321414 return;
1533 }
1534
1535 // Avoid any vectors that are wider than two elements and
1536 // with real operations less than or equal to half of vector
1537 // to others members are operands to that operations.
1538 unsigned AltOpcode = getAltOpcode(S.Opcode);
1539 unsigned SameOrAlt = 0;
1540 if (VL.size() > 2) {
1541 for (Value *V : VL) {
1542 auto *Instr = cast(V);
1543 if (sameOpcodeOrAlt(S.Opcode, AltOpcode, Instr->getOpcode()))
1544 SameOrAlt++;
1545 }
1546 if (SameOrAlt <= (VL.size() / 2))
1547 return;
15481415 }
15491416
15501417 // We now know that this is a vector of instructions of the same type from
15551422 if (EphValues.count(VL[i])) {
15561423 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
15571424 ") is ephemeral.\n");
1558 newTreeEntry(VL, false, UserTreeIdx, S);
1425 newTreeEntry(VL, false, UserTreeIdx);
15591426 return;
15601427 }
15611428 }
15661433 DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
15671434 if (E->Scalars[i] != VL[i]) {
15681435 DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
1569 newTreeEntry(VL, false, UserTreeIdx, S);
1436 newTreeEntry(VL, false, UserTreeIdx);
15701437 return;
15711438 }
15721439 }
15851452 if (getTreeEntry(I)) {
15861453 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
15871454 ") is already in tree.\n");
1588 newTreeEntry(VL, false, UserTreeIdx, S);
1455 newTreeEntry(VL, false, UserTreeIdx);
15891456 return;
15901457 }
15911458 }
15951462 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
15961463 if (MustGather.count(VL[i])) {
15971464 DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
1598 newTreeEntry(VL, false, UserTreeIdx, S);
1465 newTreeEntry(VL, false, UserTreeIdx);
15991466 return;
16001467 }
16011468 }
16091476 // Don't go into unreachable blocks. They may contain instructions with
16101477 // dependency cycles which confuse the final scheduling.
16111478 DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
1612 newTreeEntry(VL, false, UserTreeIdx, S);
1479 newTreeEntry(VL, false, UserTreeIdx);
16131480 return;
16141481 }
16151482
16181485 for (unsigned j = i + 1; j < e; ++j)
16191486 if (VL[i] == VL[j]) {
16201487 DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
1621 newTreeEntry(VL, false, UserTreeIdx, S);
1488 newTreeEntry(VL, false, UserTreeIdx);
16221489 return;
16231490 }
16241491
16331500 assert((!BS.getScheduleData(VL0) ||
16341501 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
16351502 "tryScheduleBundle should cancelScheduling on failure");
1636 newTreeEntry(VL, false, UserTreeIdx, S);
1503 newTreeEntry(VL, false, UserTreeIdx);
16371504 return;
16381505 }
16391506 DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
16521519 if (Term) {
16531520 DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
16541521 BS.cancelScheduling(VL, VL0);
1655 newTreeEntry(VL, false, UserTreeIdx, S);
1522 newTreeEntry(VL, false, UserTreeIdx);
16561523 return;
16571524 }
16581525 }
16591526
1660 newTreeEntry(VL, true, UserTreeIdx, S);
1527 newTreeEntry(VL, true, UserTreeIdx);
16611528 DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
16621529
16631530 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
16791546 } else {
16801547 BS.cancelScheduling(VL, VL0);
16811548 }
1682 newTreeEntry(VL, Reuse, UserTreeIdx, S);
1549 newTreeEntry(VL, Reuse, UserTreeIdx);
16831550 return;
16841551 }
16851552 case Instruction::Load: {
16941561 if (DL->getTypeSizeInBits(ScalarTy) !=
16951562 DL->getTypeAllocSizeInBits(ScalarTy)) {
16961563 BS.cancelScheduling(VL, VL0);
1697 newTreeEntry(VL, false, UserTreeIdx, S);
1564 newTreeEntry(VL, false, UserTreeIdx);
16981565 DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
16991566 return;
17001567 }
17051572 LoadInst *L = cast(VL[i]);
17061573 if (!L->isSimple()) {
17071574 BS.cancelScheduling(VL, VL0);
1708 newTreeEntry(VL, false, UserTreeIdx, S);
1575 newTreeEntry(VL, false, UserTreeIdx);
17091576 DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
17101577 return;
17111578 }
17271594
17281595 if (Consecutive) {
17291596 ++NumLoadsWantToKeepOrder;
1730 newTreeEntry(VL, true, UserTreeIdx, S);
1597 newTreeEntry(VL, true, UserTreeIdx);
17311598 DEBUG(dbgs() << "SLP: added a vector of loads.\n");
17321599 return;
17331600 }
17421609 }
17431610
17441611 BS.cancelScheduling(VL, VL0);
1745 newTreeEntry(VL, false, UserTreeIdx, S);
1612 newTreeEntry(VL, false, UserTreeIdx);
17461613
17471614 if (ReverseConsecutive) {
17481615 ++NumLoadsWantToChangeOrder;
17691636 Type *Ty = cast(VL[i])->getOperand(0)->getType();
17701637 if (Ty != SrcTy || !isValidElementType(Ty)) {
17711638 BS.cancelScheduling(VL, VL0);
1772 newTreeEntry(VL, false, UserTreeIdx, S);
1639 newTreeEntry(VL, false, UserTreeIdx);
17731640 DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
17741641 return;
17751642 }
17761643 }
1777 newTreeEntry(VL, true, UserTreeIdx, S);
1644 newTreeEntry(VL, true, UserTreeIdx);
17781645 DEBUG(dbgs() << "SLP: added a vector of casts.\n");
17791646
17801647 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
17971664 if (Cmp->getPredicate() != P0 ||
17981665 Cmp->getOperand(0)->getType() != ComparedTy) {
17991666 BS.cancelScheduling(VL, VL0);
1800 newTreeEntry(VL, false, UserTreeIdx, S);
1667 newTreeEntry(VL, false, UserTreeIdx);
18011668 DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
18021669 return;
18031670 }
18041671 }
18051672
1806 newTreeEntry(VL, true, UserTreeIdx, S);
1673 newTreeEntry(VL, true, UserTreeIdx);
18071674 DEBUG(dbgs() << "SLP: added a vector of compares.\n");
18081675
18091676 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
18351702 case Instruction::And:
18361703 case Instruction::Or:
18371704 case Instruction::Xor:
1838 newTreeEntry(VL, true, UserTreeIdx, S);
1705 newTreeEntry(VL, true, UserTreeIdx);
18391706 DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
18401707
18411708 // Sort operands of the instructions so that each side is more likely to
18511718 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
18521719 ValueList Operands;
18531720 // Prepare the operand vector.
1854 for (Value *VecOp : VL) {
1855 auto *I = cast(VecOp);
1856 if (I->getOpcode() == S.Opcode) {
1857 Operands.push_back(I->getOperand(i));
1858 continue;
1859 }
1860 assert(Instruction::isBinaryOp(S.Opcode) &&
1861 "Expected a binary operation.");
1862 Value *Operand = isOdd(i)
1863 ? getDefaultConstantForOpcode(S.Opcode, I->getType())
1864 : VecOp;
1865 Operands.push_back(Operand);
1866 }
1867 if (allSameType(Operands))
1868 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1721 for (Value *j : VL)
1722 Operands.push_back(cast(j)->getOperand(i));
1723
1724 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
18691725 }
18701726 return;
18711727
18751731 if (cast(VL[j])->getNumOperands() != 2) {
18761732 DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
18771733 BS.cancelScheduling(VL, VL0);
1878 newTreeEntry(VL, false, UserTreeIdx, S);
1734 newTreeEntry(VL, false, UserTreeIdx);
18791735 return;
18801736 }
18811737 }
18881744 if (Ty0 != CurTy) {
18891745 DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
18901746 BS.cancelScheduling(VL, VL0);
1891 newTreeEntry(VL, false, UserTreeIdx, S);
1747 newTreeEntry(VL, false, UserTreeIdx);
18921748 return;
18931749 }
18941750 }
19001756 DEBUG(
19011757 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
19021758 BS.cancelScheduling(VL, VL0);
1903 newTreeEntry(VL, false, UserTreeIdx, S);
1759 newTreeEntry(VL, false, UserTreeIdx);
19041760 return;
19051761 }
19061762 }
19071763
1908 newTreeEntry(VL, true, UserTreeIdx, S);
1764 newTreeEntry(VL, true, UserTreeIdx);
19091765 DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
19101766 for (unsigned i = 0, e = 2; i < e; ++i) {
19111767 ValueList Operands;
19221778 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
19231779 if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
19241780 BS.cancelScheduling(VL, VL0);
1925 newTreeEntry(VL, false, UserTreeIdx, S);
1781 newTreeEntry(VL, false, UserTreeIdx);
19261782 DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
19271783 return;
19281784 }
19291785
1930 newTreeEntry(VL, true, UserTreeIdx, S);
1786 newTreeEntry(VL, true, UserTreeIdx);
19311787 DEBUG(dbgs() << "SLP: added a vector of stores.\n");
19321788
19331789 ValueList Operands;
19451801 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
19461802 if (!isTriviallyVectorizable(ID)) {
19471803 BS.cancelScheduling(VL, VL0);
1948 newTreeEntry(VL, false, UserTreeIdx, S);
1804 newTreeEntry(VL, false, UserTreeIdx);
19491805 DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
19501806 return;
19511807 }
19591815 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
19601816 !CI->hasIdenticalOperandBundleSchema(*CI2)) {
19611817 BS.cancelScheduling(VL, VL0);
1962 newTreeEntry(VL, false, UserTreeIdx, S);
1818 newTreeEntry(VL, false, UserTreeIdx);
19631819 DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
19641820 << "\n");
19651821 return;
19701826 Value *A1J = CI2->getArgOperand(1);
19711827 if (A1I != A1J) {
19721828 BS.cancelScheduling(VL, VL0);
1973 newTreeEntry(VL, false, UserTreeIdx, S);
1829 newTreeEntry(VL, false, UserTreeIdx);
19741830 DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
19751831 << " argument "<< A1I<<"!=" << A1J
19761832 << "\n");
19831839 CI->op_begin() + CI->getBundleOperandsEndIndex(),
19841840 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
19851841 BS.cancelScheduling(VL, VL0);
1986 newTreeEntry(VL, false, UserTreeIdx, S);
1842 newTreeEntry(VL, false, UserTreeIdx);
19871843 DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
19881844 << *VL[i] << '\n');
19891845 return;
19901846 }
19911847 }
19921848
1993 newTreeEntry(VL, true, UserTreeIdx, S);
1849 newTreeEntry(VL, true, UserTreeIdx);
19941850 for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
19951851 ValueList Operands;
19961852 // Prepare the operand vector.
20071863 // then do not vectorize this instruction.
20081864 if (!S.IsAltShuffle) {
20091865 BS.cancelScheduling(VL, VL0);
2010 newTreeEntry(VL, false, UserTreeIdx, S);
1866 newTreeEntry(VL, false, UserTreeIdx);
20111867 DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
20121868 return;
20131869 }
2014 newTreeEntry(VL, true, UserTreeIdx, S);
1870 newTreeEntry(VL, true, UserTreeIdx);
20151871 DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
20161872
20171873 // Reorder operands if reordering would enable vectorization.
20261882 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
20271883 ValueList Operands;
20281884 // Prepare the operand vector.
2029 for (Value *VecOp : VL) {
2030 auto *I = cast(VecOp);
2031 if (sameOpcodeOrAlt(S.Opcode, AltOpcode, I->getOpcode())) {
2032 Operands.push_back(I->getOperand(i));
2033 continue;
2034 }
2035 assert(Instruction::isBinaryOp(S.Opcode) &&
2036 "Expected a binary operation.");
2037 Value *Operand = isOdd(i)
2038 ? getDefaultConstantForOpcode(S.Opcode, I->getType())
2039 : VecOp;
2040 Operands.push_back(Operand);
2041 }
1885 for (Value *j : VL)
1886 Operands.push_back(cast(j)->getOperand(i));
20421887
20431888 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
20441889 }
20461891
20471892 default:
20481893 BS.cancelScheduling(VL, VL0);
2049 newTreeEntry(VL, false, UserTreeIdx, S);
1894 newTreeEntry(VL, false, UserTreeIdx);
20501895 DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
20511896 return;
20521897 }
21672012 }
21682013 return getGatherCost(E->Scalars);
21692014 }
2170 assert(E->State.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
2171 auto *VL0 = cast(E->State.OpValue);
2172 unsigned ShuffleOrOp = E->State.IsAltShuffle ?
2173 (unsigned) Instruction::ShuffleVector : E->State.Opcode;
2015 InstructionsState S = getSameOpcode(VL);
2016 assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
2017 Instruction *VL0 = cast(S.OpValue);
2018 unsigned ShuffleOrOp = S.IsAltShuffle ?
2019 (unsigned) Instruction::ShuffleVector : S.Opcode;
21742020 switch (ShuffleOrOp) {
21752021 case Instruction::PHI:
21762022 return 0;
21772023
21782024 case Instruction::ExtractValue:
21792025 case Instruction::ExtractElement:
2180 if (canReuseExtract(VL, E->State.OpValue)) {
2026 if (canReuseExtract(VL, S.OpValue)) {
21812027 int DeadCost = 0;
21822028 for (unsigned i = 0, e = VL.size(); i < e; ++i) {
21832029 Instruction *E = cast(VL[i]);
22212067 // Calculate the cost of this instruction.
22222068 VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
22232069 int ScalarCost = VecTy->getNumElements() *
2224 TTI->getCmpSelInstrCost(ShuffleOrOp, ScalarTy, Builder.getInt1Ty(), VL0);
2225 int VecCost = TTI->getCmpSelInstrCost(ShuffleOrOp, VecTy, MaskTy, VL0);
2070 TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
2071 int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0);
22262072 return VecCost - ScalarCost;
22272073 }
22282074 case Instruction::Add:
22482094 TargetTransformInfo::OperandValueKind Op1VK =
22492095 TargetTransformInfo::OK_AnyValue;
22502096 TargetTransformInfo::OperandValueKind Op2VK =
2251 TargetTransformInfo::OK_AnyValue;
2097 TargetTransformInfo::OK_UniformConstantValue;
22522098 TargetTransformInfo::OperandValueProperties Op1VP =
22532099 TargetTransformInfo::OP_None;
22542100 TargetTransformInfo::OperandValueProperties Op2VP =
22592105 // If instead not all operands are constants, then set the operand kind
22602106 // to OK_AnyValue. If all operands are constants but not the same,
22612107 // then set the operand kind to OK_NonUniformConstantValue.
2262 if (auto *CInt = dyn_cast(VL0->getOperand(1))) {
2263 Op2VK = TargetTransformInfo::OK_UniformConstantValue;
2264 const unsigned Opcode = E->State.Opcode;
2265 for (auto *V : VL) {
2266 auto *I = cast(V);
2267 if (I == VL0 || Opcode != I->getOpcode())
2268 continue;
2269 if (!isa(I->getOperand(1))) {
2270 Op2VK = TargetTransformInfo::OK_AnyValue;
2271 break;
2272 }
2273 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
2274 CInt != cast(I->getOperand(1)))
2275 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
2108 ConstantInt *CInt = nullptr;
2109 for (unsigned i = 0; i < VL.size(); ++i) {
2110 const Instruction *I = cast(VL[i]);
2111 if (!isa(I->getOperand(1))) {
2112 Op2VK = TargetTransformInfo::OK_AnyValue;
2113 break;
22762114 }
2277 // FIXME: Currently cost of model modification for division by power of
2278 // 2 is handled for X86 and AArch64. Add support for other targets.
2115 if (i == 0) {
2116 CInt = cast(I->getOperand(1));
2117 continue;
2118 }
22792119 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
2280 CInt->getValue().isPowerOf2())
2281 Op2VP = TargetTransformInfo::OP_PowerOf2;
2282 }
2283
2284 int ScalarCost = VecTy->getNumElements() *
2285 TTI->getArithmeticInstrCost(E->State.Opcode, ScalarTy,
2286 Op1VK, Op2VK, Op1VP, Op2VP);
2287 int VecCost = TTI->getArithmeticInstrCost(E->State.Opcode, VecTy, Op1VK,
2288 Op2VK, Op1VP, Op2VP);
2120 CInt != cast(I->getOperand(1)))
2121 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
2122 }
2123 // FIXME: Currently cost of model modification for division by power of
2124 // 2 is handled for X86 and AArch64. Add support for other targets.
2125 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
2126 CInt->getValue().isPowerOf2())
2127 Op2VP = TargetTransformInfo::OP_PowerOf2;
2128
2129 SmallVector Operands(VL0->operand_values());
2130 int ScalarCost =
2131 VecTy->getNumElements() *
2132 TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
2133 Op2VP, Operands);
2134 int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK,
2135 Op1VP, Op2VP, Operands);
22892136 return VecCost - ScalarCost;
22902137 }
22912138 case Instruction::GetElementPtr: {
23512198 TargetTransformInfo::OK_AnyValue;
23522199 TargetTransformInfo::OperandValueKind Op2VK =
23532200 TargetTransformInfo::OK_AnyValue;
2354 unsigned AltOpcode = getAltOpcode(E->State.Opcode);
2355 int ScalarCost =
2356 TTI->getArithmeticInstrCost(E->State.Opcode, ScalarTy, Op1VK, Op2VK) *
2357 VL.size() / 2;
2358 ScalarCost +=
2359 TTI->getArithmeticInstrCost(AltOpcode, ScalarTy, Op1VK, Op2VK) *
2360 VL.size() / 2;
2201 int ScalarCost = 0;
2202 int VecCost = 0;
2203 for (Value *i : VL) {
2204 Instruction *I = cast(i);
2205 if (!I)
2206 break;
2207 ScalarCost +=
2208 TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
2209 }
23612210 // VecCost is equal to sum of the cost of creating 2 vectors
23622211 // and the cost of creating shuffle.
2363 int VecCost =
2364 TTI->getArithmeticInstrCost(E->State.Opcode, VecTy, Op1VK, Op2VK);
2365 VecCost += TTI->getArithmeticInstrCost(AltOpcode, VecTy, Op1VK, Op2VK);
2212 Instruction *I0 = cast(VL[0]);
2213 VecCost =
2214 TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
2215 Instruction *I1 = cast(VL[1]);
2216 VecCost +=
2217 TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
23662218 VecCost +=
23672219 TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
23682220 return VecCost - ScalarCost;
24282280 Instruction *PrevInst = nullptr;
24292281
24302282 for (const auto &N : VectorizableTree) {
2431 Instruction *Inst = dyn_cast(N.State.OpValue);
2283 Instruction *Inst = dyn_cast(N.Scalars[0]);
24322284 if (!Inst)
24332285 continue;
24342286
24882340 for (TreeEntry &TE : VectorizableTree) {
24892341 int C = getEntryCost(&TE);
24902342 DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
2491 << *TE.State.OpValue << ".\n");
2343 << *TE.Scalars[0] << ".\n");
24922344 Cost += C;
24932345 }
24942346
25092361 // extend the extracted value back to the original type. Here, we account
25102362 // for the extract and the added cost of the sign extend if needed.
25112363 auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
2512 auto *ScalarRoot = VectorizableTree[0].State.OpValue;
2364 auto *ScalarRoot = VectorizableTree[0].Scalars[0];
25132365 if (MinBWs.count(ScalarRoot)) {
25142366 auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
25152367 auto Extend =
25722424 SmallVectorImpl &Right) {
25732425 // Push left and right operands of binary operation into Left and Right
25742426 unsigned AltOpcode = getAltOpcode(Opcode);
2427 (void)AltOpcode;
25752428 for (Value *V : VL) {
25762429 auto *I = cast(V);
2577 if (sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode())) {
2578 Left.push_back(I->getOperand(0));
2579 Right.push_back(I->getOperand(1));
2580 } else {
2581 Left.push_back(I);
2582 Right.push_back(getDefaultConstantForOpcode(Opcode, I->getType()));
2583 }
2430 assert(sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) &&
2431 "Incorrect instruction in vector");
2432 Left.push_back(I->getOperand(0));
2433 Right.push_back(I->getOperand(1));
25842434 }
25852435
25862436 // Reorder if we have a commutative operation and consecutive access
26292479 int i, unsigned Opcode, Instruction &I, ArrayRef Left,
26302480 ArrayRef Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight,
26312481 bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) {
2632 if (I.getOpcode() == Opcode) {
2633 VLeft = I.getOperand(0);
2634 VRight = I.getOperand(1);
2635 } else {
2636 VLeft = &I;
2637 VRight = getDefaultConstantForOpcode(Opcode, I.getType());
2638 }
2482 VLeft = I.getOperand(0);
2483 VRight = I.getOperand(1);
26392484 // If we have "SplatRight", try to see if commuting is needed to preserve it.
26402485 if (SplatRight) {
26412486 if (VRight == Right[i - 1])
26992544 // Peel the first iteration out of the loop since there's nothing
27002545 // interesting to do anyway and it simplifies the checks in the loop.
27012546 auto *I = cast(VL[0]);
2702 Value *VLeft;
2703 Value *VRight;
2704 if (I->getOpcode() == Opcode) {
2705 VLeft = I->getOperand(0);
2706 VRight = I->getOperand(1);
2707 } else {
2708 VLeft = I;
2709 VRight = getDefaultConstantForOpcode(Opcode, I->getType());
2710 }
2547 Value *VLeft = I->getOperand(0);
2548 Value *VRight = I->getOperand(1);
27112549 if (!isa(VRight) && isa(VLeft))
27122550 // Favor having instruction to the right. FIXME: why?
27132551 std::swap(VLeft, VRight);
29122750 IRBuilder<>::InsertPointGuard Guard(Builder);
29132751
29142752 if (E->VectorizedValue) {
2915 DEBUG(dbgs() << "SLP: Diamond merged for " << *E->State.OpValue << ".\n");
2753 DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
29162754 return E->VectorizedValue;
29172755 }
29182756
2919 Instruction *VL0 = cast(E->State.OpValue);
2757 InstructionsState S = getSameOpcode(E->Scalars);
2758 Instruction *VL0 = cast(E->Scalars[0]);
29202759 Type *ScalarTy = VL0->getType();
29212760 if (StoreInst *SI = dyn_cast(VL0))
29222761 ScalarTy = SI->getValueOperand()->getType();
29292768 return V;
29302769 }
29312770
2932 unsigned ShuffleOrOp = E->State.IsAltShuffle ?
2933 (unsigned) Instruction::ShuffleVector : E->State.Opcode;
2771 unsigned ShuffleOrOp = S.IsAltShuffle ?
2772 (unsigned) Instruction::ShuffleVector : S.Opcode;
29342773 switch (ShuffleOrOp) {
29352774 case Instruction::PHI: {
29362775 PHINode *PH = dyn_cast(VL0);
30402879
30412880 CmpInst::Predicate P0 = cast(VL0)->getPredicate();
30422881 Value *V;
3043 if (E->State.Opcode == Instruction::FCmp)
2882 if (S.Opcode == Instruction::FCmp)
30442883 V = Builder.CreateFCmp(P0, L, R);
30452884 else
30462885 V = Builder.CreateICmp(P0, L, R);
30922931 case Instruction::Xor: {
30932932 ValueList LHSVL, RHSVL;
30942933 if (isa(VL0) && VL0->isCommutative())
3095 reorderInputsAccordingToOpcode(E->State.Opcode, E->Scalars, LHSVL,
2934 reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL,
30962935 RHSVL);
30972936 else
30982937 for (Value *V : E->Scalars) {
30992938 auto *I = cast(V);
3100 if (I->getOpcode() == E->State.Opcode) {
3101 LHSVL.push_back(I->getOperand(0));
3102 RHSVL.push_back(I->getOperand(1));
3103 } else {
3104 LHSVL.push_back(V);
3105 RHSVL.push_back(
3106 getDefaultConstantForOpcode(E->State.Opcode, I->getType()));
3107 }
2939 LHSVL.push_back(I->getOperand(0));
2940 RHSVL.push_back(I->getOperand(1));
31082941 }
31092942
31102943 setInsertPointAfterBundle(E->Scalars, VL0);
31162949 return V;
31172950
31182951 Value *V = Builder.CreateBinOp(
3119 static_cast(E->State.Opcode), LHS, RHS);
2952 static_cast(S.Opcode), LHS, RHS);
31202953 E->VectorizedValue = V;
31212954 propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
31222955 ++NumVectorInstructions;
32663099 }
32673100 case Instruction::ShuffleVector: {
32683101 ValueList LHSVL, RHSVL;
3269 assert(Instruction::isBinaryOp(E->State.Opcode) &&
3102 assert(Instruction::isBinaryOp(S.Opcode) &&
32703103 "Invalid Shuffle Vector Operand");
3271 reorderAltShuffleOperands(E->State.Opcode, E->Scalars, LHSVL, RHSVL);
3104 reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL);
32723105 setInsertPointAfterBundle(E->Scalars, VL0);
32733106
32743107 Value *LHS = vectorizeTree(LHSVL);
32793112
32803113 // Create a vector of LHS op1 RHS
32813114 Value *V0 = Builder.CreateBinOp(
3282 static_cast(E->State.Opcode), LHS, RHS);
3283
3284 unsigned AltOpcode = getAltOpcode(E->State.Opcode);
3115 static_cast(S.Opcode), LHS, RHS);
3116
3117 unsigned AltOpcode = getAltOpcode(S.Opcode);
32853118 // Create a vector of LHS op2 RHS
32863119 Value *V1 = Builder.CreateBinOp(
32873120 static_cast(AltOpcode), LHS, RHS);
33033136 }
33043137
33053138 Value *ShuffleMask = ConstantVector::get(Mask);
3306 InstructionsState S = getSameOpcode(EvenScalars);
3307 assert(!S.IsAltShuffle && "Unexpected alternate opcode");
3308 propagateIRFlags(V0, EvenScalars, S.OpValue);
3309
3310 S = getSameOpcode(OddScalars);
3311 assert(!S.IsAltShuffle && "Unexpected alternate opcode");
3312 propagateIRFlags(V1, OddScalars, S.OpValue);
3139 propagateIRFlags(V0, EvenScalars);
3140 propagateIRFlags(V1, OddScalars);
33133141
33143142 Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
33153143 E->VectorizedValue = V;
33433171 // If the vectorized tree can be rewritten in a smaller type, we truncate the
33443172 // vectorized root. InstCombine will then rewrite the entire expression. We
33453173 // sign extend the extracted values below.
3346 auto *ScalarRoot = VectorizableTree[0].State.OpValue;
3174 auto *ScalarRoot = VectorizableTree[0].Scalars[0];
33473175 if (MinBWs.count(ScalarRoot)) {
33483176 if (auto *I = dyn_cast(VectorRoot))
33493177 Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
34543282 assert(Entry->VectorizedValue && "Can't find vectorizable value");
34553283
34563284 // For each lane:
3457 const unsigned Opcode = Entry->State.Opcode;
3458 const unsigned AltOpcode = getAltOpcode(Opcode);
34593285 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
34603286 Value *Scalar = Entry->Scalars[Lane];
3461
3462 if (!sameOpcodeOrAlt(Opcode, AltOpcode,
3463 cast(Scalar)->getOpcode()))
3464 continue;
34653287
34663288 Type *Ty = Scalar->getType();
34673289 if (!Ty->isVoidTy()) {
35943416 }
35953417
35963418 for (Value *V : VL) {
3597 ScheduleData *BundleMember = getScheduleData(V, isOneOf(OpValue, V));
3419 ScheduleData *BundleMember = getScheduleData(V);
35983420 assert(BundleMember &&
35993421 "no ScheduleData for bundle member (maybe not in same basic block)");
36003422 if (BundleMember->IsScheduled) {
36673489 if (isa(OpValue))
36683490 return;
36693491
3670 ScheduleData *Bundle = getScheduleData(OpValue)->FirstInBundle;
3492 ScheduleData *Bundle = getScheduleData(OpValue);
36713493 DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
36723494 assert(!Bundle->IsScheduled &&
36733495 "Can't cancel bundle which is already scheduled");
39703792 I = I->getNextNode()) {
39713793 BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
39723794 assert(SD->isPartOfBundle() ==
3973 (getTreeEntry(SD->Inst, SD->OpValue) != nullptr) &&
3795 (getTreeEntry(SD->Inst) != nullptr) &&
39743796 "scheduler and vectorizer bundle mismatch");
39753797 SD->FirstInBundle->SchedulingPriority = Idx++;
39763798 if (SD->isSchedulingEntity()) {
39933815 ScheduleData *BundleMember = picked;
39943816 while (BundleMember) {
39953817 Instruction *pickedInst = BundleMember->Inst;
3996 if (pickedInst == BundleMember->OpValue) {
3997 if (LastScheduledInst->getNextNode() != pickedInst) {
3998 BS->BB->getInstList().remove(pickedInst);
3999 BS->BB->getInstList().insert(LastScheduledInst->getIterator(), pickedInst);
4000 }
4001 LastScheduledInst = pickedInst;
4002 }
3818 if (LastScheduledInst->getNextNode() != pickedInst) {
3819 BS->BB->getInstList().remove(pickedInst);
3820 BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
3821 pickedInst);
3822 }
3823 LastScheduledInst = pickedInst;
40033824 BundleMember = BundleMember->NextInBundle;
40043825 }
40053826
+0
-52
test/Transforms/SLPVectorizer/SystemZ/pr34619.ll less more
None ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
1 ; RUN: opt -mtriple=systemz-unknown -mcpu=z13 -slp-vectorizer -S < %s | FileCheck %s
2
3 @bar = external global [4 x [4 x i32]], align 4
4 @dct_luma = external global [4 x [4 x i32]], align 4
5
6 define void @foo() local_unnamed_addr {
7 ; CHECK-LABEL: @foo(
8 ; CHECK-NEXT: entry:
9 ; CHECK-NEXT: [[ADD277:%.*]] = add nsw i32 undef, undef
10 ; CHECK-NEXT: store i32 [[ADD277]], i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 1), align 4
11 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 0), align 4
12 ; CHECK-NEXT: [[ARRAYIDX372:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 0
13 ; CHECK-NEXT: [[ARRAYIDX372_1:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 1
14 ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 2), align 4
15 ; CHECK-NEXT: [[ARRAYIDX372_2:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 2
16 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 3), align 4
17 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP0]], i32 0
18 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[ADD277]], i32 1
19 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP1]], i32 2
20 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 3
21 ; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> undef, [[TMP6]]
22 ; CHECK-NEXT: [[TMP8:%.*]] = ashr <4 x i32> [[TMP7]],
23 ; CHECK-NEXT: [[ARRAYIDX372_3:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 3
24 ; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX372]] to <4 x i32>*
25 ; CHECK-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4
26 ; CHECK-NEXT: unreachable
27 ;
28 entry:
29 %add277 = add nsw i32 undef, undef
30 store i32 %add277, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 1), align 4
31 %0 = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 0), align 4
32 %sub355 = add nsw i32 undef, %0
33 %shr.i = ashr i32 %sub355, 6
34 %arrayidx372 = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 0
35 store i32 %shr.i, i32* %arrayidx372, align 4
36 %sub355.1 = add nsw i32 undef, %add277
37 %shr.i.1 = ashr i32 %sub355.1, 6
38 %arrayidx372.1 = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 1
39 store i32 %shr.i.1, i32* %arrayidx372.1, align 4
40 %1 = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 2), align 4
41 %sub355.2 = add nsw i32 undef, %1
42 %shr.i.2 = ashr i32 %sub355.2, 6
43 %arrayidx372.2 = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 2
44 store i32 %shr.i.2, i32* %arrayidx372.2, align 4
45 %2 = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 3), align 4
46 %sub355.3 = add nsw i32 undef, %2
47 %shr.i.3 = ashr i32 %sub355.3, 6
48 %arrayidx372.3 = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 3
49 store i32 %shr.i.3, i32* %arrayidx372.3, align 4
50 unreachable
51 }
4242 ; CHECK-LABEL: @add1(
4343 ; CHECK-NEXT: entry:
4444 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
45 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
4546 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
47 ; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4
4648 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
49 ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
50 ; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP1]], 1
4751 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
52 ; CHECK-NEXT: store i32 [[ADD3]], i32* [[INCDEC_PTR1]], align 4
4853 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
54 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
55 ; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP2]], 2
4956 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
50 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
51 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
52 ; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> , [[TMP1]]
53 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
54 ; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
57 ; CHECK-NEXT: store i32 [[ADD6]], i32* [[INCDEC_PTR4]], align 4
58 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
59 ; CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3
60 ; CHECK-NEXT: store i32 [[ADD9]], i32* [[INCDEC_PTR7]], align 4
5561 ; CHECK-NEXT: ret void
5662 ;
5763 entry:
7985 ; CHECK-LABEL: @sub0(
8086 ; CHECK-NEXT: entry:
8187 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
88 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
89 ; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
8290 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
91 ; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4
8392 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
93 ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
8494 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
95 ; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4
8596 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
97 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
98 ; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2
8699 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
87 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
88 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
89 ; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> , [[TMP1]]
90 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
91 ; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
100 ; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4
101 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
102 ; CHECK-NEXT: [[SUB8:%.*]] = add nsw i32 [[TMP3]], -3
103 ; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
92104 ; CHECK-NEXT: ret void
93105 ;
94106 entry:
192204 ; CHECK-LABEL: @addsub0(
193205 ; CHECK-NEXT: entry:
194206 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
207 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
208 ; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
195209 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
210 ; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4
196211 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
212 ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
197213 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
214 ; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4
198215 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
216 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
217 ; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2
199218 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
200 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
201 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
202 ; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]],
203 ; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]],
204 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>
205 ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
206 ; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
219 ; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4
220 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
221 ; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3
222 ; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
207223 ; CHECK-NEXT: ret void
208224 ;
209225 entry:
231247 ; CHECK-LABEL: @addsub1(
232248 ; CHECK-NEXT: entry:
233249 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
250 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
251 ; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
234252 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
253 ; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4
235254 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
255 ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
256 ; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1
236257 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
258 ; CHECK-NEXT: store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4
237259 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
260 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
238261 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
239 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
240 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
241 ; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]],
242 ; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]],
243 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>
244 ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
245 ; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
262 ; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4
263 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
264 ; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3
265 ; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
246266 ; CHECK-NEXT: ret void
247267 ;
248268 entry:
270290 ; CHECK-LABEL: @mul(
271291 ; CHECK-NEXT: entry:
272292 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
293 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
294 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 257
273295 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
296 ; CHECK-NEXT: store i32 [[MUL]], i32* [[DST]], align 4
274297 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
298 ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
299 ; CHECK-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP1]], -3
275300 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
301 ; CHECK-NEXT: store i32 [[MUL3]], i32* [[INCDEC_PTR1]], align 4
276302 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
303 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
277304 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
278 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
279 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
280 ; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> , [[TMP1]]
281 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
282 ; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
305 ; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR4]], align 4
306 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
307 ; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
308 ; CHECK-NEXT: store i32 [[MUL9]], i32* [[INCDEC_PTR7]], align 4
283309 ; CHECK-NEXT: ret void
284310 ;
285311 entry:
307333 ; CHECK-LABEL: @shl0(
308334 ; CHECK-NEXT: entry:
309335 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
336 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
310337 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
338 ; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4
311339 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
340 ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
341 ; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP1]], 1
312342 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
343 ; CHECK-NEXT: store i32 [[SHL]], i32* [[INCDEC_PTR1]], align 4
313344 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
345 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
346 ; CHECK-NEXT: [[SHL5:%.*]] = shl i32 [[TMP2]], 2
314347 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
315 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
316 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
317 ; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[TMP1]],
318 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
319 ; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
348 ; CHECK-NEXT: store i32 [[SHL5]], i32* [[INCDEC_PTR3]], align 4
349 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
350 ; CHECK-NEXT: [[SHL8:%.*]] = shl i32 [[TMP3]], 3
351 ; CHECK-NEXT: store i32 [[SHL8]], i32* [[INCDEC_PTR6]], align 4
320352 ; CHECK-NEXT: ret void
321353 ;
322354 entry:
420452 ; CHECK-LABEL: @add1f(
421453 ; CHECK-NEXT: entry:
422454 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
423 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
424 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
455 ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4
456 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
457 ; CHECK-NEXT: store float [[TMP0]], float* [[DST]], align 4
458 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
459 ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
460 ; CHECK-NEXT: [[ADD3:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
425461 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
462 ; CHECK-NEXT: store float [[ADD3]], float* [[INCDEC_PTR1]], align 4
426463 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
464 ; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
465 ; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], 2.000000e+00
427466 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
428 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
429 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
430 ; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> , [[TMP1]]
431 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
432 ; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
467 ; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
468 ; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
469 ; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00
470 ; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
433471 ; CHECK-NEXT: ret void
434472 ;
435473 entry:
457495 ; CHECK-LABEL: @sub0f(
458496 ; CHECK-NEXT: entry:
459497 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
460 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
461 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
498 ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4
499 ; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
500 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
501 ; CHECK-NEXT: store float [[ADD]], float* [[DST]], align 4
502 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
503 ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
462504 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
505 ; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
463506 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
507 ; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
508 ; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
464509 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
465 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
466 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
467 ; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> , [[TMP1]]
468 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
469 ; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
510 ; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
511 ; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
512 ; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], -3.000000e+00
513 ; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
470514 ; CHECK-NEXT: ret void
471515 ;
472516 entry:
570614 ; CHECK-LABEL: @addsub0f(
571615 ; CHECK-NEXT: entry:
572616 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
573 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
574 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
617 ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4
618 ; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
619 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
620 ; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4
621 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
622 ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
575623 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
624 ; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
576625 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
626 ; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
627 ; CHECK-NEXT: [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
577628 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
578 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
579 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
580 ; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]],
581 ; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]],
582 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32>
583 ; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>*
584 ; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4
629 ; CHECK-NEXT: store float [[SUB5]], float* [[INCDEC_PTR3]], align 4
630 ; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4
631 ; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
632 ; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4
585633 ; CHECK-NEXT: ret void
586634 ;
587635 entry:
609657 ; CHECK-LABEL: @addsub1f(
610658 ; CHECK-NEXT: entry:
611659 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
612 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
613 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
660 ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4
661 ; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
662 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
663 ; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4
664 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
665 ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
666 ; CHECK-NEXT: [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00
614667 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
668 ; CHECK-NEXT: store float [[SUB1]], float* [[INCDEC_PTR1]], align 4
615669 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
670 ; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
616671 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
617 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
618 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
619 ; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]],
620 ; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]],
621 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32>
622 ; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>*
623 ; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4
672 ; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR3]], align 4
673 ; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4
674 ; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
675 ; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4
624676 ; CHECK-NEXT: ret void
625677 ;
626678 entry:
648700 ; CHECK-LABEL: @mulf(
649701 ; CHECK-NEXT: entry:
650702 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
651 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
652 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
703 ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4
704 ; CHECK-NEXT: [[SUB:%.*]] = fmul fast float [[TMP0]], 2.570000e+02
705 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
706 ; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4
707 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
708 ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
709 ; CHECK-NEXT: [[SUB3:%.*]] = fmul fast float [[TMP1]], -3.000000e+00
653710 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
711 ; CHECK-NEXT: store float [[SUB3]], float* [[INCDEC_PTR1]], align 4
654712 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
713 ; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
655714 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
656 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
657 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
658 ; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> , [[TMP1]]
659 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
660 ; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
715 ; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR4]], align 4
716 ; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
717 ; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
718 ; CHECK-NEXT: store float [[SUB9]], float* [[INCDEC_PTR7]], align 4
661719 ; CHECK-NEXT: ret void
662720 ;
663721 entry:
766824 ; CHECK-LABEL: @sub0fn(
767825 ; CHECK-NEXT: entry:
768826 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
769 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
770 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
827 ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4
828 ; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
829 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
830 ; CHECK-NEXT: store float [[ADD]], float* [[DST]], align 4
831 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
832 ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
771833 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
834 ; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
772835 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
836 ; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
837 ; CHECK-NEXT: [[ADD6:%.*]] = fadd float [[TMP2]], -2.000000e+00
773838 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
774 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
775 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
776 ; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> , [[TMP1]]
777 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
778 ; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
839 ; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
840 ; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
841 ; CHECK-NEXT: [[ADD9:%.*]] = fadd float [[TMP3]], -3.000000e+00
842 ; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
779843 ; CHECK-NEXT: ret void
780844 ;
781845 entry: