llvm.org GIT mirror llvm / b02403e
Revert r288115 for PR31847. ------------------------------------------------------------------------ r288115 | abataev | 2016-11-29 09:21:14 +0100 (Tue, 29 Nov 2016) | 8 lines [SLPVectorizer] Improved support of partial tree vectorization. Currently SLP vectorizer tries to vectorize a binary operation and dies immediately after unsuccessful the first unsuccessfull attempt. Patch tries to improve the situation, trying to vectorize all binary operations of all children nodes in the binop tree. Differential Revision: https://reviews.llvm.org/D25517 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_40@296185 91177308-0d34-0410-b5e6-96231b3b80d8 Hans Wennborg 2 years ago
3 changed file(s) with 174 addition(s) and 259 deletion(s). Raw diff Collapse all Expand all
9191 /// collected in GEPs.
9292 bool vectorizeGEPIndices(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
9393
94 /// Try to find horizontal reduction or otherwise vectorize a chain of binary
95 /// operators.
96 bool vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB,
97 slpvectorizer::BoUpSLP &R,
98 TargetTransformInfo *TTI);
99
10094 /// \brief Scan the basic block and look for patterns that are likely to start
10195 /// a vectorization chain.
10296 bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
40254025 if (!V)
40264026 return false;
40274027
4028 Value *P = V->getParent();
4029
4030 // Vectorize in current basic block only.
4031 auto *Op0 = dyn_cast(V->getOperand(0));
4032 auto *Op1 = dyn_cast(V->getOperand(1));
4033 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
4034 return false;
4035
40364028 // Try to vectorize V.
4037 if (tryToVectorizePair(Op0, Op1, R))
4029 if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
40384030 return true;
40394031
4040 auto *A = dyn_cast(Op0);
4041 auto *B = dyn_cast(Op1);
4032 BinaryOperator *A = dyn_cast(V->getOperand(0));
4033 BinaryOperator *B = dyn_cast(V->getOperand(1));
40424034 // Try to skip B.
40434035 if (B && B->hasOneUse()) {
4044 auto *B0 = dyn_cast(B->getOperand(0));
4045 auto *B1 = dyn_cast(B->getOperand(1));
4046 if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
4036 BinaryOperator *B0 = dyn_cast(B->getOperand(0));
4037 BinaryOperator *B1 = dyn_cast(B->getOperand(1));
4038 if (tryToVectorizePair(A, B0, R)) {
40474039 return true;
4048 if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
4040 }
4041 if (tryToVectorizePair(A, B1, R)) {
40494042 return true;
4043 }
40504044 }
40514045
40524046 // Try to skip A.
40534047 if (A && A->hasOneUse()) {
4054 auto *A0 = dyn_cast(A->getOperand(0));
4055 auto *A1 = dyn_cast(A->getOperand(1));
4056 if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
4048 BinaryOperator *A0 = dyn_cast(A->getOperand(0));
4049 BinaryOperator *A1 = dyn_cast(A->getOperand(1));
4050 if (tryToVectorizePair(A0, B, R)) {
40574051 return true;
4058 if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
4052 }
4053 if (tryToVectorizePair(A1, B, R)) {
40594054 return true;
4060 }
4061 return false;
4055 }
4056 }
4057 return 0;
40624058 }
40634059
40644060 /// \brief Generate a shuffle mask to be used in a reduction tree.
45104506 return nullptr;
45114507 }
45124508
4513 namespace {
4514 /// Tracks instructons and its children.
4515 class WeakVHWithLevel final : public CallbackVH {
4516 /// Operand index of the instruction currently beeing analized.
4517 unsigned Level = 0;
4518 /// Is this the instruction that should be vectorized, or are we now
4519 /// processing children (i.e. operands of this instruction) for potential
4520 /// vectorization?
4521 bool IsInitial = true;
4522
4523 public:
4524 explicit WeakVHWithLevel() = default;
4525 WeakVHWithLevel(Value *V) : CallbackVH(V){};
4526 /// Restart children analysis each time it is repaced by the new instruction.
4527 void allUsesReplacedWith(Value *New) override {
4528 setValPtr(New);
4529 Level = 0;
4530 IsInitial = true;
4531 }
4532 /// Check if the instruction was not deleted during vectorization.
4533 bool isValid() const { return !getValPtr(); }
4534 /// Is the istruction itself must be vectorized?
4535 bool isInitial() const { return IsInitial; }
4536 /// Try to vectorize children.
4537 void clearInitial() { IsInitial = false; }
4538 /// Are all children processed already?
4539 bool isFinal() const {
4540 assert(getValPtr() &&
4541 (isa(getValPtr()) &&
4542 cast(getValPtr())->getNumOperands() >= Level));
4543 return getValPtr() &&
4544 cast(getValPtr())->getNumOperands() == Level;
4545 }
4546 /// Get next child operation.
4547 Value *nextOperand() {
4548 assert(getValPtr() && isa(getValPtr()) &&
4549 cast(getValPtr())->getNumOperands() > Level);
4550 return cast(getValPtr())->getOperand(Level++);
4551 }
4552 virtual ~WeakVHWithLevel() = default;
4553 };
4554 } // namespace
4555
45564509 /// \brief Attempt to reduce a horizontal reduction.
45574510 /// If it is legal to match a horizontal reduction feeding
4558 /// the phi node P with reduction operators Root in a basic block BB, then check
4559 /// if it can be done.
4511 /// the phi node P with reduction operators BI, then check if it
4512 /// can be done.
45604513 /// \returns true if a horizontal reduction was matched and reduced.
45614514 /// \returns false if a horizontal reduction was not matched.
4562 static bool canBeVectorized(
4563 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
4564 TargetTransformInfo *TTI,
4565 const function_ref Vectorize) {
4515 static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI,
4516 BoUpSLP &R, TargetTransformInfo *TTI,
4517 unsigned MinRegSize) {
45664518 if (!ShouldVectorizeHor)
45674519 return false;
45684520
4569 if (!Root)
4521 HorizontalReduction HorRdx(MinRegSize);
4522 if (!HorRdx.matchAssociativeReduction(P, BI))
45704523 return false;
45714524
4572 if (Root->getParent() != BB)
4573 return false;
4574 SmallVector Stack(1, Root);
4575 SmallSet VisitedInstrs;
4576 bool Res = false;
4577 while (!Stack.empty()) {
4578 Value *V = Stack.back();
4579 if (!V) {
4580 Stack.pop_back();
4581 continue;
4582 }
4583 auto *Inst = dyn_cast(V);
4584 if (!Inst || isa(Inst)) {
4585 Stack.pop_back();
4586 continue;
4587 }
4588 if (Stack.back().isInitial()) {
4589 Stack.back().clearInitial();
4590 if (auto *BI = dyn_cast(Inst)) {
4591 HorizontalReduction HorRdx(R.getMinVecRegSize());
4592 if (HorRdx.matchAssociativeReduction(P, BI)) {
4593 // If there is a sufficient number of reduction values, reduce
4594 // to a nearby power-of-2. Can safely generate oversized
4595 // vectors and rely on the backend to split them to legal sizes.
4596 HorRdx.ReduxWidth =
4597 std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
4598
4599 if (HorRdx.tryToReduce(R, TTI)) {
4600 Res = true;
4601 P = nullptr;
4602 continue;
4603 }
4604 }
4605 if (P) {
4606 Inst = dyn_cast(BI->getOperand(0));
4607 if (Inst == P)
4608 Inst = dyn_cast(BI->getOperand(1));
4609 if (!Inst) {
4610 P = nullptr;
4611 continue;
4612 }
4613 }
4614 }
4615 P = nullptr;
4616 if (Vectorize(dyn_cast(Inst), R)) {
4617 Res = true;
4618 continue;
4619 }
4620 }
4621 if (Stack.back().isFinal()) {
4622 Stack.pop_back();
4623 continue;
4624 }
4625
4626 if (auto *NextV = dyn_cast(Stack.back().nextOperand()))
4627 if (NextV->getParent() == BB && VisitedInstrs.insert(NextV).second &&
4628 Stack.size() < RecursionMaxDepth)
4629 Stack.push_back(NextV);
4630 }
4631 return Res;
4632 }
4633
4634 bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
4635 BasicBlock *BB, BoUpSLP &R,
4636 TargetTransformInfo *TTI) {
4637 if (!V)
4638 return false;
4639 auto *I = dyn_cast(V);
4640 if (!I)
4641 return false;
4642
4643 if (!isa(I))
4644 P = nullptr;
4645 // Try to match and vectorize a horizontal reduction.
4646 return canBeVectorized(P, I, BB, R, TTI,
4647 [this](BinaryOperator *BI, BoUpSLP &R) -> bool {
4648 return tryToVectorize(BI, R);
4649 });
4525 // If there is a sufficient number of reduction values, reduce
4526 // to a nearby power-of-2. Can safely generate oversized
4527 // vectors and rely on the backend to split them to legal sizes.
4528 HorRdx.ReduxWidth =
4529 std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
4530
4531 return HorRdx.tryToReduce(R, TTI);
46504532 }
46514533
46524534 bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
47164598 if (P->getNumIncomingValues() != 2)
47174599 return Changed;
47184600
4601 Value *Rdx = getReductionValue(DT, P, BB, LI);
4602
4603 // Check if this is a Binary Operator.
4604 BinaryOperator *BI = dyn_cast_or_null(Rdx);
4605 if (!BI)
4606 continue;
4607
47194608 // Try to match and vectorize a horizontal reduction.
4720 if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
4721 TTI)) {
4609 if (canMatchHorizontalReduction(P, BI, R, TTI, R.getMinVecRegSize())) {
47224610 Changed = true;
47234611 it = BB->begin();
47244612 e = BB->end();
47254613 continue;
47264614 }
4615
4616 Value *Inst = BI->getOperand(0);
4617 if (Inst == P)
4618 Inst = BI->getOperand(1);
4619
4620 if (tryToVectorize(dyn_cast(Inst), R)) {
4621 // We would like to start over since some instructions are deleted
4622 // and the iterator may become invalid value.
4623 Changed = true;
4624 it = BB->begin();
4625 e = BB->end();
4626 continue;
4627 }
4628
47274629 continue;
47284630 }
47294631
4730 if (ShouldStartVectorizeHorAtStore) {
4731 if (StoreInst *SI = dyn_cast(it)) {
4732 // Try to match and vectorize a horizontal reduction.
4733 if (vectorizeRootInstruction(nullptr, SI->getValueOperand(), BB, R,
4734 TTI)) {
4735 Changed = true;
4736 it = BB->begin();
4737 e = BB->end();
4738 continue;
4632 if (ShouldStartVectorizeHorAtStore)
4633 if (StoreInst *SI = dyn_cast(it))
4634 if (BinaryOperator *BinOp =
4635 dyn_cast(SI->getValueOperand())) {
4636 if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
4637 R.getMinVecRegSize()) ||
4638 tryToVectorize(BinOp, R)) {
4639 Changed = true;
4640 it = BB->begin();
4641 e = BB->end();
4642 continue;
4643 }
47394644 }
4740 }
4741 }
47424645
47434646 // Try to vectorize horizontal reductions feeding into a return.
4744 if (ReturnInst *RI = dyn_cast(it)) {
4745 if (RI->getNumOperands() != 0) {
4746 // Try to match and vectorize a horizontal reduction.
4747 if (vectorizeRootInstruction(nullptr, RI->getOperand(0), BB, R, TTI)) {
4748 Changed = true;
4749 it = BB->begin();
4750 e = BB->end();
4751 continue;
4647 if (ReturnInst *RI = dyn_cast(it))
4648 if (RI->getNumOperands() != 0)
4649 if (BinaryOperator *BinOp =
4650 dyn_cast(RI->getOperand(0))) {
4651 DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
4652 if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
4653 R.getMinVecRegSize()) ||
4654 tryToVectorizePair(BinOp->getOperand(0), BinOp->getOperand(1),
4655 R)) {
4656 Changed = true;
4657 it = BB->begin();
4658 e = BB->end();
4659 continue;
4660 }
47524661 }
4753 }
4754 }
47554662
47564663 // Try to vectorize trees that start at compare instructions.
47574664 if (CmpInst *CI = dyn_cast(it)) {
47644671 continue;
47654672 }
47664673
4767 for (int I = 0; I < 2; ++I) {
4768 if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) {
4769 Changed = true;
4770 // We would like to start over since some instructions are deleted
4771 // and the iterator may become invalid value.
4772 it = BB->begin();
4773 e = BB->end();
4774 break;
4674 for (int i = 0; i < 2; ++i) {
4675 if (BinaryOperator *BI = dyn_cast(CI->getOperand(i))) {
4676 if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
4677 Changed = true;
4678 // We would like to start over since some instructions are deleted
4679 // and the iterator may become invalid value.
4680 it = BB->begin();
4681 e = BB->end();
4682 break;
4683 }
47754684 }
47764685 }
47774686 continue;
1111 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4
1212 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
1313 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
14 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
15 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
16 ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]]
17 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
18 ; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]]
19 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
20 ; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]]
21 ; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
22 ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
23 ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]]
24 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
25 ; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]]
26 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
27 ; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]]
14 ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
15 ; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
16 ; CHECK-NEXT: [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
17 ; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]]
18 ; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
19 ; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
20 ; CHECK-NEXT: [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
21 ; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]]
22 ; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
23 ; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
24 ; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <2 x float> [[TMP6]], [[TMP5]]
25 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0
26 ; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP8]], [[ADD_1]]
27 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP7]], i32 1
28 ; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP9]], [[ADD_2]]
2829 ; CHECK-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV]]
29 ; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[TMP4]], [[ADD7]]
30 ; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[TMP5]], [[ADD19]]
31 ; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP9]], [[ADD19_1]]
32 ; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP10]], [[ADD19_2]]
30 ; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[MUL4]], [[ADD7]]
31 ; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD19]]
32 ; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP8]], [[ADD19_1]]
33 ; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP9]], [[ADD19_2]]
3334 ; CHECK-NEXT: store float [[ADD19_3]], float* @res, align 4
3435 ; CHECK-NEXT: ret float [[ADD19_3]]
3536 ;
6869 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4
6970 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
7071 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
71 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
72 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
73 ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]]
74 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
75 ; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]]
76 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
77 ; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]]
78 ; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
79 ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
80 ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]]
81 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
82 ; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]]
83 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
84 ; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]]
72 ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
73 ; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
74 ; CHECK-NEXT: [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
75 ; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]]
76 ; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
77 ; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
78 ; CHECK-NEXT: [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
79 ; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]]
80 ; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
81 ; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
82 ; CHECK-NEXT: [[MUL4_2:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
83 ; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[MUL4_2]], [[ADD_1]]
84 ; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
85 ; CHECK-NEXT: [[TMP8:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
86 ; CHECK-NEXT: [[MUL4_3:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
87 ; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[MUL4_3]], [[ADD_2]]
8588 ; CHECK-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
8689 ; CHECK-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
8790 ; CHECK-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]]
88 ; CHECK-NEXT: [[TMP11:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 4) to <2 x float>*), align 16
89 ; CHECK-NEXT: [[TMP12:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 4) to <2 x float>*), align 16
90 ; CHECK-NEXT: [[TMP13:%.*]] = fmul fast <2 x float> [[TMP12]], [[TMP11]]
91 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
92 ; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[TMP14]], [[ADD7]]
93 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
94 ; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[TMP15]], [[ADD19]]
95 ; CHECK-NEXT: [[TMP16:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 6) to <2 x float>*), align 8
96 ; CHECK-NEXT: [[TMP17:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 6) to <2 x float>*), align 8
97 ; CHECK-NEXT: [[TMP18:%.*]] = fmul fast <2 x float> [[TMP17]], [[TMP16]]
98 ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[TMP18]], i32 0
99 ; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP19]], [[ADD19_1]]
100 ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP18]], i32 1
101 ; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP20]], [[ADD19_2]]
91 ; CHECK-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 4), align 16
92 ; CHECK-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 4), align 16
93 ; CHECK-NEXT: [[MUL18:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
94 ; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[MUL18]], [[ADD7]]
95 ; CHECK-NEXT: [[TMP11:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 5), align 4
96 ; CHECK-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 5), align 4
97 ; CHECK-NEXT: [[MUL18_1:%.*]] = fmul fast float [[TMP12]], [[TMP11]]
98 ; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[MUL18_1]], [[ADD19]]
99 ; CHECK-NEXT: [[TMP13:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 6) to <2 x float>*), align 8
100 ; CHECK-NEXT: [[TMP14:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 6) to <2 x float>*), align 8
101 ; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <2 x float> [[TMP14]], [[TMP13]]
102 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
103 ; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP16]], [[ADD19_1]]
104 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
105 ; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP17]], [[ADD19_2]]
102106 ; CHECK-NEXT: store float [[ADD19_3]], float* @res, align 4
103107 ; CHECK-NEXT: ret float [[ADD19_3]]
104108 ;
150154 ; CHECK-NEXT: entry:
151155 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4
152156 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
153 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
154 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
155 ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
156 ; CHECK-NEXT: [[TMP4:%.*]] = fadd fast float undef, undef
157 ; CHECK-NEXT: [[TMP5:%.*]] = fadd fast float undef, [[TMP4]]
158 ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32>
159 ; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
160 ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32>
161 ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
162 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
163 ; CHECK-NEXT: [[TMP7:%.*]] = fadd fast float undef, [[TMP5]]
164 ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[CONV]], [[TMP6]]
165 ; CHECK-NEXT: store float [[TMP8]], float* @res, align 4
166 ; CHECK-NEXT: ret float [[TMP8]]
157 ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
158 ; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
159 ; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
160 ; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
161 ; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
162 ; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
163 ; CHECK-NEXT: [[TMP5:%.*]] = fadd fast float [[MUL_1]], [[MUL]]
164 ; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
165 ; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
166 ; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]]
167 ; CHECK-NEXT: [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP5]]
168 ; CHECK-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
169 ; CHECK-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
170 ; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
171 ; CHECK-NEXT: [[TMP11:%.*]] = fadd fast float [[MUL_3]], [[TMP8]]
172 ; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[CONV]], [[TMP11]]
173 ; CHECK-NEXT: store float [[TMP12]], float* @res, align 4
174 ; CHECK-NEXT: ret float [[TMP12]]
167175 ;
168176 entry:
169177 %0 = load i32, i32* @n, align 4
193201 ; CHECK-NEXT: entry:
194202 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4
195203 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
196 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
197 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
198 ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
199 ; CHECK-NEXT: [[TMP4:%.*]] = fadd fast float undef, undef
200 ; CHECK-NEXT: [[TMP5:%.*]] = fadd fast float undef, [[TMP4]]
201 ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32>
202 ; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
203 ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32>
204 ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
205 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
206 ; CHECK-NEXT: [[TMP7:%.*]] = fadd fast float undef, [[TMP5]]
207 ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[CONV]], [[TMP6]]
208 ; CHECK-NEXT: [[CONV4:%.*]] = fptosi float [[TMP8]] to i32
204 ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
205 ; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
206 ; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
207 ; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
208 ; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
209 ; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
210 ; CHECK-NEXT: [[TMP5:%.*]] = fadd fast float [[MUL_1]], [[MUL]]
211 ; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
212 ; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
213 ; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]]
214 ; CHECK-NEXT: [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP5]]
215 ; CHECK-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
216 ; CHECK-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
217 ; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
218 ; CHECK-NEXT: [[TMP11:%.*]] = fadd fast float [[MUL_3]], [[TMP8]]
219 ; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[CONV]], [[TMP11]]
220 ; CHECK-NEXT: [[CONV4:%.*]] = fptosi float [[TMP12]] to i32
209221 ; CHECK-NEXT: store i32 [[CONV4]], i32* @n, align 4
210222 ; CHECK-NEXT: ret i32 [[CONV4]]
211223 ;