llvm.org GIT mirror llvm / e126eb1
[SLP] Fix for PR6246: vectorization for scalar ops on vector elements. When trying to vectorize trees that start at insertelement instructions function tryToVectorizeList() uses vectorization factor calculated as MinVecRegSize/ScalarTypeSize. But sometimes it does not work as tree cost for this fixed vectorization factor is too high. Patch tries to improve the situation. It tries different vectorization factors from max(PowerOf2Floor(NumberOfVectorizedValues), MinVecRegSize/ScalarTypeSize) to MinVecRegSize/ScalarTypeSize and tries to choose the best one. Differential Revision: https://reviews.llvm.org/D27215 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@288412 91177308-0d34-0410-b5e6-96231b3b80d8 Alexey Bataev 3 years ago
2 changed file(s) with 140 addition(s) and 186 deletion(s). Raw diff Collapse all Expand all
38693869
38703870 unsigned Opcode0 = I0->getOpcode();
38713871
3872 // FIXME: Register size should be a parameter to this function, so we can
3873 // try different vectorization factors.
38743872 unsigned Sz = R.getVectorElementSize(I0);
3875 unsigned VF = R.getMinVecRegSize() / Sz;
3873 unsigned MinVF = R.getMinVecRegSize() / Sz;
3874 unsigned MaxVF = std::max(PowerOf2Floor(VL.size()), MinVF);
38763875
38773876 for (Value *V : VL) {
38783877 Type *Ty = V->getType();
38883887 // Keep track of values that were deleted by vectorizing in the loop below.
38893888 SmallVector TrackValues(VL.begin(), VL.end());
38903889
3891 for (unsigned i = 0, e = VL.size(); i < e; ++i) {
3892 unsigned OpsWidth = 0;
3893
3894 if (i + VF > e)
3895 OpsWidth = e - i;
3896 else
3897 OpsWidth = VF;
3898
3899 if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
3900 break;
3901
3902 // Check that a previous iteration of this loop did not delete the Value.
3903 if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
3904 continue;
3905
3906 DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
3907 << "\n");
3908 ArrayRef Ops = VL.slice(i, OpsWidth);
3909
3910 ArrayRef BuildVectorSlice;
3911 if (!BuildVector.empty())
3912 BuildVectorSlice = BuildVector.slice(i, OpsWidth);
3913
3914 R.buildTree(Ops, BuildVectorSlice);
3915 // TODO: check if we can allow reordering for more cases.
3916 if (AllowReorder && R.shouldReorder()) {
3917 // Conceptually, there is nothing actually preventing us from trying to
3918 // reorder a larger list. In fact, we do exactly this when vectorizing
3919 // reductions. However, at this point, we only expect to get here from
3920 // tryToVectorizePair().
3921 assert(Ops.size() == 2);
3922 assert(BuildVectorSlice.empty());
3923 Value *ReorderedOps[] = { Ops[1], Ops[0] };
3924 R.buildTree(ReorderedOps, None);
3925 }
3926 if (R.isTreeTinyAndNotFullyVectorizable())
3927 continue;
3928
3929 R.computeMinimumValueSizes();
3930 int Cost = R.getTreeCost();
3931
3932 if (Cost < -SLPCostThreshold) {
3933 DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
3934 Value *VectorizedRoot = R.vectorizeTree();
3935
3936 // Reconstruct the build vector by extracting the vectorized root. This
3937 // way we handle the case where some elements of the vector are undefined.
3938 // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
3939 if (!BuildVectorSlice.empty()) {
3940 // The insert point is the last build vector instruction. The vectorized
3941 // root will precede it. This guarantees that we get an instruction. The
3942 // vectorized tree could have been constant folded.
3943 Instruction *InsertAfter = cast(BuildVectorSlice.back());
3944 unsigned VecIdx = 0;
3945 for (auto &V : BuildVectorSlice) {
3946 IRBuilder Builder(InsertAfter->getParent(),
3947 ++BasicBlock::iterator(InsertAfter));
3948 Instruction *I = cast(V);
3949 assert(isa(I) || isa(I));
3950 Instruction *Extract = cast(Builder.CreateExtractElement(
3951 VectorizedRoot, Builder.getInt32(VecIdx++)));
3952 I->setOperand(1, Extract);
3953 I->removeFromParent();
3954 I->insertAfter(Extract);
3955 InsertAfter = I;
3890 unsigned NextInst = 0, MaxInst = VL.size();
3891 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
3892 VF /= 2) {
3893 for (unsigned I = NextInst; I < MaxInst; ++I) {
3894 unsigned OpsWidth = 0;
3895
3896 if (I + VF > MaxInst)
3897 OpsWidth = MaxInst - I;
3898 else
3899 OpsWidth = VF;
3900
3901 if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
3902 break;
3903
3904 // Check that a previous iteration of this loop did not delete the Value.
3905 if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
3906 continue;
3907
3908 DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
3909 << "\n");
3910 ArrayRef Ops = VL.slice(I, OpsWidth);
3911
3912 ArrayRef BuildVectorSlice;
3913 if (!BuildVector.empty())
3914 BuildVectorSlice = BuildVector.slice(I, OpsWidth);
3915
3916 R.buildTree(Ops, BuildVectorSlice);
3917 // TODO: check if we can allow reordering for more cases.
3918 if (AllowReorder && R.shouldReorder()) {
3919 // Conceptually, there is nothing actually preventing us from trying to
3920 // reorder a larger list. In fact, we do exactly this when vectorizing
3921 // reductions. However, at this point, we only expect to get here from
3922 // tryToVectorizePair().
3923 assert(Ops.size() == 2);
3924 assert(BuildVectorSlice.empty());
3925 Value *ReorderedOps[] = {Ops[1], Ops[0]};
3926 R.buildTree(ReorderedOps, None);
3927 }
3928 if (R.isTreeTinyAndNotFullyVectorizable())
3929 continue;
3930
3931 R.computeMinimumValueSizes();
3932 int Cost = R.getTreeCost();
3933
3934 if (Cost < -SLPCostThreshold) {
3935 DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
3936 Value *VectorizedRoot = R.vectorizeTree();
3937
3938 // Reconstruct the build vector by extracting the vectorized root. This
3939 // way we handle the case where some elements of the vector are
3940 // undefined.
3941 // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
3942 if (!BuildVectorSlice.empty()) {
3943 // The insert point is the last build vector instruction. The
3944 // vectorized root will precede it. This guarantees that we get an
3945 // instruction. The vectorized tree could have been constant folded.
3946 Instruction *InsertAfter = cast(BuildVectorSlice.back());
3947 unsigned VecIdx = 0;
3948 for (auto &V : BuildVectorSlice) {
3949 IRBuilder Builder(InsertAfter->getParent(),
3950 ++BasicBlock::iterator(InsertAfter));
3951 Instruction *I = cast(V);
3952 assert(isa(I) || isa(I));
3953 Instruction *Extract =
3954 cast(Builder.CreateExtractElement(
3955 VectorizedRoot, Builder.getInt32(VecIdx++)));
3956 I->setOperand(1, Extract);
3957 I->removeFromParent();
3958 I->insertAfter(Extract);
3959 InsertAfter = I;
3960 }
39563961 }
3957 }
3958 // Move to the next bundle.
3959 i += VF - 1;
3960 Changed = true;
3962 // Move to the next bundle.
3963 I += VF - 1;
3964 NextInst = I + 1;
3965 Changed = true;
3966 }
39613967 }
39623968 }
39633969
615615 define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
616616 ; CHECK-LABEL: @multi_tree(
617617 ; CHECK-NEXT: entry:
618 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double %w, i32 0
619 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double %x, i32 1
620 ; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]],
621 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double %y, i32 0
622 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double %z, i32 1
623 ; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP4]],
624 ; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> , [[TMP2]]
625 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
626 ; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x double> undef, double [[TMP7]], i32 3
627 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
628 ; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x double> [[I1]], double [[TMP8]], i32 2
629 ; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x double> , [[TMP5]]
630 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i32 0
631 ; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x double> [[I2]], double [[TMP10]], i32 1
632 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP9]], i32 1
633 ; CHECK-NEXT: [[I4:%.*]] = insertelement <4 x double> [[I3]], double [[TMP11]], i32 0
618 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> undef, double %w, i32 0
619 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double %x, i32 1
620 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double %y, i32 2
621 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double %z, i32 3
622 ; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP3]],
623 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x double> , [[TMP4]]
624 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x double> [[TMP5]], i32 0
625 ; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x double> undef, double [[TMP6]], i32 3
626 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x double> [[TMP5]], i32 1
627 ; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x double> [[I1]], double [[TMP7]], i32 2
628 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x double> [[TMP5]], i32 2
629 ; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x double> [[I2]], double [[TMP8]], i32 1
630 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x double> [[TMP5]], i32 3
631 ; CHECK-NEXT: [[I4:%.*]] = insertelement <4 x double> [[I3]], double [[TMP9]], i32 0
634632 ; CHECK-NEXT: ret <4 x double> [[I4]]
635633 ;
636634 ; ZEROTHRESH-LABEL: @multi_tree(
637635 ; ZEROTHRESH-NEXT: entry:
638 ; ZEROTHRESH-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double %w, i32 0
639 ; ZEROTHRESH-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double %x, i32 1
640 ; ZEROTHRESH-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]],
641 ; ZEROTHRESH-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double %y, i32 0
642 ; ZEROTHRESH-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double %z, i32 1
643 ; ZEROTHRESH-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP4]],
644 ; ZEROTHRESH-NEXT: [[TMP6:%.*]] = fmul <2 x double> , [[TMP2]]
645 ; ZEROTHRESH-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
646 ; ZEROTHRESH-NEXT: [[I1:%.*]] = insertelement <4 x double> undef, double [[TMP7]], i32 3
647 ; ZEROTHRESH-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
648 ; ZEROTHRESH-NEXT: [[I2:%.*]] = insertelement <4 x double> [[I1]], double [[TMP8]], i32 2
649 ; ZEROTHRESH-NEXT: [[TMP9:%.*]] = fmul <2 x double> , [[TMP5]]
650 ; ZEROTHRESH-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i32 0
651 ; ZEROTHRESH-NEXT: [[I3:%.*]] = insertelement <4 x double> [[I2]], double [[TMP10]], i32 1
652 ; ZEROTHRESH-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP9]], i32 1
653 ; ZEROTHRESH-NEXT: [[I4:%.*]] = insertelement <4 x double> [[I3]], double [[TMP11]], i32 0
636 ; ZEROTHRESH-NEXT: [[TMP0:%.*]] = insertelement <4 x double> undef, double %w, i32 0
637 ; ZEROTHRESH-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double %x, i32 1
638 ; ZEROTHRESH-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double %y, i32 2
639 ; ZEROTHRESH-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double %z, i32 3
640 ; ZEROTHRESH-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP3]],
641 ; ZEROTHRESH-NEXT: [[TMP5:%.*]] = fmul <4 x double> , [[TMP4]]
642 ; ZEROTHRESH-NEXT: [[TMP6:%.*]] = extractelement <4 x double> [[TMP5]], i32 0
643 ; ZEROTHRESH-NEXT: [[I1:%.*]] = insertelement <4 x double> undef, double [[TMP6]], i32 3
644 ; ZEROTHRESH-NEXT: [[TMP7:%.*]] = extractelement <4 x double> [[TMP5]], i32 1
645 ; ZEROTHRESH-NEXT: [[I2:%.*]] = insertelement <4 x double> [[I1]], double [[TMP7]], i32 2
646 ; ZEROTHRESH-NEXT: [[TMP8:%.*]] = extractelement <4 x double> [[TMP5]], i32 2
647 ; ZEROTHRESH-NEXT: [[I3:%.*]] = insertelement <4 x double> [[I2]], double [[TMP8]], i32 1
648 ; ZEROTHRESH-NEXT: [[TMP9:%.*]] = extractelement <4 x double> [[TMP5]], i32 3
649 ; ZEROTHRESH-NEXT: [[I4:%.*]] = insertelement <4 x double> [[I3]], double [[TMP9]], i32 0
654650 ; ZEROTHRESH-NEXT: ret <4 x double> [[I4]]
655651 ;
656652 entry:
672668 define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr #0 {
673669 ; CHECK-LABEL: @_vadd256(
674670 ; CHECK-NEXT: entry:
675 ; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <8 x float> %a, i32 0
676 ; CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <8 x float> %b, i32 0
677 ; CHECK-NEXT: [[VECEXT2:%.*]] = extractelement <8 x float> %a, i32 1
678 ; CHECK-NEXT: [[VECEXT3:%.*]] = extractelement <8 x float> %b, i32 1
679 ; CHECK-NEXT: [[VECEXT5:%.*]] = extractelement <8 x float> %a, i32 2
680 ; CHECK-NEXT: [[VECEXT6:%.*]] = extractelement <8 x float> %b, i32 2
681 ; CHECK-NEXT: [[VECEXT8:%.*]] = extractelement <8 x float> %a, i32 3
682 ; CHECK-NEXT: [[VECEXT9:%.*]] = extractelement <8 x float> %b, i32 3
683 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> undef, float [[VECEXT]], i32 0
684 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[VECEXT2]], i32 1
685 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[VECEXT5]], i32 2
686 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[VECEXT8]], i32 3
687 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> undef, float [[VECEXT1]], i32 0
688 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[VECEXT3]], i32 1
689 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[VECEXT6]], i32 2
690 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[VECEXT9]], i32 3
691 ; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP3]], [[TMP7]]
692 ; CHECK-NEXT: [[VECEXT11:%.*]] = extractelement <8 x float> %a, i32 4
693 ; CHECK-NEXT: [[VECEXT12:%.*]] = extractelement <8 x float> %b, i32 4
694 ; CHECK-NEXT: [[VECEXT14:%.*]] = extractelement <8 x float> %a, i32 5
695 ; CHECK-NEXT: [[VECEXT15:%.*]] = extractelement <8 x float> %b, i32 5
696 ; CHECK-NEXT: [[VECEXT17:%.*]] = extractelement <8 x float> %a, i32 6
697 ; CHECK-NEXT: [[VECEXT18:%.*]] = extractelement <8 x float> %b, i32 6
698 ; CHECK-NEXT: [[VECEXT20:%.*]] = extractelement <8 x float> %a, i32 7
699 ; CHECK-NEXT: [[VECEXT21:%.*]] = extractelement <8 x float> %b, i32 7
700 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> undef, float [[VECEXT11]], i32 0
701 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[VECEXT14]], i32 1
702 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[VECEXT17]], i32 2
703 ; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[VECEXT20]], i32 3
704 ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x float> undef, float [[VECEXT12]], i32 0
705 ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[VECEXT15]], i32 1
706 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[VECEXT18]], i32 2
707 ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[VECEXT21]], i32 3
708 ; CHECK-NEXT: [[TMP17:%.*]] = fadd <4 x float> [[TMP12]], [[TMP16]]
709 ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP8]], i32 0
710 ; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x float> undef, float [[TMP18]], i32 0
711 ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP8]], i32 1
712 ; CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP19]], i32 1
713 ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP8]], i32 2
714 ; CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT1_I]], float [[TMP20]], i32 2
715 ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[TMP8]], i32 3
716 ; CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP21]], i32 3
717 ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[TMP17]], i32 0
718 ; CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP22]], i32 4
719 ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x float> [[TMP17]], i32 1
720 ; CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP23]], i32 5
721 ; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x float> [[TMP17]], i32 2
722 ; CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP24]], i32 6
723 ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x float> [[TMP17]], i32 3
724 ; CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP25]], i32 7
671 ; CHECK-NEXT: [[TMP0:%.*]] = fadd <8 x float> %a, %b
672 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x float> [[TMP0]], i32 0
673 ; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0
674 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[TMP0]], i32 1
675 ; CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP2]], i32 1
676 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP0]], i32 2
677 ; CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT1_I]], float [[TMP3]], i32 2
678 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP0]], i32 3
679 ; CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP4]], i32 3
680 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x float> [[TMP0]], i32 4
681 ; CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP5]], i32 4
682 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x float> [[TMP0]], i32 5
683 ; CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP6]], i32 5
684 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x float> [[TMP0]], i32 6
685 ; CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP7]], i32 6
686 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x float> [[TMP0]], i32 7
687 ; CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP8]], i32 7
725688 ; CHECK-NEXT: ret <8 x float> [[VECINIT7_I]]
726689 ;
727690 ; ZEROTHRESH-LABEL: @_vadd256(
728691 ; ZEROTHRESH-NEXT: entry:
729 ; ZEROTHRESH-NEXT: [[VECEXT:%.*]] = extractelement <8 x float> %a, i32 0
730 ; ZEROTHRESH-NEXT: [[VECEXT1:%.*]] = extractelement <8 x float> %b, i32 0
731 ; ZEROTHRESH-NEXT: [[ADD:%.*]] = fadd float [[VECEXT]], [[VECEXT1]]
732 ; ZEROTHRESH-NEXT: [[VECEXT2:%.*]] = extractelement <8 x float> %a, i32 1
733 ; ZEROTHRESH-NEXT: [[VECEXT3:%.*]] = extractelement <8 x float> %b, i32 1
734 ; ZEROTHRESH-NEXT: [[ADD4:%.*]] = fadd float [[VECEXT2]], [[VECEXT3]]
735 ; ZEROTHRESH-NEXT: [[VECEXT5:%.*]] = extractelement <8 x float> %a, i32 2
736 ; ZEROTHRESH-NEXT: [[VECEXT6:%.*]] = extractelement <8 x float> %b, i32 2
737 ; ZEROTHRESH-NEXT: [[ADD7:%.*]] = fadd float [[VECEXT5]], [[VECEXT6]]
738 ; ZEROTHRESH-NEXT: [[VECEXT8:%.*]] = extractelement <8 x float> %a, i32 3
739 ; ZEROTHRESH-NEXT: [[VECEXT9:%.*]] = extractelement <8 x float> %b, i32 3
740 ; ZEROTHRESH-NEXT: [[ADD10:%.*]] = fadd float [[VECEXT8]], [[VECEXT9]]
741 ; ZEROTHRESH-NEXT: [[VECEXT11:%.*]] = extractelement <8 x float> %a, i32 4
742 ; ZEROTHRESH-NEXT: [[VECEXT12:%.*]] = extractelement <8 x float> %b, i32 4
743 ; ZEROTHRESH-NEXT: [[ADD13:%.*]] = fadd float [[VECEXT11]], [[VECEXT12]]
744 ; ZEROTHRESH-NEXT: [[VECEXT14:%.*]] = extractelement <8 x float> %a, i32 5
745 ; ZEROTHRESH-NEXT: [[VECEXT15:%.*]] = extractelement <8 x float> %b, i32 5
746 ; ZEROTHRESH-NEXT: [[ADD16:%.*]] = fadd float [[VECEXT14]], [[VECEXT15]]
747 ; ZEROTHRESH-NEXT: [[VECEXT17:%.*]] = extractelement <8 x float> %a, i32 6
748 ; ZEROTHRESH-NEXT: [[VECEXT18:%.*]] = extractelement <8 x float> %b, i32 6
749 ; ZEROTHRESH-NEXT: [[ADD19:%.*]] = fadd float [[VECEXT17]], [[VECEXT18]]
750 ; ZEROTHRESH-NEXT: [[VECEXT20:%.*]] = extractelement <8 x float> %a, i32 7
751 ; ZEROTHRESH-NEXT: [[VECEXT21:%.*]] = extractelement <8 x float> %b, i32 7
752 ; ZEROTHRESH-NEXT: [[ADD22:%.*]] = fadd float [[VECEXT20]], [[VECEXT21]]
753 ; ZEROTHRESH-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x float> undef, float [[ADD]], i32 0
754 ; ZEROTHRESH-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[ADD4]], i32 1
755 ; ZEROTHRESH-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT1_I]], float [[ADD7]], i32 2
756 ; ZEROTHRESH-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[ADD10]], i32 3
757 ; ZEROTHRESH-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[ADD13]], i32 4
758 ; ZEROTHRESH-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[ADD16]], i32 5
759 ; ZEROTHRESH-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[ADD19]], i32 6
760 ; ZEROTHRESH-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[ADD22]], i32 7
692 ; ZEROTHRESH-NEXT: [[TMP0:%.*]] = fadd <8 x float> %a, %b
693 ; ZEROTHRESH-NEXT: [[TMP1:%.*]] = extractelement <8 x float> [[TMP0]], i32 0
694 ; ZEROTHRESH-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0
695 ; ZEROTHRESH-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[TMP0]], i32 1
696 ; ZEROTHRESH-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP2]], i32 1
697 ; ZEROTHRESH-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP0]], i32 2
698 ; ZEROTHRESH-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT1_I]], float [[TMP3]], i32 2
699 ; ZEROTHRESH-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP0]], i32 3
700 ; ZEROTHRESH-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP4]], i32 3
701 ; ZEROTHRESH-NEXT: [[TMP5:%.*]] = extractelement <8 x float> [[TMP0]], i32 4
702 ; ZEROTHRESH-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP5]], i32 4
703 ; ZEROTHRESH-NEXT: [[TMP6:%.*]] = extractelement <8 x float> [[TMP0]], i32 5
704 ; ZEROTHRESH-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP6]], i32 5
705 ; ZEROTHRESH-NEXT: [[TMP7:%.*]] = extractelement <8 x float> [[TMP0]], i32 6
706 ; ZEROTHRESH-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP7]], i32 6
707 ; ZEROTHRESH-NEXT: [[TMP8:%.*]] = extractelement <8 x float> [[TMP0]], i32 7
708 ; ZEROTHRESH-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP8]], i32 7
761709 ; ZEROTHRESH-NEXT: ret <8 x float> [[VECINIT7_I]]
762710 ;
763711 entry: