llvm.org GIT mirror llvm / d2ce939
Add Support to Recognize and Vectorize NON SIMD instructions in SLPVectorizer. This patch adds support to recognize patterns such as fadd,fsub,fadd,fsub.../add,sub,add,sub... and vectorizes them as vector shuffles if they are profitable. These patterns of vector shuffle can later be converted to instructions such as addsubpd etc on X86. Thanks to Arnold and Hal for the reviews. http://reviews.llvm.org/D4015 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211339 91177308-0d34-0410-b5e6-96231b3b80d8 Karthik Bhat 5 years ago
6 changed file(s) with 445 addition(s) and 46 deletion(s). Raw diff Collapse all Expand all
321321 enum ShuffleKind {
322322 SK_Broadcast, ///< Broadcast element 0 to all other elements.
323323 SK_Reverse, ///< Reverse the order of the vector.
324 SK_Alternate, ///< Choose alternate elements from vector.
324325 SK_InsertSubvector, ///< InsertSubvector. Index indicates start offset.
325326 SK_ExtractSubvector ///< ExtractSubvector Index indicates start offset.
326327 };
3737 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
3838 /// are set if the result needs to be inserted and/or extracted from vectors.
3939 unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
40
41 /// Estimate the cost overhead of SK_Alternate shuffle.
42 unsigned getAltShuffleOverhead(Type *Ty) const;
4043
4144 const TargetLoweringBase *getTLI() const { return TM->getTargetLowering(); }
4245
326329 return OpCost;
327330 }
328331
332 unsigned BasicTTI::getAltShuffleOverhead(Type *Ty) const {
333 assert(Ty->isVectorTy() && "Can only shuffle vectors");
334 unsigned Cost = 0;
335 // Shuffle cost is equal to the cost of extracting element from its argument
336 // plus the cost of inserting them onto the result vector.
337
338 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from index
339 // 0 of first vector, index 1 of second vector,index 2 of first vector and
340 // finally index 3 of second vector and insert them at index <0,1,2,3> of
341 // result vector.
342 for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
343 Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
344 Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
345 }
346 return Cost;
347 }
348
329349 unsigned BasicTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
330350 Type *SubTp) const {
351 if (Kind == SK_Alternate) {
352 return getAltShuffleOverhead(Tp);
353 }
331354 return 1;
332355 }
333356
442442
443443 unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
444444 Type *SubTp) const {
445 // We only handle costs of reverse shuffles for now.
446 if (Kind != SK_Reverse)
445 // We only handle costs of reverse and alternate shuffles for now.
446 if (Kind != SK_Reverse && Kind != SK_Alternate)
447447 return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
448448
449 static const CostTblEntry NEONShuffleTbl[] = {
450 // Reverse shuffle cost one instruction if we are shuffling within a double
451 // word (vrev) or two if we shuffle a quad word (vrev, vext).
452 { ISD::VECTOR_SHUFFLE, MVT::v2i32, 1 },
453 { ISD::VECTOR_SHUFFLE, MVT::v2f32, 1 },
454 { ISD::VECTOR_SHUFFLE, MVT::v2i64, 1 },
455 { ISD::VECTOR_SHUFFLE, MVT::v2f64, 1 },
456
457 { ISD::VECTOR_SHUFFLE, MVT::v4i32, 2 },
458 { ISD::VECTOR_SHUFFLE, MVT::v4f32, 2 },
459 { ISD::VECTOR_SHUFFLE, MVT::v8i16, 2 },
460 { ISD::VECTOR_SHUFFLE, MVT::v16i8, 2 }
461 };
462
463 std::pair LT = TLI->getTypeLegalizationCost(Tp);
464
465 int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
466 if (Idx == -1)
467 return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
468
469 return LT.first * NEONShuffleTbl[Idx].Cost;
449 if (Kind == SK_Reverse) {
450 static const CostTblEntry NEONShuffleTbl[] = {
451 // Reverse shuffle cost one instruction if we are shuffling within a
452 // double word (vrev) or two if we shuffle a quad word (vrev, vext).
453 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
454 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
455 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
456 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
457
458 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
459 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
460 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
461 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
462
463 std::pair LT = TLI->getTypeLegalizationCost(Tp);
464
465 int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
466 if (Idx == -1)
467 return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
468
469 return LT.first * NEONShuffleTbl[Idx].Cost;
470 }
471 if (Kind == SK_Alternate) {
472 static const CostTblEntry NEONAltShuffleTbl[] = {
473 // Alt shuffle cost table for ARM. Cost is the number of instructions
474 // required to create the shuffled vector.
475
476 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
477 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
478 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
479 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
480
481 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
482 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
483 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
484
485 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
486
487 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
488
489 std::pair LT = TLI->getTypeLegalizationCost(Tp);
490 int Idx =
491 CostTableLookup(NEONAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
492 if (Idx == -1)
493 return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
494 return LT.first * NEONAltShuffleTbl[Idx].Cost;
495 }
496 return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
470497 }
471498
472499 unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
401401
402402 unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
403403 Type *SubTp) const {
404 // We only estimate the cost of reverse shuffles.
405 if (Kind != SK_Reverse)
404 // We only estimate the cost of reverse and alternate shuffles.
405 if (Kind != SK_Reverse && Kind != SK_Alternate)
406406 return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
407407
408 std::pair LT = TLI->getTypeLegalizationCost(Tp);
409 unsigned Cost = 1;
410 if (LT.second.getSizeInBits() > 128)
411 Cost = 3; // Extract + insert + copy.
412
413 // Multiple by the number of parts.
414 return Cost * LT.first;
408 if (Kind == SK_Reverse) {
409 std::pair LT = TLI->getTypeLegalizationCost(Tp);
410 unsigned Cost = 1;
411 if (LT.second.getSizeInBits() > 128)
412 Cost = 3; // Extract + insert + copy.
413
414 // Multiple by the number of parts.
415 return Cost * LT.first;
416 }
417
418 if (Kind == SK_Alternate) {
419 static const CostTblEntry X86AltShuffleTbl[] = {
420 // Alt shuffle cost table for X86. Cost is the number of instructions
421 // required to create the shuffled vector.
422
423 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
424 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
425 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
426
427 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 2},
428 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
429 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
430
431 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 8},
432 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8},
433
434 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 49}};
435
436 std::pair LT = TLI->getTypeLegalizationCost(Tp);
437
438 int Idx = CostTableLookup(X86AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
439 if (Idx == -1)
440 return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
441 return LT.first * X86AltShuffleTbl[Idx].Cost;
442 }
443
444 return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
415445 }
416446
417447 unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
148148 return true;
149149 }
150150
151 ///\returns Opcode that can be clubbed with \p Op to create an alternate
152 /// sequence which can later be merged as a ShuffleVector instruction.
153 static unsigned getAltOpcode(unsigned Op) {
154 switch (Op) {
155 case Instruction::FAdd:
156 return Instruction::FSub;
157 case Instruction::FSub:
158 return Instruction::FAdd;
159 case Instruction::Add:
160 return Instruction::Sub;
161 case Instruction::Sub:
162 return Instruction::Add;
163 default:
164 return 0;
165 }
166 }
167
168 ///\returns bool representing if Opcode \p Op can be part
169 /// of an alternate sequence which can later be merged as
170 /// a ShuffleVector instruction.
171 static bool canCombineAsAltInst(unsigned Op) {
172 if (Op == Instruction::FAdd || Op == Instruction::FSub ||
173 Op == Instruction::Sub || Op == Instruction::Add)
174 return true;
175 return false;
176 }
177
178 /// \returns ShuffleVector instruction if intructions in \p VL have
179 /// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
180 /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
181 static unsigned isAltInst(ArrayRef VL) {
182 Instruction *I0 = dyn_cast(VL[0]);
183 unsigned Opcode = I0->getOpcode();
184 unsigned AltOpcode = getAltOpcode(Opcode);
185 for (int i = 1, e = VL.size(); i < e; i++) {
186 Instruction *I = dyn_cast(VL[i]);
187 if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
188 return 0;
189 }
190 return Instruction::ShuffleVector;
191 }
192
151193 /// \returns The opcode if all of the Instructions in \p VL have the same
152194 /// opcode, or zero.
153195 static unsigned getSameOpcode(ArrayRef VL) {
157199 unsigned Opcode = I0->getOpcode();
158200 for (int i = 1, e = VL.size(); i < e; i++) {
159201 Instruction *I = dyn_cast(VL[i]);
160 if (!I || Opcode != I->getOpcode())
202 if (!I || Opcode != I->getOpcode()) {
203 if (canCombineAsAltInst(Opcode) && i == 1)
204 return isAltInst(VL);
161205 return 0;
206 }
162207 }
163208 return Opcode;
164209 }
376421
377422 /// \brief Perform LICM and CSE on the newly generated gather sequences.
378423 void optimizeGatherSequence();
424
379425 private:
380426 struct TreeEntry;
381427
593639
594640 void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) {
595641 bool SameTy = getSameType(VL); (void)SameTy;
642 bool isAltShuffle = false;
596643 assert(SameTy && "Invalid types!");
597644
598645 if (Depth == RecursionMaxDepth) {
614661 newTreeEntry(VL, false);
615662 return;
616663 }
664 unsigned Opcode = getSameOpcode(VL);
665
666 // Check that this shuffle vector refers to the alternate
667 // sequence of opcodes.
668 if (Opcode == Instruction::ShuffleVector) {
669 Instruction *I0 = dyn_cast(VL[0]);
670 unsigned Op = I0->getOpcode();
671 if (Op != Instruction::ShuffleVector)
672 isAltShuffle = true;
673 }
617674
618675 // If all of the operands are identical or constant we have a simple solution.
619 if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) ||
620 !getSameOpcode(VL)) {
676 if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
621677 DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
622678 newTreeEntry(VL, false);
623679 return;
752808 }
753809
754810 DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
755
756 unsigned Opcode = getSameOpcode(VL);
757811
758812 // Check if it is safe to sink the loads or the stores.
759813 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
10561110 }
10571111 return;
10581112 }
1113 case Instruction::ShuffleVector: {
1114 // If this is not an alternate sequence of opcode like add-sub
1115 // then do not vectorize this instruction.
1116 if (!isAltShuffle) {
1117 newTreeEntry(VL, false);
1118 DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
1119 return;
1120 }
1121 newTreeEntry(VL, true);
1122 DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
1123 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1124 ValueList Operands;
1125 // Prepare the operand vector.
1126 for (unsigned j = 0; j < VL.size(); ++j)
1127 Operands.push_back(cast(VL[j])->getOperand(i));
1128
1129 buildTree_rec(Operands, Depth + 1);
1130 }
1131 return;
1132 }
10591133 default:
10601134 newTreeEntry(VL, false);
10611135 DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
10791153 }
10801154 return getGatherCost(E->Scalars);
10811155 }
1082
1083 assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) &&
1084 "Invalid VL");
1156 unsigned Opcode = getSameOpcode(VL);
1157 assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
10851158 Instruction *VL0 = cast(VL[0]);
1086 unsigned Opcode = VL0->getOpcode();
10871159 switch (Opcode) {
10881160 case Instruction::PHI: {
10891161 return 0;
12411313
12421314 return VecCallCost - ScalarCallCost;
12431315 }
1316 case Instruction::ShuffleVector: {
1317 TargetTransformInfo::OperandValueKind Op1VK =
1318 TargetTransformInfo::OK_AnyValue;
1319 TargetTransformInfo::OperandValueKind Op2VK =
1320 TargetTransformInfo::OK_AnyValue;
1321 int ScalarCost = 0;
1322 int VecCost = 0;
1323 for (unsigned i = 0; i < VL.size(); ++i) {
1324 Instruction *I = cast(VL[i]);
1325 if (!I)
1326 break;
1327 ScalarCost +=
1328 TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
1329 }
1330 // VecCost is equal to sum of the cost of creating 2 vectors
1331 // and the cost of creating shuffle.
1332 Instruction *I0 = cast(VL[0]);
1333 VecCost =
1334 TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
1335 Instruction *I1 = cast(VL[1]);
1336 VecCost +=
1337 TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
1338 VecCost +=
1339 TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
1340 return VecCost - ScalarCost;
1341 }
12441342 default:
12451343 llvm_unreachable("Unknown instruction");
12461344 }
15211619 setInsertPointAfterBundle(E->Scalars);
15221620 return Gather(E->Scalars, VecTy);
15231621 }
1524
1525 unsigned Opcode = VL0->getOpcode();
1526 assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode");
1622 unsigned Opcode = getSameOpcode(E->Scalars);
15271623
15281624 switch (Opcode) {
15291625 case Instruction::PHI: {
17961892 E->VectorizedValue = V;
17971893 return V;
17981894 }
1895 case Instruction::ShuffleVector: {
1896 ValueList LHSVL, RHSVL;
1897 for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
1898 LHSVL.push_back(cast(E->Scalars[i])->getOperand(0));
1899 RHSVL.push_back(cast(E->Scalars[i])->getOperand(1));
1900 }
1901 setInsertPointAfterBundle(E->Scalars);
1902
1903 Value *LHS = vectorizeTree(LHSVL);
1904 Value *RHS = vectorizeTree(RHSVL);
1905
1906 if (Value *V = alreadyVectorized(E->Scalars))
1907 return V;
1908
1909 // Create a vector of LHS op1 RHS
1910 BinaryOperator *BinOp0 = cast(VL0);
1911 Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
1912
1913 // Create a vector of LHS op2 RHS
1914 Instruction *VL1 = cast(E->Scalars[1]);
1915 BinaryOperator *BinOp1 = cast(VL1);
1916 Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
1917
1918 // Create appropriate shuffle to take alternative operations from
1919 // the vector.
1920 std::vector Mask(E->Scalars.size());
1921 unsigned e = E->Scalars.size();
1922 for (unsigned i = 0; i < e; ++i) {
1923 if (i & 1)
1924 Mask[i] = Builder.getInt32(e + i);
1925 else
1926 Mask[i] = Builder.getInt32(i);
1927 }
1928
1929 Value *ShuffleMask = ConstantVector::get(Mask);
1930
1931 Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
1932 E->VectorizedValue = V;
1933 if (Instruction *I = dyn_cast(V))
1934 return propagateMetadata(I, E->Scalars);
1935
1936 return V;
1937 }
17991938 default:
18001939 llvm_unreachable("unknown inst");
18011940 }
18642003 // For each lane:
18652004 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
18662005 Value *Scalar = Entry->Scalars[Lane];
1867
18682006 // No need to handle users of gathered values.
18692007 if (Entry->NeedToGather)
18702008 continue;
20482186 for (po_iterator it = po_begin(&F.getEntryBlock()),
20492187 e = po_end(&F.getEntryBlock()); it != e; ++it) {
20502188 BasicBlock *BB = *it;
2051
20522189 // Vectorize trees that end at stores.
20532190 if (unsigned count = collectStores(BB, R)) {
20542191 (void)count;
0 ; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s
1 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
2 target triple = "x86_64-unknown-linux-gnu"
3
4 @b = common global [4 x i32] zeroinitializer, align 16
5 @c = common global [4 x i32] zeroinitializer, align 16
6 @d = common global [4 x i32] zeroinitializer, align 16
7 @e = common global [4 x i32] zeroinitializer, align 16
8 @a = common global [4 x i32] zeroinitializer, align 16
9 @fb = common global [4 x float] zeroinitializer, align 16
10 @fc = common global [4 x float] zeroinitializer, align 16
11 @fa = common global [4 x float] zeroinitializer, align 16
12
13 ; CHECK-LABEL: @addsub
14 ; CHECK: %5 = add <4 x i32> %3, %4
15 ; CHECK: %6 = add <4 x i32> %2, %5
16 ; CHECK: %7 = sub <4 x i32> %2, %5
17 ; CHECK: %8 = shufflevector <4 x i32> %6, <4 x i32> %7, <4 x i32>
18
19 ; Function Attrs: nounwind uwtable
20 define void @addsub() #0 {
21 entry:
22 %0 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 0), align 4
23 %1 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 0), align 4
24 %add = add nsw i32 %0, %1
25 %2 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 0), align 4
26 %3 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 0), align 4
27 %add1 = add nsw i32 %2, %3
28 %add2 = add nsw i32 %add, %add1
29 store i32 %add2, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 0), align 4
30 %4 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 1), align 4
31 %5 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 1), align 4
32 %add3 = add nsw i32 %4, %5
33 %6 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 1), align 4
34 %7 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 1), align 4
35 %add4 = add nsw i32 %6, %7
36 %sub = sub nsw i32 %add3, %add4
37 store i32 %sub, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 1), align 4
38 %8 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 2), align 4
39 %9 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 2), align 4
40 %add5 = add nsw i32 %8, %9
41 %10 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 2), align 4
42 %11 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 2), align 4
43 %add6 = add nsw i32 %10, %11
44 %add7 = add nsw i32 %add5, %add6
45 store i32 %add7, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 2), align 4
46 %12 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 3), align 4
47 %13 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 3), align 4
48 %add8 = add nsw i32 %12, %13
49 %14 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 3), align 4
50 %15 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 3), align 4
51 %add9 = add nsw i32 %14, %15
52 %sub10 = sub nsw i32 %add8, %add9
53 store i32 %sub10, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 3), align 4
54 ret void
55 }
56
57 ; CHECK-LABEL: @subadd
58 ; CHECK: %5 = add <4 x i32> %3, %4
59 ; CHECK: %6 = sub <4 x i32> %2, %5
60 ; CHECK: %7 = add <4 x i32> %2, %5
61 ; CHECK: %8 = shufflevector <4 x i32> %6, <4 x i32> %7, <4 x i32>
62
63 ; Function Attrs: nounwind uwtable
64 define void @subadd() #0 {
65 entry:
66 %0 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 0), align 4
67 %1 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 0), align 4
68 %add = add nsw i32 %0, %1
69 %2 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 0), align 4
70 %3 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 0), align 4
71 %add1 = add nsw i32 %2, %3
72 %sub = sub nsw i32 %add, %add1
73 store i32 %sub, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 0), align 4
74 %4 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 1), align 4
75 %5 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 1), align 4
76 %add2 = add nsw i32 %4, %5
77 %6 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 1), align 4
78 %7 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 1), align 4
79 %add3 = add nsw i32 %6, %7
80 %add4 = add nsw i32 %add2, %add3
81 store i32 %add4, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 1), align 4
82 %8 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 2), align 4
83 %9 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 2), align 4
84 %add5 = add nsw i32 %8, %9
85 %10 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 2), align 4
86 %11 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 2), align 4
87 %add6 = add nsw i32 %10, %11
88 %sub7 = sub nsw i32 %add5, %add6
89 store i32 %sub7, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 2), align 4
90 %12 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 3), align 4
91 %13 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 3), align 4
92 %add8 = add nsw i32 %12, %13
93 %14 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 3), align 4
94 %15 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 3), align 4
95 %add9 = add nsw i32 %14, %15
96 %add10 = add nsw i32 %add8, %add9
97 store i32 %add10, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 3), align 4
98 ret void
99 }
100
101 ; CHECK-LABEL: @faddfsub
102 ; CHECK: %2 = fadd <4 x float> %0, %1
103 ; CHECK: %3 = fsub <4 x float> %0, %1
104 ; CHECK: %4 = shufflevector <4 x float> %2, <4 x float> %3, <4 x i32>
105 ; Function Attrs: nounwind uwtable
106 define void @faddfsub() #0 {
107 entry:
108 %0 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 0), align 4
109 %1 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 0), align 4
110 %add = fadd float %0, %1
111 store float %add, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 0), align 4
112 %2 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 1), align 4
113 %3 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 1), align 4
114 %sub = fsub float %2, %3
115 store float %sub, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 1), align 4
116 %4 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 2), align 4
117 %5 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 2), align 4
118 %add1 = fadd float %4, %5
119 store float %add1, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 2), align 4
120 %6 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 3), align 4
121 %7 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 3), align 4
122 %sub2 = fsub float %6, %7
123 store float %sub2, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 3), align 4
124 ret void
125 }
126
127 ; CHECK-LABEL: @fsubfadd
128 ; CHECK: %2 = fsub <4 x float> %0, %1
129 ; CHECK: %3 = fadd <4 x float> %0, %1
130 ; CHECK: %4 = shufflevector <4 x float> %2, <4 x float> %3, <4 x i32>
131 ; Function Attrs: nounwind uwtable
132 define void @fsubfadd() #0 {
133 entry:
134 %0 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 0), align 4
135 %1 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 0), align 4
136 %sub = fsub float %0, %1
137 store float %sub, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 0), align 4
138 %2 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 1), align 4
139 %3 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 1), align 4
140 %add = fadd float %2, %3
141 store float %add, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 1), align 4
142 %4 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 2), align 4
143 %5 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 2), align 4
144 %sub1 = fsub float %4, %5
145 store float %sub1, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 2), align 4
146 %6 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 3), align 4
147 %7 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 3), align 4
148 %add2 = fadd float %6, %7
149 store float %add2, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 3), align 4
150 ret void
151 }
152
153 ; CHECK-LABEL: @No_faddfsub
154 ; CHECK-NOT: fadd <4 x float>
155 ; CHECK-NOT: fsub <4 x float>
156 ; CHECK-NOT: shufflevector
157 ; Function Attrs: nounwind uwtable
158 define void @No_faddfsub() #0 {
159 entry:
160 %0 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 0), align 4
161 %1 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 0), align 4
162 %add = fadd float %0, %1
163 store float %add, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 0), align 4
164 %2 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 1), align 4
165 %3 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 1), align 4
166 %add1 = fadd float %2, %3
167 store float %add1, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 1), align 4
168 %4 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 2), align 4
169 %5 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 2), align 4
170 %add2 = fadd float %4, %5
171 store float %add2, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 2), align 4
172 %6 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 3), align 4
173 %7 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 3), align 4
174 %sub = fsub float %6, %7
175 store float %sub, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 3), align 4
176 ret void
177 }
178
179 attributes #0 = { nounwind }
180