llvm.org GIT mirror llvm / 86316b8
[SLPVectorizer] Schedule bundle with different opcodes. This change let us schedule a bundle with different opcodes in it, for example : [ load, add, add, add ] Reviewers: mkuper, RKSimon, ABataev, mzolotukhin, spatel, filcab Subscribers: llvm-commits, rengolin Differential Revision: https://reviews.llvm.org/D36518 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@310847 91177308-0d34-0410-b5e6-96231b3b80d8 Dinar Temirbulatov 2 years ago
2 changed file(s) with 193 addition(s) and 52 deletion(s). Raw diff Collapse all Expand all
941941 return nullptr;
942942 }
943943
944 ScheduleData *getScheduleData(Value *V, Value *Key) {
945 if (V == Key)
946 return getScheduleData(V);
947 auto I = ExtraScheduleDataMap.find(V);
948 if (I != ExtraScheduleDataMap.end()) {
949 ScheduleData *SD = I->second[Key];
950 if (SD && SD->SchedulingRegionID == SchedulingRegionID)
951 return SD;
952 }
953 return nullptr;
954 }
955
944956 bool isInSchedulingRegion(ScheduleData *SD) {
945957 return SD->SchedulingRegionID == SchedulingRegionID;
946958 }
954966
955967 ScheduleData *BundleMember = SD;
956968 while (BundleMember) {
969 if (BundleMember->Inst != BundleMember->OpValue) {
970 BundleMember = BundleMember->NextInBundle;
971 continue;
972 }
957973 // Handle the def-use chain dependencies.
958974 for (Use &U : BundleMember->Inst->operands()) {
959 ScheduleData *OpDef = getScheduleData(U.get());
960 if (OpDef && OpDef->hasValidDependencies() &&
961 OpDef->incrementUnscheduledDeps(-1) == 0) {
962 // There are no more unscheduled dependencies after decrementing,
963 // so we can put the dependent instruction into the ready list.
964 ScheduleData *DepBundle = OpDef->FirstInBundle;
965 assert(!DepBundle->IsScheduled &&
966 "already scheduled bundle gets ready");
967 ReadyList.insert(DepBundle);
968 DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n");
969 }
975 auto *I = dyn_cast(U.get());
976 if (!I)
977 continue;
978 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
979 if (OpDef && OpDef->hasValidDependencies() &&
980 OpDef->incrementUnscheduledDeps(-1) == 0) {
981 // There are no more unscheduled dependencies after
982 // decrementing, so we can put the dependent instruction
983 // into the ready list.
984 ScheduleData *DepBundle = OpDef->FirstInBundle;
985 assert(!DepBundle->IsScheduled &&
986 "already scheduled bundle gets ready");
987 ReadyList.insert(DepBundle);
988 DEBUG(dbgs()
989 << "SLP: gets ready (def): " << *DepBundle << "\n");
990 }
991 });
970992 }
971993 // Handle the memory dependencies.
972994 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
977999 assert(!DepBundle->IsScheduled &&
9781000 "already scheduled bundle gets ready");
9791001 ReadyList.insert(DepBundle);
980 DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n");
1002 DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle
1003 << "\n");
9811004 }
9821005 }
9831006 BundleMember = BundleMember->NextInBundle;
9841007 }
1008 }
1009
1010 void doForAllOpcodes(Value *V,
1011 function_ref Action) {
1012 if (ScheduleData *SD = getScheduleData(V))
1013 Action(SD);
1014 auto I = ExtraScheduleDataMap.find(V);
1015 if (I != ExtraScheduleDataMap.end())
1016 for (auto &P : I->second)
1017 if (P.second->SchedulingRegionID == SchedulingRegionID)
1018 Action(P.second);
9851019 }
9861020
9871021 /// Put all instructions into the ReadyList which are ready for scheduling.
9881022 template
9891023 void initialFillReadyList(ReadyListType &ReadyList) {
9901024 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
991 ScheduleData *SD = getScheduleData(I);
992 if (SD->isSchedulingEntity() && SD->isReady()) {
993 ReadyList.insert(SD);
994 DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
995 }
1025 doForAllOpcodes(I, [&ReadyList, I](ScheduleData *SD) {
1026 if (SD->isSchedulingEntity() && SD->isReady()) {
1027 ReadyList.insert(SD);
1028 DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
1029 }
1030 });
9961031 }
9971032 }
9981033
10041039 /// Un-bundles a group of instructions.
10051040 void cancelScheduling(ArrayRef VL, Value *OpValue);
10061041
1042 /// Allocates schedule data chunk.
1043 ScheduleData *allocateScheduleDataChunks();
1044
10071045 /// Extends the scheduling region so that V is inside the region.
10081046 /// \returns true if the region size is within the limit.
1009 bool extendSchedulingRegion(Value *V);
1047 bool extendSchedulingRegion(Value *V, Value *OpValue);
10101048
10111049 /// Initialize the ScheduleData structures for new instructions in the
10121050 /// scheduling region.
10381076 /// Note that the mapping survives during all vectorization iterations, i.e.
10391077 /// ScheduleData structures are recycled.
10401078 DenseMap ScheduleDataMap;
1079
1080 /// Attaches ScheduleData to Instruction with the leading key.
1081 DenseMap>
1082 ExtraScheduleDataMap;
10411083
10421084 struct ReadyList : SmallVector {
10431085 void insert(ScheduleData *SD) { push_back(SD); }
32783320 // Make sure that the scheduling region contains all
32793321 // instructions of the bundle.
32803322 for (Value *V : VL) {
3281 if (!extendSchedulingRegion(V))
3323 if (!extendSchedulingRegion(V, OpValue))
32823324 return false;
32833325 }
32843326
33153357 // It is seldom that this needs to be done a second time after adding the
33163358 // initial bundle to the region.
33173359 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3318 ScheduleData *SD = getScheduleData(I);
3319 SD->clearDependencies();
3360 doForAllOpcodes(I, [](ScheduleData *SD) {
3361 SD->clearDependencies();
3362 });
33203363 }
33213364 ReSchedule = true;
33223365 }
33773420 }
33783421 }
33793422
3380 bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
3381 if (getScheduleData(V))
3423 BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
3424 // Allocate a new ScheduleData for the instruction.
3425 if (ChunkPos >= ChunkSize) {
3426 ScheduleDataChunks.push_back(llvm::make_unique(ChunkSize));
3427 ChunkPos = 0;
3428 }
3429 return &(ScheduleDataChunks.back()[ChunkPos++]);
3430 }
3431
3432 bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
3433 Value *OpValue) {
3434 if (getScheduleData(V, isOneOf(OpValue, V)))
33823435 return true;
33833436 Instruction *I = dyn_cast(V);
33843437 assert(I && "bundle member must be an instruction");
33853438 assert(!isa(I) && "phi nodes don't need to be scheduled");
3439 auto &&CheckSheduleForI = [this, OpValue](Instruction *I) -> bool {
3440 ScheduleData *ISD = getScheduleData(I);
3441 if (!ISD)
3442 return false;
3443 assert(isInSchedulingRegion(ISD) &&
3444 "ScheduleData not in scheduling region");
3445 ScheduleData *SD = allocateScheduleDataChunks();
3446 SD->Inst = I;
3447 SD->init(SchedulingRegionID, OpValue);
3448 ExtraScheduleDataMap[I][OpValue] = SD;
3449 return true;
3450 };
3451 if (CheckSheduleForI(I))
3452 return true;
33863453 if (!ScheduleStart) {
33873454 // It's the first instruction in the new region.
33883455 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
33893456 ScheduleStart = I;
33903457 ScheduleEnd = I->getNextNode();
3458 if (isOneOf(OpValue, I) != I)
3459 CheckSheduleForI(I);
33913460 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
33923461 DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
33933462 return true;
34093478 if (&*UpIter == I) {
34103479 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
34113480 ScheduleStart = I;
3481 if (isOneOf(OpValue, I) != I)
3482 CheckSheduleForI(I);
34123483 DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
34133484 return true;
34143485 }
34193490 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
34203491 nullptr);
34213492 ScheduleEnd = I->getNextNode();
3493 if (isOneOf(OpValue, I) != I)
3494 CheckSheduleForI(I);
34223495 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
34233496 DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
34243497 return true;
34453518 llvm::make_unique(ChunkSize));
34463519 ChunkPos = 0;
34473520 }
3448 SD = &(ScheduleDataChunks.back()[ChunkPos++]);
3521 SD = allocateScheduleDataChunks();
34493522 ScheduleDataMap[I] = SD;
34503523 SD->Inst = I;
34513524 }
34933566 BundleMember->resetUnscheduledDeps();
34943567
34953568 // Handle def-use chain dependencies.
3496 for (User *U : BundleMember->Inst->users()) {
3497 if (isa(U)) {
3498 ScheduleData *UseSD = getScheduleData(U);
3499 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
3569 if (BundleMember->OpValue != BundleMember->Inst) {
3570 ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
3571 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
3572 BundleMember->Dependencies++;
3573 ScheduleData *DestBundle = UseSD->FirstInBundle;
3574 if (!DestBundle->IsScheduled)
3575 BundleMember->incrementUnscheduledDeps(1);
3576 if (!DestBundle->hasValidDependencies())
3577 WorkList.push_back(DestBundle);
3578 }
3579 } else {
3580 for (User *U : BundleMember->Inst->users()) {
3581 if (isa(U)) {
3582 ScheduleData *UseSD = getScheduleData(U);
3583 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
3584 BundleMember->Dependencies++;
3585 ScheduleData *DestBundle = UseSD->FirstInBundle;
3586 if (!DestBundle->IsScheduled)
3587 BundleMember->incrementUnscheduledDeps(1);
3588 if (!DestBundle->hasValidDependencies())
3589 WorkList.push_back(DestBundle);
3590 }
3591 } else {
3592 // I'm not sure if this can ever happen. But we need to be safe.
3593 // This lets the instruction/bundle never be scheduled and
3594 // eventually disable vectorization.
35003595 BundleMember->Dependencies++;
3501 ScheduleData *DestBundle = UseSD->FirstInBundle;
3502 if (!DestBundle->IsScheduled)
3503 BundleMember->incrementUnscheduledDeps(1);
3504 if (!DestBundle->hasValidDependencies())
3505 WorkList.push_back(DestBundle);
3596 BundleMember->incrementUnscheduledDeps(1);
35063597 }
3507 } else {
3508 // I'm not sure if this can ever happen. But we need to be safe.
3509 // This lets the instruction/bundle never be scheduled and
3510 // eventually disable vectorization.
3511 BundleMember->Dependencies++;
3512 BundleMember->incrementUnscheduledDeps(1);
35133598 }
35143599 }
35153600
35863671 assert(ScheduleStart &&
35873672 "tried to reset schedule on block which has not been scheduled");
35883673 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3589 ScheduleData *SD = getScheduleData(I);
3590 assert(isInSchedulingRegion(SD));
3591 SD->IsScheduled = false;
3592 SD->resetUnscheduledDeps();
3674 doForAllOpcodes(I, [this](ScheduleData *SD) {
3675 assert(isInSchedulingRegion(SD) &&
3676 "ScheduleData not in scheduling region");
3677 SD->IsScheduled = false;
3678 SD->resetUnscheduledDeps();
3679 });
35933680 }
35943681 ReadyInsts.clear();
35953682 }
36193706 int NumToSchedule = 0;
36203707 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
36213708 I = I->getNextNode()) {
3622 ScheduleData *SD = BS->getScheduleData(I);
3623 assert(
3624 SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr) &&
3625 "scheduler and vectorizer have different opinion on what is a bundle");
3626 SD->FirstInBundle->SchedulingPriority = Idx++;
3627 if (SD->isSchedulingEntity()) {
3628 BS->calculateDependencies(SD, false, this);
3629 NumToSchedule++;
3630 }
3709 BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
3710 assert(SD->isPartOfBundle() ==
3711 (getTreeEntry(SD->Inst) != nullptr) &&
3712 "scheduler and vectorizer bundle mismatch");
3713 SD->FirstInBundle->SchedulingPriority = Idx++;
3714 if (SD->isSchedulingEntity()) {
3715 BS->calculateDependencies(SD, false, this);
3716 NumToSchedule++;
3717 }
3718 });
36313719 }
36323720 BS->initialFillReadyList(ReadyInsts);
36333721
0 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
1 ; RUN: opt -S -slp-vectorizer -slp-vectorizer -mcpu=bdver1 < %s | FileCheck %s
2
3 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
4 target triple = "x86_64-unknown-linux-gnu"
5
6 @a = common local_unnamed_addr global [1 x i32] zeroinitializer, align 4
7 @b = common local_unnamed_addr global [1 x i32] zeroinitializer, align 4
8
9 define i32 @slp_schedule_bundle() local_unnamed_addr #0 {
10 ; CHECK-LABEL: @slp_schedule_bundle(
11 ; CHECK-NEXT: entry:
12 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([1 x i32]* @b to <4 x i32>*), align 4
13 ; CHECK-NEXT: [[TMP1:%.*]] = lshr <4 x i32> [[TMP0]],
14 ; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> , [[TMP1]]
15 ; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([1 x i32]* @a to <4 x i32>*), align 4
16 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 4, i64 0), align 4
17 ; CHECK-NEXT: [[DOTLOBIT_4:%.*]] = lshr i32 [[TMP3]], 31
18 ; CHECK-NEXT: [[DOTLOBIT_NOT_4:%.*]] = xor i32 [[DOTLOBIT_4]], 1
19 ; CHECK-NEXT: store i32 [[DOTLOBIT_NOT_4]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 4, i64 0), align 4
20 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 5, i64 0), align 4
21 ; CHECK-NEXT: [[DOTLOBIT_5:%.*]] = lshr i32 [[TMP4]], 31
22 ; CHECK-NEXT: [[DOTLOBIT_NOT_5:%.*]] = xor i32 [[DOTLOBIT_5]], 1
23 ; CHECK-NEXT: store i32 [[DOTLOBIT_NOT_5]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 5, i64 0), align 4
24 ; CHECK-NEXT: ret i32 undef
25 ;
26 entry:
27 %0 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @b, i64 0, i64 0), align 4
28 %.lobit = lshr i32 %0, 31
29 %.lobit.not = xor i32 %.lobit, 1
30 store i32 %.lobit.not, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @a, i64 0, i64 0), align 4
31 %1 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @b, i64 1, i64 0), align 4
32 %.lobit.1 = lshr i32 %1, 31
33 %.lobit.not.1 = xor i32 %.lobit.1, 1
34 store i32 %.lobit.not.1, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @a, i64 1, i64 0), align 4
35 %2 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 2, i64 0), align 4
36 %.lobit.2 = lshr i32 %2, 31
37 %.lobit.not.2 = xor i32 %.lobit.2, 1
38 store i32 %.lobit.not.2, i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 2, i64 0), align 4
39 %3 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 3, i64 0), align 4
40 %.lobit.3 = lshr i32 %3, 31
41 %.lobit.not.3 = xor i32 %.lobit.3, 1
42 store i32 %.lobit.not.3, i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 3, i64 0), align 4
43 %4 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 4, i64 0), align 4
44 %.lobit.4 = lshr i32 %4, 31
45 %.lobit.not.4 = xor i32 %.lobit.4, 1
46 store i32 %.lobit.not.4, i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 4, i64 0), align 4
47 %5 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 5, i64 0), align 4
48 %.lobit.5 = lshr i32 %5, 31
49 %.lobit.not.5 = xor i32 %.lobit.5, 1
50 store i32 %.lobit.not.5, i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 5, i64 0), align 4
51 ret i32 undef
52 }