llvm.org GIT mirror llvm / b0c326b
[AArch64] Enable more load clustering in the MI Scheduler. This patch adds unscaled loads and sign-extend loads to the TII getMemOpBaseRegImmOfs API, which is used to control clustering in the MI scheduler. This is done to create more opportunities for load pairing. I've also added the scaled LDRSWui instruction, which was missing from the scaled instructions. Finally, I've added support in shouldClusterLoads for clustering adjacent sext and zext loads that too can be paired by the load/store optimizer. Differential Revision: http://reviews.llvm.org/D18048 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@263819 91177308-0d34-0410-b5e6-96231b3b80d8 Chad Rosier 3 years ago
4 changed file(s) with 217 addition(s) and 38 deletion(s). Raw diff Collapse all Expand all
13411341 return isUnscaledLdSt(MI->getOpcode());
13421342 }
13431343
1344 // Is this a candidate for ld/st merging or pairing? For example, we don't
1345 // touch volatiles or load/stores that have a hint to avoid pair formation.
1346 bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr *MI) const {
1347 // If this is a volatile load/store, don't mess with it.
1348 if (MI->hasOrderedMemoryRef())
1349 return false;
1350
1351 // Make sure this is a reg+imm (as opposed to an address reloc).
1352 assert(MI->getOperand(1).isReg() && "Expected a reg operand.");
1353 if (!MI->getOperand(2).isImm())
1354 return false;
1355
1356 // Can't merge/pair if the instruction modifies the base register.
1357 // e.g., ldr x0, [x0]
1358 unsigned BaseReg = MI->getOperand(1).getReg();
1359 const TargetRegisterInfo *TRI = &getRegisterInfo();
1360 if (MI->modifiesRegister(BaseReg, TRI))
1361 return false;
1362
1363 // Check if this load/store has a hint to avoid pair formation.
1364 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
1365 if (isLdStPairSuppressed(MI))
1366 return false;
1367
1368 return true;
1369 }
1370
13441371 bool AArch64InstrInfo::getMemOpBaseRegImmOfs(
13451372 MachineInstr *LdSt, unsigned &BaseReg, int64_t &Offset,
13461373 const TargetRegisterInfo *TRI) const {
13581385 case AArch64::LDRQui:
13591386 case AArch64::LDRXui:
13601387 case AArch64::LDRWui:
1388 case AArch64::LDRSWui:
1389 // Unscaled instructions.
1390 case AArch64::LDURSi:
1391 case AArch64::LDURDi:
1392 case AArch64::LDURQi:
1393 case AArch64::LDURWi:
1394 case AArch64::LDURXi:
1395 case AArch64::LDURSWi:
13611396 unsigned Width;
13621397 return getMemOpBaseRegImmOfsWidth(LdSt, BaseReg, Offset, Width, TRI);
13631398 };
14281463 break;
14291464 case AArch64::LDRWui:
14301465 case AArch64::LDRSui:
1466 case AArch64::LDRSWui:
14311467 case AArch64::STRWui:
14321468 case AArch64::STRSui:
14331469 Scale = Width = 4;
14491485 BaseReg = LdSt->getOperand(1).getReg();
14501486 Offset = LdSt->getOperand(2).getImm() * Scale;
14511487 return true;
1488 }
1489
1490 // Scale the unscaled offsets. Returns false if the unscaled offset can't be
1491 // scaled.
1492 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
1493 unsigned OffsetStride = 1;
1494 switch (Opc) {
1495 default:
1496 return false;
1497 case AArch64::LDURQi:
1498 OffsetStride = 16;
1499 break;
1500 case AArch64::LDURXi:
1501 case AArch64::LDURDi:
1502 OffsetStride = 8;
1503 break;
1504 case AArch64::LDURWi:
1505 case AArch64::LDURSi:
1506 case AArch64::LDURSWi:
1507 OffsetStride = 4;
1508 break;
1509 }
1510 // If the byte-offset isn't a multiple of the stride, we can't scale this
1511 // offset.
1512 if (Offset % OffsetStride != 0)
1513 return false;
1514
1515 // Convert the byte-offset used by unscaled into an "element" offset used
1516 // by the scaled pair load/store instructions.
1517 Offset /= OffsetStride;
1518 return true;
1519 }
1520
1521 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
1522 if (FirstOpc == SecondOpc)
1523 return true;
1524 // We can also pair sign-ext and zero-ext instructions.
1525 switch (FirstOpc) {
1526 default:
1527 return false;
1528 case AArch64::LDRWui:
1529 case AArch64::LDURWi:
1530 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
1531 case AArch64::LDRSWui:
1532 case AArch64::LDURSWi:
1533 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
1534 }
1535 // These instructions can't be paired based on their opcodes.
1536 return false;
14521537 }
14531538
14541539 /// Detect opportunities for ldp/stp formation.
14601545 // Only cluster up to a single pair.
14611546 if (NumLoads > 1)
14621547 return false;
1463 if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode())
1464 return false;
1465 // getMemOpBaseRegImmOfs guarantees that oper 2 isImm.
1466 unsigned Ofs1 = FirstLdSt->getOperand(2).getImm();
1467 // Allow 6 bits of positive range.
1468 if (Ofs1 > 64)
1469 return false;
1548
1549 // Can we pair these instructions based on their opcodes?
1550 unsigned FirstOpc = FirstLdSt->getOpcode();
1551 unsigned SecondOpc = SecondLdSt->getOpcode();
1552 if (!canPairLdStOpc(FirstOpc, SecondOpc))
1553 return false;
1554
1555 // Can't merge volatiles or load/stores that have a hint to avoid pair
1556 // formation, for example.
1557 if (!isCandidateToMergeOrPair(FirstLdSt) ||
1558 !isCandidateToMergeOrPair(SecondLdSt))
1559 return false;
1560
1561 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
1562 int64_t Offset1 = FirstLdSt->getOperand(2).getImm();
1563 if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
1564 return false;
1565
1566 int64_t Offset2 = SecondLdSt->getOperand(2).getImm();
1567 if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
1568 return false;
1569
1570 // Pairwise instructions have a 7-bit signed offset field.
1571 if (Offset1 > 63 || Offset1 < -64)
1572 return false;
1573
14701574 // The caller should already have ordered First/SecondLdSt by offset.
1471 unsigned Ofs2 = SecondLdSt->getOperand(2).getImm();
1472 return Ofs1 + 1 == Ofs2;
1575 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
1576 return Offset1 + 1 == Offset2;
14731577 }
14741578
14751579 bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
9191
9292 /// Return true if this is an unscaled load/store.
9393 bool isUnscaledLdSt(MachineInstr *MI) const;
94
95 /// Return true if this is a load/store that can be potentially paired/merged.
96 bool isCandidateToMergeOrPair(MachineInstr *MI) const;
9497
9598 /// Hint that pairing the given load or store is unprofitable.
9699 void suppressLdStPair(MachineInstr *MI) const;
144144 MachineBasicBlock::iterator
145145 mergeUpdateInsn(MachineBasicBlock::iterator I,
146146 MachineBasicBlock::iterator Update, bool IsPreIdx);
147
148 // Is this a candidate for ld/st merging or pairing? For example, we don't
149 // touch volatiles or load/stores that have a hint to avoid pair formation.
150 bool isCandidateToMergeOrPair(MachineInstr *MI);
151147
152148 // Find and merge foldable ldr/str instructions.
153149 bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);
15871583 return false;
15881584 }
15891585
1590 bool AArch64LoadStoreOpt::isCandidateToMergeOrPair(MachineInstr *MI) {
1591 // If this is a volatile load/store, don't mess with it.
1592 if (MI->hasOrderedMemoryRef())
1593 return false;
1594
1595 // Make sure this is a reg+imm (as opposed to an address reloc).
1596 if (!getLdStOffsetOp(MI).isImm())
1597 return false;
1598
1599 // Can't merge/pair if the instruction modifies the base register.
1600 // e.g., ldr x0, [x0]
1601 unsigned BaseReg = getLdStBaseOp(MI).getReg();
1602 if (MI->modifiesRegister(BaseReg, TRI))
1603 return false;
1604
1605 // Check if this load/store has a hint to avoid pair formation.
1606 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
1607 if (TII->isLdStPairSuppressed(MI))
1608 return false;
1609
1610 return true;
1611 }
1612
16131586 // Find narrow loads that can be converted into a single wider load with
16141587 // bitfield extract instructions. Also merge adjacent zero stores into a wider
16151588 // store.
16201593 MachineInstr *MI = MBBI;
16211594 MachineBasicBlock::iterator E = MI->getParent()->end();
16221595
1623 if (!isCandidateToMergeOrPair(MI))
1596 if (!TII->isCandidateToMergeOrPair(MI))
16241597 return false;
16251598
16261599 // For promotable zero stores, the stored value should be WZR.
16521625 MachineInstr *MI = MBBI;
16531626 MachineBasicBlock::iterator E = MI->getParent()->end();
16541627
1655 if (!isCandidateToMergeOrPair(MI))
1628 if (!TII->isCandidateToMergeOrPair(MI))
16561629 return false;
16571630
16581631 // Early exit if the offset is not possible to match. (6 bits of positive
0 ; REQUIRES: asserts
1 ; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
2
3 ; Test ldr clustering.
4 ; CHECK: ********** MI Scheduling **********
5 ; CHECK-LABEL: ldr_int:BB#0
6 ; CHECK: Cluster loads SU(1) - SU(2)
7 ; CHECK: SU(1): %vreg{{[0-9]+}} = LDRWui
8 ; CHECK: SU(2): %vreg{{[0-9]+}} = LDRWui
9 define i32 @ldr_int(i32* %a) nounwind {
10 %p1 = getelementptr inbounds i32, i32* %a, i32 1
11 %tmp1 = load i32, i32* %p1, align 2
12 %p2 = getelementptr inbounds i32, i32* %a, i32 2
13 %tmp2 = load i32, i32* %p2, align 2
14 %tmp3 = add i32 %tmp1, %tmp2
15 ret i32 %tmp3
16 }
17
18 ; Test ldpsw clustering
19 ; CHECK: ********** MI Scheduling **********
20 ; CHECK-LABEL: ldp_sext_int:BB#0
21 ; CHECK: Cluster loads SU(1) - SU(2)
22 ; CHECK: SU(1): %vreg{{[0-9]+}} = LDRSWui
23 ; CHECK: SU(2): %vreg{{[0-9]+}} = LDRSWui
24 define i64 @ldp_sext_int(i32* %p) nounwind {
25 %tmp = load i32, i32* %p, align 4
26 %add.ptr = getelementptr inbounds i32, i32* %p, i64 1
27 %tmp1 = load i32, i32* %add.ptr, align 4
28 %sexttmp = sext i32 %tmp to i64
29 %sexttmp1 = sext i32 %tmp1 to i64
30 %add = add nsw i64 %sexttmp1, %sexttmp
31 ret i64 %add
32 }
33
34 ; Test ldur clustering.
35 ; CHECK: ********** MI Scheduling **********
36 ; CHECK-LABEL: ldur_int:BB#0
37 ; CHECK: Cluster loads SU(2) - SU(1)
38 ; CHECK: SU(1): %vreg{{[0-9]+}} = LDURWi
39 ; CHECK: SU(2): %vreg{{[0-9]+}} = LDURWi
40 define i32 @ldur_int(i32* %a) nounwind {
41 %p1 = getelementptr inbounds i32, i32* %a, i32 -1
42 %tmp1 = load i32, i32* %p1, align 2
43 %p2 = getelementptr inbounds i32, i32* %a, i32 -2
44 %tmp2 = load i32, i32* %p2, align 2
45 %tmp3 = add i32 %tmp1, %tmp2
46 ret i32 %tmp3
47 }
48
49 ; Test sext + zext clustering.
50 ; CHECK: ********** MI Scheduling **********
51 ; CHECK-LABEL: ldp_half_sext_zext_int:BB#0
52 ; CHECK: Cluster loads SU(3) - SU(4)
53 ; CHECK: SU(3): %vreg{{[0-9]+}} = LDRSWui
54 ; CHECK: SU(4): %vreg{{[0-9]+}}:sub_32 = LDRWui
55 define i64 @ldp_half_sext_zext_int(i64* %q, i32* %p) nounwind {
56 %tmp0 = load i64, i64* %q, align 4
57 %tmp = load i32, i32* %p, align 4
58 %add.ptr = getelementptr inbounds i32, i32* %p, i64 1
59 %tmp1 = load i32, i32* %add.ptr, align 4
60 %sexttmp = sext i32 %tmp to i64
61 %sexttmp1 = zext i32 %tmp1 to i64
62 %add = add nsw i64 %sexttmp1, %sexttmp
63 %add1 = add nsw i64 %add, %tmp0
64 ret i64 %add1
65 }
66
67 ; Test zext + sext clustering.
68 ; CHECK: ********** MI Scheduling **********
69 ; CHECK-LABEL: ldp_half_zext_sext_int:BB#0
70 ; CHECK: Cluster loads SU(3) - SU(4)
71 ; CHECK: SU(3): %vreg{{[0-9]+}}:sub_32 = LDRWui
72 ; CHECK: SU(4): %vreg{{[0-9]+}} = LDRSWui
73 define i64 @ldp_half_zext_sext_int(i64* %q, i32* %p) nounwind {
74 %tmp0 = load i64, i64* %q, align 4
75 %tmp = load i32, i32* %p, align 4
76 %add.ptr = getelementptr inbounds i32, i32* %p, i64 1
77 %tmp1 = load i32, i32* %add.ptr, align 4
78 %sexttmp = zext i32 %tmp to i64
79 %sexttmp1 = sext i32 %tmp1 to i64
80 %add = add nsw i64 %sexttmp1, %sexttmp
81 %add1 = add nsw i64 %add, %tmp0
82 ret i64 %add1
83 }
84
85 ; Verify we don't cluster volatile loads.
86 ; CHECK: ********** MI Scheduling **********
87 ; CHECK-LABEL: ldr_int_volatile:BB#0
88 ; CHECK-NOT: Cluster loads
89 ; CHECK: SU(1): %vreg{{[0-9]+}} = LDRWui
90 ; CHECK: SU(2): %vreg{{[0-9]+}} = LDRWui
91 define i32 @ldr_int_volatile(i32* %a) nounwind {
92 %p1 = getelementptr inbounds i32, i32* %a, i32 1
93 %tmp1 = load volatile i32, i32* %p1, align 2
94 %p2 = getelementptr inbounds i32, i32* %a, i32 2
95 %tmp2 = load volatile i32, i32* %p2, align 2
96 %tmp3 = add i32 %tmp1, %tmp2
97 ret i32 %tmp3
98 }