llvm.org GIT mirror llvm / 27565d6
Thumb2: Modify codegen for memcpy intrinsic to prefer LDM/STM. We were previously codegen'ing these as regular load/store operations and hoping that the register allocator would allocate registers in ascending order so that we could apply an LDM/STM combine after register allocation. According to the commit that first introduced this code (r37179), we planned to teach the register allocator to allocate the registers in ascending order. This never got implemented, and up to now we've been stuck with very poor codegen. A much simpler approach for achiveing better codegen is to create LDM/STM instructions with identical sets of virtual registers, let the register allocator pick arbitrary registers and order register lists when printing an MCInst. This approach also avoids the need to repeatedly calculate offsets which ultimately ought to be eliminated pre-RA in order to decrease register pressure. This is implemented by lowering the memcpy intrinsic to a series of SD-only MCOPY pseudo-instructions which performs a memory copy using a given number of registers. During SD->MI lowering, we lower MCOPY to LDM/STM. This is a little unusual, but it avoids the need to encode register lists in the SD, and we can take advantage of SD use lists to decide whether to use the _UPD variant of the instructions. Fixes PR9199. Differential Revision: http://reviews.llvm.org/D9508 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@238473 91177308-0d34-0410-b5e6-96231b3b80d8 Peter Collingbourne 5 years ago
8 changed file(s) with 205 addition(s) and 42 deletion(s). Raw diff Collapse all Expand all
11211121 case ARMISD::VORRIMM: return "ARMISD::VORRIMM";
11221122 case ARMISD::VBICIMM: return "ARMISD::VBICIMM";
11231123 case ARMISD::VBSL: return "ARMISD::VBSL";
1124 case ARMISD::MCOPY: return "ARMISD::MCOPY";
11241125 case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP";
11251126 case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP";
11261127 case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP";
76287629 }
76297630 }
76307631
7632 /// \brief Lowers MCOPY to either LDMIA/STMIA or LDMIA_UPD/STMID_UPD depending
7633 /// on whether the result is used. This is done as a post-isel lowering instead
7634 /// of as a custom inserter because we need the use list from the SDNode.
7635 static void LowerMCOPY(const ARMSubtarget *Subtarget, MachineInstr *MI,
7636 SDNode *Node) {
7637 bool isThumb1 = Subtarget->isThumb1Only();
7638 bool isThumb2 = Subtarget->isThumb2();
7639 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
7640
7641 DebugLoc dl = MI->getDebugLoc();
7642 MachineBasicBlock *BB = MI->getParent();
7643 MachineFunction *MF = BB->getParent();
7644 MachineRegisterInfo &MRI = MF->getRegInfo();
7645
7646 MachineInstrBuilder LD, ST;
7647 if (isThumb1 || Node->hasAnyUseOfValue(1)) {
7648 LD = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA_UPD
7649 : isThumb1 ? ARM::tLDMIA_UPD
7650 : ARM::LDMIA_UPD))
7651 .addOperand(MI->getOperand(1));
7652 } else {
7653 LD = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA : ARM::LDMIA));
7654 }
7655
7656 if (isThumb1 || Node->hasAnyUseOfValue(0)) {
7657 ST = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA_UPD
7658 : isThumb1 ? ARM::tSTMIA_UPD
7659 : ARM::STMIA_UPD))
7660 .addOperand(MI->getOperand(0));
7661 } else {
7662 ST = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA : ARM::STMIA));
7663 }
7664
7665 LD.addOperand(MI->getOperand(3)).addImm(ARMCC::AL).addReg(0);
7666 ST.addOperand(MI->getOperand(2)).addImm(ARMCC::AL).addReg(0);
7667
7668 for (unsigned I = 0; I != MI->getOperand(4).getImm(); ++I) {
7669 unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
7670 : &ARM::GPRRegClass);
7671 LD.addReg(TmpReg, RegState::Define);
7672 ST.addReg(TmpReg, RegState::Kill);
7673 }
7674
7675 MI->eraseFromParent();
7676 }
7677
76317678 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
76327679 SDNode *Node) const {
7680 if (MI->getOpcode() == ARM::MCOPY) {
7681 LowerMCOPY(Subtarget, MI, Node);
7682 return;
7683 }
7684
76337685 const MCInstrDesc *MCID = &MI->getDesc();
76347686 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
76357687 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
187187
188188 // Vector bitwise select
189189 VBSL,
190
191 // Pseudo-instruction representing a memory copy using ldm/stm
192 // instructions.
193 MCOPY,
190194
191195 // Vector load N-element structure to all lanes:
192196 VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
7272 def SDT_ARMVMAXNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>;
7373 def SDT_ARMVMINNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>;
7474
75 def SDT_ARMMCOPY : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
76 SDTCisVT<2, i32>, SDTCisVT<3, i32>,
77 SDTCisVT<4, i32>]>;
78
7579 def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
7680 [SDTCisSameAs<0, 2>,
7781 SDTCisSameAs<0, 3>,
177181
178182 def ARMvmaxnm : SDNode<"ARMISD::VMAXNM", SDT_ARMVMAXNM, []>;
179183 def ARMvminnm : SDNode<"ARMISD::VMINNM", SDT_ARMVMINNM, []>;
184
185 def ARMmcopy : SDNode<"ARMISD::MCOPY", SDT_ARMMCOPY,
186 [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
187 SDNPMayStore, SDNPMayLoad]>;
180188
181189 //===----------------------------------------------------------------------===//
182190 // ARM Instruction Predicate Definitions.
45514559 [(ARMcopystructbyval GPR:$dst, GPR:$src, imm:$size, imm:$alignment)]>;
45524560 }
45534561
4562 let hasPostISelHook = 1 in {
4563 def MCOPY : PseudoInst<
4564 (outs GPR:$newdst, GPR:$newsrc), (ins GPR:$dst, GPR:$src, i32imm:$nreg),
4565 NoItinerary,
4566 [(set GPR:$newdst, GPR:$newsrc, (ARMmcopy GPR:$dst, GPR:$src, imm:$nreg))]>;
4567 }
4568
45544569 def ldrex_1 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{
45554570 return cast(N)->getMemoryVT() == MVT::i8;
45564571 }]>;
163163 unsigned VTSize = 4;
164164 unsigned i = 0;
165165 // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
166 const unsigned MAX_LOADS_IN_LDM = Subtarget.isThumb1Only() ? 4 : 6;
166 const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
167167 SDValue TFOps[6];
168168 SDValue Loads[6];
169169 uint64_t SrcOff = 0, DstOff = 0;
170170
171 // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the
172 // same number of stores. The loads and stores will get combined into
173 // ldm/stm later on.
174 while (EmittedNumMemOps < NumMemOps) {
175 for (i = 0;
176 i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
177 Loads[i] = DAG.getLoad(VT, dl, Chain,
178 DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
179 DAG.getConstant(SrcOff, dl, MVT::i32)),
180 SrcPtrInfo.getWithOffset(SrcOff), isVolatile,
181 false, false, 0);
182 TFOps[i] = Loads[i].getValue(1);
183 SrcOff += VTSize;
184 }
185 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
186 makeArrayRef(TFOps, i));
187
188 for (i = 0;
189 i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
190 TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
191 DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
192 DAG.getConstant(DstOff, dl, MVT::i32)),
193 DstPtrInfo.getWithOffset(DstOff),
194 isVolatile, false, 0);
195 DstOff += VTSize;
196 }
197 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
198 makeArrayRef(TFOps, i));
199
200 EmittedNumMemOps += i;
171 // FIXME: We should invent a VMCOPY pseudo-instruction that lowers to
172 // VLDM/VSTM and make this code emit it when appropriate. This would reduce
173 // pressure on the general purpose registers. However this seems harder to map
174 // onto the register allocator's view of the world.
175
176 // The number of MCOPY pseudo-instructions to emit. We use up to MaxLoadsInLDM
177 // registers per mcopy, which will get lowered into ldm/stm later on. This is
178 // a lower bound on the number of MCOPY operations we must emit.
179 unsigned NumMCOPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
180
181 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
182
183 for (unsigned I = 0; I != NumMCOPYs; ++I) {
184 // Evenly distribute registers among MCOPY operations to reduce register
185 // pressure.
186 unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMCOPYs;
187 unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
188
189 Dst = DAG.getNode(ARMISD::MCOPY, dl, VTs, Chain, Dst, Src,
190 DAG.getConstant(NumRegs, dl, MVT::i32));
191 Src = Dst.getValue(1);
192 Chain = Dst.getValue(2);
193
194 DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
195 SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);
196
197 EmittedNumMemOps = NextEmittedNumMemOps;
201198 }
202199
203200 if (BytesLeft == 0)
743743 const MCSubtargetInfo &STI,
744744 raw_ostream &O) {
745745 O << "{";
746 for (unsigned i = OpNum, e = MI->getNumOperands(); i != e; ++i) {
747 if (i != OpNum)
746
747 // The backend may have given us a register list in non-ascending order. Sort
748 // it now.
749 std::vector RegOps(MI->size() - OpNum);
750 std::copy(MI->begin() + OpNum, MI->end(), RegOps.begin());
751 std::sort(RegOps.begin(), RegOps.end(),
752 [this](const MCOperand &O1, const MCOperand &O2) -> bool {
753 return MRI.getEncodingValue(O1.getReg()) <
754 MRI.getEncodingValue(O2.getReg());
755 });
756
757 for (unsigned i = 0, e = RegOps.size(); i != e; ++i) {
758 if (i != 0)
748759 O << ", ";
749 printRegName(O, MI->getOperand(i).getReg());
760 printRegName(O, RegOps[i].getReg());
750761 }
751762 O << "}";
752763 }
124124 { ARM::t2LDMIA, ARM::tLDMIA, 0, 0, 0, 1, 1, 1,1, 0,1,0 },
125125 { ARM::t2LDMIA_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 0,1,0 },
126126 { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0, 0, 1, 1, 1,1, 0,1,0 },
127 // ARM::t2STM (with no basereg writeback) has no Thumb1 equivalent
127 // ARM::t2STMIA (with no basereg writeback) has no Thumb1 equivalent.
128 // tSTMIA_UPD is a change in semantics which can only be used if the base
129 // register is killed. This difference is correctly handled elsewhere.
130 { ARM::t2STMIA, ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 },
128131 { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 },
129132 { ARM::t2STMDB_UPD, 0, ARM::tPUSH, 0, 0, 1, 1, 1,1, 0,1,0 }
130133 };
431434 isLdStMul = true;
432435 break;
433436 }
437 case ARM::t2STMIA: {
438 // If the base register is killed, we don't care what its value is after the
439 // instruction, so we can use an updating STMIA.
440 if (!MI->getOperand(0).isKill())
441 return false;
442
443 break;
444 }
434445 case ARM::t2LDMIA_RET: {
435446 unsigned BaseReg = MI->getOperand(1).getReg();
436447 if (BaseReg != ARM::SP)
488499 // Add the 16-bit load / store instruction.
489500 DebugLoc dl = MI->getDebugLoc();
490501 MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, TII->get(Opc));
502
503 // tSTMIA_UPD takes a defining register operand. We've already checked that
504 // the register is killed, so mark it as dead here.
505 if (Entry.WideOpc == ARM::t2STMIA)
506 MIB.addReg(MI->getOperand(0).getReg(), RegState::Define | RegState::Dead);
507
491508 if (!isLdStMul) {
492509 MIB.addOperand(MI->getOperand(0));
493510 MIB.addOperand(MI->getOperand(1));
55 @b = external global i32*
66
77 ; Function Attrs: nounwind
8 define void @foo() #0 {
8 define void @foo24() #0 {
99 entry:
10 ; CHECK-LABEL: foo:
11 ; CHECK: ldr r[[SB:[0-9]]], .LCPI
10 ; CHECK-LABEL: foo24:
1211 ; CHECK: ldr r[[LB:[0-9]]], .LCPI
1312 ; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4
14 ; CHECK-NEXT: ldm r[[NLB]],
13 ; CHECK: ldr r[[SB:[0-9]]], .LCPI
1514 ; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4
16 ; CHECK-NEXT: stm r[[NSB]]
15 ; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]}
16 ; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]}
17 ; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]}
18 ; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]}
1719 %0 = load i32*, i32** @a, align 4
1820 %arrayidx = getelementptr inbounds i32, i32* %0, i32 1
1921 %1 = bitcast i32* %arrayidx to i8*
2426 ret void
2527 }
2628
29 define void @foo28() #0 {
30 entry:
31 ; CHECK-LABEL: foo28:
32 ; CHECK: ldr r[[LB:[0-9]]], .LCPI
33 ; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4
34 ; CHECK: ldr r[[SB:[0-9]]], .LCPI
35 ; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4
36 ; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]}
37 ; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]}
38 ; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]], r[[R4:[0-9]]]}
39 ; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]], r[[R4]]}
40 %0 = load i32*, i32** @a, align 4
41 %arrayidx = getelementptr inbounds i32, i32* %0, i32 1
42 %1 = bitcast i32* %arrayidx to i8*
43 %2 = load i32*, i32** @b, align 4
44 %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1
45 %3 = bitcast i32* %arrayidx1 to i8*
46 tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 28, i32 4, i1 false)
47 ret void
48 }
49
50 define void @foo32() #0 {
51 entry:
52 ; CHECK-LABEL: foo32:
53 ; CHECK: ldr r[[LB:[0-9]]], .LCPI
54 ; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4
55 ; CHECK: ldr r[[SB:[0-9]]], .LCPI
56 ; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4
57 ; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]], r[[R4:[0-9]]]}
58 ; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]], r[[R4]]}
59 ; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]], r[[R4:[0-9]]]}
60 ; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]], r[[R4]]}
61 %0 = load i32*, i32** @a, align 4
62 %arrayidx = getelementptr inbounds i32, i32* %0, i32 1
63 %1 = bitcast i32* %arrayidx to i8*
64 %2 = load i32*, i32** @b, align 4
65 %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1
66 %3 = bitcast i32* %arrayidx1 to i8*
67 tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 32, i32 4, i1 false)
68 ret void
69 }
70
71 define void @foo36() #0 {
72 entry:
73 ; CHECK-LABEL: foo36:
74 ; CHECK: ldr r[[LB:[0-9]]], .LCPI
75 ; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4
76 ; CHECK: ldr r[[SB:[0-9]]], .LCPI
77 ; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4
78 ; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]}
79 ; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]}
80 ; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]}
81 ; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]}
82 ; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]}
83 ; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]}
84 %0 = load i32*, i32** @a, align 4
85 %arrayidx = getelementptr inbounds i32, i32* %0, i32 1
86 %1 = bitcast i32* %arrayidx to i8*
87 %2 = load i32*, i32** @b, align 4
88 %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1
89 %3 = bitcast i32* %arrayidx1 to i8*
90 tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 36, i32 4, i1 false)
91 ret void
92 }
93
2794 ; Function Attrs: nounwind
2895 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1
66 entry:
77 ; CHECK-LABEL: t1:
88 ; CHECK: ldr r[[LB:[0-9]]],
9 ; CHECK-NEXT: ldr r[[SB:[0-9]]],
910 ; CHECK-NEXT: ldm r[[LB]]!,
10 ; CHECK-NEXT: ldr r[[SB:[0-9]]],
1111 ; CHECK-NEXT: stm r[[SB]]!,
1212 ; CHECK-NEXT: ldrb {{.*}}, [r[[LB]]]
1313 ; CHECK-NEXT: strb {{.*}}, [r[[SB]]]
2020 entry:
2121 ; CHECK-LABEL: t2:
2222 ; CHECK: ldr r[[LB:[0-9]]],
23 ; CHECK-NEXT: ldr r[[SB:[0-9]]],
2324 ; CHECK-NEXT: ldm r[[LB]]!,
24 ; CHECK-NEXT: ldr r[[SB:[0-9]]],
2525 ; CHECK-NEXT: stm r[[SB]]!,
2626 ; CHECK-NEXT: ldrh {{.*}}, [r[[LB]]]
2727 ; CHECK-NEXT: ldrb {{.*}}, [r[[LB]], #2]