llvm.org GIT mirror llvm / f5c04a9
Revert r238473, "Thumb2: Modify codegen for memcpy intrinsic to prefer LDM/STM." as it caused miscompilations and assertion failures (PR23768, http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20150601/280380.html). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@239169 91177308-0d34-0410-b5e6-96231b3b80d8 Peter Collingbourne 5 years ago
8 changed file(s) with 42 addition(s) and 205 deletion(s). Raw diff Collapse all Expand all
11241124 case ARMISD::VORRIMM: return "ARMISD::VORRIMM";
11251125 case ARMISD::VBICIMM: return "ARMISD::VBICIMM";
11261126 case ARMISD::VBSL: return "ARMISD::VBSL";
1127 case ARMISD::MCOPY: return "ARMISD::MCOPY";
11281127 case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP";
11291128 case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP";
11301129 case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP";
76757674 }
76767675 }
76777676
7678 /// \brief Lowers MCOPY to either LDMIA/STMIA or LDMIA_UPD/STMID_UPD depending
7679 /// on whether the result is used. This is done as a post-isel lowering instead
7680 /// of as a custom inserter because we need the use list from the SDNode.
7681 static void LowerMCOPY(const ARMSubtarget *Subtarget, MachineInstr *MI,
7682 SDNode *Node) {
7683 bool isThumb1 = Subtarget->isThumb1Only();
7684 bool isThumb2 = Subtarget->isThumb2();
7685 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
7686
7687 DebugLoc dl = MI->getDebugLoc();
7688 MachineBasicBlock *BB = MI->getParent();
7689 MachineFunction *MF = BB->getParent();
7690 MachineRegisterInfo &MRI = MF->getRegInfo();
7691
7692 MachineInstrBuilder LD, ST;
7693 if (isThumb1 || Node->hasAnyUseOfValue(1)) {
7694 LD = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA_UPD
7695 : isThumb1 ? ARM::tLDMIA_UPD
7696 : ARM::LDMIA_UPD))
7697 .addOperand(MI->getOperand(1));
7698 } else {
7699 LD = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA : ARM::LDMIA));
7700 }
7701
7702 if (isThumb1 || Node->hasAnyUseOfValue(0)) {
7703 ST = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA_UPD
7704 : isThumb1 ? ARM::tSTMIA_UPD
7705 : ARM::STMIA_UPD))
7706 .addOperand(MI->getOperand(0));
7707 } else {
7708 ST = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA : ARM::STMIA));
7709 }
7710
7711 LD.addOperand(MI->getOperand(3)).addImm(ARMCC::AL).addReg(0);
7712 ST.addOperand(MI->getOperand(2)).addImm(ARMCC::AL).addReg(0);
7713
7714 for (unsigned I = 0; I != MI->getOperand(4).getImm(); ++I) {
7715 unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
7716 : &ARM::GPRRegClass);
7717 LD.addReg(TmpReg, RegState::Define);
7718 ST.addReg(TmpReg, RegState::Kill);
7719 }
7720
7721 MI->eraseFromParent();
7722 }
7723
77247677 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
77257678 SDNode *Node) const {
7726 if (MI->getOpcode() == ARM::MCOPY) {
7727 LowerMCOPY(Subtarget, MI, Node);
7728 return;
7729 }
7730
77317679 const MCInstrDesc *MCID = &MI->getDesc();
77327680 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
77337681 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
187187
188188 // Vector bitwise select
189189 VBSL,
190
191 // Pseudo-instruction representing a memory copy using ldm/stm
192 // instructions.
193 MCOPY,
194190
195191 // Vector load N-element structure to all lanes:
196192 VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
7272 def SDT_ARMVMAXNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>;
7373 def SDT_ARMVMINNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>;
7474
75 def SDT_ARMMCOPY : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
76 SDTCisVT<2, i32>, SDTCisVT<3, i32>,
77 SDTCisVT<4, i32>]>;
78
7975 def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
8076 [SDTCisSameAs<0, 2>,
8177 SDTCisSameAs<0, 3>,
181177
182178 def ARMvmaxnm : SDNode<"ARMISD::VMAXNM", SDT_ARMVMAXNM, []>;
183179 def ARMvminnm : SDNode<"ARMISD::VMINNM", SDT_ARMVMINNM, []>;
184
185 def ARMmcopy : SDNode<"ARMISD::MCOPY", SDT_ARMMCOPY,
186 [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
187 SDNPMayStore, SDNPMayLoad]>;
188180
189181 //===----------------------------------------------------------------------===//
190182 // ARM Instruction Predicate Definitions.
45854577 [(ARMcopystructbyval GPR:$dst, GPR:$src, imm:$size, imm:$alignment)]>;
45864578 }
45874579
4588 let hasPostISelHook = 1 in {
4589 def MCOPY : PseudoInst<
4590 (outs GPR:$newdst, GPR:$newsrc), (ins GPR:$dst, GPR:$src, i32imm:$nreg),
4591 NoItinerary,
4592 [(set GPR:$newdst, GPR:$newsrc, (ARMmcopy GPR:$dst, GPR:$src, imm:$nreg))]>;
4593 }
4594
45954580 def ldrex_1 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{
45964581 return cast(N)->getMemoryVT() == MVT::i8;
45974582 }]>;
163163 unsigned VTSize = 4;
164164 unsigned i = 0;
165165 // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
166 const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
166 const unsigned MAX_LOADS_IN_LDM = Subtarget.isThumb1Only() ? 4 : 6;
167167 SDValue TFOps[6];
168168 SDValue Loads[6];
169169 uint64_t SrcOff = 0, DstOff = 0;
170170
171 // FIXME: We should invent a VMCOPY pseudo-instruction that lowers to
172 // VLDM/VSTM and make this code emit it when appropriate. This would reduce
173 // pressure on the general purpose registers. However this seems harder to map
174 // onto the register allocator's view of the world.
175
176 // The number of MCOPY pseudo-instructions to emit. We use up to MaxLoadsInLDM
177 // registers per mcopy, which will get lowered into ldm/stm later on. This is
178 // a lower bound on the number of MCOPY operations we must emit.
179 unsigned NumMCOPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
180
181 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
182
183 for (unsigned I = 0; I != NumMCOPYs; ++I) {
184 // Evenly distribute registers among MCOPY operations to reduce register
185 // pressure.
186 unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMCOPYs;
187 unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
188
189 Dst = DAG.getNode(ARMISD::MCOPY, dl, VTs, Chain, Dst, Src,
190 DAG.getConstant(NumRegs, dl, MVT::i32));
191 Src = Dst.getValue(1);
192 Chain = Dst.getValue(2);
193
194 DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
195 SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);
196
197 EmittedNumMemOps = NextEmittedNumMemOps;
171 // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the
172 // same number of stores. The loads and stores will get combined into
173 // ldm/stm later on.
174 while (EmittedNumMemOps < NumMemOps) {
175 for (i = 0;
176 i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
177 Loads[i] = DAG.getLoad(VT, dl, Chain,
178 DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
179 DAG.getConstant(SrcOff, dl, MVT::i32)),
180 SrcPtrInfo.getWithOffset(SrcOff), isVolatile,
181 false, false, 0);
182 TFOps[i] = Loads[i].getValue(1);
183 SrcOff += VTSize;
184 }
185 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
186 makeArrayRef(TFOps, i));
187
188 for (i = 0;
189 i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
190 TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
191 DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
192 DAG.getConstant(DstOff, dl, MVT::i32)),
193 DstPtrInfo.getWithOffset(DstOff),
194 isVolatile, false, 0);
195 DstOff += VTSize;
196 }
197 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
198 makeArrayRef(TFOps, i));
199
200 EmittedNumMemOps += i;
198201 }
199202
200203 if (BytesLeft == 0)
743743 const MCSubtargetInfo &STI,
744744 raw_ostream &O) {
745745 O << "{";
746
747 // The backend may have given us a register list in non-ascending order. Sort
748 // it now.
749 std::vector RegOps(MI->size() - OpNum);
750 std::copy(MI->begin() + OpNum, MI->end(), RegOps.begin());
751 std::sort(RegOps.begin(), RegOps.end(),
752 [this](const MCOperand &O1, const MCOperand &O2) -> bool {
753 return MRI.getEncodingValue(O1.getReg()) <
754 MRI.getEncodingValue(O2.getReg());
755 });
756
757 for (unsigned i = 0, e = RegOps.size(); i != e; ++i) {
758 if (i != 0)
746 for (unsigned i = OpNum, e = MI->getNumOperands(); i != e; ++i) {
747 if (i != OpNum)
759748 O << ", ";
760 printRegName(O, RegOps[i].getReg());
749 printRegName(O, MI->getOperand(i).getReg());
761750 }
762751 O << "}";
763752 }
124124 { ARM::t2LDMIA, ARM::tLDMIA, 0, 0, 0, 1, 1, 1,1, 0,1,0 },
125125 { ARM::t2LDMIA_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 0,1,0 },
126126 { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0, 0, 1, 1, 1,1, 0,1,0 },
127 // ARM::t2STMIA (with no basereg writeback) has no Thumb1 equivalent.
128 // tSTMIA_UPD is a change in semantics which can only be used if the base
129 // register is killed. This difference is correctly handled elsewhere.
130 { ARM::t2STMIA, ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 },
127 // ARM::t2STM (with no basereg writeback) has no Thumb1 equivalent
131128 { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 },
132129 { ARM::t2STMDB_UPD, 0, ARM::tPUSH, 0, 0, 1, 1, 1,1, 0,1,0 }
133130 };
434431 isLdStMul = true;
435432 break;
436433 }
437 case ARM::t2STMIA: {
438 // If the base register is killed, we don't care what its value is after the
439 // instruction, so we can use an updating STMIA.
440 if (!MI->getOperand(0).isKill())
441 return false;
442
443 break;
444 }
445434 case ARM::t2LDMIA_RET: {
446435 unsigned BaseReg = MI->getOperand(1).getReg();
447436 if (BaseReg != ARM::SP)
499488 // Add the 16-bit load / store instruction.
500489 DebugLoc dl = MI->getDebugLoc();
501490 MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, TII->get(Opc));
502
503 // tSTMIA_UPD takes a defining register operand. We've already checked that
504 // the register is killed, so mark it as dead here.
505 if (Entry.WideOpc == ARM::t2STMIA)
506 MIB.addReg(MI->getOperand(0).getReg(), RegState::Define | RegState::Dead);
507
508491 if (!isLdStMul) {
509492 MIB.addOperand(MI->getOperand(0));
510493 MIB.addOperand(MI->getOperand(1));
55 @b = external global i32*
66
77 ; Function Attrs: nounwind
8 define void @foo24() #0 {
8 define void @foo() #0 {
99 entry:
10 ; CHECK-LABEL: foo24:
10 ; CHECK-LABEL: foo:
11 ; CHECK: ldr r[[SB:[0-9]]], .LCPI
1112 ; CHECK: ldr r[[LB:[0-9]]], .LCPI
1213 ; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4
13 ; CHECK: ldr r[[SB:[0-9]]], .LCPI
14 ; CHECK-NEXT: ldm r[[NLB]],
1415 ; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4
15 ; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]}
16 ; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]}
17 ; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]}
18 ; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]}
16 ; CHECK-NEXT: stm r[[NSB]]
1917 %0 = load i32*, i32** @a, align 4
2018 %arrayidx = getelementptr inbounds i32, i32* %0, i32 1
2119 %1 = bitcast i32* %arrayidx to i8*
2624 ret void
2725 }
2826
29 define void @foo28() #0 {
30 entry:
31 ; CHECK-LABEL: foo28:
32 ; CHECK: ldr r[[LB:[0-9]]], .LCPI
33 ; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4
34 ; CHECK: ldr r[[SB:[0-9]]], .LCPI
35 ; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4
36 ; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]}
37 ; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]}
38 ; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]], r[[R4:[0-9]]]}
39 ; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]], r[[R4]]}
40 %0 = load i32*, i32** @a, align 4
41 %arrayidx = getelementptr inbounds i32, i32* %0, i32 1
42 %1 = bitcast i32* %arrayidx to i8*
43 %2 = load i32*, i32** @b, align 4
44 %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1
45 %3 = bitcast i32* %arrayidx1 to i8*
46 tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 28, i32 4, i1 false)
47 ret void
48 }
49
50 define void @foo32() #0 {
51 entry:
52 ; CHECK-LABEL: foo32:
53 ; CHECK: ldr r[[LB:[0-9]]], .LCPI
54 ; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4
55 ; CHECK: ldr r[[SB:[0-9]]], .LCPI
56 ; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4
57 ; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]], r[[R4:[0-9]]]}
58 ; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]], r[[R4]]}
59 ; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]], r[[R4:[0-9]]]}
60 ; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]], r[[R4]]}
61 %0 = load i32*, i32** @a, align 4
62 %arrayidx = getelementptr inbounds i32, i32* %0, i32 1
63 %1 = bitcast i32* %arrayidx to i8*
64 %2 = load i32*, i32** @b, align 4
65 %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1
66 %3 = bitcast i32* %arrayidx1 to i8*
67 tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 32, i32 4, i1 false)
68 ret void
69 }
70
71 define void @foo36() #0 {
72 entry:
73 ; CHECK-LABEL: foo36:
74 ; CHECK: ldr r[[LB:[0-9]]], .LCPI
75 ; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4
76 ; CHECK: ldr r[[SB:[0-9]]], .LCPI
77 ; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4
78 ; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]}
79 ; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]}
80 ; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]}
81 ; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]}
82 ; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]}
83 ; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]}
84 %0 = load i32*, i32** @a, align 4
85 %arrayidx = getelementptr inbounds i32, i32* %0, i32 1
86 %1 = bitcast i32* %arrayidx to i8*
87 %2 = load i32*, i32** @b, align 4
88 %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1
89 %3 = bitcast i32* %arrayidx1 to i8*
90 tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 36, i32 4, i1 false)
91 ret void
92 }
93
9427 ; Function Attrs: nounwind
9528 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1
66 entry:
77 ; CHECK-LABEL: t1:
88 ; CHECK: ldr r[[LB:[0-9]]],
9 ; CHECK-NEXT: ldm r[[LB]]!,
910 ; CHECK-NEXT: ldr r[[SB:[0-9]]],
10 ; CHECK-NEXT: ldm r[[LB]]!,
1111 ; CHECK-NEXT: stm r[[SB]]!,
1212 ; CHECK-NEXT: ldrb {{.*}}, [r[[LB]]]
1313 ; CHECK-NEXT: strb {{.*}}, [r[[SB]]]
2020 entry:
2121 ; CHECK-LABEL: t2:
2222 ; CHECK: ldr r[[LB:[0-9]]],
23 ; CHECK-NEXT: ldm r[[LB]]!,
2324 ; CHECK-NEXT: ldr r[[SB:[0-9]]],
24 ; CHECK-NEXT: ldm r[[LB]]!,
2525 ; CHECK-NEXT: stm r[[SB]]!,
2626 ; CHECK-NEXT: ldrh {{.*}}, [r[[LB]]]
2727 ; CHECK-NEXT: ldrb {{.*}}, [r[[LB]], #2]