llvm.org GIT mirror llvm / 4102eb5
Fix memcpy lowering when addresses are 4-byte aligned but size is not multiple of 4. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@43234 91177308-0d34-0410-b5e6-96231b3b80d8 Evan Cheng 12 years ago
4 changed file(s) with 103 addition(s) and 42 deletion(s). Raw diff Collapse all Expand all
12861286 return DAG.getNode(ISD::BUILD_PAIR, MVT::i64, Lo, Hi);
12871287 }
12881288
1289 SDOperand ARMTargetLowering::LowerMEMCPY(SDOperand Op, SelectionDAG &DAG) {
1289 SDOperand ARMTargetLowering::LowerMEMCPY(SDOperand Op, SelectionDAG &DAG,
1290 const ARMSubtarget *ST) {
12901291 SDOperand ChainOp = Op.getOperand(0);
12911292 SDOperand DestOp = Op.getOperand(1);
12921293 SDOperand SourceOp = Op.getOperand(2);
13041305 assert(!AlwaysInline && "Cannot inline copy of unknown size");
13051306 return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG);
13061307 }
1307 unsigned Size = I->getValue();
1308
1309 if (AlwaysInline)
1310 return LowerMEMCPYInline(ChainOp, DestOp, SourceOp, Size, Align, DAG);
1311
1312 // The libc version is likely to be faster for the following cases. It can
1308
1309 // If not DWORD aligned or if size is more than threshold, then call memcpy.
1310 // The libc version is likely to be faster for the these cases. It can
13131311 // use the address value and run time information about the CPU.
13141312 // With glibc 2.6.1 on a core 2, coping an array of 100M longs was 30% faster
1315
1316 // If not DWORD aligned, call memcpy.
1317 if ((Align & 3) != 0)
1318 return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG);
1319
1320 // If size is more than the threshold, call memcpy.
1321 // if (Size > Subtarget->getMinRepStrSizeThreshold())
1322 if (Size >= 64)
1323 return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG);
1324
1325 return LowerMEMCPYInline(ChainOp, DestOp, SourceOp, Size, Align, DAG);
1313 // FIXME: For now, we don't lower memcpy's to loads / stores for Thumb. Change
1314 // this once Thumb ldmia / stmia support is added.
1315 unsigned Size = I->getValue();
1316 if (AlwaysInline ||
1317 (!ST->isThumb() && Size < 64 && (Align & 3) == 0))
1318 return LowerMEMCPYInline(ChainOp, DestOp, SourceOp, Size, Align, DAG);
1319 return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG);
13261320 }
13271321
13281322 SDOperand ARMTargetLowering::LowerMEMCPYCall(SDOperand Chain,
13491343 unsigned Size,
13501344 unsigned Align,
13511345 SelectionDAG &DAG) {
1352
1353 // Do repeated 4-byte loads and stores. To be improved.
1354 assert((Size& 3) == 0);
1355 assert((Align & 3) == 0);
1346 // Do repeated 4-byte loads and stores. To be improved.
1347 assert((Align & 3) == 0 && "Expected 4-byte aligned addresses!");
1348 unsigned BytesLeft = Size & 3;
13561349 unsigned NumMemOps = Size >> 2;
13571350 unsigned EmittedNumMemOps = 0;
13581351 unsigned SrcOff = 0, DstOff = 0;
13591352 MVT::ValueType VT = MVT::i32;
13601353 unsigned VTSize = 4;
1354 unsigned i = 0;
13611355 const unsigned MAX_LOADS_IN_LDM = 6;
1362 SDOperand LoadChains[MAX_LOADS_IN_LDM];
1356 SDOperand TFOps[MAX_LOADS_IN_LDM];
13631357 SDOperand Loads[MAX_LOADS_IN_LDM];
13641358
1365 // Emit up to 4 loads, then a TokenFactor barrier, then the same
1366 // number of stores. The loads and stores will get combined into
1359 // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the
1360 // same number of stores. The loads and stores will get combined into
13671361 // ldm/stm later on.
1368 while(EmittedNumMemOps < NumMemOps) {
1369 unsigned i;
1370 for (i=0; i) {
1362 while (EmittedNumMemOps < NumMemOps) {
1363 for (i = 0;
1364 i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
13711365 Loads[i] = DAG.getLoad(VT, Chain,
1372 DAG.getNode(ISD::ADD, VT, Source,
1373 DAG.getConstant(SrcOff, VT)),
1366 DAG.getNode(ISD::ADD, MVT::i32, Source,
1367 DAG.getConstant(SrcOff, MVT::i32)),
13741368 NULL, 0);
1375 LoadChains[i] = Loads[i].getValue(1);
1369 TFOps[i] = Loads[i].getValue(1);
13761370 SrcOff += VTSize;
13771371 }
1378
1379 Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, &LoadChains[0], i);
1380
1381 for (i=0; i
1382 Chain = DAG.getStore(Chain, Loads[i],
1383 DAG.getNode(ISD::ADD, VT, Dest,
1384 DAG.getConstant(DstOff, VT)),
1372 Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, &TFOps[0], i);
1373
1374 for (i = 0;
1375 i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
1376 TFOps[i] = DAG.getStore(Chain, Loads[i],
1377 DAG.getNode(ISD::ADD, MVT::i32, Dest,
1378 DAG.getConstant(DstOff, MVT::i32)),
13851379 NULL, 0);
13861380 DstOff += VTSize;
13871381 }
1382 Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, &TFOps[0], i);
1383
13881384 EmittedNumMemOps += i;
13891385 }
13901386
1391 return Chain;
1387 if (BytesLeft == 0)
1388 return Chain;
1389
1390 // Issue loads / stores for the trailing (1 - 3) bytes.
1391 unsigned BytesLeftSave = BytesLeft;
1392 i = 0;
1393 while (BytesLeft) {
1394 if (BytesLeft >= 2) {
1395 VT = MVT::i16;
1396 VTSize = 2;
1397 } else {
1398 VT = MVT::i8;
1399 VTSize = 1;
1400 }
1401
1402 Loads[i] = DAG.getLoad(VT, Chain,
1403 DAG.getNode(ISD::ADD, MVT::i32, Source,
1404 DAG.getConstant(SrcOff, MVT::i32)),
1405 NULL, 0);
1406 TFOps[i] = Loads[i].getValue(1);
1407 ++i;
1408 SrcOff += VTSize;
1409 BytesLeft -= VTSize;
1410 }
1411 Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, &TFOps[0], i);
1412
1413 i = 0;
1414 BytesLeft = BytesLeftSave;
1415 while (BytesLeft) {
1416 if (BytesLeft >= 2) {
1417 VT = MVT::i16;
1418 VTSize = 2;
1419 } else {
1420 VT = MVT::i8;
1421 VTSize = 1;
1422 }
1423
1424 TFOps[i] = DAG.getStore(Chain, Loads[i],
1425 DAG.getNode(ISD::ADD, MVT::i32, Dest,
1426 DAG.getConstant(DstOff, MVT::i32)),
1427 NULL, 0);
1428 ++i;
1429 DstOff += VTSize;
1430 BytesLeft -= VTSize;
1431 }
1432 return DAG.getNode(ISD::TokenFactor, MVT::Other, &TFOps[0], i);
13921433 }
13931434
13941435 SDOperand ARMTargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) {
14181459 case ISD::RETURNADDR: break;
14191460 case ISD::FRAMEADDR: break;
14201461 case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
1421 case ISD::MEMCPY: return LowerMEMCPY(Op, DAG);
1462 case ISD::MEMCPY: return LowerMEMCPY(Op, DAG, Subtarget);
14221463 }
14231464 return SDOperand();
14241465 }
129129 SDOperand LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
130130 SelectionDAG &DAG);
131131 SDOperand LowerToTLSExecModels(GlobalAddressSDNode *GA,
132 SelectionDAG &DAG);
132 SelectionDAG &DAG);
133133 SDOperand LowerGLOBAL_OFFSET_TABLE(SDOperand Op, SelectionDAG &DAG);
134134 SDOperand LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG);
135135 SDOperand LowerBR_JT(SDOperand Op, SelectionDAG &DAG);
136 SDOperand LowerMEMCPY(SDOperand Op, SelectionDAG &DAG);
136 SDOperand LowerMEMCPY(SDOperand Op, SelectionDAG &DAG,
137 const ARMSubtarget *ST);
137138 SDOperand LowerMEMCPYCall(SDOperand Chain, SDOperand Dest,
138139 SDOperand Source, SDOperand Count,
139140 SelectionDAG &DAG);
220220
221221 Make register allocator / spiller smarter so we can re-materialize "mov r, imm",
222222 etc. Almost all Thumb instructions clobber condition code.
223
224 //===---------------------------------------------------------------------===//
225
226 Add ldmia, stmia support.
0 ; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | grep ldmia
1 ; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | grep ldrb
2 ; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | grep ldrh
3
4 %struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
5 @src = external global %struct.x
6 @dst = external global %struct.x
7
8 define i32 @t() {
9 entry:
10 call void @llvm.memcpy.i32( i8* getelementptr (%struct.x* @dst, i32 0, i32 0), i8* getelementptr (%struct.x* @src, i32 0, i32 0), i32 11, i32 8 )
11 ret i32 0
12 }
13
14 declare void @llvm.memcpy.i32(i8*, i8*, i32, i32)