llvm.org GIT mirror llvm / a723d1e
Factor a bunch of functionality related to memcpy and memset transforms out of GVN and into its own pass. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@49419 91177308-0d34-0410-b5e6-96231b3b80d8 Owen Anderson 11 years ago
19 changed file(s) with 1039 addition(s) and 877 deletion(s). Raw diff Collapse all Expand all
111111 (void) llvm::createPredicateSimplifierPass();
112112 (void) llvm::createCodeGenPreparePass();
113113 (void) llvm::createGVNPass();
114 (void) llvm::createMemCpyOptPass();
114115
115116 (void)new llvm::IntervalPartition();
116117 (void)new llvm::FindUsedTypes();
304304
305305 //===----------------------------------------------------------------------===//
306306 //
307 // MemCpyOpt - This pass performs optimizations related to eliminating memcpy
308 // calls and/or combining multiple stores into memset's.
309 //
310 FunctionPass *createMemCpyOptPass();
311
312 //===----------------------------------------------------------------------===//
313 //
307314 // CodeGenPrepare - This pass prepares a function for instruction selection.
308315 //
309316 FunctionPass *createCodeGenPreparePass(const TargetLowering *TLI = 0);
4141
4242 STATISTIC(NumGVNInstr, "Number of instructions deleted");
4343 STATISTIC(NumGVNLoad, "Number of loads deleted");
44 STATISTIC(NumMemSetInfer, "Number of memsets inferred");
45
46 namespace {
47 cl::opt
48 FormMemSet("form-memset-from-stores",
49 cl::desc("Transform straight-line stores to memsets"),
50 cl::init(true), cl::Hidden);
51 }
5244
5345 //===----------------------------------------------------------------------===//
5446 // ValueTable Class
667659 bool processLoad(LoadInst* L,
668660 DenseMap &lastLoad,
669661 SmallVectorImpl &toErase);
670 bool processStore(StoreInst *SI, SmallVectorImpl &toErase);
671662 bool processInstruction(Instruction* I,
672663 ValueNumberedSet& currAvail,
673664 DenseMap& lastSeenLoad,
674665 SmallVectorImpl &toErase);
675666 bool processNonLocalLoad(LoadInst* L,
676667 SmallVectorImpl &toErase);
677 bool processMemCpy(MemCpyInst* M, MemCpyInst* MDep,
678 SmallVectorImpl &toErase);
679 bool performCallSlotOptzn(MemCpyInst* cpy, CallInst* C,
680 SmallVectorImpl &toErase);
681668 Value *GetValueForBlock(BasicBlock *BB, LoadInst* orig,
682669 DenseMap &Phis,
683670 bool top_level = false);
982969 return deletedLoad;
983970 }
984971
985 /// isBytewiseValue - If the specified value can be set by repeating the same
986 /// byte in memory, return the i8 value that it is represented with. This is
987 /// true for all i8 values obviously, but is also true for i32 0, i32 -1,
988 /// i16 0xF0F0, double 0.0 etc. If the value can't be handled with a repeated
989 /// byte store (e.g. i16 0x1234), return null.
990 static Value *isBytewiseValue(Value *V) {
991 // All byte-wide stores are splatable, even of arbitrary variables.
992 if (V->getType() == Type::Int8Ty) return V;
993
994 // Constant float and double values can be handled as integer values if the
995 // corresponding integer value is "byteable". An important case is 0.0.
996 if (ConstantFP *CFP = dyn_cast(V)) {
997 if (CFP->getType() == Type::FloatTy)
998 V = ConstantExpr::getBitCast(CFP, Type::Int32Ty);
999 if (CFP->getType() == Type::DoubleTy)
1000 V = ConstantExpr::getBitCast(CFP, Type::Int64Ty);
1001 // Don't handle long double formats, which have strange constraints.
1002 }
1003
1004 // We can handle constant integers that are power of two in size and a
1005 // multiple of 8 bits.
1006 if (ConstantInt *CI = dyn_cast(V)) {
1007 unsigned Width = CI->getBitWidth();
1008 if (isPowerOf2_32(Width) && Width > 8) {
1009 // We can handle this value if the recursive binary decomposition is the
1010 // same at all levels.
1011 APInt Val = CI->getValue();
1012 APInt Val2;
1013 while (Val.getBitWidth() != 8) {
1014 unsigned NextWidth = Val.getBitWidth()/2;
1015 Val2 = Val.lshr(NextWidth);
1016 Val2.trunc(Val.getBitWidth()/2);
1017 Val.trunc(Val.getBitWidth()/2);
1018
1019 // If the top/bottom halves aren't the same, reject it.
1020 if (Val != Val2)
1021 return 0;
1022 }
1023 return ConstantInt::get(Val);
1024 }
1025 }
1026
1027 // Conceptually, we could handle things like:
1028 // %a = zext i8 %X to i16
1029 // %b = shl i16 %a, 8
1030 // %c = or i16 %a, %b
1031 // but until there is an example that actually needs this, it doesn't seem
1032 // worth worrying about.
1033 return 0;
1034 }
1035
1036 static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx,
1037 bool &VariableIdxFound, TargetData &TD) {
1038 // Skip over the first indices.
1039 gep_type_iterator GTI = gep_type_begin(GEP);
1040 for (unsigned i = 1; i != Idx; ++i, ++GTI)
1041 /*skip along*/;
1042
1043 // Compute the offset implied by the rest of the indices.
1044 int64_t Offset = 0;
1045 for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
1046 ConstantInt *OpC = dyn_cast(GEP->getOperand(i));
1047 if (OpC == 0)
1048 return VariableIdxFound = true;
1049 if (OpC->isZero()) continue; // No offset.
1050
1051 // Handle struct indices, which add their field offset to the pointer.
1052 if (const StructType *STy = dyn_cast(*GTI)) {
1053 Offset += TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
1054 continue;
1055 }
1056
1057 // Otherwise, we have a sequential type like an array or vector. Multiply
1058 // the index by the ElementSize.
1059 uint64_t Size = TD.getABITypeSize(GTI.getIndexedType());
1060 Offset += Size*OpC->getSExtValue();
1061 }
1062
1063 return Offset;
1064 }
1065
1066 /// IsPointerOffset - Return true if Ptr1 is provably equal to Ptr2 plus a
1067 /// constant offset, and return that constant offset. For example, Ptr1 might
1068 /// be &A[42], and Ptr2 might be &A[40]. In this case offset would be -8.
1069 static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
1070 TargetData &TD) {
1071 // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical
1072 // base. After that base, they may have some number of common (and
1073 // potentially variable) indices. After that they handle some constant
1074 // offset, which determines their offset from each other. At this point, we
1075 // handle no other case.
1076 GetElementPtrInst *GEP1 = dyn_cast(Ptr1);
1077 GetElementPtrInst *GEP2 = dyn_cast(Ptr2);
1078 if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0))
1079 return false;
1080
1081 // Skip any common indices and track the GEP types.
1082 unsigned Idx = 1;
1083 for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx)
1084 if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx))
1085 break;
1086
1087 bool VariableIdxFound = false;
1088 int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, TD);
1089 int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, TD);
1090 if (VariableIdxFound) return false;
1091
1092 Offset = Offset2-Offset1;
1093 return true;
1094 }
1095
1096
1097 /// MemsetRange - Represents a range of memset'd bytes with the ByteVal value.
1098 /// This allows us to analyze stores like:
1099 /// store 0 -> P+1
1100 /// store 0 -> P+0
1101 /// store 0 -> P+3
1102 /// store 0 -> P+2
1103 /// which sometimes happens with stores to arrays of structs etc. When we see
1104 /// the first store, we make a range [1, 2). The second store extends the range
1105 /// to [0, 2). The third makes a new range [2, 3). The fourth store joins the
1106 /// two ranges into [0, 3) which is memset'able.
1107 namespace {
1108 struct MemsetRange {
1109 // Start/End - A semi range that describes the span that this range covers.
1110 // The range is closed at the start and open at the end: [Start, End).
1111 int64_t Start, End;
1112
1113 /// StartPtr - The getelementptr instruction that points to the start of the
1114 /// range.
1115 Value *StartPtr;
1116
1117 /// Alignment - The known alignment of the first store.
1118 unsigned Alignment;
1119
1120 /// TheStores - The actual stores that make up this range.
1121 SmallVector TheStores;
1122
1123 bool isProfitableToUseMemset(const TargetData &TD) const;
1124
1125 };
1126 } // end anon namespace
1127
1128 bool MemsetRange::isProfitableToUseMemset(const TargetData &TD) const {
1129 // If we found more than 8 stores to merge or 64 bytes, use memset.
1130 if (TheStores.size() >= 8 || End-Start >= 64) return true;
1131
1132 // Assume that the code generator is capable of merging pairs of stores
1133 // together if it wants to.
1134 if (TheStores.size() <= 2) return false;
1135
1136 // If we have fewer than 8 stores, it can still be worthwhile to do this.
1137 // For example, merging 4 i8 stores into an i32 store is useful almost always.
1138 // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the
1139 // memset will be split into 2 32-bit stores anyway) and doing so can
1140 // pessimize the llvm optimizer.
1141 //
1142 // Since we don't have perfect knowledge here, make some assumptions: assume
1143 // the maximum GPR width is the same size as the pointer size and assume that
1144 // this width can be stored. If so, check to see whether we will end up
1145 // actually reducing the number of stores used.
1146 unsigned Bytes = unsigned(End-Start);
1147 unsigned NumPointerStores = Bytes/TD.getPointerSize();
1148
1149 // Assume the remaining bytes if any are done a byte at a time.
1150 unsigned NumByteStores = Bytes - NumPointerStores*TD.getPointerSize();
1151
1152 // If we will reduce the # stores (according to this heuristic), do the
1153 // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
1154 // etc.
1155 return TheStores.size() > NumPointerStores+NumByteStores;
1156 }
1157
1158
1159 namespace {
1160 class MemsetRanges {
1161 /// Ranges - A sorted list of the memset ranges. We use std::list here
1162 /// because each element is relatively large and expensive to copy.
1163 std::list Ranges;
1164 typedef std::list::iterator range_iterator;
1165 TargetData &TD;
1166 public:
1167 MemsetRanges(TargetData &td) : TD(td) {}
1168
1169 typedef std::list::const_iterator const_iterator;
1170 const_iterator begin() const { return Ranges.begin(); }
1171 const_iterator end() const { return Ranges.end(); }
1172 bool empty() const { return Ranges.empty(); }
1173
1174 void addStore(int64_t OffsetFromFirst, StoreInst *SI);
1175 };
1176
1177 } // end anon namespace
1178
1179
1180 /// addStore - Add a new store to the MemsetRanges data structure. This adds a
1181 /// new range for the specified store at the specified offset, merging into
1182 /// existing ranges as appropriate.
1183 void MemsetRanges::addStore(int64_t Start, StoreInst *SI) {
1184 int64_t End = Start+TD.getTypeStoreSize(SI->getOperand(0)->getType());
1185
1186 // Do a linear search of the ranges to see if this can be joined and/or to
1187 // find the insertion point in the list. We keep the ranges sorted for
1188 // simplicity here. This is a linear search of a linked list, which is ugly,
1189 // however the number of ranges is limited, so this won't get crazy slow.
1190 range_iterator I = Ranges.begin(), E = Ranges.end();
1191
1192 while (I != E && Start > I->End)
1193 ++I;
1194
1195 // We now know that I == E, in which case we didn't find anything to merge
1196 // with, or that Start <= I->End. If End < I->Start or I == E, then we need
1197 // to insert a new range. Handle this now.
1198 if (I == E || End < I->Start) {
1199 MemsetRange &R = *Ranges.insert(I, MemsetRange());
1200 R.Start = Start;
1201 R.End = End;
1202 R.StartPtr = SI->getPointerOperand();
1203 R.Alignment = SI->getAlignment();
1204 R.TheStores.push_back(SI);
1205 return;
1206 }
1207
1208 // This store overlaps with I, add it.
1209 I->TheStores.push_back(SI);
1210
1211 // At this point, we may have an interval that completely contains our store.
1212 // If so, just add it to the interval and return.
1213 if (I->Start <= Start && I->End >= End)
1214 return;
1215
1216 // Now we know that Start <= I->End and End >= I->Start so the range overlaps
1217 // but is not entirely contained within the range.
1218
1219 // See if the range extends the start of the range. In this case, it couldn't
1220 // possibly cause it to join the prior range, because otherwise we would have
1221 // stopped on *it*.
1222 if (Start < I->Start) {
1223 I->Start = Start;
1224 I->StartPtr = SI->getPointerOperand();
1225 }
1226
1227 // Now we know that Start <= I->End and Start >= I->Start (so the startpoint
1228 // is in or right at the end of I), and that End >= I->Start. Extend I out to
1229 // End.
1230 if (End > I->End) {
1231 I->End = End;
1232 range_iterator NextI = I;;
1233 while (++NextI != E && End >= NextI->Start) {
1234 // Merge the range in.
1235 I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end());
1236 if (NextI->End > I->End)
1237 I->End = NextI->End;
1238 Ranges.erase(NextI);
1239 NextI = I;
1240 }
1241 }
1242 }
1243
1244
1245
1246 /// processStore - When GVN is scanning forward over instructions, we look for
1247 /// some other patterns to fold away. In particular, this looks for stores to
1248 /// neighboring locations of memory. If it sees enough consequtive ones
1249 /// (currently 4) it attempts to merge them together into a memcpy/memset.
1250 bool GVN::processStore(StoreInst *SI, SmallVectorImpl &toErase) {
1251 if (!FormMemSet) return false;
1252 if (SI->isVolatile()) return false;
1253
1254 // There are two cases that are interesting for this code to handle: memcpy
1255 // and memset. Right now we only handle memset.
1256
1257 // Ensure that the value being stored is something that can be memset'able a
1258 // byte at a time like "0" or "-1" or any width, as well as things like
1259 // 0xA0A0A0A0 and 0.0.
1260 Value *ByteVal = isBytewiseValue(SI->getOperand(0));
1261 if (!ByteVal)
1262 return false;
1263
1264 TargetData &TD = getAnalysis();
1265 AliasAnalysis &AA = getAnalysis();
1266
1267 // Okay, so we now have a single store that can be splatable. Scan to find
1268 // all subsequent stores of the same value to offset from the same pointer.
1269 // Join these together into ranges, so we can decide whether contiguous blocks
1270 // are stored.
1271 MemsetRanges Ranges(TD);
1272
1273 Value *StartPtr = SI->getPointerOperand();
1274
1275 BasicBlock::iterator BI = SI;
1276 for (++BI; !isa(BI); ++BI) {
1277 if (isa(BI) || isa(BI)) {
1278 // If the call is readnone, ignore it, otherwise bail out. We don't even
1279 // allow readonly here because we don't want something like:
1280 // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
1281 if (AA.getModRefBehavior(CallSite::get(BI)) ==
1282 AliasAnalysis::DoesNotAccessMemory)
1283 continue;
1284
1285 // TODO: If this is a memset, try to join it in.
1286
1287 break;
1288 } else if (isa(BI) || isa(BI))
1289 break;
1290
1291 // If this is a non-store instruction it is fine, ignore it.
1292 StoreInst *NextStore = dyn_cast(BI);
1293 if (NextStore == 0) continue;
1294
1295 // If this is a store, see if we can merge it in.
1296 if (NextStore->isVolatile()) break;
1297
1298 // Check to see if this stored value is of the same byte-splattable value.
1299 if (ByteVal != isBytewiseValue(NextStore->getOperand(0)))
1300 break;
1301
1302 // Check to see if this store is to a constant offset from the start ptr.
1303 int64_t Offset;
1304 if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset, TD))
1305 break;
1306
1307 Ranges.addStore(Offset, NextStore);
1308 }
1309
1310 // If we have no ranges, then we just had a single store with nothing that
1311 // could be merged in. This is a very common case of course.
1312 if (Ranges.empty())
1313 return false;
1314
1315 // If we had at least one store that could be merged in, add the starting
1316 // store as well. We try to avoid this unless there is at least something
1317 // interesting as a small compile-time optimization.
1318 Ranges.addStore(0, SI);
1319
1320
1321 Function *MemSetF = 0;
1322
1323 // Now that we have full information about ranges, loop over the ranges and
1324 // emit memset's for anything big enough to be worthwhile.
1325 bool MadeChange = false;
1326 for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end();
1327 I != E; ++I) {
1328 const MemsetRange &Range = *I;
1329
1330 if (Range.TheStores.size() == 1) continue;
1331
1332 // If it is profitable to lower this range to memset, do so now.
1333 if (!Range.isProfitableToUseMemset(TD))
1334 continue;
1335
1336 // Otherwise, we do want to transform this! Create a new memset. We put
1337 // the memset right before the first instruction that isn't part of this
1338 // memset block. This ensure that the memset is dominated by any addressing
1339 // instruction needed by the start of the block.
1340 BasicBlock::iterator InsertPt = BI;
1341
1342 if (MemSetF == 0)
1343 MemSetF = Intrinsic::getDeclaration(SI->getParent()->getParent()
1344 ->getParent(), Intrinsic::memset_i64);
1345
1346 // Get the starting pointer of the block.
1347 StartPtr = Range.StartPtr;
1348
1349 // Cast the start ptr to be i8* as memset requires.
1350 const Type *i8Ptr = PointerType::getUnqual(Type::Int8Ty);
1351 if (StartPtr->getType() != i8Ptr)
1352 StartPtr = new BitCastInst(StartPtr, i8Ptr, StartPtr->getNameStart(),
1353 InsertPt);
1354
1355 Value *Ops[] = {
1356 StartPtr, ByteVal, // Start, value
1357 ConstantInt::get(Type::Int64Ty, Range.End-Range.Start), // size
1358 ConstantInt::get(Type::Int32Ty, Range.Alignment) // align
1359 };
1360 Value *C = CallInst::Create(MemSetF, Ops, Ops+4, "", InsertPt);
1361 DEBUG(cerr << "Replace stores:\n";
1362 for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i)
1363 cerr << *Range.TheStores[i];
1364 cerr << "With: " << *C); C=C;
1365
1366 // Zap all the stores.
1367 toErase.append(Range.TheStores.begin(), Range.TheStores.end());
1368 ++NumMemSetInfer;
1369 MadeChange = true;
1370 }
1371
1372 return MadeChange;
1373 }
1374
1375
1376 /// performCallSlotOptzn - takes a memcpy and a call that it depends on,
1377 /// and checks for the possibility of a call slot optimization by having
1378 /// the call write its result directly into the destination of the memcpy.
1379 bool GVN::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C,
1380 SmallVectorImpl &toErase) {
1381 // The general transformation to keep in mind is
1382 //
1383 // call @func(..., src, ...)
1384 // memcpy(dest, src, ...)
1385 //
1386 // ->
1387 //
1388 // memcpy(dest, src, ...)
1389 // call @func(..., dest, ...)
1390 //
1391 // Since moving the memcpy is technically awkward, we additionally check that
1392 // src only holds uninitialized values at the moment of the call, meaning that
1393 // the memcpy can be discarded rather than moved.
1394
1395 // Deliberately get the source and destination with bitcasts stripped away,
1396 // because we'll need to do type comparisons based on the underlying type.
1397 Value* cpyDest = cpy->getDest();
1398 Value* cpySrc = cpy->getSource();
1399 CallSite CS = CallSite::get(C);
1400
1401 // We need to be able to reason about the size of the memcpy, so we require
1402 // that it be a constant.
1403 ConstantInt* cpyLength = dyn_cast(cpy->getLength());
1404 if (!cpyLength)
1405 return false;
1406
1407 // Require that src be an alloca. This simplifies the reasoning considerably.
1408 AllocaInst* srcAlloca = dyn_cast(cpySrc);
1409 if (!srcAlloca)
1410 return false;
1411
1412 // Check that all of src is copied to dest.
1413 TargetData& TD = getAnalysis();
1414
1415 ConstantInt* srcArraySize = dyn_cast(srcAlloca->getArraySize());
1416 if (!srcArraySize)
1417 return false;
1418
1419 uint64_t srcSize = TD.getABITypeSize(srcAlloca->getAllocatedType()) *
1420 srcArraySize->getZExtValue();
1421
1422 if (cpyLength->getZExtValue() < srcSize)
1423 return false;
1424
1425 // Check that accessing the first srcSize bytes of dest will not cause a
1426 // trap. Otherwise the transform is invalid since it might cause a trap
1427 // to occur earlier than it otherwise would.
1428 if (AllocaInst* A = dyn_cast(cpyDest)) {
1429 // The destination is an alloca. Check it is larger than srcSize.
1430 ConstantInt* destArraySize = dyn_cast(A->getArraySize());
1431 if (!destArraySize)
1432 return false;
1433
1434 uint64_t destSize = TD.getABITypeSize(A->getAllocatedType()) *
1435 destArraySize->getZExtValue();
1436
1437 if (destSize < srcSize)
1438 return false;
1439 } else if (Argument* A = dyn_cast(cpyDest)) {
1440 // If the destination is an sret parameter then only accesses that are
1441 // outside of the returned struct type can trap.
1442 if (!A->hasStructRetAttr())
1443 return false;
1444
1445 const Type* StructTy = cast(A->getType())->getElementType();
1446 uint64_t destSize = TD.getABITypeSize(StructTy);
1447
1448 if (destSize < srcSize)
1449 return false;
1450 } else {
1451 return false;
1452 }
1453
1454 // Check that src is not accessed except via the call and the memcpy. This
1455 // guarantees that it holds only undefined values when passed in (so the final
1456 // memcpy can be dropped), that it is not read or written between the call and
1457 // the memcpy, and that writing beyond the end of it is undefined.
1458 SmallVector srcUseList(srcAlloca->use_begin(),
1459 srcAlloca->use_end());
1460 while (!srcUseList.empty()) {
1461 User* UI = srcUseList.back();
1462 srcUseList.pop_back();
1463
1464 if (isa(UI) || isa(UI)) {
1465 for (User::use_iterator I = UI->use_begin(), E = UI->use_end();
1466 I != E; ++I)
1467 srcUseList.push_back(*I);
1468 } else if (UI != C && UI != cpy) {
1469 return false;
1470 }
1471 }
1472
1473 // Since we're changing the parameter to the callsite, we need to make sure
1474 // that what would be the new parameter dominates the callsite.
1475 DominatorTree& DT = getAnalysis();
1476 if (Instruction* cpyDestInst = dyn_cast(cpyDest))
1477 if (!DT.dominates(cpyDestInst, C))
1478 return false;
1479
1480 // In addition to knowing that the call does not access src in some
1481 // unexpected manner, for example via a global, which we deduce from
1482 // the use analysis, we also need to know that it does not sneakily
1483 // access dest. We rely on AA to figure this out for us.
1484 AliasAnalysis& AA = getAnalysis();
1485 if (AA.getModRefInfo(C, cpy->getRawDest(), srcSize) !=
1486 AliasAnalysis::NoModRef)
1487 return false;
1488
1489 // All the checks have passed, so do the transformation.
1490 for (unsigned i = 0; i < CS.arg_size(); ++i)
1491 if (CS.getArgument(i) == cpySrc) {
1492 if (cpySrc->getType() != cpyDest->getType())
1493 cpyDest = CastInst::createPointerCast(cpyDest, cpySrc->getType(),
1494 cpyDest->getName(), C);
1495 CS.setArgument(i, cpyDest);
1496 }
1497
1498 // Drop any cached information about the call, because we may have changed
1499 // its dependence information by changing its parameter.
1500 MemoryDependenceAnalysis& MD = getAnalysis();
1501 MD.dropInstruction(C);
1502
1503 // Remove the memcpy
1504 MD.removeInstruction(cpy);
1505 toErase.push_back(cpy);
1506
1507 return true;
1508 }
1509
1510 /// processMemCpy - perform simplication of memcpy's. If we have memcpy A which
1511 /// copies X to Y, and memcpy B which copies Y to Z, then we can rewrite B to be
1512 /// a memcpy from X to Z (or potentially a memmove, depending on circumstances).
1513 /// This allows later passes to remove the first memcpy altogether.
1514 bool GVN::processMemCpy(MemCpyInst* M, MemCpyInst* MDep,
1515 SmallVectorImpl &toErase) {
1516 // We can only transforms memcpy's where the dest of one is the source of the
1517 // other
1518 if (M->getSource() != MDep->getDest())
1519 return false;
1520
1521 // Second, the length of the memcpy's must be the same, or the preceeding one
1522 // must be larger than the following one.
1523 ConstantInt* C1 = dyn_cast(MDep->getLength());
1524 ConstantInt* C2 = dyn_cast(M->getLength());
1525 if (!C1 || !C2)
1526 return false;
1527
1528 uint64_t DepSize = C1->getValue().getZExtValue();
1529 uint64_t CpySize = C2->getValue().getZExtValue();
1530
1531 if (DepSize < CpySize)
1532 return false;
1533
1534 // Finally, we have to make sure that the dest of the second does not
1535 // alias the source of the first
1536 AliasAnalysis& AA = getAnalysis();
1537 if (AA.alias(M->getRawDest(), CpySize, MDep->getRawSource(), DepSize) !=
1538 AliasAnalysis::NoAlias)
1539 return false;
1540 else if (AA.alias(M->getRawDest(), CpySize, M->getRawSource(), CpySize) !=
1541 AliasAnalysis::NoAlias)
1542 return false;
1543 else if (AA.alias(MDep->getRawDest(), DepSize, MDep->getRawSource(), DepSize)
1544 != AliasAnalysis::NoAlias)
1545 return false;
1546
1547 // If all checks passed, then we can transform these memcpy's
1548 Function* MemCpyFun = Intrinsic::getDeclaration(
1549 M->getParent()->getParent()->getParent(),
1550 M->getIntrinsicID());
1551
1552 std::vector args;
1553 args.push_back(M->getRawDest());
1554 args.push_back(MDep->getRawSource());
1555 args.push_back(M->getLength());
1556 args.push_back(M->getAlignment());
1557
1558 CallInst* C = CallInst::Create(MemCpyFun, args.begin(), args.end(), "", M);
1559
1560 MemoryDependenceAnalysis& MD = getAnalysis();
1561 if (MD.getDependency(C) == MDep) {
1562 MD.dropInstruction(M);
1563 toErase.push_back(M);
1564 return true;
1565 }
1566
1567 MD.removeInstruction(C);
1568 toErase.push_back(C);
1569 return false;
1570 }
1571
1572972 /// processInstruction - When calculating availability, handle an instruction
1573973 /// by inserting it into the appropriate sets
1574974 bool GVN::processInstruction(Instruction *I, ValueNumberedSet &currAvail,
1577977 if (LoadInst* L = dyn_cast(I))
1578978 return processLoad(L, lastSeenLoad, toErase);
1579979
1580 if (StoreInst *SI = dyn_cast(I))
1581 return processStore(SI, toErase);
1582
1583980 // Allocations are always uniquely numbered, so we can save time and memory
1584981 // by fast failing them.
1585982 if (isa(I))
1586983 return false;
1587
1588 if (MemCpyInst* M = dyn_cast(I)) {
1589 MemoryDependenceAnalysis& MD = getAnalysis();
1590
1591 // The are two possible optimizations we can do for memcpy:
1592 // a) memcpy-memcpy xform which exposes redundance for DSE
1593 // b) call-memcpy xform for return slot optimization
1594 Instruction* dep = MD.getDependency(M);
1595 if (dep == MemoryDependenceAnalysis::None ||
1596 dep == MemoryDependenceAnalysis::NonLocal)
1597 return false;
1598 if (MemCpyInst *MemCpy = dyn_cast(dep))
1599 return processMemCpy(M, MemCpy, toErase);
1600 if (CallInst* C = dyn_cast(dep))
1601 return performCallSlotOptzn(M, C, toErase);
1602 return false;
1603 }
1604984
1605985 unsigned num = VN.lookup_or_add(I);
1606986
0 //===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs various transformations related to eliminating memcpy
10 // calls, or transforming sets of stores into memset's.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #define DEBUG_TYPE "memcpyopt"
15 #include "llvm/Transforms/Scalar.h"
16 #include "llvm/BasicBlock.h"
17 #include "llvm/Constants.h"
18 #include "llvm/DerivedTypes.h"
19 #include "llvm/Function.h"
20 #include "llvm/IntrinsicInst.h"
21 #include "llvm/Instructions.h"
22 #include "llvm/ParameterAttributes.h"
23 #include "llvm/Value.h"
24 #include "llvm/ADT/DepthFirstIterator.h"
25 #include "llvm/ADT/SmallVector.h"
26 #include "llvm/ADT/Statistic.h"
27 #include "llvm/Analysis/Dominators.h"
28 #include "llvm/Analysis/AliasAnalysis.h"
29 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
30 #include "llvm/Support/CFG.h"
31 #include "llvm/Support/CommandLine.h"
32 #include "llvm/Support/Compiler.h"
33 #include "llvm/Support/Debug.h"
34 #include "llvm/Support/GetElementPtrTypeIterator.h"
35 #include "llvm/Target/TargetData.h"
36 #include
37 using namespace llvm;
38
39 STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
40 STATISTIC(NumMemSetInfer, "Number of memsets inferred");
41
42 namespace {
43 cl::opt
44 FormMemSet("form-memset-from-stores",
45 cl::desc("Transform straight-line stores to memsets"),
46 cl::init(true), cl::Hidden);
47 }
48
49 /// isBytewiseValue - If the specified value can be set by repeating the same
50 /// byte in memory, return the i8 value that it is represented with. This is
51 /// true for all i8 values obviously, but is also true for i32 0, i32 -1,
52 /// i16 0xF0F0, double 0.0 etc. If the value can't be handled with a repeated
53 /// byte store (e.g. i16 0x1234), return null.
54 static Value *isBytewiseValue(Value *V) {
55 // All byte-wide stores are splatable, even of arbitrary variables.
56 if (V->getType() == Type::Int8Ty) return V;
57
58 // Constant float and double values can be handled as integer values if the
59 // corresponding integer value is "byteable". An important case is 0.0.
60 if (ConstantFP *CFP = dyn_cast(V)) {
61 if (CFP->getType() == Type::FloatTy)
62 V = ConstantExpr::getBitCast(CFP, Type::Int32Ty);
63 if (CFP->getType() == Type::DoubleTy)
64 V = ConstantExpr::getBitCast(CFP, Type::Int64Ty);
65 // Don't handle long double formats, which have strange constraints.
66 }
67
68 // We can handle constant integers that are power of two in size and a
69 // multiple of 8 bits.
70 if (ConstantInt *CI = dyn_cast(V)) {
71 unsigned Width = CI->getBitWidth();
72 if (isPowerOf2_32(Width) && Width > 8) {
73 // We can handle this value if the recursive binary decomposition is the
74 // same at all levels.
75 APInt Val = CI->getValue();
76 APInt Val2;
77 while (Val.getBitWidth() != 8) {
78 unsigned NextWidth = Val.getBitWidth()/2;
79 Val2 = Val.lshr(NextWidth);
80 Val2.trunc(Val.getBitWidth()/2);
81 Val.trunc(Val.getBitWidth()/2);
82
83 // If the top/bottom halves aren't the same, reject it.
84 if (Val != Val2)
85 return 0;
86 }
87 return ConstantInt::get(Val);
88 }
89 }
90
91 // Conceptually, we could handle things like:
92 // %a = zext i8 %X to i16
93 // %b = shl i16 %a, 8
94 // %c = or i16 %a, %b
95 // but until there is an example that actually needs this, it doesn't seem
96 // worth worrying about.
97 return 0;
98 }
99
100 static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx,
101 bool &VariableIdxFound, TargetData &TD) {
102 // Skip over the first indices.
103 gep_type_iterator GTI = gep_type_begin(GEP);
104 for (unsigned i = 1; i != Idx; ++i, ++GTI)
105 /*skip along*/;
106
107 // Compute the offset implied by the rest of the indices.
108 int64_t Offset = 0;
109 for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
110 ConstantInt *OpC = dyn_cast(GEP->getOperand(i));
111 if (OpC == 0)
112 return VariableIdxFound = true;
113 if (OpC->isZero()) continue; // No offset.
114
115 // Handle struct indices, which add their field offset to the pointer.
116 if (const StructType *STy = dyn_cast(*GTI)) {
117 Offset += TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
118 continue;
119 }
120
121 // Otherwise, we have a sequential type like an array or vector. Multiply
122 // the index by the ElementSize.
123 uint64_t Size = TD.getABITypeSize(GTI.getIndexedType());
124 Offset += Size*OpC->getSExtValue();
125 }
126
127 return Offset;
128 }
129
130 /// IsPointerOffset - Return true if Ptr1 is provably equal to Ptr2 plus a
131 /// constant offset, and return that constant offset. For example, Ptr1 might
132 /// be &A[42], and Ptr2 might be &A[40]. In this case offset would be -8.
133 static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
134 TargetData &TD) {
135 // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical
136 // base. After that base, they may have some number of common (and
137 // potentially variable) indices. After that they handle some constant
138 // offset, which determines their offset from each other. At this point, we
139 // handle no other case.
140 GetElementPtrInst *GEP1 = dyn_cast(Ptr1);
141 GetElementPtrInst *GEP2 = dyn_cast(Ptr2);
142 if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0))
143 return false;
144
145 // Skip any common indices and track the GEP types.
146 unsigned Idx = 1;
147 for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx)
148 if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx))
149 break;
150
151 bool VariableIdxFound = false;
152 int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, TD);
153 int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, TD);
154 if (VariableIdxFound) return false;
155
156 Offset = Offset2-Offset1;
157 return true;
158 }
159
160
161 /// MemsetRange - Represents a range of memset'd bytes with the ByteVal value.
162 /// This allows us to analyze stores like:
163 /// store 0 -> P+1
164 /// store 0 -> P+0
165 /// store 0 -> P+3
166 /// store 0 -> P+2
167 /// which sometimes happens with stores to arrays of structs etc. When we see
168 /// the first store, we make a range [1, 2). The second store extends the range
169 /// to [0, 2). The third makes a new range [2, 3). The fourth store joins the
170 /// two ranges into [0, 3) which is memset'able.
171 namespace {
172 struct MemsetRange {
173 // Start/End - A semi range that describes the span that this range covers.
174 // The range is closed at the start and open at the end: [Start, End).
175 int64_t Start, End;
176
177 /// StartPtr - The getelementptr instruction that points to the start of the
178 /// range.
179 Value *StartPtr;
180
181 /// Alignment - The known alignment of the first store.
182 unsigned Alignment;
183
184 /// TheStores - The actual stores that make up this range.
185 SmallVector TheStores;
186
187 bool isProfitableToUseMemset(const TargetData &TD) const;
188
189 };
190 } // end anon namespace
191
192 bool MemsetRange::isProfitableToUseMemset(const TargetData &TD) const {
193 // If we found more than 8 stores to merge or 64 bytes, use memset.
194 if (TheStores.size() >= 8 || End-Start >= 64) return true;
195
196 // Assume that the code generator is capable of merging pairs of stores
197 // together if it wants to.
198 if (TheStores.size() <= 2) return false;
199
200 // If we have fewer than 8 stores, it can still be worthwhile to do this.
201 // For example, merging 4 i8 stores into an i32 store is useful almost always.
202 // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the
203 // memset will be split into 2 32-bit stores anyway) and doing so can
204 // pessimize the llvm optimizer.
205 //
206 // Since we don't have perfect knowledge here, make some assumptions: assume
207 // the maximum GPR width is the same size as the pointer size and assume that
208 // this width can be stored. If so, check to see whether we will end up
209 // actually reducing the number of stores used.
210 unsigned Bytes = unsigned(End-Start);
211 unsigned NumPointerStores = Bytes/TD.getPointerSize();
212
213 // Assume the remaining bytes if any are done a byte at a time.
214 unsigned NumByteStores = Bytes - NumPointerStores*TD.getPointerSize();
215
216 // If we will reduce the # stores (according to this heuristic), do the
217 // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
218 // etc.
219 return TheStores.size() > NumPointerStores+NumByteStores;
220 }
221
222
223 namespace {
224 class MemsetRanges {
225 /// Ranges - A sorted list of the memset ranges. We use std::list here
226 /// because each element is relatively large and expensive to copy.
227 std::list Ranges;
228 typedef std::list::iterator range_iterator;
229 TargetData &TD;
230 public:
231 MemsetRanges(TargetData &td) : TD(td) {}
232
233 typedef std::list::const_iterator const_iterator;
234 const_iterator begin() const { return Ranges.begin(); }
235 const_iterator end() const { return Ranges.end(); }
236 bool empty() const { return Ranges.empty(); }
237
238 void addStore(int64_t OffsetFromFirst, StoreInst *SI);
239 };
240
241 } // end anon namespace
242
243
244 /// addStore - Add a new store to the MemsetRanges data structure. This adds a
245 /// new range for the specified store at the specified offset, merging into
246 /// existing ranges as appropriate.
247 void MemsetRanges::addStore(int64_t Start, StoreInst *SI) {
248 int64_t End = Start+TD.getTypeStoreSize(SI->getOperand(0)->getType());
249
250 // Do a linear search of the ranges to see if this can be joined and/or to
251 // find the insertion point in the list. We keep the ranges sorted for
252 // simplicity here. This is a linear search of a linked list, which is ugly,
253 // however the number of ranges is limited, so this won't get crazy slow.
254 range_iterator I = Ranges.begin(), E = Ranges.end();
255
256 while (I != E && Start > I->End)
257 ++I;
258
259 // We now know that I == E, in which case we didn't find anything to merge
260 // with, or that Start <= I->End. If End < I->Start or I == E, then we need
261 // to insert a new range. Handle this now.
262 if (I == E || End < I->Start) {
263 MemsetRange &R = *Ranges.insert(I, MemsetRange());
264 R.Start = Start;
265 R.End = End;
266 R.StartPtr = SI->getPointerOperand();
267 R.Alignment = SI->getAlignment();
268 R.TheStores.push_back(SI);
269 return;
270 }
271
272 // This store overlaps with I, add it.
273 I->TheStores.push_back(SI);
274
275 // At this point, we may have an interval that completely contains our store.
276 // If so, just add it to the interval and return.
277 if (I->Start <= Start && I->End >= End)
278 return;
279
280 // Now we know that Start <= I->End and End >= I->Start so the range overlaps
281 // but is not entirely contained within the range.
282
283 // See if the range extends the start of the range. In this case, it couldn't
284 // possibly cause it to join the prior range, because otherwise we would have
285 // stopped on *it*.
286 if (Start < I->Start) {
287 I->Start = Start;
288 I->StartPtr = SI->getPointerOperand();
289 }
290
291 // Now we know that Start <= I->End and Start >= I->Start (so the startpoint
292 // is in or right at the end of I), and that End >= I->Start. Extend I out to
293 // End.
294 if (End > I->End) {
295 I->End = End;
296 range_iterator NextI = I;;
297 while (++NextI != E && End >= NextI->Start) {
298 // Merge the range in.
299 I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end());
300 if (NextI->End > I->End)
301 I->End = NextI->End;
302 Ranges.erase(NextI);
303 NextI = I;
304 }
305 }
306 }
307
308 //===----------------------------------------------------------------------===//
309 // MemCpyOpt Pass
310 //===----------------------------------------------------------------------===//
311
312 namespace {
313
314 class VISIBILITY_HIDDEN MemCpyOpt : public FunctionPass {
315 bool runOnFunction(Function &F);
316 public:
317 static char ID; // Pass identification, replacement for typeid
318 MemCpyOpt() : FunctionPass((intptr_t)&ID) { }
319
320 private:
321 // This transformation requires dominator postdominator info
322 virtual void getAnalysisUsage(AnalysisUsage &AU) const {
323 AU.setPreservesCFG();
324 AU.addRequired();
325 AU.addRequired();
326 AU.addRequired();
327 AU.addRequired();
328 AU.addPreserved();
329 AU.addPreserved();
330 AU.addPreserved();
331 }
332
333 // Helper fuctions
334 bool processInstruction(Instruction* I,
335 SmallVectorImpl &toErase);
336 bool processStore(StoreInst *SI, SmallVectorImpl &toErase);
337 bool processMemCpy(MemCpyInst* M, MemCpyInst* MDep,
338 SmallVectorImpl &toErase);
339 bool performCallSlotOptzn(MemCpyInst* cpy, CallInst* C,
340 SmallVectorImpl &toErase);
341 bool iterateOnFunction(Function &F);
342 };
343
344 char MemCpyOpt::ID = 0;
345 }
346
347 // createMemCpyOptPass - The public interface to this file...
348 FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); }
349
350 static RegisterPass X("memcpyopt",
351 "MemCpy Optimization");
352
353
354
355 /// processStore - When GVN is scanning forward over instructions, we look for
356 /// some other patterns to fold away. In particular, this looks for stores to
357 /// neighboring locations of memory. If it sees enough consequtive ones
358 /// (currently 4) it attempts to merge them together into a memcpy/memset.
359 bool MemCpyOpt::processStore(StoreInst *SI, SmallVectorImpl &toErase) {
360 if (!FormMemSet) return false;
361 if (SI->isVolatile()) return false;
362
363 // There are two cases that are interesting for this code to handle: memcpy
364 // and memset. Right now we only handle memset.
365
366 // Ensure that the value being stored is something that can be memset'able a
367 // byte at a time like "0" or "-1" or any width, as well as things like
368 // 0xA0A0A0A0 and 0.0.
369 Value *ByteVal = isBytewiseValue(SI->getOperand(0));
370 if (!ByteVal)
371 return false;
372
373 TargetData &TD = getAnalysis();
374 AliasAnalysis &AA = getAnalysis();
375
376 // Okay, so we now have a single store that can be splatable. Scan to find
377 // all subsequent stores of the same value to offset from the same pointer.
378 // Join these together into ranges, so we can decide whether contiguous blocks
379 // are stored.
380 MemsetRanges Ranges(TD);
381
382 Value *StartPtr = SI->getPointerOperand();
383
384 BasicBlock::iterator BI = SI;
385 for (++BI; !isa(BI); ++BI) {
386 if (isa(BI) || isa(BI)) {
387 // If the call is readnone, ignore it, otherwise bail out. We don't even
388 // allow readonly here because we don't want something like:
389 // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
390 if (AA.getModRefBehavior(CallSite::get(BI)) ==
391 AliasAnalysis::DoesNotAccessMemory)
392 continue;
393
394 // TODO: If this is a memset, try to join it in.
395
396 break;
397 } else if (isa(BI) || isa(BI))
398 break;
399
400 // If this is a non-store instruction it is fine, ignore it.
401 StoreInst *NextStore = dyn_cast(BI);
402 if (NextStore == 0) continue;
403
404 // If this is a store, see if we can merge it in.
405 if (NextStore->isVolatile()) break;
406
407 // Check to see if this stored value is of the same byte-splattable value.
408 if (ByteVal != isBytewiseValue(NextStore->getOperand(0)))
409 break;
410
411 // Check to see if this store is to a constant offset from the start ptr.
412 int64_t Offset;
413 if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset, TD))
414 break;
415
416 Ranges.addStore(Offset, NextStore);
417 }
418
419 // If we have no ranges, then we just had a single store with nothing that
420 // could be merged in. This is a very common case of course.
421 if (Ranges.empty())
422 return false;
423
424 // If we had at least one store that could be merged in, add the starting
425 // store as well. We try to avoid this unless there is at least something
426 // interesting as a small compile-time optimization.
427 Ranges.addStore(0, SI);
428
429
430 Function *MemSetF = 0;
431
432 // Now that we have full information about ranges, loop over the ranges and
433 // emit memset's for anything big enough to be worthwhile.
434 bool MadeChange = false;
435 for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end();
436 I != E; ++I) {
437 const MemsetRange &Range = *I;
438
439 if (Range.TheStores.size() == 1) continue;
440
441 // If it is profitable to lower this range to memset, do so now.
442 if (!Range.isProfitableToUseMemset(TD))
443 continue;
444
445 // Otherwise, we do want to transform this! Create a new memset. We put
446 // the memset right before the first instruction that isn't part of this
447 // memset block. This ensure that the memset is dominated by any addressing
448 // instruction needed by the start of the block.
449 BasicBlock::iterator InsertPt = BI;
450
451 if (MemSetF == 0)
452 MemSetF = Intrinsic::getDeclaration(SI->getParent()->getParent()
453 ->getParent(), Intrinsic::memset_i64);
454
455 // Get the starting pointer of the block.
456 StartPtr = Range.StartPtr;
457
458 // Cast the start ptr to be i8* as memset requires.
459 const Type *i8Ptr = PointerType::getUnqual(Type::Int8Ty);
460 if (StartPtr->getType() != i8Ptr)
461 StartPtr = new BitCastInst(StartPtr, i8Ptr, StartPtr->getNameStart(),
462 InsertPt);
463
464 Value *Ops[] = {
465 StartPtr, ByteVal, // Start, value
466 ConstantInt::get(Type::Int64Ty, Range.End-Range.Start), // size
467 ConstantInt::get(Type::Int32Ty, Range.Alignment) // align
468 };
469 Value *C = CallInst::Create(MemSetF, Ops, Ops+4, "", InsertPt);
470 DEBUG(cerr << "Replace stores:\n";
471 for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i)
472 cerr << *Range.TheStores[i];
473 cerr << "With: " << *C); C=C;
474
475 // Zap all the stores.
476 toErase.append(Range.TheStores.begin(), Range.TheStores.end());
477 ++NumMemSetInfer;
478 MadeChange = true;
479 }
480
481 return MadeChange;
482 }
483
484
485 /// performCallSlotOptzn - takes a memcpy and a call that it depends on,
486 /// and checks for the possibility of a call slot optimization by having
487 /// the call write its result directly into the destination of the memcpy.
488 bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C,
489 SmallVectorImpl &toErase) {
490 // The general transformation to keep in mind is
491 //
492 // call @func(..., src, ...)
493 // memcpy(dest, src, ...)
494 //
495 // ->
496 //
497 // memcpy(dest, src, ...)
498 // call @func(..., dest, ...)
499 //
500 // Since moving the memcpy is technically awkward, we additionally check that
501 // src only holds uninitialized values at the moment of the call, meaning that
502 // the memcpy can be discarded rather than moved.
503
504 // Deliberately get the source and destination with bitcasts stripped away,
505 // because we'll need to do type comparisons based on the underlying type.
506 Value* cpyDest = cpy->getDest();
507 Value* cpySrc = cpy->getSource();
508 CallSite CS = CallSite::get(C);
509
510 // We need to be able to reason about the size of the memcpy, so we require
511 // that it be a constant.
512 ConstantInt* cpyLength = dyn_cast(cpy->getLength());
513 if (!cpyLength)
514 return false;
515
516 // Require that src be an alloca. This simplifies the reasoning considerably.
517 AllocaInst* srcAlloca = dyn_cast(cpySrc);
518 if (!srcAlloca)
519 return false;
520
521 // Check that all of src is copied to dest.
522 TargetData& TD = getAnalysis();
523
524 ConstantInt* srcArraySize = dyn_cast(srcAlloca->getArraySize());
525 if (!srcArraySize)
526 return false;
527
528 uint64_t srcSize = TD.getABITypeSize(srcAlloca->getAllocatedType()) *
529 srcArraySize->getZExtValue();
530
531 if (cpyLength->getZExtValue() < srcSize)
532 return false;
533
534 // Check that accessing the first srcSize bytes of dest will not cause a
535 // trap. Otherwise the transform is invalid since it might cause a trap
536 // to occur earlier than it otherwise would.
537 if (AllocaInst* A = dyn_cast(cpyDest)) {
538 // The destination is an alloca. Check it is larger than srcSize.
539 ConstantInt* destArraySize = dyn_cast(A->getArraySize());
540 if (!destArraySize)
541 return false;
542
543 uint64_t destSize = TD.getABITypeSize(A->getAllocatedType()) *
544 destArraySize->getZExtValue();
545
546 if (destSize < srcSize)
547 return false;
548 } else if (Argument* A = dyn_cast(cpyDest)) {
549 // If the destination is an sret parameter then only accesses that are
550 // outside of the returned struct type can trap.
551 if (!A->hasStructRetAttr())
552 return false;
553
554 const Type* StructTy = cast(A->getType())->getElementType();
555 uint64_t destSize = TD.getABITypeSize(StructTy);
556
557 if (destSize < srcSize)
558 return false;
559 } else {
560 return false;
561 }
562
563 // Check that src is not accessed except via the call and the memcpy. This
564 // guarantees that it holds only undefined values when passed in (so the final
565 // memcpy can be dropped), that it is not read or written between the call and
566 // the memcpy, and that writing beyond the end of it is undefined.
567 SmallVector srcUseList(srcAlloca->use_begin(),
568 srcAlloca->use_end());
569 while (!srcUseList.empty()) {
570 User* UI = srcUseList.back();
571 srcUseList.pop_back();
572
573 if (isa(UI) || isa(UI)) {
574 for (User::use_iterator I = UI->use_begin(), E = UI->use_end();
575 I != E; ++I)
576 srcUseList.push_back(*I);
577 } else if (UI != C && UI != cpy) {
578 return false;
579 }
580 }
581
582 // Since we're changing the parameter to the callsite, we need to make sure
583 // that what would be the new parameter dominates the callsite.
584 DominatorTree& DT = getAnalysis();
585 if (Instruction* cpyDestInst = dyn_cast(cpyDest))
586 if (!DT.dominates(cpyDestInst, C))
587 return false;
588
589 // In addition to knowing that the call does not access src in some
590 // unexpected manner, for example via a global, which we deduce from
591 // the use analysis, we also need to know that it does not sneakily
592 // access dest. We rely on AA to figure this out for us.
593 AliasAnalysis& AA = getAnalysis();
594 if (AA.getModRefInfo(C, cpy->getRawDest(), srcSize) !=
595 AliasAnalysis::NoModRef)
596 return false;
597
598 // All the checks have passed, so do the transformation.
599 for (unsigned i = 0; i < CS.arg_size(); ++i)
600 if (CS.getArgument(i) == cpySrc) {
601 if (cpySrc->getType() != cpyDest->getType())
602 cpyDest = CastInst::createPointerCast(cpyDest, cpySrc->getType(),
603 cpyDest->getName(), C);
604 CS.setArgument(i, cpyDest);
605 }
606
607 // Drop any cached information about the call, because we may have changed
608 // its dependence information by changing its parameter.
609 MemoryDependenceAnalysis& MD = getAnalysis();
610 MD.dropInstruction(C);
611
612 // Remove the memcpy
613 MD.removeInstruction(cpy);
614 toErase.push_back(cpy);
615
616 return true;
617 }
618
619 /// processMemCpy - perform simplication of memcpy's. If we have memcpy A which
620 /// copies X to Y, and memcpy B which copies Y to Z, then we can rewrite B to be
621 /// a memcpy from X to Z (or potentially a memmove, depending on circumstances).
622 /// This allows later passes to remove the first memcpy altogether.
623 bool MemCpyOpt::processMemCpy(MemCpyInst* M, MemCpyInst* MDep,
624 SmallVectorImpl &toErase) {
625 // We can only transforms memcpy's where the dest of one is the source of the
626 // other
627 if (M->getSource() != MDep->getDest())
628 return false;
629
630 // Second, the length of the memcpy's must be the same, or the preceeding one
631 // must be larger than the following one.
632 ConstantInt* C1 = dyn_cast(MDep->getLength());
633 ConstantInt* C2 = dyn_cast(M->getLength());
634 if (!C1 || !C2)
635 return false;
636
637 uint64_t DepSize = C1->getValue().getZExtValue();
638 uint64_t CpySize = C2->getValue().getZExtValue();
639
640 if (DepSize < CpySize)
641 return false;
642
643 // Finally, we have to make sure that the dest of the second does not
644 // alias the source of the first
645 AliasAnalysis& AA = getAnalysis();
646 if (AA.alias(M->getRawDest(), CpySize, MDep->getRawSource(), DepSize) !=
647 AliasAnalysis::NoAlias)
648 return false;
649 else if (AA.alias(M->getRawDest(), CpySize, M->getRawSource(), CpySize) !=
650 AliasAnalysis::NoAlias)
651 return false;
652 else if (AA.alias(MDep->getRawDest(), DepSize, MDep->getRawSource(), DepSize)
653 != AliasAnalysis::NoAlias)
654 return false;
655
656 // If all checks passed, then we can transform these memcpy's
657 Function* MemCpyFun = Intrinsic::getDeclaration(
658 M->getParent()->getParent()->getParent(),
659 M->getIntrinsicID());
660
661 std::vector args;
662 args.push_back(M->getRawDest());
663 args.push_back(MDep->getRawSource());
664 args.push_back(M->getLength());
665 args.push_back(M->getAlignment());
666
667 CallInst* C = CallInst::Create(MemCpyFun, args.begin(), args.end(), "", M);
668
669 MemoryDependenceAnalysis& MD = getAnalysis();
670 if (MD.getDependency(C) == MDep) {
671 MD.dropInstruction(M);
672 toErase.push_back(M);
673 return true;
674 }
675
676 MD.removeInstruction(C);
677 toErase.push_back(C);
678 return false;
679 }
680
681 /// processInstruction - When calculating availability, handle an instruction
682 /// by inserting it into the appropriate sets
683 bool MemCpyOpt::processInstruction(Instruction *I,
684 SmallVectorImpl &toErase) {
685 if (StoreInst *SI = dyn_cast(I))
686 return processStore(SI, toErase);
687
688 if (MemCpyInst* M = dyn_cast(I)) {
689 MemoryDependenceAnalysis& MD = getAnalysis();
690
691 // The are two possible optimizations we can do for memcpy:
692 // a) memcpy-memcpy xform which exposes redundance for DSE
693 // b) call-memcpy xform for return slot optimization
694 Instruction* dep = MD.getDependency(M);
695 if (dep == MemoryDependenceAnalysis::None ||
696 dep == MemoryDependenceAnalysis::NonLocal)
697 return false;
698 if (MemCpyInst *MemCpy = dyn_cast(dep))
699 return processMemCpy(M, MemCpy, toErase);
700 if (CallInst* C = dyn_cast(dep))
701 return performCallSlotOptzn(M, C, toErase);
702 return false;
703 }
704
705 return false;
706 }
707
708 // MemCpyOpt::runOnFunction - This is the main transformation entry point for a
709 // function.
710 //
711 bool MemCpyOpt::runOnFunction(Function& F) {
712
713 bool changed = false;
714 bool shouldContinue = true;
715
716 while (shouldContinue) {
717 shouldContinue = iterateOnFunction(F);
718 changed |= shouldContinue;
719 }
720
721 return changed;
722 }
723
724
725 // MemCpyOpt::iterateOnFunction - Executes one iteration of GVN
726 bool MemCpyOpt::iterateOnFunction(Function &F) {
727 bool changed_function = false;
728
729 DominatorTree &DT = getAnalysis();
730
731 SmallVector toErase;
732
733 // Top-down walk of the dominator tree
734 for (df_iterator DI = df_begin(DT.getRootNode()),
735 E = df_end(DT.getRootNode()); DI != E; ++DI) {
736
737 BasicBlock* BB = DI->getBlock();
738 for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
739 BI != BE;) {
740 changed_function |= processInstruction(BI, toErase);
741 if (toErase.empty()) {
742 ++BI;
743 continue;
744 }
745
746 // If we need some instructions deleted, do it now.
747 NumMemCpyInstr += toErase.size();
748
749 // Avoid iterator invalidation.
750 bool AtStart = BI == BB->begin();
751 if (!AtStart)
752 --BI;
753
754 for (SmallVector::iterator I = toErase.begin(),
755 E = toErase.end(); I != E; ++I)
756 (*I)->eraseFromParent();
757
758 if (AtStart)
759 BI = BB->begin();
760 else
761 ++BI;
762
763 toErase.clear();
764 }
765 }
766
767 return changed_function;
768 }
+0
-34
test/Transforms/GVN/2008-02-24-MultipleUseofSRet.ll less more
None ; RUN: llvm-as < %s | opt -gvn -dse | llvm-dis | grep {call.*initialize} | not grep memtmp
1 ; PR2077
2
3 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
4 target triple = "i386-pc-linux-gnu"
5
6 define internal fastcc void @initialize({ x86_fp80, x86_fp80 }* noalias sret %agg.result) nounwind {
7 entry:
8 %agg.result.03 = getelementptr { x86_fp80, x86_fp80 }* %agg.result, i32 0, i32 0 ; [#uses=1]
9 store x86_fp80 0xK00000000000000000000, x86_fp80* %agg.result.03
10 %agg.result.15 = getelementptr { x86_fp80, x86_fp80 }* %agg.result, i32 0, i32 1 ; [#uses=1]
11 store x86_fp80 0xK00000000000000000000, x86_fp80* %agg.result.15
12 ret void
13 }
14
15 declare fastcc x86_fp80 @passed_uninitialized({ x86_fp80, x86_fp80 }* %x) nounwind
16
17 define fastcc void @badly_optimized() nounwind {
18 entry:
19 %z = alloca { x86_fp80, x86_fp80 } ; <{ x86_fp80, x86_fp80 }*> [#uses=2]
20 %tmp = alloca { x86_fp80, x86_fp80 } ; <{ x86_fp80, x86_fp80 }*> [#uses=2]
21 %memtmp = alloca { x86_fp80, x86_fp80 }, align 8 ; <{ x86_fp80, x86_fp80 }*> [#uses=2]
22 call fastcc void @initialize( { x86_fp80, x86_fp80 }* noalias sret %memtmp )
23 %tmp1 = bitcast { x86_fp80, x86_fp80 }* %tmp to i8* ; [#uses=1]
24 %memtmp2 = bitcast { x86_fp80, x86_fp80 }* %memtmp to i8* ; [#uses=1]
25 call void @llvm.memcpy.i32( i8* %tmp1, i8* %memtmp2, i32 24, i32 8 )
26 %z3 = bitcast { x86_fp80, x86_fp80 }* %z to i8* ; [#uses=1]
27 %tmp4 = bitcast { x86_fp80, x86_fp80 }* %tmp to i8* ; [#uses=1]
28 call void @llvm.memcpy.i32( i8* %z3, i8* %tmp4, i32 24, i32 8 )
29 %tmp5 = call fastcc x86_fp80 @passed_uninitialized( { x86_fp80, x86_fp80 }* %z ) ; [#uses=0]
30 ret void
31 }
32
33 declare void @llvm.memcpy.i32(i8*, i8*, i32, i32) nounwind
+0
-19
test/Transforms/GVN/2008-03-13-ReturnSlotBitcast.ll less more
None ; RUN: llvm-as < %s | opt -gvn | llvm-dis | not grep {call.*memcpy.}
1 %a = type { i32 }
2 %b = type { float }
3
4 declare void @llvm.memcpy.i32(i8*, i8*, i32, i32) nounwind
5 declare void @g(%a*)
6
7 define float @f() {
8 entry:
9 %a_var = alloca %a
10 %b_var = alloca %b
11 call void @g(%a *%a_var)
12 %a_i8 = bitcast %a* %a_var to i8*
13 %b_i8 = bitcast %b* %b_var to i8*
14 call void @llvm.memcpy.i32(i8* %b_i8, i8* %a_i8, i32 4, i32 4)
15 %tmp1 = getelementptr %b* %b_var, i32 0, i32 0
16 %tmp2 = load float* %tmp1
17 ret float %tmp2
18 }
+0
-55
test/Transforms/GVN/form-memset.ll less more
None ; RUN: llvm-as < %s | opt -gvn -form-memset-from-stores | llvm-dis | not grep store
1 ; RUN: llvm-as < %s | opt -gvn -form-memset-from-stores | llvm-dis | grep {call.*llvm.memset}
2
3 ; All the stores in this example should be merged into a single memset.
4
5 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
6 target triple = "i386-apple-darwin8"
7
8 define void @foo(i8 signext %c) nounwind {
9 entry:
10 %x = alloca [19 x i8] ; <[19 x i8]*> [#uses=20]
11 %tmp = getelementptr [19 x i8]* %x, i32 0, i32 0 ; [#uses=1]
12 store i8 %c, i8* %tmp, align 1
13 %tmp5 = getelementptr [19 x i8]* %x, i32 0, i32 1 ; [#uses=1]
14 store i8 %c, i8* %tmp5, align 1
15 %tmp9 = getelementptr [19 x i8]* %x, i32 0, i32 2 ; [#uses=1]
16 store i8 %c, i8* %tmp9, align 1
17 %tmp13 = getelementptr [19 x i8]* %x, i32 0, i32 3 ; [#uses=1]
18 store i8 %c, i8* %tmp13, align 1
19 %tmp17 = getelementptr [19 x i8]* %x, i32 0, i32 4 ; [#uses=1]
20 store i8 %c, i8* %tmp17, align 1
21 %tmp21 = getelementptr [19 x i8]* %x, i32 0, i32 5 ; [#uses=1]
22 store i8 %c, i8* %tmp21, align 1
23 %tmp25 = getelementptr [19 x i8]* %x, i32 0, i32 6 ; [#uses=1]
24 store i8 %c, i8* %tmp25, align 1
25 %tmp29 = getelementptr [19 x i8]* %x, i32 0, i32 7 ; [#uses=1]
26 store i8 %c, i8* %tmp29, align 1
27 %tmp33 = getelementptr [19 x i8]* %x, i32 0, i32 8 ; [#uses=1]
28 store i8 %c, i8* %tmp33, align 1
29 %tmp37 = getelementptr [19 x i8]* %x, i32 0, i32 9 ; [#uses=1]
30 store i8 %c, i8* %tmp37, align 1
31 %tmp41 = getelementptr [19 x i8]* %x, i32 0, i32 10 ; [#uses=1]
32 store i8 %c, i8* %tmp41, align 1
33 %tmp45 = getelementptr [19 x i8]* %x, i32 0, i32 11 ; [#uses=1]
34 store i8 %c, i8* %tmp45, align 1
35 %tmp49 = getelementptr [19 x i8]* %x, i32 0, i32 12 ; [#uses=1]
36 store i8 %c, i8* %tmp49, align 1
37 %tmp53 = getelementptr [19 x i8]* %x, i32 0, i32 13 ; [#uses=1]
38 store i8 %c, i8* %tmp53, align 1
39 %tmp57 = getelementptr [19 x i8]* %x, i32 0, i32 14 ; [#uses=1]
40 store i8 %c, i8* %tmp57, align 1
41 %tmp61 = getelementptr [19 x i8]* %x, i32 0, i32 15 ; [#uses=1]
42 store i8 %c, i8* %tmp61, align 1
43 %tmp65 = getelementptr [19 x i8]* %x, i32 0, i32 16 ; [#uses=1]
44 store i8 %c, i8* %tmp65, align 1
45 %tmp69 = getelementptr [19 x i8]* %x, i32 0, i32 17 ; [#uses=1]
46 store i8 %c, i8* %tmp69, align 1
47 %tmp73 = getelementptr [19 x i8]* %x, i32 0, i32 18 ; [#uses=1]
48 store i8 %c, i8* %tmp73, align 1
49 %tmp76 = call i32 (...)* @bar( [19 x i8]* %x ) nounwind ; [#uses=0]
50 ret void
51 }
52
53 declare i32 @bar(...)
54
+0
-99
test/Transforms/GVN/form-memset2.ll less more
None ; RUN: llvm-as < %s | opt -gvn -form-memset-from-stores | llvm-dis | not grep store
1 ; RUN: llvm-as < %s | opt -gvn -form-memset-from-stores | llvm-dis | grep {call.*llvm.memset} | count 3
2
3 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
4 target triple = "i386-apple-darwin8"
5 %struct.MV = type { i16, i16 }
6
7 define i32 @t() nounwind {
8 entry:
9 %ref_idx = alloca [8 x i8] ; <[8 x i8]*> [#uses=8]
10 %left_mvd = alloca [8 x %struct.MV] ; <[8 x %struct.MV]*> [#uses=17]
11 %up_mvd = alloca [8 x %struct.MV] ; <[8 x %struct.MV]*> [#uses=17]
12 %tmp20 = getelementptr [8 x i8]* %ref_idx, i32 0, i32 7 ; [#uses=1]
13 store i8 -1, i8* %tmp20, align 1
14 %tmp23 = getelementptr [8 x i8]* %ref_idx, i32 0, i32 6 ; [#uses=1]
15 store i8 -1, i8* %tmp23, align 1
16 %tmp26 = getelementptr [8 x i8]* %ref_idx, i32 0, i32 5 ; [#uses=1]
17 store i8 -1, i8* %tmp26, align 1
18 %tmp29 = getelementptr [8 x i8]* %ref_idx, i32 0, i32 4 ; [#uses=1]
19 store i8 -1, i8* %tmp29, align 1
20 %tmp32 = getelementptr [8 x i8]* %ref_idx, i32 0, i32 3 ; [#uses=1]
21 store i8 -1, i8* %tmp32, align 1
22 %tmp35 = getelementptr [8 x i8]* %ref_idx, i32 0, i32 2 ; [#uses=1]
23 store i8 -1, i8* %tmp35, align 1
24 %tmp38 = getelementptr [8 x i8]* %ref_idx, i32 0, i32 1 ; [#uses=1]
25 store i8 -1, i8* %tmp38, align 1
26 %tmp41 = getelementptr [8 x i8]* %ref_idx, i32 0, i32 0 ; [#uses=2]
27 store i8 -1, i8* %tmp41, align 1
28 %tmp43 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 7, i32 0 ; [#uses=1]
29 store i16 0, i16* %tmp43, align 2
30 %tmp46 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 7, i32 1 ; [#uses=1]
31 store i16 0, i16* %tmp46, align 2
32 %tmp57 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 6, i32 0 ; [#uses=1]
33 store i16 0, i16* %tmp57, align 2
34 %tmp60 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 6, i32 1 ; [#uses=1]
35 store i16 0, i16* %tmp60, align 2
36 %tmp71 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 5, i32 0 ; [#uses=1]
37 store i16 0, i16* %tmp71, align 2
38 %tmp74 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 5, i32 1 ; [#uses=1]
39 store i16 0, i16* %tmp74, align 2
40 %tmp85 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 4, i32 0 ; [#uses=1]
41 store i16 0, i16* %tmp85, align 2
42 %tmp88 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 4, i32 1 ; [#uses=1]
43 store i16 0, i16* %tmp88, align 2
44 %tmp99 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 3, i32 0 ; [#uses=1]
45 store i16 0, i16* %tmp99, align 2
46 %tmp102 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 3, i32 1 ; [#uses=1]
47 store i16 0, i16* %tmp102, align 2
48 %tmp113 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 2, i32 0 ; [#uses=1]
49 store i16 0, i16* %tmp113, align 2
50 %tmp116 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 2, i32 1 ; [#uses=1]
51 store i16 0, i16* %tmp116, align 2
52 %tmp127 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 1, i32 0 ; [#uses=1]
53 store i16 0, i16* %tmp127, align 2
54 %tmp130 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 1, i32 1 ; [#uses=1]
55 store i16 0, i16* %tmp130, align 2
56 %tmp141 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 0, i32 0 ; [#uses=1]
57 store i16 0, i16* %tmp141, align 8
58 %tmp144 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 0, i32 1 ; [#uses=1]
59 store i16 0, i16* %tmp144, align 2
60 %tmp148 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 7, i32 0 ; [#uses=1]
61 store i16 0, i16* %tmp148, align 2
62 %tmp151 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 7, i32 1 ; [#uses=1]
63 store i16 0, i16* %tmp151, align 2
64 %tmp162 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 6, i32 0 ; [#uses=1]
65 store i16 0, i16* %tmp162, align 2
66 %tmp165 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 6, i32 1 ; [#uses=1]
67 store i16 0, i16* %tmp165, align 2
68 %tmp176 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 5, i32 0 ; [#uses=1]
69 store i16 0, i16* %tmp176, align 2
70 %tmp179 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 5, i32 1 ; [#uses=1]
71 store i16 0, i16* %tmp179, align 2
72 %tmp190 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 4, i32 0 ; [#uses=1]
73 store i16 0, i16* %tmp190, align 2
74 %tmp193 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 4, i32 1 ; [#uses=1]
75 store i16 0, i16* %tmp193, align 2
76 %tmp204 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 3, i32 0 ; [#uses=1]
77 store i16 0, i16* %tmp204, align 2
78 %tmp207 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 3, i32 1 ; [#uses=1]
79 store i16 0, i16* %tmp207, align 2
80 %tmp218 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 2, i32 0 ; [#uses=1]
81 store i16 0, i16* %tmp218, align 2
82 %tmp221 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 2, i32 1 ; [#uses=1]
83 store i16 0, i16* %tmp221, align 2
84 %tmp232 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 1, i32 0 ; [#uses=1]
85 store i16 0, i16* %tmp232, align 2
86 %tmp235 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 1, i32 1 ; [#uses=1]
87 store i16 0, i16* %tmp235, align 2
88 %tmp246 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 0, i32 0 ; [#uses=1]
89 store i16 0, i16* %tmp246, align 8
90 %tmp249 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 0, i32 1 ; [#uses=1]
91 store i16 0, i16* %tmp249, align 2
92 %up_mvd252 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 0 ; <%struct.MV*> [#uses=1]
93 %left_mvd253 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 0 ; <%struct.MV*> [#uses=1]
94 call void @foo( %struct.MV* %up_mvd252, %struct.MV* %left_mvd253, i8* %tmp41 ) nounwind
95 ret i32 undef
96 }
97
98 declare void @foo(%struct.MV*, %struct.MV*, i8*)
+0
-22
test/Transforms/GVN/memcpy.ll less more
None ; RUN: llvm-as < %s | opt -gvn -dse | llvm-dis | grep {call.*memcpy} | count 1
1
2 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
3 target triple = "i686-apple-darwin9"
4
5 define void @ccosl({ x86_fp80, x86_fp80 }* sret %agg.result, x86_fp80 %z.0, x86_fp80 %z.1) nounwind {
6 entry:
7 %tmp2 = alloca { x86_fp80, x86_fp80 } ; <{ x86_fp80, x86_fp80 }*> [#uses=1]
8 %memtmp = alloca { x86_fp80, x86_fp80 }, align 16 ; <{ x86_fp80, x86_fp80 }*> [#uses=2]
9 %tmp5 = sub x86_fp80 0xK80000000000000000000, %z.1 ; [#uses=1]
10 call void @ccoshl( { x86_fp80, x86_fp80 }* sret %memtmp, x86_fp80 %tmp5, x86_fp80 %z.0 ) nounwind
11 %tmp219 = bitcast { x86_fp80, x86_fp80 }* %tmp2 to i8* ; [#uses=2]
12 %memtmp20 = bitcast { x86_fp80, x86_fp80 }* %memtmp to i8* ; [#uses=1]
13 call void @llvm.memcpy.i32( i8* %tmp219, i8* %memtmp20, i32 32, i32 16 )
14 %agg.result21 = bitcast { x86_fp80, x86_fp80 }* %agg.result to i8* ; [#uses=1]
15 call void @llvm.memcpy.i32( i8* %agg.result21, i8* %tmp219, i32 32, i32 16 )
16 ret void
17 }
18
19 declare void @ccoshl({ x86_fp80, x86_fp80 }* sret , x86_fp80, x86_fp80) nounwind
20
21 declare void @llvm.memcpy.i32(i8*, i8*, i32, i32) nounwind
+0
-28
test/Transforms/GVN/sret.ll less more
None ; RUN: llvm-as < %s | opt -gvn | llvm-dis | not grep {call.*memcpy}
1
2 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
3 target triple = "i686-apple-darwin9"
4
5 define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 }* byval %z) nounwind {
6 entry:
7 %iz = alloca { x86_fp80, x86_fp80 } ; <{ x86_fp80, x86_fp80 }*> [#uses=3]
8 %memtmp = alloca { x86_fp80, x86_fp80 }, align 16 ; <{ x86_fp80, x86_fp80 }*> [#uses=2]
9 %tmp1 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 1 ; [#uses=1]
10 %tmp2 = load x86_fp80* %tmp1, align 16 ; [#uses=1]
11 %tmp3 = sub x86_fp80 0xK80000000000000000000, %tmp2 ; [#uses=1]
12 %tmp4 = getelementptr { x86_fp80, x86_fp80 }* %iz, i32 0, i32 1 ; [#uses=1]
13 %real = getelementptr { x86_fp80, x86_fp80 }* %iz, i32 0, i32 0 ; [#uses=1]
14 %tmp7 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 0 ; [#uses=1]
15 %tmp8 = load x86_fp80* %tmp7, align 16 ; [#uses=1]
16 store x86_fp80 %tmp3, x86_fp80* %real, align 16
17 store x86_fp80 %tmp8, x86_fp80* %tmp4, align 16
18 call void @ccoshl( { x86_fp80, x86_fp80 }* noalias sret %memtmp, { x86_fp80, x86_fp80 }* byval %iz ) nounwind
19 %memtmp14 = bitcast { x86_fp80, x86_fp80 }* %memtmp to i8* ; [#uses=1]
20 %agg.result15 = bitcast { x86_fp80, x86_fp80 }* %agg.result to i8* ; [#uses=1]
21 call void @llvm.memcpy.i32( i8* %agg.result15, i8* %memtmp14, i32 32, i32 16 )
22 ret void
23 }
24
25 declare void @ccoshl({ x86_fp80, x86_fp80 }* noalias sret , { x86_fp80, x86_fp80 }* byval ) nounwind
26
27 declare void @llvm.memcpy.i32(i8*, i8*, i32, i32) nounwind
0 ; RUN: llvm-as < %s | opt -memcpyopt -dse | llvm-dis | grep {call.*initialize} | not grep memtmp
1 ; PR2077
2
3 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
4 target triple = "i386-pc-linux-gnu"
5
6 define internal fastcc void @initialize({ x86_fp80, x86_fp80 }* noalias sret %agg.result) nounwind {
7 entry:
8 %agg.result.03 = getelementptr { x86_fp80, x86_fp80 }* %agg.result, i32 0, i32 0 ; [#uses=1]
9 store x86_fp80 0xK00000000000000000000, x86_fp80* %agg.result.03
10 %agg.result.15 = getelementptr { x86_fp80, x86_fp80 }* %agg.result, i32 0, i32 1 ; [#uses=1]
11 store x86_fp80 0xK00000000000000000000, x86_fp80* %agg.result.15
12 ret void
13 }
14
15 declare fastcc x86_fp80 @passed_uninitialized({ x86_fp80, x86_fp80 }* %x) nounwind
16
17 define fastcc void @badly_optimized() nounwind {
18 entry:
19 %z = alloca { x86_fp80, x86_fp80 } ; <{ x86_fp80, x86_fp80 }*> [#uses=2]
20 %tmp = alloca { x86_fp80, x86_fp80 } ; <{ x86_fp80, x86_fp80 }*> [#uses=2]
21 %memtmp = alloca { x86_fp80, x86_fp80 }, align 8 ; <{ x86_fp80, x86_fp80 }*> [#uses=2]
22 call fastcc void @initialize( { x86_fp80, x86_fp80 }* noalias sret %memtmp )
23 %tmp1 = bitcast { x86_fp80, x86_fp80 }* %tmp to i8* ; [#uses=1]
24 %memtmp2 = bitcast { x86_fp80, x86_fp80 }* %memtmp to i8* ; [#uses=1]
25 call void @llvm.memcpy.i32( i8* %tmp1, i8* %memtmp2, i32 24, i32 8 )
26 %z3 = bitcast { x86_fp80, x86_fp80 }* %z to i8* ; [#uses=1]
27 %tmp4 = bitcast { x86_fp80, x86_fp80 }* %tmp to i8* ; [#uses=1]
28 call void @llvm.memcpy.i32( i8* %z3, i8* %tmp4, i32 24, i32 8 )
29 %tmp5 = call fastcc x86_fp80 @passed_uninitialized( { x86_fp80, x86_fp80 }* %z ) ; [#uses=0]
30 ret void
31 }
32
33 declare void @llvm.memcpy.i32(i8*, i8*, i32, i32) nounwind
0 ; RUN: llvm-as < %s | opt -memcpyopt | llvm-dis | not grep {call.*memcpy.}
1 %a = type { i32 }
2 %b = type { float }
3
4 declare void @llvm.memcpy.i32(i8*, i8*, i32, i32) nounwind
5 declare void @g(%a*)
6
7 define float @f() {
8 entry:
9 %a_var = alloca %a
10 %b_var = alloca %b
11 call void @g(%a *%a_var)
12 %a_i8 = bitcast %a* %a_var to i8*
13 %b_i8 = bitcast %b* %b_var to i8*
14 call void @llvm.memcpy.i32(i8* %b_i8, i8* %a_i8, i32 4, i32 4)
15 %tmp1 = getelementptr %b* %b_var, i32 0, i32 0
16 %tmp2 = load float* %tmp1
17 ret float %tmp2
18 }
0 load_lib llvm.exp
1
2 RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll,llx,c,cpp,tr}]]
0 ; RUN: llvm-as < %s | opt -memcpyopt -form-memset-from-stores | llvm-dis | not grep store
1 ; RUN: llvm-as < %s | opt -memcpyopt -form-memset-from-stores | llvm-dis | grep {call.*llvm.memset}
2
3 ; All the stores in this example should be merged into a single memset.
4
5 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
6 target triple = "i386-apple-darwin8"
7
8 define void @foo(i8 signext %c) nounwind {
9 entry:
10 %x = alloca [19 x i8] ; <[19 x i8]*> [#uses=20]
11 %tmp = getelementptr [19 x i8]* %x, i32 0, i32 0 ; [#uses=1]
12 store i8 %c, i8* %tmp, align 1
13 %tmp5 = getelementptr [19 x i8]* %x, i32 0, i32 1 ; [#uses=1]
14 store i8 %c, i8* %tmp5, align 1
15 %tmp9 = getelementptr [19 x i8]* %x, i32 0, i32 2 ; [#uses=1]
16 store i8 %c, i8* %tmp9, align 1
17 %tmp13 = getelementptr [19 x i8]* %x, i32 0, i32 3 ; [#uses=1]
18 store i8 %c, i8* %tmp13, align 1
19 %tmp17 = getelementptr [19 x i8]* %x, i32 0, i32 4 ; [#uses=1]
20 store i8 %c, i8* %tmp17, align 1
21 %tmp21 = getelementptr [19 x i8]* %x, i32 0, i32 5 ; [#uses=1]
22 store i8 %c, i8* %tmp21, align 1
23 %tmp25 = getelementptr [19 x i8]* %x, i32 0, i32 6 ; [#uses=1]
24 store i8 %c, i8* %tmp25, align 1
25 %tmp29 = getelementptr [19 x i8]* %x, i32 0, i32 7 ; [#uses=1]
26 store i8 %c, i8* %tmp29, align 1
27 %tmp33 = getelementptr [19 x i8]* %x, i32 0, i32 8 ; [#uses=1]
28 store i8 %c, i8* %tmp33, align 1
29 %tmp37 = getelementptr [19 x i8]* %x, i32 0, i32 9 ; [#uses=1]
30 store i8 %c, i8* %tmp37, align 1
31 %tmp41 = getelementptr [19 x i8]* %x, i32 0, i32 10 ; [#uses=1]
32 store i8 %c, i8* %tmp41, align 1
33 %tmp45 = getelementptr [19 x i8]* %x, i32 0, i32 11 ; [#uses=1]
34 store i8 %c, i8* %tmp45, align 1
35 %tmp49 = getelementptr [19 x i8]* %x, i32 0, i32 12 ; [#uses=1]
36 store i8 %c, i8* %tmp49, align 1
37 %tmp53 = getelementptr [19 x i8]* %x, i32 0, i32 13 ; [#uses=1]
38 store i8 %c, i8* %tmp53, align 1
39 %tmp57 = getelementptr [19 x i8]* %x, i32 0, i32 14 ; [#uses=1]
40 store i8 %c, i8* %tmp57, align 1
41 %tmp61 = getelementptr [19 x i8]* %x, i32 0, i32 15 ; [#uses=1]
42 store i8 %c, i8* %tmp61, align 1
43 %tmp65 = getelementptr [19 x i8]* %x, i32 0, i32 16 ; [#uses=1]
44 store i8 %c, i8* %tmp65, align 1
45 %tmp69 = getelementptr [19 x i8]* %x, i32 0, i32 17 ; [#uses=1]
46 store i8 %c, i8* %tmp69, align 1
47 %tmp73 = getelementptr [19 x i8]* %x, i32 0, i32 18 ; [#uses=1]
48 store i8 %c, i8* %tmp73, align 1
49 %tmp76 = call i32 (...)* @bar( [19 x i8]* %x ) nounwind ; [#uses=0]
50 ret void
51 }
52
53 declare i32 @bar(...)
54
0 ; RUN: llvm-as < %s | opt -memcpyopt -form-memset-from-stores | llvm-dis | not grep store
1 ; RUN: llvm-as < %s | opt -memcpyopt -form-memset-from-stores | llvm-dis | grep {call.*llvm.memset} | count 3
2
3 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
4 target triple = "i386-apple-darwin8"
5 %struct.MV = type { i16, i16 }
6
7 define i32 @t() nounwind {
8 entry:
9 %ref_idx = alloca [8 x i8] ; <[8 x i8]*> [#uses=8]
10 %left_mvd = alloca [8 x %struct.MV] ; <[8 x %struct.MV]*> [#uses=17]
11 %up_mvd = alloca [8 x %struct.MV] ; <[8 x %struct.MV]*> [#uses=17]
12 %tmp20 = getelementptr [8 x i8]* %ref_idx, i32 0, i32 7 ; [#uses=1]
13 store i8 -1, i8* %tmp20, align 1
14 %tmp23 = getelementptr [8 x i8]* %ref_idx, i32 0, i32 6 ; [#uses=1]
15 store i8 -1, i8* %tmp23, align 1
16 %tmp26 = getelementptr [8 x i8]* %ref_idx, i32 0, i32 5 ; [#uses=1]
17 store i8 -1, i8* %tmp26, align 1
18 %tmp29 = getelementptr [8 x i8]* %ref_idx, i32 0, i32 4 ; [#uses=1]
19 store i8 -1, i8* %tmp29, align 1
20 %tmp32 = getelementptr [8 x i8]* %ref_idx, i32 0, i32 3 ; [#uses=1]
21 store i8 -1, i8* %tmp32, align 1
22 %tmp35 = getelementptr [8 x i8]* %ref_idx, i32 0, i32 2 ; [#uses=1]
23 store i8 -1, i8* %tmp35, align 1
24 %tmp38 = getelementptr [8 x i8]* %ref_idx, i32 0, i32 1 ; [#uses=1]
25 store i8 -1, i8* %tmp38, align 1
26 %tmp41 = getelementptr [8 x i8]* %ref_idx, i32 0, i32 0 ; [#uses=2]
27 store i8 -1, i8* %tmp41, align 1
28 %tmp43 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 7, i32 0 ; [#uses=1]
29 store i16 0, i16* %tmp43, align 2
30 %tmp46 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 7, i32 1 ; [#uses=1]
31 store i16 0, i16* %tmp46, align 2
32 %tmp57 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 6, i32 0 ; [#uses=1]
33 store i16 0, i16* %tmp57, align 2
34 %tmp60 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 6, i32 1 ; [#uses=1]
35 store i16 0, i16* %tmp60, align 2
36 %tmp71 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 5, i32 0 ; [#uses=1]
37 store i16 0, i16* %tmp71, align 2
38 %tmp74 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 5, i32 1 ; [#uses=1]
39 store i16 0, i16* %tmp74, align 2
40 %tmp85 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 4, i32 0 ; [#uses=1]
41 store i16 0, i16* %tmp85, align 2
42 %tmp88 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 4, i32 1 ; [#uses=1]
43 store i16 0, i16* %tmp88, align 2
44 %tmp99 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 3, i32 0 ; [#uses=1]
45 store i16 0, i16* %tmp99, align 2
46 %tmp102 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 3, i32 1 ; [#uses=1]
47 store i16 0, i16* %tmp102, align 2
48 %tmp113 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 2, i32 0 ; [#uses=1]
49 store i16 0, i16* %tmp113, align 2
50 %tmp116 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 2, i32 1 ; [#uses=1]
51 store i16 0, i16* %tmp116, align 2
52 %tmp127 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 1, i32 0 ; [#uses=1]
53 store i16 0, i16* %tmp127, align 2
54 %tmp130 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 1, i32 1 ; [#uses=1]
55 store i16 0, i16* %tmp130, align 2
56 %tmp141 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 0, i32 0 ; [#uses=1]
57 store i16 0, i16* %tmp141, align 8
58 %tmp144 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 0, i32 1 ; [#uses=1]
59 store i16 0, i16* %tmp144, align 2
60 %tmp148 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 7, i32 0 ; [#uses=1]
61 store i16 0, i16* %tmp148, align 2
62 %tmp151 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 7, i32 1 ; [#uses=1]
63 store i16 0, i16* %tmp151, align 2
64 %tmp162 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 6, i32 0 ; [#uses=1]
65 store i16 0, i16* %tmp162, align 2
66 %tmp165 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 6, i32 1 ; [#uses=1]
67 store i16 0, i16* %tmp165, align 2
68 %tmp176 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 5, i32 0 ; [#uses=1]
69 store i16 0, i16* %tmp176, align 2
70 %tmp179 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 5, i32 1 ; [#uses=1]
71 store i16 0, i16* %tmp179, align 2
72 %tmp190 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 4, i32 0 ; [#uses=1]
73 store i16 0, i16* %tmp190, align 2
74 %tmp193 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 4, i32 1 ; [#uses=1]
75 store i16 0, i16* %tmp193, align 2
76 %tmp204 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 3, i32 0 ; [#uses=1]
77 store i16 0, i16* %tmp204, align 2
78 %tmp207 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 3, i32 1 ; [#uses=1]
79 store i16 0, i16* %tmp207, align 2
80 %tmp218 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 2, i32 0 ; [#uses=1]
81 store i16 0, i16* %tmp218, align 2
82 %tmp221 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 2, i32 1 ; [#uses=1]
83 store i16 0, i16* %tmp221, align 2
84 %tmp232 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 1, i32 0 ; [#uses=1]
85 store i16 0, i16* %tmp232, align 2
86 %tmp235 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 1, i32 1 ; [#uses=1]
87 store i16 0, i16* %tmp235, align 2
88 %tmp246 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 0, i32 0 ; [#uses=1]
89 store i16 0, i16* %tmp246, align 8
90 %tmp249 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 0, i32 1 ; [#uses=1]
91 store i16 0, i16* %tmp249, align 2
92 %up_mvd252 = getelementptr [8 x %struct.MV]* %up_mvd, i32 0, i32 0 ; <%struct.MV*> [#uses=1]
93 %left_mvd253 = getelementptr [8 x %struct.MV]* %left_mvd, i32 0, i32 0 ; <%struct.MV*> [#uses=1]
94 call void @foo( %struct.MV* %up_mvd252, %struct.MV* %left_mvd253, i8* %tmp41 ) nounwind
95 ret i32 undef
96 }
97
98 declare void @foo(%struct.MV*, %struct.MV*, i8*)
0 ; RUN: llvm-as < %s | opt -memcpyopt -dse | llvm-dis | grep {call.*memcpy} | count 1
1
2 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
3 target triple = "i686-apple-darwin9"
4
5 define void @ccosl({ x86_fp80, x86_fp80 }* sret %agg.result, x86_fp80 %z.0, x86_fp80 %z.1) nounwind {
6 entry:
7 %tmp2 = alloca { x86_fp80, x86_fp80 } ; <{ x86_fp80, x86_fp80 }*> [#uses=1]
8 %memtmp = alloca { x86_fp80, x86_fp80 }, align 16 ; <{ x86_fp80, x86_fp80 }*> [#uses=2]
9 %tmp5 = sub x86_fp80 0xK80000000000000000000, %z.1 ; [#uses=1]
10 call void @ccoshl( { x86_fp80, x86_fp80 }* sret %memtmp, x86_fp80 %tmp5, x86_fp80 %z.0 ) nounwind
11 %tmp219 = bitcast { x86_fp80, x86_fp80 }* %tmp2 to i8* ; [#uses=2]
12 %memtmp20 = bitcast { x86_fp80, x86_fp80 }* %memtmp to i8* ; [#uses=1]
13 call void @llvm.memcpy.i32( i8* %tmp219, i8* %memtmp20, i32 32, i32 16 )
14 %agg.result21 = bitcast { x86_fp80, x86_fp80 }* %agg.result to i8* ; [#uses=1]
15 call void @llvm.memcpy.i32( i8* %agg.result21, i8* %tmp219, i32 32, i32 16 )
16 ret void
17 }
18
19 declare void @ccoshl({ x86_fp80, x86_fp80 }* sret , x86_fp80, x86_fp80) nounwind
20
21 declare void @llvm.memcpy.i32(i8*, i8*, i32, i32) nounwind
0 ; RUN: llvm-as < %s | opt -memcpyopt | llvm-dis | not grep {call.*memcpy}
1
2 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
3 target triple = "i686-apple-darwin9"
4
5 define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 }* byval %z) nounwind {
6 entry:
7 %iz = alloca { x86_fp80, x86_fp80 } ; <{ x86_fp80, x86_fp80 }*> [#uses=3]
8 %memtmp = alloca { x86_fp80, x86_fp80 }, align 16 ; <{ x86_fp80, x86_fp80 }*> [#uses=2]
9 %tmp1 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 1 ; [#uses=1]
10 %tmp2 = load x86_fp80* %tmp1, align 16 ; [#uses=1]
11 %tmp3 = sub x86_fp80 0xK80000000000000000000, %tmp2 ; [#uses=1]
12 %tmp4 = getelementptr { x86_fp80, x86_fp80 }* %iz, i32 0, i32 1 ; [#uses=1]
13 %real = getelementptr { x86_fp80, x86_fp80 }* %iz, i32 0, i32 0 ; [#uses=1]
14 %tmp7 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 0 ; [#uses=1]
15 %tmp8 = load x86_fp80* %tmp7, align 16 ; [#uses=1]
16 store x86_fp80 %tmp3, x86_fp80* %real, align 16
17 store x86_fp80 %tmp8, x86_fp80* %tmp4, align 16
18 call void @ccoshl( { x86_fp80, x86_fp80 }* noalias sret %memtmp, { x86_fp80, x86_fp80 }* byval %iz ) nounwind
19 %memtmp14 = bitcast { x86_fp80, x86_fp80 }* %memtmp to i8* ; [#uses=1]
20 %agg.result15 = bitcast { x86_fp80, x86_fp80 }* %agg.result to i8* ; [#uses=1]
21 call void @llvm.memcpy.i32( i8* %agg.result15, i8* %memtmp14, i32 32, i32 16 )
22 ret void
23 }
24
25 declare void @ccoshl({ x86_fp80, x86_fp80 }* noalias sret , { x86_fp80, x86_fp80 }* byval ) nounwind
26
27 declare void @llvm.memcpy.i32(i8*, i8*, i32, i32) nounwind
168168 addPass(Passes, createGlobalsModRefPass()); // IP alias analysis
169169
170170 addPass(Passes, createLICMPass()); // Hoist loop invariants
171 addPass(Passes, createMemCpyOptPass()); // Remove dead memcpy's
171172 addPass(Passes, createGVNPass()); // Remove redundancies
172173 addPass(Passes, createDeadStoreEliminationPass()); // Nuke dead stores
173174
281281 addPass(PM, createIndVarSimplifyPass()); // Canonicalize indvars
282282 addPass(PM, createLoopUnrollPass()); // Unroll small loops
283283 addPass(PM, createInstructionCombiningPass()); // Clean up after the unroller
284 addPass(PM, createMemCpyOptPass()); // Remove unneeded memcpy's
284285 addPass(PM, createGVNPass()); // Remove redundancies
285286 addPass(PM, createSCCPPass()); // Constant prop with SCCP
286287