llvm.org GIT mirror llvm / 3a39372
Implement rdar://9009151, transforming strided loop stores of unsplatable values into memset_pattern16 when it is available (recent darwins). This transforms lots of strided loop stores of ints for example, like 5 in vpr: Formed memset: call void @memset_pattern16(i8* %4, i8* getelementptr inbounds ([16 x i8]* @.memset_pattern9, i32 0, i32 0), i64 %tmp25) from store to: {%3,+,4}<%11> at: store i32 3, i32* %scevgep, align 4, !tbaa !4 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@126040 91177308-0d34-0410-b5e6-96231b3b80d8 Chris Lattner 8 years ago
2 changed file(s) with 154 addition(s) and 34 deletion(s). Raw diff Collapse all Expand all
3838 #define DEBUG_TYPE "loop-idiom"
3939 #include "llvm/Transforms/Scalar.h"
4040 #include "llvm/IntrinsicInst.h"
41 #include "llvm/Module.h"
4142 #include "llvm/Analysis/AliasAnalysis.h"
4243 #include "llvm/Analysis/LoopPass.h"
4344 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
7576 bool processLoopStore(StoreInst *SI, const SCEV *BECount);
7677 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
7778
78 bool processLoopStoreOfSplatValue(Value *DestPtr, unsigned StoreSize,
79 unsigned StoreAlignment,
80 Value *SplatValue, Instruction *TheStore,
81 const SCEVAddRecExpr *Ev,
82 const SCEV *BECount);
79 bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
80 unsigned StoreAlignment,
81 Value *SplatValue, Instruction *TheStore,
82 const SCEVAddRecExpr *Ev,
83 const SCEV *BECount);
8384 bool processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
8485 const SCEVAddRecExpr *StoreEv,
8586 const SCEVAddRecExpr *LoadEv,
274275 // validity check in mayLoopAccessLocation to be updated though.
275276 if (Stride == 0 || StoreSize != Stride->getValue()->getValue())
276277 return false;
277
278 // If the stored value is a byte-wise value (like i32 -1), then it may be
279 // turned into a memset of i8 -1, assuming that all the consecutive bytes
280 // are stored. A store of i32 0x01020304 can never be turned into a memset.
281 if (Value *SplatValue = isBytewiseValue(StoredVal))
282 if (processLoopStoreOfSplatValue(StorePtr, StoreSize, SI->getAlignment(),
283 SplatValue, SI, StoreEv, BECount))
284 return true;
278
279 // See if we can optimize just this store in isolation.
280 if (processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(),
281 StoredVal, SI, StoreEv, BECount))
282 return true;
285283
286284 // If the stored value is a strided load in the same loop with the same stride
287285 // this this may be transformable into a memcpy. This kicks in for stuff like
332330 if (Stride == 0 || MSI->getLength() != Stride->getValue())
333331 return false;
334332
335 return processLoopStoreOfSplatValue(Pointer, (unsigned)SizeInBytes,
336 MSI->getAlignment(), MSI->getValue(),
337 MSI, Ev, BECount);
333 return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
334 MSI->getAlignment(), MSI->getValue(),
335 MSI, Ev, BECount);
338336 }
339337
340338
371369 return false;
372370 }
373371
374 /// processLoopStoreOfSplatValue - We see a strided store of a memsetable value.
375 /// If we can transform this into a memset in the loop preheader, do so.
372 /// getMemSetPatternValue - If a strided store of the specified value is safe to
373 /// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
374 /// be passed in. Otherwise, return null.
375 ///
376 /// Note that we don't ever attempt to use memset_pattern8 or 4, because these
377 /// just replicate their input array and then pass on to memset_pattern16.
378 static Constant *getMemSetPatternValue(Value *V, const TargetData &TD) {
379 // If the value isn't a constant, we can't promote it to being in a constant
380 // array. We could theoretically do a store to an alloca or something, but
381 // that doesn't seem worthwhile.
382 Constant *C = dyn_cast(V);
383 if (C == 0) return 0;
384
385 // Only handle simple values that are a power of two bytes in size.
386 uint64_t Size = TD.getTypeSizeInBits(V->getType());
387 if (Size == 0 || (Size & 7) || (Size & (Size-1)))
388 return 0;
389
390 // Convert the constant to an integer type of the appropriate size so we can
391 // start hacking on it.
392 if (isa(V->getType()))
393 C = ConstantExpr::getPtrToInt(C, IntegerType::get(C->getContext(), Size));
394 else if (isa(V->getType()) || V->getType()->isFloatingPointTy())
395 C = ConstantExpr::getBitCast(C, IntegerType::get(C->getContext(), Size));
396 else if (!isa(V->getType()))
397 return 0; // Unhandled type.
398
399 // Convert to size in bytes.
400 Size /= 8;
401
402 // If we couldn't fold this to an integer, we fail. We don't bother to handle
403 // relocatable expressions like the address of a global yet.
404 // FIXME!
405 ConstantInt *CI = dyn_cast(C);
406 if (CI == 0) return 0;
407
408 APInt CVal = CI->getValue();
409
410 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
411 // if the top and bottom are the same.
412 if (Size > 16) return 0;
413
414 // If this is a big endian target (PPC) then we need to bswap.
415 if (TD.isBigEndian())
416 CVal = CVal.byteSwap();
417
418 // Determine what each byte of the pattern value should be.
419 char Value[16];
420 for (unsigned i = 0; i != 16; ++i) {
421 // Get the byte value we're indexing into.
422 unsigned CByte = i % Size;
423 Value[i] = (unsigned char)(CVal.getZExtValue() >> CByte);
424 }
425
426 return ConstantArray::get(V->getContext(), StringRef(Value, 16), false);
427 }
428
429
430 /// processLoopStridedStore - We see a strided store of some value. If we can
431 /// transform this into a memset or memset_pattern in the loop preheader, do so.
376432 bool LoopIdiomRecognize::
377 processLoopStoreOfSplatValue(Value *DestPtr, unsigned StoreSize,
378 unsigned StoreAlignment, Value *SplatValue,
379 Instruction *TheStore,
380 const SCEVAddRecExpr *Ev, const SCEV *BECount) {
381 // If we're not allowed to form memset, we fail.
382 if (!TLI->has(LibFunc::memset))
383 return false;
384
385
386 // Verify that the stored value is loop invariant. If not, we can't promote
387 // the memset.
388 if (!CurLoop->isLoopInvariant(SplatValue))
389 return false;
433 processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
434 unsigned StoreAlignment, Value *StoredVal,
435 Instruction *TheStore, const SCEVAddRecExpr *Ev,
436 const SCEV *BECount) {
437
438 // If the stored value is a byte-wise value (like i32 -1), then it may be
439 // turned into a memset of i8 -1, assuming that all the consecutive bytes
440 // are stored. A store of i32 0x01020304 can never be turned into a memset,
441 // but it can be turned into memset_pattern if the target supports it.
442 Value *SplatValue = isBytewiseValue(StoredVal);
443 Constant *PatternValue = 0;
444
445 // If we're allowed to form a memset, and the stored value would be acceptable
446 // for memset, use it.
447 if (SplatValue && TLI->has(LibFunc::memset) &&
448 // Verify that the stored value is loop invariant. If not, we can't
449 // promote the memset.
450 CurLoop->isLoopInvariant(SplatValue)) {
451 // Keep and use SplatValue.
452 PatternValue = 0;
453 } else if (TLI->has(LibFunc::memset_pattern16) &&
454 (PatternValue = getMemSetPatternValue(StoredVal, *TD))) {
455 // It looks like we can use PatternValue!
456 SplatValue = 0;
457 } else {
458 // Otherwise, this isn't an idiom we can transform. For example, we can't
459 // do anything with a 3-byte store, for example.
460 return false;
461 }
462
390463
391464 // Okay, we have a strided store "p[i]" of a splattable value. We can turn
392465 // this into a memset in the loop preheader now if we want. However, this
414487
415488 // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
416489 // pointer size if it isn't already.
417 const Type *IntPtr = TD->getIntPtrType(SplatValue->getContext());
490 const Type *IntPtr = TD->getIntPtrType(DestPtr->getContext());
418491 BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
419492
420493 const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1),
426499 Value *NumBytes =
427500 Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
428501
429 Value *NewCall =
430 Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment);
502 Value *NewCall;
503 if (SplatValue)
504 NewCall = Builder.CreateMemSet(BasePtr, SplatValue,NumBytes,StoreAlignment);
505 else {
506 Module *M = TheStore->getParent()->getParent()->getParent();
507 Value *MSP = M->getOrInsertFunction("memset_pattern16",
508 Builder.getVoidTy(),
509 Builder.getInt8PtrTy(),
510 Builder.getInt8PtrTy(), IntPtr,
511 (void*)0);
512
513 // Otherwise we should form a memset_pattern16. PatternValue is known to be
514 // an constant array of 16-bytes. Plop the value into a mergable global.
515 GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
516 GlobalValue::InternalLinkage,
517 PatternValue, ".memset_pattern");
518 GV->setUnnamedAddr(true); // Ok to merge these.
519 GV->setAlignment(16);
520 Value *PatternPtr = Builder.CreateConstInBoundsGEP2_32(GV, 0, 0, "pattern");
521
522 NewCall = Builder.CreateCall3(MSP, BasePtr, PatternPtr, NumBytes);
523 }
431524
432525 DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
433526 << " from store to: " << *Ev << " at: " << *TheStore << "\n");
272272 ; CHECK-NOT: store
273273 ; CHECK: ret void
274274 }
275
276 ; On darwin10 (which is the triple in this .ll file) this loop can be turned
277 ; into a memset_pattern call.
278 ; rdar://9009151
279 define void @test11(i32* nocapture %P) nounwind ssp {
280 entry:
281 br label %for.body
282
283 for.body: ; preds = %entry, %for.body
284 %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %for.body ]
285 %arrayidx = getelementptr i32* %P, i64 %indvar
286 store i32 1, i32* %arrayidx, align 4
287 %indvar.next = add i64 %indvar, 1
288 %exitcond = icmp eq i64 %indvar.next, 10000
289 br i1 %exitcond, label %for.end, label %for.body
290
291 for.end: ; preds = %for.body
292 ret void
293 ; CHECK: @test11
294 ; CHECK-NEXT: entry:
295 ; CHECK-NEXT: bitcast
296 ; CHECK-NEXT: memset_pattern
297 ; CHECK-NOT: store
298 ; CHECK: ret void
299 }
300
301