llvm.org GIT mirror llvm / 37ac608
Turn a memcpy from a double* into a load/store of double instead of a load/store of i64. The later prevents promotion/scalarrepl of the source and dest in many cases. This fixes the 300% performance regression of the byval stuff on stepanov_v1p2. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@45945 91177308-0d34-0410-b5e6-96231b3b80d8 Chris Lattner 11 years ago
2 changed file(s) with 61 addition(s) and 10 deletion(s). Raw diff Collapse all Expand all
78257825 ConstantInt *MemOpLength = dyn_cast(MI->getOperand(3));
78267826 if (MemOpLength == 0) return 0;
78277827
7828 // Source and destination pointer types are always "i8*" for intrinsic.
7829 // If Size is 8 then use Int64Ty
7830 // If Size is 4 then use Int32Ty
7831 // If Size is 2 then use Int16Ty
7832 // If Size is 1 then use Int8Ty
7828 // Source and destination pointer types are always "i8*" for intrinsic. See
7829 // if the size is something we can handle with a single primitive load/store.
7830 // A single load+store correctly handles overlapping memory in the memmove
7831 // case.
78337832 unsigned Size = MemOpLength->getZExtValue();
78347833 if (Size == 0 || Size > 8 || (Size&(Size-1)))
7835 return 0; // If not 1/2/4/8, exit.
7836
7834 return 0; // If not 1/2/4/8 bytes, exit.
7835
7836 // Use an integer load+store unless we can find something better.
78377837 Type *NewPtrTy = PointerType::getUnqual(IntegerType::get(Size<<3));
7838
7839 // Memcpy forces the use of i8* for the source and destination. That means
7840 // that if you're using memcpy to move one double around, you'll get a cast
7841 // from double* to i8*. We'd much rather use a double load+store rather than
7842 // an i64 load+store, here because this improves the odds that the source or
7843 // dest address will be promotable. See if we can find a better type than the
7844 // integer datatype.
7845 if (Value *Op = getBitCastOperand(MI->getOperand(1))) {
7846 const Type *SrcETy = cast(Op->getType())->getElementType();
7847 if (SrcETy->isSized() && TD->getTypeStoreSize(SrcETy) == Size) {
7848 // The SrcETy might be something like {{{double}}} or [1 x double]. Rip
7849 // down through these levels if so.
7850 while (!SrcETy->isFirstClassType()) {
7851 if (const StructType *STy = dyn_cast(SrcETy)) {
7852 if (STy->getNumElements() == 1)
7853 SrcETy = STy->getElementType(0);
7854 else
7855 break;
7856 } else if (const ArrayType *ATy = dyn_cast(SrcETy)) {
7857 if (ATy->getNumElements() == 1)
7858 SrcETy = ATy->getElementType();
7859 else
7860 break;
7861 } else
7862 break;
7863 }
7864
7865 if (SrcETy->isFirstClassType())
7866 NewPtrTy = PointerType::getUnqual(SrcETy);
7867 }
7868 }
7869
7870
78387871 // If the memcpy/memmove provides better alignment info than we can
78397872 // infer, use it.
78407873 SrcAlign = std::max(SrcAlign, CopyAlign);
78427875
78437876 Value *Src = InsertBitCastBefore(MI->getOperand(2), NewPtrTy, *MI);
78447877 Value *Dest = InsertBitCastBefore(MI->getOperand(1), NewPtrTy, *MI);
7845 Value *L = new LoadInst(Src, "tmp", false, SrcAlign, MI);
7846 new StoreInst(L, Dest, false, DstAlign, MI);
7847 return EraseInstFromFunction(*MI);
7878 Instruction *L = new LoadInst(Src, "tmp", false, SrcAlign);
7879 InsertNewInstBefore(L, *MI);
7880 InsertNewInstBefore(new StoreInst(L, Dest, false, DstAlign), *MI);
7881
7882 // Set the size of the copy to 0, it will be deleted on the next iteration.
7883 MI->setOperand(3, Constant::getNullValue(MemOpLength->getType()));
7884 return MI;
78487885 }
78497886
78507887 /// visitCallInst - CallInst simplification. This mostly only handles folding
0 ; RUN: llvm-as < %s | opt -instcombine | llvm-dis | grep {load double}
1 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
2 target triple = "i686-apple-darwin8"
3
4 define void @foo(double* %X, double* %Y) {
5 entry:
6 %"alloca point" = bitcast i32 0 to i32 ; [#uses=0]
7 %tmp2 = bitcast double* %X to i8* ; [#uses=1]
8 %tmp13 = bitcast double* %Y to i8* ; [#uses=1]
9 call void @llvm.memcpy.i32( i8* %tmp2, i8* %tmp13, i32 8, i32 1 )
10 ret void
11 }
12
13 declare void @llvm.memcpy.i32(i8*, i8*, i32, i32) nounwind