llvm.org GIT mirror llvm / 4b97731
Optimized load + SIGN_EXTEND patterns in the X86 backend. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170506 91177308-0d34-0410-b5e6-96231b3b80d8 Elena Demikhovsky 7 years ago
6 changed file(s) with 202 addition(s) and 7 deletion(s). Raw diff Collapse all Expand all
52345234 LN0->getAlignment());
52355235 CombineTo(N, ExtLoad);
52365236 CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
5237 AddToWorkList(ExtLoad.getNode());
52375238 return SDValue(N, 0); // Return N so it doesn't get rechecked!
52385239 }
52395240 // fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use
1592815928
1592915929 // If this is a vector EXT Load then attempt to optimize it using a
1593015930 // shuffle. We need SSSE3 shuffles.
15931 // SEXT loads are suppoted starting SSE41.
15932 // We generate X86ISD::VSEXT for them.
1593115933 // TODO: It is possible to support ZExt by zeroing the undef values
1593215934 // during the shuffle phase or after the shuffle.
1593315935 if (RegVT.isVector() && RegVT.isInteger() &&
15934 Ext == ISD::EXTLOAD && Subtarget->hasSSSE3()) {
15936 (Ext == ISD::EXTLOAD && Subtarget->hasSSSE3() ||
15937 Ext == ISD::SEXTLOAD && Subtarget->hasSSE41())){
1593515938 assert(MemVT != RegVT && "Cannot extend to the same type");
1593615939 assert(MemVT.isVector() && "Must load a vector from memory");
1593715940
1593915942 unsigned RegSz = RegVT.getSizeInBits();
1594015943 unsigned MemSz = MemVT.getSizeInBits();
1594115944 assert(RegSz > MemSz && "Register size must be greater than the mem size");
15945
15946 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256())
15947 return SDValue();
1594215948
1594315949 // All sizes must be a power of two.
1594415950 if (!isPowerOf2_32(RegSz * MemSz * NumElems))
1596315969 // Calculate the number of scalar loads that we need to perform
1596415970 // in order to load our vector from memory.
1596515971 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
15972 if (Ext == ISD::SEXTLOAD && NumLoads > 1)
15973 return SDValue();
15974
15975 unsigned loadRegZize = RegSz;
15976 if (Ext == ISD::SEXTLOAD && RegSz == 256)
15977 loadRegZize /= 2;
1596615978
1596715979 // Represent our vector as a sequence of elements which are the
1596815980 // largest scalar that we can load.
1596915981 EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
15970 RegSz/SclrLoadTy.getSizeInBits());
15982 loadRegZize/SclrLoadTy.getSizeInBits());
1597115983
1597215984 // Represent the data using the same element type that is stored in
1597315985 // memory. In practice, we ''widen'' MemVT.
15974 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
15975 RegSz/MemVT.getScalarType().getSizeInBits());
15986 EVT WideVecVT =
15987 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
15988 loadRegZize/MemVT.getScalarType().getSizeInBits());
1597615989
1597715990 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
1597815991 "Invalid vector type");
1601316026 SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
1601416027 unsigned SizeRatio = RegSz/MemSz;
1601516028
16029 if (Ext == ISD::SEXTLOAD) {
16030 SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
16031 return DCI.CombineTo(N, Sext, TF, true);
16032 }
1601616033 // Redistribute the loaded elements into the different locations.
1601716034 SmallVector ShuffleVec(NumElems * SizeRatio, -1);
1601816035 for (unsigned i = 0; i != NumElems; ++i)
58415841 defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
58425842 defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>;
58435843
5844 let Predicates = [HasAVX2] in {
5845 def : Pat<(v8i32 (X86vsmovl (v8i16 (bitconvert (v2i64 (load addr:$src)))))),
5846 (VPMOVSXWDYrm addr:$src)>;
5847 def : Pat<(v4i64 (X86vsmovl (v4i32 (bitconvert (v2i64 (load addr:$src)))))),
5848 (VPMOVSXDQYrm addr:$src)>;
5849
5850 def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2i64
5851 (scalar_to_vector (loadi64 addr:$src))))))),
5852 (VPMOVSXBDYrm addr:$src)>;
5853 def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2f64
5854 (scalar_to_vector (loadf64 addr:$src))))))),
5855 (VPMOVSXBDYrm addr:$src)>;
5856
5857 def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2i64
5858 (scalar_to_vector (loadi64 addr:$src))))))),
5859 (VPMOVSXWQYrm addr:$src)>;
5860 def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2f64
5861 (scalar_to_vector (loadf64 addr:$src))))))),
5862 (VPMOVSXWQYrm addr:$src)>;
5863
5864 def : Pat<(v4i64 (X86vsext (v16i8 (bitconvert (v4i32
5865 (scalar_to_vector (loadi32 addr:$src))))))),
5866 (VPMOVSXBQYrm addr:$src)>;
5867 }
5868
58445869 let Predicates = [HasAVX] in {
58455870 // Common patterns involving scalar load
58465871 def : Pat<(int_x86_sse41_pmovsxbq
58655890 (bitconvert (v4i32 (X86vzmovl
58665891 (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
58675892 (PMOVZXBQrm addr:$src)>;
5893
5894 def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64
5895 (scalar_to_vector (loadi64 addr:$src))))))),
5896 (PMOVSXWDrm addr:$src)>;
5897 def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64
5898 (scalar_to_vector (loadf64 addr:$src))))))),
5899 (PMOVSXWDrm addr:$src)>;
5900 def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32
5901 (scalar_to_vector (loadi32 addr:$src))))))),
5902 (PMOVSXBDrm addr:$src)>;
5903 def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32
5904 (scalar_to_vector (loadi32 addr:$src))))))),
5905 (PMOVSXWQrm addr:$src)>;
5906 def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32
5907 (scalar_to_vector (extloadi32i16 addr:$src))))))),
5908 (PMOVSXBQrm addr:$src)>;
5909 def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64
5910 (scalar_to_vector (loadi64 addr:$src))))))),
5911 (PMOVSXDQrm addr:$src)>;
5912 def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64
5913 (scalar_to_vector (loadf64 addr:$src))))))),
5914 (PMOVSXDQrm addr:$src)>;
5915 def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64
5916 (scalar_to_vector (loadi64 addr:$src))))))),
5917 (PMOVSXBWrm addr:$src)>;
5918 def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64
5919 (scalar_to_vector (loadf64 addr:$src))))))),
5920 (PMOVSXBWrm addr:$src)>;
58685921 }
58695922
58705923 let Predicates = [HasAVX2] in {
59255978 (VPMOVZXDQrm addr:$src)>;
59265979 def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
59275980 (VPMOVZXDQrm addr:$src)>;
5981
5982 def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64
5983 (scalar_to_vector (loadi64 addr:$src))))))),
5984 (VPMOVSXWDrm addr:$src)>;
5985 def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64
5986 (scalar_to_vector (loadi64 addr:$src))))))),
5987 (VPMOVSXDQrm addr:$src)>;
5988 def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64
5989 (scalar_to_vector (loadf64 addr:$src))))))),
5990 (VPMOVSXWDrm addr:$src)>;
5991 def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64
5992 (scalar_to_vector (loadf64 addr:$src))))))),
5993 (VPMOVSXDQrm addr:$src)>;
5994 def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64
5995 (scalar_to_vector (loadi64 addr:$src))))))),
5996 (VPMOVSXBWrm addr:$src)>;
5997 def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64
5998 (scalar_to_vector (loadf64 addr:$src))))))),
5999 (VPMOVSXBWrm addr:$src)>;
6000
6001 def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32
6002 (scalar_to_vector (loadi32 addr:$src))))))),
6003 (VPMOVSXBDrm addr:$src)>;
6004 def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32
6005 (scalar_to_vector (loadi32 addr:$src))))))),
6006 (VPMOVSXWQrm addr:$src)>;
6007 def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32
6008 (scalar_to_vector (extloadi32i16 addr:$src))))))),
6009 (VPMOVSXBQrm addr:$src)>;
59286010 }
59296011
59306012 let Predicates = [UseSSE41] in {
1515 ; CHECK: main
1616 define i32 @main() nounwind uwtable {
1717 entry:
18 ; CHECK: movsbq j(%rip), %
19 ; CHECK: movsbq i(%rip), %
18 ; CHECK: pmovsxbq j(%rip), %
19 ; CHECK: pmovsxbq i(%rip), %
2020 %0 = load <2 x i8>* @i, align 8
2121 %1 = load <2 x i8>* @j, align 8
2222 %div = sdiv <2 x i8> %1, %0
None ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
0 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s
11
22 define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
33 ;CHECK: sext_8i16_to_8i32
1414 %B = sext <4 x i32> %A to <4 x i64>
1515 ret <4 x i64>%B
1616 }
17
18 ; CHECK: load_sext_test1
19 ; CHECK: vpmovsxwd (%r{{[^,]*}}), %xmm{{.*}}
20 ; CHECK: ret
21 define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) {
22 %X = load <4 x i16>* %ptr
23 %Y = sext <4 x i16> %X to <4 x i32>
24 ret <4 x i32>%Y
25 }
26
27 ; CHECK: load_sext_test2
28 ; CHECK: vpmovsxbd (%r{{[^,]*}}), %xmm{{.*}}
29 ; CHECK: ret
30 define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) {
31 %X = load <4 x i8>* %ptr
32 %Y = sext <4 x i8> %X to <4 x i32>
33 ret <4 x i32>%Y
34 }
35
36 ; CHECK: load_sext_test3
37 ; CHECK: vpmovsxbq (%r{{[^,]*}}), %xmm{{.*}}
38 ; CHECK: ret
39 define <2 x i64> @load_sext_test3(<2 x i8> *%ptr) {
40 %X = load <2 x i8>* %ptr
41 %Y = sext <2 x i8> %X to <2 x i64>
42 ret <2 x i64>%Y
43 }
44
45 ; CHECK: load_sext_test4
46 ; CHECK: vpmovsxwq (%r{{[^,]*}}), %xmm{{.*}}
47 ; CHECK: ret
48 define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) {
49 %X = load <2 x i16>* %ptr
50 %Y = sext <2 x i16> %X to <2 x i64>
51 ret <2 x i64>%Y
52 }
53
54 ; CHECK: load_sext_test5
55 ; CHECK: vpmovsxdq (%r{{[^,]*}}), %xmm{{.*}}
56 ; CHECK: ret
57 define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) {
58 %X = load <2 x i32>* %ptr
59 %Y = sext <2 x i32> %X to <2 x i64>
60 ret <2 x i64>%Y
61 }
62
63 ; CHECK: load_sext_test6
64 ; CHECK: vpmovsxbw (%r{{[^,]*}}), %xmm{{.*}}
65 ; CHECK: ret
66 define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) {
67 %X = load <8 x i8>* %ptr
68 %Y = sext <8 x i8> %X to <8 x i16>
69 ret <8 x i16>%Y
70 }
6262 ret <8 x i32>%B
6363 }
6464
65 ; CHECK: load_sext_test1
66 ; CHECK: vpmovsxdq (%r{{[^,]*}}), %ymm{{.*}}
67 ; CHECK: ret
68 define <4 x i64> @load_sext_test1(<4 x i32> *%ptr) {
69 %X = load <4 x i32>* %ptr
70 %Y = sext <4 x i32> %X to <4 x i64>
71 ret <4 x i64>%Y
72 }
6573
74 ; CHECK: load_sext_test2
75 ; CHECK: vpmovsxbq (%r{{[^,]*}}), %ymm{{.*}}
76 ; CHECK: ret
77 define <4 x i64> @load_sext_test2(<4 x i8> *%ptr) {
78 %X = load <4 x i8>* %ptr
79 %Y = sext <4 x i8> %X to <4 x i64>
80 ret <4 x i64>%Y
81 }
6682
83 ; CHECK: load_sext_test3
84 ; CHECK: vpmovsxwq (%r{{[^,]*}}), %ymm{{.*}}
85 ; CHECK: ret
86 define <4 x i64> @load_sext_test3(<4 x i16> *%ptr) {
87 %X = load <4 x i16>* %ptr
88 %Y = sext <4 x i16> %X to <4 x i64>
89 ret <4 x i64>%Y
90 }
6791
92 ; CHECK: load_sext_test4
93 ; CHECK: vpmovsxwd (%r{{[^,]*}}), %ymm{{.*}}
94 ; CHECK: ret
95 define <8 x i32> @load_sext_test4(<8 x i16> *%ptr) {
96 %X = load <8 x i16>* %ptr
97 %Y = sext <8 x i16> %X to <8 x i32>
98 ret <8 x i32>%Y
99 }
100
101 ; CHECK: load_sext_test5
102 ; CHECK: vpmovsxbd (%r{{[^,]*}}), %ymm{{.*}}
103 ; CHECK: ret
104 define <8 x i32> @load_sext_test5(<8 x i8> *%ptr) {
105 %X = load <8 x i8>* %ptr
106 %Y = sext <8 x i8> %X to <8 x i32>
107 ret <8 x i32>%Y
108 }