llvm.org GIT mirror llvm / 1734791
X86: Emit vector sext as shuffle + sra if vpmovsx is not available. Also loosen the SSSE3 dependency a bit, expanded pshufb + psra is still better than scalarized loads. Fixes PR14590. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170984 91177308-0d34-0410-b5e6-96231b3b80d8 Benjamin Kramer 7 years ago
2 changed file(s) with 136 addition(s) and 32 deletion(s). Raw diff Collapse all Expand all
1604216042 ISD::LoadExtType Ext = Ld->getExtensionType();
1604316043
1604416044 // If this is a vector EXT Load then attempt to optimize it using a
16045 // shuffle. We need SSSE3 shuffles.
16046 // SEXT loads are suppoted starting SSE41.
16047 // We generate X86ISD::VSEXT for them.
16045 // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
16046 // expansion is still better than scalar code.
16047 // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll
16048 // emit a shuffle and a arithmetic shift.
1604816049 // TODO: It is possible to support ZExt by zeroing the undef values
1604916050 // during the shuffle phase or after the shuffle.
16050 if (RegVT.isVector() && RegVT.isInteger() &&
16051 ((Ext == ISD::EXTLOAD && Subtarget->hasSSSE3()) ||
16052 (Ext == ISD::SEXTLOAD && Subtarget->hasSSE41()))){
16051 if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() &&
16052 (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) {
1605316053 assert(MemVT != RegVT && "Cannot extend to the same type");
1605416054 assert(MemVT.isVector() && "Must load a vector from memory");
1605516055
1614216142 unsigned SizeRatio = RegSz/MemSz;
1614316143
1614416144 if (Ext == ISD::SEXTLOAD) {
16145 SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
16146 return DCI.CombineTo(N, Sext, TF, true);
16147 }
16145 // If we have SSE4.1 we can directly emit a VSEXT node.
16146 if (Subtarget->hasSSE41()) {
16147 SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
16148 return DCI.CombineTo(N, Sext, TF, true);
16149 }
16150
16151 // Otherwise we'll shuffle the small elements in the high bits of the
16152 // larger type and perform an arithmetic shift. If the shift is not legal
16153 // it's better to scalarize.
16154 if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT))
16155 return SDValue();
16156
16157 // Redistribute the loaded elements into the different locations.
16158 SmallVector ShuffleVec(NumElems * SizeRatio, -1);
16159 for (unsigned i = 0; i != NumElems; ++i)
16160 ShuffleVec[i*SizeRatio + SizeRatio-1] = i;
16161
16162 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
16163 DAG.getUNDEF(WideVecVT),
16164 &ShuffleVec[0]);
16165
16166 Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16167
16168 // Build the arithmetic shift.
16169 unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
16170 MemVT.getVectorElementType().getSizeInBits();
16171 SmallVector C(NumElems,
16172 DAG.getConstant(Amt, RegVT.getScalarType()));
16173 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, RegVT, &C[0], C.size());
16174 Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff, BV);
16175
16176 return DCI.CombineTo(N, Shuff, TF, true);
16177 }
16178
1614816179 // Redistribute the loaded elements into the different locations.
1614916180 SmallVector ShuffleVec(NumElems * SizeRatio, -1);
1615016181 for (unsigned i = 0; i != NumElems; ++i)
None ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s
0 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX
1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSSE3
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=pentium4 | FileCheck %s -check-prefix=SSE2
13
24 define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
3 ;CHECK: sext_8i16_to_8i32
4 ;CHECK: vpmovsxwd
5 ; AVX: sext_8i16_to_8i32
6 ; AVX: vpmovsxwd
57
68 %B = sext <8 x i16> %A to <8 x i32>
79 ret <8 x i32>%B
810 }
911
1012 define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
11 ;CHECK: sext_4i32_to_4i64
12 ;CHECK: vpmovsxdq
13 ; AVX: sext_4i32_to_4i64
14 ; AVX: vpmovsxdq
1315
1416 %B = sext <4 x i32> %A to <4 x i64>
1517 ret <4 x i64>%B
1618 }
1719
18 ; CHECK: load_sext_test1
19 ; CHECK: vpmovsxwd (%r{{[^,]*}}), %xmm{{.*}}
20 ; CHECK: ret
20 ; AVX: load_sext_test1
21 ; AVX: vpmovsxwd (%r{{[^,]*}}), %xmm{{.*}}
22 ; AVX: ret
23
24 ; SSSE3: load_sext_test1
25 ; SSSE3: movq
26 ; SSSE3: punpcklwd %xmm{{.*}}, %xmm{{.*}}
27 ; SSSE3: psrad $16
28 ; SSSE3: ret
29
30 ; SSE2: load_sext_test1
31 ; SSE2: movq
32 ; SSE2: punpcklwd %xmm{{.*}}, %xmm{{.*}}
33 ; SSE2: psrad $16
34 ; SSE2: ret
2135 define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) {
2236 %X = load <4 x i16>* %ptr
2337 %Y = sext <4 x i16> %X to <4 x i32>
2438 ret <4 x i32>%Y
2539 }
2640
27 ; CHECK: load_sext_test2
28 ; CHECK: vpmovsxbd (%r{{[^,]*}}), %xmm{{.*}}
29 ; CHECK: ret
41 ; AVX: load_sext_test2
42 ; AVX: vpmovsxbd (%r{{[^,]*}}), %xmm{{.*}}
43 ; AVX: ret
44
45 ; SSSE3: load_sext_test2
46 ; SSSE3: movd
47 ; SSSE3: pshufb
48 ; SSSE3: psrad $24
49 ; SSSE3: ret
50
51 ; SSE2: load_sext_test2
52 ; SSE2: movl
53 ; SSE2: psrad $24
54 ; SSE2: ret
3055 define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) {
3156 %X = load <4 x i8>* %ptr
3257 %Y = sext <4 x i8> %X to <4 x i32>
3358 ret <4 x i32>%Y
3459 }
3560
36 ; CHECK: load_sext_test3
37 ; CHECK: vpmovsxbq (%r{{[^,]*}}), %xmm{{.*}}
38 ; CHECK: ret
61 ; AVX: load_sext_test3
62 ; AVX: vpmovsxbq (%r{{[^,]*}}), %xmm{{.*}}
63 ; AVX: ret
64
65 ; SSSE3: load_sext_test3
66 ; SSSE3: movsbq
67 ; SSSE3: movsbq
68 ; SSSE3: punpcklqdq
69 ; SSSE3: ret
70
71 ; SSE2: load_sext_test3
72 ; SSE2: movsbq
73 ; SSE2: movsbq
74 ; SSE2: punpcklqdq
75 ; SSE2: ret
3976 define <2 x i64> @load_sext_test3(<2 x i8> *%ptr) {
4077 %X = load <2 x i8>* %ptr
4178 %Y = sext <2 x i8> %X to <2 x i64>
4279 ret <2 x i64>%Y
4380 }
4481
45 ; CHECK: load_sext_test4
46 ; CHECK: vpmovsxwq (%r{{[^,]*}}), %xmm{{.*}}
47 ; CHECK: ret
82 ; AVX: load_sext_test4
83 ; AVX: vpmovsxwq (%r{{[^,]*}}), %xmm{{.*}}
84 ; AVX: ret
85
86 ; SSSE3: load_sext_test4
87 ; SSSE3: movswq
88 ; SSSE3: movswq
89 ; SSSE3: punpcklqdq
90 ; SSSE3: ret
91
92 ; SSE2: load_sext_test4
93 ; SSE2: movswq
94 ; SSE2: movswq
95 ; SSE2: punpcklqdq
96 ; SSE2: ret
4897 define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) {
4998 %X = load <2 x i16>* %ptr
5099 %Y = sext <2 x i16> %X to <2 x i64>
51100 ret <2 x i64>%Y
52101 }
53102
54 ; CHECK: load_sext_test5
55 ; CHECK: vpmovsxdq (%r{{[^,]*}}), %xmm{{.*}}
56 ; CHECK: ret
103 ; AVX: load_sext_test5
104 ; AVX: vpmovsxdq (%r{{[^,]*}}), %xmm{{.*}}
105 ; AVX: ret
106
107 ; SSSE3: load_sext_test5
108 ; SSSE3: movslq
109 ; SSSE3: movslq
110 ; SSSE3: punpcklqdq
111 ; SSSE3: ret
112
113 ; SSE2: load_sext_test5
114 ; SSE2: movslq
115 ; SSE2: movslq
116 ; SSE2: punpcklqdq
117 ; SSE2: ret
57118 define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) {
58119 %X = load <2 x i32>* %ptr
59120 %Y = sext <2 x i32> %X to <2 x i64>
60121 ret <2 x i64>%Y
61122 }
62123
63 ; CHECK: load_sext_test6
64 ; CHECK: vpmovsxbw (%r{{[^,]*}}), %xmm{{.*}}
65 ; CHECK: ret
124 ; AVX: load_sext_test6
125 ; AVX: vpmovsxbw (%r{{[^,]*}}), %xmm{{.*}}
126 ; AVX: ret
127
128 ; SSSE3: load_sext_test6
129 ; SSSE3: movq
130 ; SSSE3: punpcklbw
131 ; SSSE3: psraw $8
132 ; SSSE3: ret
133
134 ; SSE2: load_sext_test6
135 ; SSE2: movq
136 ; SSE2: punpcklbw
137 ; SSE2: psraw $8
138 ; SSE2: ret
66139 define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) {
67140 %X = load <8 x i8>* %ptr
68141 %Y = sext <8 x i8> %X to <8 x i16>