llvm.org GIT mirror llvm / 0ea813a
[SLPVectorizer][X86] Add other tests described in PR28474 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@362297 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 3 months ago
1 changed file(s) with 256 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
11 ; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.10.0 -mattr=+sse4.2 | FileCheck %s
22
33 ; PR28474
4
5 ;void foo();
6 ;
7 ;int test1(unsigned int *p) {
8 ; int sum = 0;
9 ; #pragma nounroll
10 ; for (int y = 0; y < 2; y++) {
11 ; // Inner loop gets unrolled
12 ; for (int x = 0; x < 8; x++) {
13 ; sum += p[x] * 42;
14 ; }
15 ; // Dummy call to keep outer loop alive
16 ; foo();
17 ; }
18 ; return sum;
19 ;}
20
421 define i32 @test(i32* nocapture readonly %p) {
522 ; CHECK-LABEL: @test(
623 ; CHECK-NEXT: entry:
7895 for.end:
7996 ret i32 %add.7
8097 }
98
99 ;void foo();
100 ;
101 ;int test2(unsigned int *p, unsigned int *q) {
102 ; int sum = 0;
103 ; #pragma nounroll
104 ; for (int y = 0; y < 2; y++) {
105 ; // Inner loop gets unrolled
106 ; for (int x = 0; x < 8; x++) {
107 ; sum += p[x] * q[x];
108 ; }
109 ; // Dummy call to keep outer loop alive
110 ; foo();
111 ; }
112 ; return sum;
113 ;}
114
115 define i32 @test2(i32* nocapture readonly %p, i32* nocapture readonly %q) {
116 ; CHECK-LABEL: @test2(
117 ; CHECK-NEXT: entry:
118 ; CHECK-NEXT: [[ARRAYIDX_P_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
119 ; CHECK-NEXT: [[ARRAYIDX_P_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
120 ; CHECK-NEXT: [[ARRAYIDX_P_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
121 ; CHECK-NEXT: [[ARRAYIDX_P_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
122 ; CHECK-NEXT: [[ARRAYIDX_P_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
123 ; CHECK-NEXT: [[ARRAYIDX_P_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
124 ; CHECK-NEXT: [[ARRAYIDX_P_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
125 ; CHECK-NEXT: [[ARRAYIDX_Q_1:%.*]] = getelementptr inbounds i32, i32* [[Q:%.*]], i64 1
126 ; CHECK-NEXT: [[ARRAYIDX_Q_2:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 2
127 ; CHECK-NEXT: [[ARRAYIDX_Q_3:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 3
128 ; CHECK-NEXT: [[ARRAYIDX_Q_4:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 4
129 ; CHECK-NEXT: [[ARRAYIDX_Q_5:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 5
130 ; CHECK-NEXT: [[ARRAYIDX_Q_6:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 6
131 ; CHECK-NEXT: [[ARRAYIDX_Q_7:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 7
132 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
133 ; CHECK: for.body:
134 ; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
135 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
136 ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
137 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[Q]] to <8 x i32>*
138 ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
139 ; CHECK-NEXT: [[TMP4:%.*]] = mul <8 x i32> [[TMP1]], [[TMP3]]
140 ; CHECK-NEXT: [[ADD:%.*]] = add i32 undef, [[SUM]]
141 ; CHECK-NEXT: [[ADD_1:%.*]] = add i32 undef, [[ADD]]
142 ; CHECK-NEXT: [[ADD_2:%.*]] = add i32 undef, [[ADD_1]]
143 ; CHECK-NEXT: [[ADD_3:%.*]] = add i32 undef, [[ADD_2]]
144 ; CHECK-NEXT: [[ADD_4:%.*]] = add i32 undef, [[ADD_3]]
145 ; CHECK-NEXT: [[ADD_5:%.*]] = add i32 undef, [[ADD_4]]
146 ; CHECK-NEXT: [[ADD_6:%.*]] = add i32 undef, [[ADD_5]]
147 ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> undef, <8 x i32>
148 ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP4]], [[RDX_SHUF]]
149 ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32>
150 ; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
151 ; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32>
152 ; CHECK-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
153 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
154 ; CHECK-NEXT: [[OP_EXTRA]] = add i32 [[TMP5]], [[SUM]]
155 ; CHECK-NEXT: [[ADD_7:%.*]] = add i32 undef, [[ADD_6]]
156 ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
157 ; CHECK: for.end:
158 ; CHECK-NEXT: ret i32 [[OP_EXTRA]]
159 ;
160 entry:
161 %arrayidx.p.1 = getelementptr inbounds i32, i32* %p, i64 1
162 %arrayidx.p.2 = getelementptr inbounds i32, i32* %p, i64 2
163 %arrayidx.p.3 = getelementptr inbounds i32, i32* %p, i64 3
164 %arrayidx.p.4 = getelementptr inbounds i32, i32* %p, i64 4
165 %arrayidx.p.5 = getelementptr inbounds i32, i32* %p, i64 5
166 %arrayidx.p.6 = getelementptr inbounds i32, i32* %p, i64 6
167 %arrayidx.p.7 = getelementptr inbounds i32, i32* %p, i64 7
168
169 %arrayidx.q.1 = getelementptr inbounds i32, i32* %q, i64 1
170 %arrayidx.q.2 = getelementptr inbounds i32, i32* %q, i64 2
171 %arrayidx.q.3 = getelementptr inbounds i32, i32* %q, i64 3
172 %arrayidx.q.4 = getelementptr inbounds i32, i32* %q, i64 4
173 %arrayidx.q.5 = getelementptr inbounds i32, i32* %q, i64 5
174 %arrayidx.q.6 = getelementptr inbounds i32, i32* %q, i64 6
175 %arrayidx.q.7 = getelementptr inbounds i32, i32* %q, i64 7
176 br label %for.body
177
178 for.body:
179 %sum = phi i32 [ 0, %entry ], [ %add.7, %for.body ]
180 %tmpp = load i32, i32* %p, align 4
181 %tmpq = load i32, i32* %q, align 4
182 %mul = mul i32 %tmpp, %tmpq
183 %add = add i32 %mul, %sum
184 %tmp5p = load i32, i32* %arrayidx.p.1, align 4
185 %tmp5q = load i32, i32* %arrayidx.q.1, align 4
186 %mul.1 = mul i32 %tmp5p, %tmp5q
187 %add.1 = add i32 %mul.1, %add
188 %tmp6p = load i32, i32* %arrayidx.p.2, align 4
189 %tmp6q = load i32, i32* %arrayidx.q.2, align 4
190 %mul.2 = mul i32 %tmp6p, %tmp6q
191 %add.2 = add i32 %mul.2, %add.1
192 %tmp7p = load i32, i32* %arrayidx.p.3, align 4
193 %tmp7q = load i32, i32* %arrayidx.q.3, align 4
194 %mul.3 = mul i32 %tmp7p, %tmp7q
195 %add.3 = add i32 %mul.3, %add.2
196 %tmp8p = load i32, i32* %arrayidx.p.4, align 4
197 %tmp8q = load i32, i32* %arrayidx.q.4, align 4
198 %mul.4 = mul i32 %tmp8p, %tmp8q
199 %add.4 = add i32 %mul.4, %add.3
200 %tmp9p = load i32, i32* %arrayidx.p.5, align 4
201 %tmp9q = load i32, i32* %arrayidx.q.5, align 4
202 %mul.5 = mul i32 %tmp9p, %tmp9q
203 %add.5 = add i32 %mul.5, %add.4
204 %tmp10p = load i32, i32* %arrayidx.p.6, align 4
205 %tmp10q = load i32, i32* %arrayidx.q.6, align 4
206 %mul.6 = mul i32 %tmp10p, %tmp10q
207 %add.6 = add i32 %mul.6, %add.5
208 %tmp11p = load i32, i32* %arrayidx.p.7, align 4
209 %tmp11q = load i32, i32* %arrayidx.q.7, align 4
210 %mul.7 = mul i32 %tmp11p, %tmp11q
211 %add.7 = add i32 %mul.7, %add.6
212 br i1 true, label %for.end, label %for.body
213
214 for.end:
215 ret i32 %add.7
216 }
217
218 ;void foo();
219 ;
220 ;int test3(unsigned int *p, unsigned int *q) {
221 ; int sum = 0;
222 ; #pragma nounroll
223 ; for (int y = 0; y < 2; y++) {
224 ; // Inner loop gets unrolled
225 ; for (int x = 0; x < 8; x++) {
226 ; sum += p[x] * q[7-x];
227 ; }
228 ; // Dummy call to keep outer loop alive
229 ; foo();
230 ; }
231 ; return sum;
232 ;}
233
234 define i32 @test3(i32* nocapture readonly %p, i32* nocapture readonly %q) {
235 ; CHECK-LABEL: @test3(
236 ; CHECK-NEXT: entry:
237 ; CHECK-NEXT: [[ARRAYIDX_P_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
238 ; CHECK-NEXT: [[ARRAYIDX_P_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
239 ; CHECK-NEXT: [[ARRAYIDX_P_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
240 ; CHECK-NEXT: [[ARRAYIDX_P_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
241 ; CHECK-NEXT: [[ARRAYIDX_P_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
242 ; CHECK-NEXT: [[ARRAYIDX_P_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
243 ; CHECK-NEXT: [[ARRAYIDX_P_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
244 ; CHECK-NEXT: [[ARRAYIDX_Q_1:%.*]] = getelementptr inbounds i32, i32* [[Q:%.*]], i64 1
245 ; CHECK-NEXT: [[ARRAYIDX_Q_2:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 2
246 ; CHECK-NEXT: [[ARRAYIDX_Q_3:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 3
247 ; CHECK-NEXT: [[ARRAYIDX_Q_4:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 4
248 ; CHECK-NEXT: [[ARRAYIDX_Q_5:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 5
249 ; CHECK-NEXT: [[ARRAYIDX_Q_6:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 6
250 ; CHECK-NEXT: [[ARRAYIDX_Q_7:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 7
251 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
252 ; CHECK: for.body:
253 ; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
254 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
255 ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
256 ; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32>
257 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[Q]] to <8 x i32>*
258 ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
259 ; CHECK-NEXT: [[TMP4:%.*]] = mul <8 x i32> [[REORDER_SHUFFLE]], [[TMP3]]
260 ; CHECK-NEXT: [[ADD:%.*]] = add i32 undef, [[SUM]]
261 ; CHECK-NEXT: [[ADD_1:%.*]] = add i32 undef, [[ADD]]
262 ; CHECK-NEXT: [[ADD_2:%.*]] = add i32 undef, [[ADD_1]]
263 ; CHECK-NEXT: [[ADD_3:%.*]] = add i32 undef, [[ADD_2]]
264 ; CHECK-NEXT: [[ADD_4:%.*]] = add i32 undef, [[ADD_3]]
265 ; CHECK-NEXT: [[ADD_5:%.*]] = add i32 undef, [[ADD_4]]
266 ; CHECK-NEXT: [[ADD_6:%.*]] = add i32 undef, [[ADD_5]]
267 ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> undef, <8 x i32>
268 ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP4]], [[RDX_SHUF]]
269 ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32>
270 ; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
271 ; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32>
272 ; CHECK-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
273 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
274 ; CHECK-NEXT: [[OP_EXTRA]] = add i32 [[TMP5]], [[SUM]]
275 ; CHECK-NEXT: [[ADD_7:%.*]] = add i32 undef, [[ADD_6]]
276 ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
277 ; CHECK: for.end:
278 ; CHECK-NEXT: ret i32 [[OP_EXTRA]]
279 ;
280 entry:
281 %arrayidx.p.1 = getelementptr inbounds i32, i32* %p, i64 1
282 %arrayidx.p.2 = getelementptr inbounds i32, i32* %p, i64 2
283 %arrayidx.p.3 = getelementptr inbounds i32, i32* %p, i64 3
284 %arrayidx.p.4 = getelementptr inbounds i32, i32* %p, i64 4
285 %arrayidx.p.5 = getelementptr inbounds i32, i32* %p, i64 5
286 %arrayidx.p.6 = getelementptr inbounds i32, i32* %p, i64 6
287 %arrayidx.p.7 = getelementptr inbounds i32, i32* %p, i64 7
288
289 %arrayidx.q.1 = getelementptr inbounds i32, i32* %q, i64 1
290 %arrayidx.q.2 = getelementptr inbounds i32, i32* %q, i64 2
291 %arrayidx.q.3 = getelementptr inbounds i32, i32* %q, i64 3
292 %arrayidx.q.4 = getelementptr inbounds i32, i32* %q, i64 4
293 %arrayidx.q.5 = getelementptr inbounds i32, i32* %q, i64 5
294 %arrayidx.q.6 = getelementptr inbounds i32, i32* %q, i64 6
295 %arrayidx.q.7 = getelementptr inbounds i32, i32* %q, i64 7
296 br label %for.body
297
298 for.body:
299 %sum = phi i32 [ 0, %entry ], [ %add.7, %for.body ]
300 %tmpp = load i32, i32* %p, align 4
301 %tmpq = load i32, i32* %arrayidx.q.7, align 4
302 %mul = mul i32 %tmpp, %tmpq
303 %add = add i32 %mul, %sum
304 %tmp5p = load i32, i32* %arrayidx.p.1, align 4
305 %tmp5q = load i32, i32* %arrayidx.q.6, align 4
306 %mul.1 = mul i32 %tmp5p, %tmp5q
307 %add.1 = add i32 %mul.1, %add
308 %tmp6p = load i32, i32* %arrayidx.p.2, align 4
309 %tmp6q = load i32, i32* %arrayidx.q.5, align 4
310 %mul.2 = mul i32 %tmp6p, %tmp6q
311 %add.2 = add i32 %mul.2, %add.1
312 %tmp7p = load i32, i32* %arrayidx.p.3, align 4
313 %tmp7q = load i32, i32* %arrayidx.q.4, align 4
314 %mul.3 = mul i32 %tmp7p, %tmp7q
315 %add.3 = add i32 %mul.3, %add.2
316 %tmp8p = load i32, i32* %arrayidx.p.4, align 4
317 %tmp8q = load i32, i32* %arrayidx.q.3, align 4
318 %mul.4 = mul i32 %tmp8p, %tmp8q
319 %add.4 = add i32 %mul.4, %add.3
320 %tmp9p = load i32, i32* %arrayidx.p.5, align 4
321 %tmp9q = load i32, i32* %arrayidx.q.2, align 4
322 %mul.5 = mul i32 %tmp9p, %tmp9q
323 %add.5 = add i32 %mul.5, %add.4
324 %tmp10p = load i32, i32* %arrayidx.p.6, align 4
325 %tmp10q = load i32, i32* %arrayidx.q.1, align 4
326 %mul.6 = mul i32 %tmp10p, %tmp10q
327 %add.6 = add i32 %mul.6, %add.5
328 %tmp11p = load i32, i32* %arrayidx.p.7, align 4
329 %tmp11q = load i32, i32* %q, align 4
330 %mul.7 = mul i32 %tmp11p, %tmp11q
331 %add.7 = add i32 %mul.7, %add.6
332 br i1 true, label %for.end, label %for.body
333
334 for.end:
335 ret i32 %add.7
336 }