llvm.org GIT mirror llvm / 2cd2d0a
[X86][SSE] Added 16i8 -> 8i64 sext test Shows poor codegen for AVX2 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@266560 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 4 years ago
1 changed file(s) with 125 addition(s) and 1 deletion(s). Raw diff Collapse all Expand all
294294 %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32>
295295 %C = sext <4 x i8> %B to <4 x i64>
296296 ret <4 x i64> %C
297 }
298
299 define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp {
300 ; SSE2-LABEL: sext_16i8_to_8i64:
301 ; SSE2: # BB#0: # %entry
302 ; SSE2-NEXT: movdqa %xmm0, %xmm1
303 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
304 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
305 ; SSE2-NEXT: movdqa %xmm0, %xmm2
306 ; SSE2-NEXT: psrad $31, %xmm2
307 ; SSE2-NEXT: psrad $24, %xmm0
308 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
309 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
310 ; SSE2-NEXT: psrld $16, %xmm1
311 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
312 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
313 ; SSE2-NEXT: movdqa %xmm1, %xmm2
314 ; SSE2-NEXT: psrad $31, %xmm2
315 ; SSE2-NEXT: psrad $24, %xmm1
316 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
317 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
318 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
319 ; SSE2-NEXT: movdqa %xmm2, %xmm4
320 ; SSE2-NEXT: psrad $31, %xmm4
321 ; SSE2-NEXT: psrad $24, %xmm2
322 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
323 ; SSE2-NEXT: psrld $16, %xmm3
324 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
325 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
326 ; SSE2-NEXT: movdqa %xmm3, %xmm4
327 ; SSE2-NEXT: psrad $31, %xmm4
328 ; SSE2-NEXT: psrad $24, %xmm3
329 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
330 ; SSE2-NEXT: retq
331 ;
332 ; SSSE3-LABEL: sext_16i8_to_8i64:
333 ; SSSE3: # BB#0: # %entry
334 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
335 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
336 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
337 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
338 ; SSSE3-NEXT: psrad $31, %xmm2
339 ; SSSE3-NEXT: psrad $24, %xmm0
340 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
341 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
342 ; SSSE3-NEXT: psrld $16, %xmm1
343 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
344 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
345 ; SSSE3-NEXT: movdqa %xmm1, %xmm2
346 ; SSSE3-NEXT: psrad $31, %xmm2
347 ; SSSE3-NEXT: psrad $24, %xmm1
348 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
349 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
350 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
351 ; SSSE3-NEXT: movdqa %xmm2, %xmm4
352 ; SSSE3-NEXT: psrad $31, %xmm4
353 ; SSSE3-NEXT: psrad $24, %xmm2
354 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
355 ; SSSE3-NEXT: psrld $16, %xmm3
356 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
357 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
358 ; SSSE3-NEXT: movdqa %xmm3, %xmm4
359 ; SSSE3-NEXT: psrad $31, %xmm4
360 ; SSSE3-NEXT: psrad $24, %xmm3
361 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
362 ; SSSE3-NEXT: retq
363 ;
364 ; SSE41-LABEL: sext_16i8_to_8i64:
365 ; SSE41: # BB#0: # %entry
366 ; SSE41-NEXT: pmovsxbq %xmm0, %xmm4
367 ; SSE41-NEXT: movdqa %xmm0, %xmm1
368 ; SSE41-NEXT: psrld $16, %xmm1
369 ; SSE41-NEXT: pmovsxbq %xmm1, %xmm1
370 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
371 ; SSE41-NEXT: pmovsxbq %xmm2, %xmm2
372 ; SSE41-NEXT: psrlq $48, %xmm0
373 ; SSE41-NEXT: pmovsxbq %xmm0, %xmm3
374 ; SSE41-NEXT: movdqa %xmm4, %xmm0
375 ; SSE41-NEXT: retq
376 ;
377 ; AVX1-LABEL: sext_16i8_to_8i64:
378 ; AVX1: # BB#0: # %entry
379 ; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1
380 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2
381 ; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2
382 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
383 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
384 ; AVX1-NEXT: vpmovsxbq %xmm1, %xmm1
385 ; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0
386 ; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0
387 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
388 ; AVX1-NEXT: vmovaps %ymm2, %ymm0
389 ; AVX1-NEXT: retq
390 ;
391 ; AVX2-LABEL: sext_16i8_to_8i64:
392 ; AVX2: # BB#0: # %entry
393 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
394 ; AVX2-NEXT: vpslld $24, %xmm1, %xmm1
395 ; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1
396 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm2
397 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
398 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
399 ; AVX2-NEXT: vpslld $24, %xmm0, %xmm0
400 ; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
401 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm1
402 ; AVX2-NEXT: vmovdqa %ymm2, %ymm0
403 ; AVX2-NEXT: retq
404 ;
405 ; X32-SSE41-LABEL: sext_16i8_to_8i64:
406 ; X32-SSE41: # BB#0: # %entry
407 ; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm4
408 ; X32-SSE41-NEXT: movdqa %xmm0, %xmm1
409 ; X32-SSE41-NEXT: psrld $16, %xmm1
410 ; X32-SSE41-NEXT: pmovsxbq %xmm1, %xmm1
411 ; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
412 ; X32-SSE41-NEXT: pmovsxbq %xmm2, %xmm2
413 ; X32-SSE41-NEXT: psrlq $48, %xmm0
414 ; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm3
415 ; X32-SSE41-NEXT: movdqa %xmm4, %xmm0
416 ; X32-SSE41-NEXT: retl
417 entry:
418 %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32>
419 %C = sext <8 x i8> %B to <8 x i64>
420 ret <8 x i64> %C
297421 }
298422
299423 define <4 x i32> @sext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp {
9581082 ; X32-SSE41-NEXT: pinsrd $2, %ecx, %xmm1
9591083 ; X32-SSE41-NEXT: shrl $3, %eax
9601084 ; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm1
961 ; X32-SSE41-NEXT: pand .LCPI16_0, %xmm1
1085 ; X32-SSE41-NEXT: pand .LCPI17_0, %xmm1
9621086 ; X32-SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
9631087 ; X32-SSE41-NEXT: psllq $63, %xmm0
9641088 ; X32-SSE41-NEXT: psrad $31, %xmm0