llvm.org GIT mirror llvm / 03c079d
One readme entry is done, one is really easy (Evan, want to investigate eliminating the llvm.x86.sse2.loadl.pd intrinsic?), one shuffle optzn may be done (if shufps is better than pinsw, Evan, please review), and we already know about LICM of simple instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@45407 91177308-0d34-0410-b5e6-96231b3b80d8 Chris Lattner 11 years ago
2 changed file(s) with 36 addition(s) and 78 deletion(s). Raw diff Collapse all Expand all
455455 So icc is smart enough to know that B is in memory so it doesn't load it and
456456 store it back to stack.
457457
458 This should be fixed by eliminating the llvm.x86.sse2.loadl.pd intrinsic,
459 lowering it to a load+insertelement instead. Already match the load+shuffle
460 as movlpd, so this should be easy. We already get optimal code for:
461
462 define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) {
463 entry:
464 %tmp2 = load <2 x double>* %A, align 16
465 %tmp8 = insertelement <2 x double> %tmp2, double %B, i32 0
466 store <2 x double> %tmp8, <2 x double>* %r, align 16
467 ret void
468 }
469
458470 //===---------------------------------------------------------------------===//
459471
460472 __m128d test1( __m128d A, __m128d B) {
475487
476488 This code generates ugly code, probably due to costs being off or something:
477489
478 void %test(float* %P, <4 x float>* %P2 ) {
490 define void @test(float* %P, <4 x float>* %P2 ) {
479491 %xFloat0.688 = load float* %P
480 %loadVector37.712 = load <4 x float>* %P2
481 %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3
492 %tmp = load <4 x float>* %P2
493 %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
482494 store <4 x float> %inFloat3.713, <4 x float>* %P2
483495 ret void
484496 }
486498 Generates:
487499
488500 _test:
489 pxor %xmm0, %xmm0
490 movd %xmm0, %eax ;; EAX = 0!
491 movl 8(%esp), %ecx
492 movaps (%ecx), %xmm0
493 pinsrw $6, %eax, %xmm0
494 shrl $16, %eax ;; EAX = 0 again!
495 pinsrw $7, %eax, %xmm0
496 movaps %xmm0, (%ecx)
497 ret
498
499 It would be better to generate:
501 movl 8(%esp), %eax
502 movaps (%eax), %xmm0
503 pxor %xmm1, %xmm1
504 movaps %xmm0, %xmm2
505 shufps $50, %xmm1, %xmm2
506 shufps $132, %xmm2, %xmm0
507 movaps %xmm0, (%eax)
508 ret
509
510 Would it be better to generate:
500511
501512 _test:
502513 movl 8(%esp), %ecx
507518 movaps %xmm0, (%ecx)
508519 ret
509520
510 or use pxor (to make a zero vector) and shuffle (to insert it).
521 ?
511522
512523 //===---------------------------------------------------------------------===//
513524
575586
576587 //===---------------------------------------------------------------------===//
577588
578 This code:
579
580 #include
581 __m128i test(long long i) { return _mm_cvtsi64x_si128(i); }
582
583 Should turn into a single 'movq %rdi, %xmm0' instruction. Instead, we
584 get this (on x86-64):
585
586 _test:
587 movd %rdi, %xmm1
588 xorps %xmm0, %xmm0
589 movsd %xmm1, %xmm0
590 ret
591
592 The LLVM IR is:
593
594 target triple = "x86_64-apple-darwin8"
595 define <2 x i64> @test(i64 %i) {
596 entry:
597 %tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0
598 %tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1
599 ret <2 x i64> %tmp11
600 }
601
602 //===---------------------------------------------------------------------===//
603
604589 These functions should produce the same code:
605590
606591 #include
667652 x86-64 indeed has an instruction to load a 32-bit float from memory and convert
668653 it into a 64-bit float in a register, however it doesn't notice that this isn't
669654 beneficial because it prevents the load from being folded into the multiply.
670
671 //===---------------------------------------------------------------------===//
672
673 In this loop:
674
675 bb49: ; preds = %bb49, %bb49.preheader
676 %indvar = phi i32 [ 0, %bb49.preheader ], [ %indvar.next, %bb49 ] ; [#uses=2]
677 %dp.089.0.rec = shl i32 %indvar, 3 ; [#uses=2]
678 %dp.089.0 = getelementptr i32* %tmp89, i32 %dp.089.0.rec ; [#uses=1]
679 %tmp5051 = bitcast i32* %dp.089.0 to <2 x i64>* ; <<2 x i64>*> [#uses=1]
680 store <2 x i64> zeroinitializer, <2 x i64>* %tmp5051, align 16
681 %dp.089.0.sum105 = or i32 %dp.089.0.rec, 4 ; [#uses=1]
682 %tmp56 = getelementptr i32* %tmp89, i32 %dp.089.0.sum105 ; [#uses=1]
683 %tmp5657 = bitcast i32* %tmp56 to <2 x i64>* ; <<2 x i64>*> [#uses=1]
684 store <2 x i64> zeroinitializer, <2 x i64>* %tmp5657, align 16
685 %indvar.next = add i32 %indvar, 1 ; [#uses=2]
686 %exitcond = icmp eq i32 %indvar.next, %tmp98 ; [#uses=1]
687 br i1 %exitcond, label %bb72, label %bb49
688
689 we get:
690
691 LBB5_6: # bb49.preheader
692 shlw $2, %si
693 decw %si
694 movzwl %si, %eax
695 incl %eax
696 xorl %ecx, %ecx
697 LBB5_7: # bb49
698 xorps %xmm0, %xmm0 # (1)
699 movaps %xmm0, (%edx)
700 movaps %xmm0, 16(%edx)
701 addl $32, %edx
702 incl %ecx
703 cmpl %eax, %ecx
704 jne LBB4_7 # bb47
705
706 The instruction at (1) can be moved out of the main body of the loop.
707655
708656 //===---------------------------------------------------------------------===//
709657
0 ; RUN: llvm-as < %s | llc -march=x86-64 | not grep movsd
1 ; RUN: llvm-as < %s | llc -march=x86-64 | grep {movd.*%rdi,.*%xmm0}
2
3 define <2 x i64> @test(i64 %i) nounwind {
4 entry:
5 %tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0
6 %tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1
7 ret <2 x i64> %tmp11
8 }
9