llvm.org GIT mirror llvm / 9516b8f
AMDGPU: Select DS insts without m0 initialization GFX9 stopped using m0 for most DS instructions. Select a different instruction without the use. I think this will be less error prone than trying to manually maintain m0 uses as needed. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319270 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 2 years ago
31 changed file(s) with 1803 addition(s) and 603 deletion(s). Raw diff Collapse all Expand all
720720 def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">,
721721 AssemblerPredicate<"FeatureGFX9Insts">;
722722
723
724 def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">;
725 def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">;
726
723727 def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
724728 AssemblerPredicate<"FeatureGFX9Insts">;
725729
336336 }
337337
338338 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
339 if (cast(N)->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS)
339 if (cast(N)->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS ||
340 !Subtarget->ldsRequiresM0Init())
340341 return N;
341342
342343 const SITargetLowering& Lowering =
461461 return getGeneration() >= GFX9;
462462 }
463463
464 /// Return if most LDS instructions have an m0 use that require m0 to be
465 /// iniitalized.
466 bool ldsRequiresM0Init() const {
467 return getGeneration() < GFX9;
468 }
469
464470 bool hasAddNoCarry() const {
465471 return AddNoCarryInsts;
466472 }
599599 (inst $ptr, (as_i16imm $offset), (i1 0))
600600 >;
601601
602 // FIXME: Passing name of PatFrag in workaround. Why doesn't
603 // !cast(frag.NAME#"_m0") work!?
604 multiclass DSReadPat_mc {
605
606 let OtherPredicates = [LDSRequiresM0Init] in {
607 def : DSReadPat(frag#"_m0")>;
608 }
609
610 let OtherPredicates = [NotLDSRequiresM0Init] in {
611 def : DSReadPat(inst.NAME#"_gfx9"), vt, !cast(frag)>;
612 }
613 }
614
615
602616 multiclass DSReadPat_Hi16 {
603617 def : GCNPat <
604618 (build_vector vt:$lo, (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))),
623637 >;
624638 }
625639
626
627 def : DSReadPat ;
628 def : DSReadPat ;
629 def : DSReadPat ;
630 def : DSReadPat ;
631 def : DSReadPat ;
632 def : DSReadPat ;
633 def : DSReadPat ;
634 def : DSReadPat ;
635 def : DSReadPat >;
640 defm : DSReadPat_mc >;
641 defm : DSReadPat_mc ;
642 defm : DSReadPat_mc ;
643 defm : DSReadPat_mc ;
644 defm : DSReadPat_mc ;
645 defm : DSReadPat_mc ;
646 defm : DSReadPat_mc ;
647 defm : DSReadPat_mc ;
648 defm : DSReadPat_mc ;
636649
637650 let AddedComplexity = 100 in {
638651
639 def : DSReadPat >;
652 defm : DSReadPat_mc >;
640653
641654 } // End AddedComplexity = 100
642
643 def : GCNPat <
644 (v2i32 (load_local_m0 (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
645 i8:$offset1))),
646 (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0))
647 >;
648
649655
650656 let OtherPredicates = [HasD16LoadStore] in {
651657 let AddedComplexity = 100 in {
665671 (inst $ptr, $value, (as_i16imm $offset), (i1 0))
666672 >;
667673
668 def : DSWritePat ;
669 def : DSWritePat ;
670 def : DSWritePat ;
671 def : DSWritePat ;
672 def : DSWritePat ;
674 multiclass DSWritePat_mc {
675 let OtherPredicates = [LDSRequiresM0Init] in {
676 def : DSWritePat(frag#"_m0")>;
677 }
678
679 let OtherPredicates = [NotLDSRequiresM0Init] in {
680 def : DSWritePat(inst.NAME#"_gfx9"), vt, !cast(frag)>;
681 }
682 }
683
684 defm : DSWritePat_mc ;
685 defm : DSWritePat_mc ;
686 defm : DSWritePat_mc ;
687 defm : DSWritePat_mc ;
688 defm : DSWritePat_mc ;
673689
674690 let OtherPredicates = [HasD16LoadStore] in {
675691 def : DSWritePat ;
676692 def : DSWritePat ;
677693 }
678694
695
696 class DS64Bit4ByteAlignedReadPat : GCNPat <
697 (v2i32 (frag (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))),
698 (inst $ptr, $offset0, $offset1, (i1 0))
699 >;
700
701 class DS64Bit4ByteAlignedWritePat : GCNPat<
702 (frag v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)),
703 (inst $ptr, (i32 (EXTRACT_SUBREG $value, sub0)),
704 (i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1,
705 (i1 0))
706 >;
707
708 let OtherPredicates = [LDSRequiresM0Init] in {
709 def : DS64Bit4ByteAlignedReadPat;
710 def : DS64Bit4ByteAlignedWritePat;
711 }
712
713 let OtherPredicates = [NotLDSRequiresM0Init] in {
714 def : DS64Bit4ByteAlignedReadPat;
715 def : DS64Bit4ByteAlignedWritePat;
716 }
717
718
679719 let AddedComplexity = 100 in {
680720
681 def : DSWritePat >;
721 defm : DSWritePat_mc >;
682722 } // End AddedComplexity = 100
683
684 def : GCNPat <
685 (store_local_m0 v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
686 i8:$offset1)),
687 (DS_WRITE2_B32 $ptr, (i32 (EXTRACT_SUBREG $value, sub0)),
688 (i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1,
689 (i1 0))
690 >;
691
692723 class DSAtomicRetPat : GCNPat <
693724 (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
694725 (inst $ptr, $value, (as_i16imm $offset), (i1 0))
695726 >;
696727
728 multiclass DSAtomicRetPat_mc {
729 let OtherPredicates = [LDSRequiresM0Init] in {
730 def : DSAtomicRetPat(frag#"_m0")>;
731 }
732
733 let OtherPredicates = [NotLDSRequiresM0Init] in {
734 def : DSAtomicRetPat(inst.NAME#"_gfx9"), vt, !cast(frag)>;
735 }
736 }
737
738
739
697740 class DSAtomicCmpXChg : GCNPat <
698741 (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
699742 (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0))
700743 >;
701744
745 multiclass DSAtomicCmpXChg_mc {
746 let OtherPredicates = [LDSRequiresM0Init] in {
747 def : DSAtomicCmpXChg(frag#"_m0")>;
748 }
749
750 let OtherPredicates = [NotLDSRequiresM0Init] in {
751 def : DSAtomicCmpXChg(inst.NAME#"_gfx9"), vt, !cast(frag)>;
752 }
753 }
754
755
702756
703757 // 32-bit atomics.
704 def : DSAtomicRetPat;
705 def : DSAtomicRetPat;
706 def : DSAtomicRetPat;
707 def : DSAtomicRetPat;
708 def : DSAtomicRetPat;
709 def : DSAtomicRetPat;
710 def : DSAtomicRetPat;
711 def : DSAtomicRetPat;
712 def : DSAtomicRetPat;
713 def : DSAtomicRetPat;
714 def : DSAtomicRetPat;
715 def : DSAtomicRetPat;
716 def : DSAtomicCmpXChg>;
758 defm : DSAtomicRetPat_mc>;
759 defm : DSAtomicRetPat_mc;
760 defm : DSAtomicRetPat_mc;
761 defm : DSAtomicRetPat_mc;
762 defm : DSAtomicRetPat_mc;
763 defm : DSAtomicRetPat_mc;
764 defm : DSAtomicRetPat_mc;
765 defm : DSAtomicRetPat_mc;
766 defm : DSAtomicRetPat_mc;
767 defm : DSAtomicRetPat_mc;
768 defm : DSAtomicRetPat_mc;
769 defm : DSAtomicRetPat_mc;
770 defm : DSAtomicCmpXChg_mc;
717771
718772 // 64-bit atomics.
719 def : DSAtomicRetPat;
720 def : DSAtomicRetPat;
721 def : DSAtomicRetPat;
722 def : DSAtomicRetPat;
723 def : DSAtomicRetPat;
724 def : DSAtomicRetPat;
725 def : DSAtomicRetPat;
726 def : DSAtomicRetPat;
727 def : DSAtomicRetPat;
728 def : DSAtomicRetPat;
729 def : DSAtomicRetPat;
730 def : DSAtomicRetPat;
731
732 def : DSAtomicCmpXChg>;
773 defm : DSAtomicRetPat_mc>;
774 defm : DSAtomicRetPat_mc;
775 defm : DSAtomicRetPat_mc;
776 defm : DSAtomicRetPat_mc;
777 defm : DSAtomicRetPat_mc;
778 defm : DSAtomicRetPat_mc;
779 defm : DSAtomicRetPat_mc;
780 defm : DSAtomicRetPat_mc;
781 defm : DSAtomicRetPat_mc;
782 defm : DSAtomicRetPat_mc;
783 defm : DSAtomicRetPat_mc;
784 defm : DSAtomicRetPat_mc;
785
786 defm : DSAtomicCmpXChg_mc;
733787
734788 //===----------------------------------------------------------------------===//
735789 // Real instructions
159159
160160 defm atomic_inc_global : global_binary_atomic_op;
161161 defm atomic_dec_global : global_binary_atomic_op;
162
163 def atomic_inc_local : local_binary_atomic_op;
164 def atomic_dec_local : local_binary_atomic_op;
162165
163166 //===----------------------------------------------------------------------===//
164167 // SDNodes PatFrags for loads/stores with a glue input.
111111 static bool offsetsCanBeCombined(CombineInfo &CI);
112112
113113 bool findMatchingInst(CombineInfo &CI);
114
115 unsigned read2Opcode(unsigned EltSize) const;
116 unsigned read2ST64Opcode(unsigned EltSize) const;
114117 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
118
119 unsigned write2Opcode(unsigned EltSize) const;
120 unsigned write2ST64Opcode(unsigned EltSize) const;
115121 MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
116122 MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
117123 MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
435441 return false;
436442 }
437443
444 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
445 if (STM->ldsRequiresM0Init())
446 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
447 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
448 }
449
450 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
451 if (STM->ldsRequiresM0Init())
452 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
453
454 return (EltSize == 4) ?
455 AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9;
456 }
457
438458 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
439459 CombineInfo &CI) {
440460 MachineBasicBlock *MBB = CI.I->getParent();
448468
449469 unsigned NewOffset0 = CI.Offset0;
450470 unsigned NewOffset1 = CI.Offset1;
451 unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2_B32
452 : AMDGPU::DS_READ2_B64;
453
454 if (CI.UseST64)
455 Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2ST64_B32
456 : AMDGPU::DS_READ2ST64_B64;
471 unsigned Opc = CI.UseST64 ?
472 read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
457473
458474 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
459475 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
516532 return Next;
517533 }
518534
535 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
536 if (STM->ldsRequiresM0Init())
537 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
538 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9;
539 }
540
541 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
542 if (STM->ldsRequiresM0Init())
543 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
544
545 return (EltSize == 4) ?
546 AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
547 }
548
519549 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
520550 CombineInfo &CI) {
521551 MachineBasicBlock *MBB = CI.I->getParent();
529559
530560 unsigned NewOffset0 = CI.Offset0;
531561 unsigned NewOffset1 = CI.Offset1;
532 unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2_B32
533 : AMDGPU::DS_WRITE2_B64;
534
535 if (CI.UseST64)
536 Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
537 : AMDGPU::DS_WRITE2ST64_B64;
562 unsigned Opc = CI.UseST64 ?
563 write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
538564
539565 if (NewOffset0 > NewOffset1) {
540566 // Canonicalize the merged instruction so the smaller offset comes first.
785811 CombineInfo CI;
786812 CI.I = I;
787813 unsigned Opc = MI.getOpcode();
788 if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
814 if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 ||
815 Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) {
816
789817 CI.InstClass = DS_READ_WRITE;
790 CI.EltSize = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
818 CI.EltSize =
819 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4;
820
791821 if (findMatchingInst(CI)) {
792822 Modified = true;
793823 I = mergeRead2Pair(CI);
796826 }
797827
798828 continue;
799 }
800 if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
829 } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 ||
830 Opc == AMDGPU::DS_WRITE_B32_gfx9 ||
831 Opc == AMDGPU::DS_WRITE_B64_gfx9) {
801832 CI.InstClass = DS_READ_WRITE;
802 CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
833 CI.EltSize
834 = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4;
835
803836 if (findMatchingInst(CI)) {
804837 Modified = true;
805838 I = mergeWrite2Pair(CI);
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SICI -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SICI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SICI,SICIVI,GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SICI,CIVI,SICIVI,GCN %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,CIVI,SICIVI,GFX89,GCN %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX89,GCN %s
34
4 ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
5 ; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
6 ; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
7 ; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
8 ; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
9 ; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
5 ; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
6 ; GFX9-NOT: m0
7 ; SICIVI-DAG: s_mov_b32 m0
8
9 ; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
10 ; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
11 ; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
12 ; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
13 ; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
1014 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
1115 ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
1216 ; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
1923 ret void
2024 }
2125
22 ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
26 ; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
27 ; GFX9-NOT: m0
28 ; SICIVI-DAG: s_mov_b32 m0
29
2330 ; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
2431 ; SICI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
25 ; VI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
26 ; VI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
32 ; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
33 ; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
2734 ; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
2835 ; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
2936 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
3037 ; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
3138 ; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
3239 ; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
33 ; GCN: buffer_store_dwordx2 [[RESULT]],
40 ; GCN: [[RESULT]]
3441 ; GCN: s_endpgm
3542 define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
3643 %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
4047 ret void
4148 }
4249
43 ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset
50 ; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset
51 ; GFX9-NOT: m0
4452 ; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
4553 ; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
54 ; GFX9: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
4655 ; GCN: s_endpgm
4756 define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
4857 %sub = sub i32 %a, %b
5463 ret void
5564 }
5665
57 ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset:
58 ; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
59 ; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa
60 ; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
61 ; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
66 ; GCN-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset:
67 ; GFX9-NOT: m0
68 ; SICIVI-DAG: s_mov_b32 m0
69
70
71 ; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
72 ; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa
73 ; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
74 ; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
6275 ; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
6376 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
6477 ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
7184 ret void
7285 }
7386
74 ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset:
75 ; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
76 ; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
77 ; VI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
78 ; VI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
87 ; GCN-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset:
88 ; GFX9-NOT: m0
89 ; SICIVI-DAG: s_mov_b32 m0
90
91 ; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
92 ; SICI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
93 ; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
94 ; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
7995 ; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
8096 ; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
8197 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
2 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
3 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=R600,FUNC %s
34
45 ; FUNC-LABEL: {{^}}atomic_add_local:
6 ; SICIVI: s_mov_b32 m0
7 ; GFX9-NOT: m0
58 ; R600: LDS_ADD *
6 ; SI: ds_add_u32
9 ; GCN: ds_add_u32
710 define amdgpu_kernel void @atomic_add_local(i32 addrspace(3)* %local) {
811 %unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
912 ret void
1013 }
1114
1215 ; FUNC-LABEL: {{^}}atomic_add_local_const_offset:
16 ; SICIVI: s_mov_b32 m0
17 ; GFX9-NOT: m0
18
1319 ; R600: LDS_ADD *
14 ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
20 ; GCN: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
1521 define amdgpu_kernel void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
1622 %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
1723 %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
1925 }
2026
2127 ; FUNC-LABEL: {{^}}atomic_add_ret_local:
28 ; SICIVI: s_mov_b32 m0
29 ; GFX9-NOT: m0
30
2231 ; R600: LDS_ADD_RET *
23 ; SI: ds_add_rtn_u32
32 ; GCN: ds_add_rtn_u32
2433 define amdgpu_kernel void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
2534 %val = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
2635 store i32 %val, i32 addrspace(1)* %out
2837 }
2938
3039 ; FUNC-LABEL: {{^}}atomic_add_ret_local_const_offset:
40 ; SICIVI: s_mov_b32 m0
41 ; GFX9-NOT: m0
42
3143 ; R600: LDS_ADD_RET *
32 ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
44 ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
3345 define amdgpu_kernel void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
3446 %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
3547 %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SICIVI,FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SICIVI,FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s
3 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefixes=R600,FUNC %s
34
45 ; FUNC-LABEL: {{^}}atomic_sub_local:
6 ; SICIVI: s_mov_b32 m0
7 ; GFX9-NOT: m0
8
59 ; R600: LDS_SUB *
6 ; SI: ds_sub_u32
10 ; GCN: ds_sub_u32
711 define amdgpu_kernel void @atomic_sub_local(i32 addrspace(3)* %local) {
812 %unused = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
913 ret void
1014 }
1115
1216 ; FUNC-LABEL: {{^}}atomic_sub_local_const_offset:
17 ; SICIVI: s_mov_b32 m0
18 ; GFX9-NOT: m0
19
1320 ; R600: LDS_SUB *
14 ; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
21 ; GCN: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
1522 define amdgpu_kernel void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
1623 %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
1724 %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
1926 }
2027
2128 ; FUNC-LABEL: {{^}}atomic_sub_ret_local:
29 ; SICIVI: s_mov_b32 m0
30 ; GFX9-NOT: m0
31
2232 ; R600: LDS_SUB_RET *
23 ; SI: ds_sub_rtn_u32
33 ; GCN: ds_sub_rtn_u32
2434 define amdgpu_kernel void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
2535 %val = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
2636 store i32 %val, i32 addrspace(1)* %out
2838 }
2939
3040 ; FUNC-LABEL: {{^}}atomic_sub_ret_local_const_offset:
41 ; SICIVI: s_mov_b32 m0
42 ; GFX9-NOT: m0
43
3144 ; R600: LDS_SUB_RET *
32 ; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
45 ; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
3346 define amdgpu_kernel void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
3447 %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
3548 %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
None ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefix=SI %s
0 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefixes=GCN,CI %s
1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX9 %s
12
23 ; FIXME: We don't get cases where the address was an SGPR because we
34 ; get a copy to the address register for each one.
56 @lds = addrspace(3) global [512 x float] undef, align 4
67 @lds.f64 = addrspace(3) global [512 x double] undef, align 8
78
8 ; SI-LABEL: @simple_read2_f32
9 ; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8
10 ; SI: s_waitcnt lgkmcnt(0)
11 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
12 ; SI: buffer_store_dword [[RESULT]]
13 ; SI: s_endpgm
9 ; GCN-LABEL: {{^}}simple_read2_f32:
10 ; CI-DAG: s_mov_b32 m0
11 ; GFX9-NOT: m0
12
13 ; GCN: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8
14 ; GCN: s_waitcnt lgkmcnt(0)
15 ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
16 ; CI: buffer_store_dword [[RESULT]]
17 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
18 ; GCN: s_endpgm
1419 define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 {
1520 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
1621 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
2429 ret void
2530 }
2631
27 ; SI-LABEL: @simple_read2_f32_max_offset
28 ; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:255
29 ; SI: s_waitcnt lgkmcnt(0)
30 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
31 ; SI: buffer_store_dword [[RESULT]]
32 ; SI: s_endpgm
32 ; GCN-LABEL: {{^}}simple_read2_f32_max_offset:
33 ; CI-DAG: s_mov_b32 m0
34 ; GFX9-NOT: m0
35
36 ; GCN: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:255
37 ; GCN: s_waitcnt lgkmcnt(0)
38 ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
39
40 ; CI: buffer_store_dword [[RESULT]]
41 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
3342 define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
3443 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
3544 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
4352 ret void
4453 }
4554
46 ; SI-LABEL: @simple_read2_f32_too_far
47 ; SI-NOT ds_read2_b32
48 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
49 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
50 ; SI: s_endpgm
55 ; GCN-LABEL: @simple_read2_f32_too_far
56 ; CI-DAG: s_mov_b32 m0
57 ; GFX9-NOT: m0
58
59 ; GCN-NOT ds_read2_b32
60 ; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
61 ; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
62 ; GCN: s_endpgm
5163 define amdgpu_kernel void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
5264 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
5365 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
6173 ret void
6274 }
6375
64 ; SI-LABEL: @simple_read2_f32_x2
65 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
66 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
67 ; SI: s_endpgm
76 ; GCN-LABEL: @simple_read2_f32_x2
77 ; CI-DAG: s_mov_b32 m0
78 ; GFX9-NOT: m0
79
80 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
81 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
82 ; GCN: s_endpgm
6883 define amdgpu_kernel void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
6984 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
7085 %idx.0 = add nsw i32 %tid.x, 0
92107 }
93108
94109 ; Make sure there is an instruction between the two sets of reads.
95 ; SI-LABEL: @simple_read2_f32_x2_barrier
96 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
97 ; SI: s_barrier
98 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
99 ; SI: s_endpgm
110 ; GCN-LABEL: @simple_read2_f32_x2_barrier
111 ; CI-DAG: s_mov_b32 m0
112 ; GFX9-NOT: m0
113
114 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
115 ; GCN: s_barrier
116 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
117 ; GCN: s_endpgm
100118 define amdgpu_kernel void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
101119 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
102120 %idx.0 = add nsw i32 %tid.x, 0
128146 ; For some reason adding something to the base address for the first
129147 ; element results in only folding the inner pair.
130148
131 ; SI-LABEL: @simple_read2_f32_x2_nonzero_base
132 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8
133 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
134 ; SI: s_endpgm
149 ; GCN-LABEL: @simple_read2_f32_x2_nonzero_base
150 ; CI-DAG: s_mov_b32 m0
151 ; GFX9-NOT: m0
152
153 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8
154 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
155 ; GCN: s_endpgm
135156 define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
136157 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
137158 %idx.0 = add nsw i32 %tid.x, 2
164185 ; Base pointers come from different subregister of same super
165186 ; register. We can't safely merge this.
166187
167 ; SI-LABEL: @read2_ptr_is_subreg_arg_f32
168 ; SI-NOT: ds_read2_b32
169 ; SI: ds_read_b32
170 ; SI: ds_read_b32
171 ; SI: s_endpgm
188 ; GCN-LABEL: @read2_ptr_is_subreg_arg_f32
189 ; CI-DAG: s_mov_b32 m0
190 ; GFX9-NOT: m0
191
192 ; GCN-NOT: ds_read2_b32
193 ; GCN: ds_read_b32
194 ; GCN: ds_read_b32
195 ; GCN: s_endpgm
172196 define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
173197 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
174198 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
190214 ; sure we are really rejecting it because of the different
191215 ; subregisters.
192216
193 ; SI-LABEL: @read2_ptr_is_subreg_arg_offset_f32
194 ; SI-NOT: ds_read2_b32
195 ; SI: ds_read_b32
196 ; SI: ds_read_b32
197 ; SI: s_endpgm
217 ; GCN-LABEL: @read2_ptr_is_subreg_arg_offset_f32
218 ; CI-DAG: s_mov_b32 m0
219 ; GFX9-NOT: m0
220
221 ; GCN-NOT: ds_read2_b32
222 ; GCN: ds_read_b32
223 ; GCN: ds_read_b32
224 ; GCN: s_endpgm
198225 define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
199226 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
200227 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
215242 ret void
216243 }
217244
218 ; SI-LABEL: {{^}}read2_ptr_is_subreg_f32:
219 ; SI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:8{{$}}
220 ; SI: s_endpgm
245 ; GCN-LABEL: {{^}}read2_ptr_is_subreg_f32:
246 ; CI-DAG: s_mov_b32 m0
247 ; GFX9-NOT: m0
248
249 ; GCN: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:8{{$}}
250 ; GCN: s_endpgm
221251 define amdgpu_kernel void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
222252 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
223253 %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0
237267 ret void
238268 }
239269
240 ; SI-LABEL: @simple_read2_f32_volatile_0
241 ; SI-NOT ds_read2_b32
242 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
243 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
244 ; SI: s_endpgm
270 ; GCN-LABEL: @simple_read2_f32_volatile_0
271 ; CI-DAG: s_mov_b32 m0
272 ; GFX9-NOT: m0
273
274 ; GCN-NOT ds_read2_b32
275 ; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
276 ; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
277 ; GCN: s_endpgm
245278 define amdgpu_kernel void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
246279 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
247280 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
255288 ret void
256289 }
257290
258 ; SI-LABEL: @simple_read2_f32_volatile_1
259 ; SI-NOT ds_read2_b32
260 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
261 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
262 ; SI: s_endpgm
291 ; GCN-LABEL: @simple_read2_f32_volatile_1
292 ; CI-DAG: s_mov_b32 m0
293 ; GFX9-NOT: m0
294
295 ; GCN-NOT ds_read2_b32
296 ; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
297 ; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
298 ; GCN: s_endpgm
263299 define amdgpu_kernel void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
264300 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
265301 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
276312 ; Can't fold since not correctly aligned.
277313 ; XXX: This isn't really testing anything useful now. I think CI
278314 ; allows unaligned LDS accesses, which would be a problem here.
279 ; SI-LABEL: @unaligned_read2_f32
280 ; SI-NOT: ds_read2_b32
281 ; SI: s_endpgm
315 ; GCN-LABEL: @unaligned_read2_f32
316 ; CI-DAG: s_mov_b32 m0
317 ; GFX9-NOT: m0
318
319 ; GCN-NOT: ds_read2_b32
320 ; GCN: s_endpgm
282321 define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
283322 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
284323 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
292331 ret void
293332 }
294333
295 ; SI-LABEL: @misaligned_2_simple_read2_f32
296 ; SI-NOT: ds_read2_b32
297 ; SI: s_endpgm
334 ; GCN-LABEL: @misaligned_2_simple_read2_f32
335 ; CI-DAG: s_mov_b32 m0
336 ; GFX9-NOT: m0
337
338 ; GCN-NOT: ds_read2_b32
339 ; GCN: s_endpgm
298340 define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
299341 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
300342 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
308350 ret void
309351 }
310352
311 ; SI-LABEL: @simple_read2_f64
312 ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}}
313 ; SI: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8
314 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
315 ; SI: buffer_store_dwordx2 [[RESULT]]
316 ; SI: s_endpgm
353 ; GCN-LABEL: @simple_read2_f64
354 ; CI-DAG: s_mov_b32 m0
355 ; GFX9-NOT: m0
356
357 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}}
358 ; GCN: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8
359 ; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
360
361 ; CI: buffer_store_dwordx2 [[RESULT]]
362 ; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
317363 define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 {
318364 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
319365 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
327373 ret void
328374 }
329375
330 ; SI-LABEL: @simple_read2_f64_max_offset
331 ; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255
332 ; SI: s_endpgm
376 ; GCN-LABEL: @simple_read2_f64_max_offset
377 ; CI-DAG: s_mov_b32 m0
378 ; GFX9-NOT: m0
379
380 ; GCN: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255
381 ; GCN: s_endpgm
333382 define amdgpu_kernel void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
334383 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
335384 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
343392 ret void
344393 }
345394
346 ; SI-LABEL: @simple_read2_f64_too_far
347 ; SI-NOT ds_read2_b64
348 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
349 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056
350 ; SI: s_endpgm
395 ; GCN-LABEL: @simple_read2_f64_too_far
396 ; CI-DAG: s_mov_b32 m0
397 ; GFX9-NOT: m0
398
399 ; GCN-NOT ds_read2_b64
400 ; GCN: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
401 ; GCN: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056
402 ; GCN: s_endpgm
351403 define amdgpu_kernel void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
352404 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
353405 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
362414 }
363415
364416 ; Alignment only 4
365 ; SI-LABEL: @misaligned_read2_f64
366 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
367 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15
368 ; SI: s_endpgm
417 ; GCN-LABEL: @misaligned_read2_f64
418 ; CI-DAG: s_mov_b32 m0
419 ; GFX9-NOT: m0
420
421 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
422 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15
423 ; GCN: s_endpgm
369424 define amdgpu_kernel void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
370425 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
371426 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
381436
382437 @foo = addrspace(3) global [4 x i32] undef, align 4
383438
384 ; SI-LABEL: @load_constant_adjacent_offsets
385 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
386 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
439 ; GCN-LABEL: @load_constant_adjacent_offsets
440 ; CI-DAG: s_mov_b32 m0
441 ; GFX9-NOT: m0
442
443 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
444 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
387445 define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
388446 %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
389447 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
392450 ret void
393451 }
394452
395 ; SI-LABEL: @load_constant_disjoint_offsets
396 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
397 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2
453 ; GCN-LABEL: @load_constant_disjoint_offsets
454 ; CI-DAG: s_mov_b32 m0
455 ; GFX9-NOT: m0
456
457 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
458 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2
398459 define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
399460 %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
400461 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
405466
406467 @bar = addrspace(3) global [4 x i64] undef, align 4
407468
408 ; SI-LABEL: @load_misaligned64_constant_offsets
409 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
410 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
411 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3
469 ; GCN-LABEL: @load_misaligned64_constant_offsets
470 ; CI-DAG: s_mov_b32 m0
471 ; GFX9-NOT: m0
472
473 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
474 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
475 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3
412476 define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
413477 %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
414478 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
419483
420484 @bar.large = addrspace(3) global [4096 x i64] undef, align 4
421485
422 ; SI-LABEL: @load_misaligned64_constant_large_offsets
423 ; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
424 ; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000
425 ; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1
426 ; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1
427 ; SI: s_endpgm
486 ; GCN-LABEL: @load_misaligned64_constant_large_offsets
487 ; CI-DAG: s_mov_b32 m0
488 ; GFX9-NOT: m0
489
490 ; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
491 ; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000
492 ; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1
493 ; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1
494 ; GCN: s_endpgm
428495 define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
429496 %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
430497 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
435502
436503 @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
437504 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
505
506 ; GCN-LABEL: {{^}}sgemm_inner_loop_read2_sequence:
507 ; CI-DAG: s_mov_b32 m0
508 ; GFX9-NOT: m0
438509
439510 define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 {
440511 %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
480551 ret void
481552 }
482553
554 ; GCN-LABEL: {{^}}misaligned_read2_v2i32:
555 ; CI-DAG: s_mov_b32 m0
556 ; GFX9-NOT: m0
483557 define amdgpu_kernel void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 {
484558 %load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4
485559 store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8
486560 ret void
487561 }
488562
563 ; GCN-LABEL: {{^}}misaligned_read2_i64:
564 ; CI-DAG: s_mov_b32 m0
565 ; GFX9-NOT: m0
489566 define amdgpu_kernel void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 {
490567 %load = load i64, i64 addrspace(3)* %in, align 4
491568 store i64 %load, i64 addrspace(1)* %out, align 8
492569 ret void
493570 }
494571
495 ; SI-LABEL: ds_read_diff_base_interleaving
496 ; SI-NOT: ds_read_b32
572 ; GCN-LABEL: ds_read_diff_base_interleaving
573 ; CI-DAG: s_mov_b32 m0
574 ; GFX9-NOT: m0
575
576 ; GCN-NOT: ds_read_b32
497577 define amdgpu_kernel void @ds_read_diff_base_interleaving(
498578 float addrspace(1)* nocapture %arg,
499579 [4 x [4 x float]] addrspace(3)* %arg1,
532612 ret void
533613 }
534614
535 ; Function Attrs: nounwind readnone
536615 declare i32 @llvm.amdgcn.workgroup.id.x() #1
537
538 ; Function Attrs: nounwind readnone
539616 declare i32 @llvm.amdgcn.workgroup.id.y() #1
540
541 ; Function Attrs: nounwind readnone
542617 declare i32 @llvm.amdgcn.workitem.id.x() #1
543
544 ; Function Attrs: nounwind readnone
545618 declare i32 @llvm.amdgcn.workitem.id.y() #1
546
547 ; Function Attrs: convergent nounwind
548619 declare void @llvm.amdgcn.s.barrier() #2
549620
550621 attributes #0 = { nounwind }
None ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI %s
0 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefixes=GCN,CI %s
1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefixes=GCN,GFX9 %s
12
23 @lds = addrspace(3) global [512 x float] undef, align 4
34 @lds.f64 = addrspace(3) global [512 x double] undef, align 8
45
56
6 ; SI-LABEL: @simple_read2st64_f32_0_1
7 ; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
8 ; SI: s_waitcnt lgkmcnt(0)
9 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
10 ; SI: buffer_store_dword [[RESULT]]
11 ; SI: s_endpgm
7 ; GCN-LABEL: @simple_read2st64_f32_0_1
8 ; CI: s_mov_b32 m0
9 ; GFX9-NOT: m0
10
11 ; GCN: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
12 ; GCN: s_waitcnt lgkmcnt(0)
13 ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
14 ; CI: buffer_store_dword [[RESULT]]
15 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1216 define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
1317 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
1418 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
2226 ret void
2327 }
2428
25 ; SI-LABEL: @simple_read2st64_f32_1_2
26 ; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
27 ; SI: s_waitcnt lgkmcnt(0)
28 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
29 ; SI: buffer_store_dword [[RESULT]]
30 ; SI: s_endpgm
29 ; GCN-LABEL: @simple_read2st64_f32_1_2
30 ; CI: s_mov_b32 m0
31 ; GFX9-NOT: m0
32
33 ; GCN: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
34 ; GCN: s_waitcnt lgkmcnt(0)
35 ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
36 ; CI: buffer_store_dword [[RESULT]]
37 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
3138 define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
3239 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
3340 %add.x.0 = add nsw i32 %x.i, 64
4249 ret void
4350 }
4451
45 ; SI-LABEL: @simple_read2st64_f32_max_offset
46 ; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255
47 ; SI: s_waitcnt lgkmcnt(0)
48 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
49 ; SI: buffer_store_dword [[RESULT]]
50 ; SI: s_endpgm
52 ; GCN-LABEL: @simple_read2st64_f32_max_offset
53 ; CI: s_mov_b32 m0
54 ; GFX9-NOT: m0
55
56 ; GCN: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255
57 ; GCN: s_waitcnt lgkmcnt(0)
58 ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
59 ; CI: buffer_store_dword [[RESULT]]
60 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
5161 define amdgpu_kernel void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
5262 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
5363 %add.x.0 = add nsw i32 %x.i, 64
6272 ret void
6373 }
6474
65 ; SI-LABEL: @simple_read2st64_f32_over_max_offset
66 ; SI-NOT: ds_read2st64_b32
67 ; SI-DAG: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
68 ; SI-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
69 ; SI-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}}
70 ; SI: s_endpgm
75 ; GCN-LABEL: @simple_read2st64_f32_over_max_offset
76 ; CI: s_mov_b32 m0
77 ; GFX9-NOT: m0
78
79 ; GCN-NOT: ds_read2st64_b32
80 ; GCN-DAG: v_add{{(_co)?}}_{{i|u}}32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
81 ; GCN-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
82 ; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}}
83 ; GCN: s_endpgm
7184 define amdgpu_kernel void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
7285 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
7386 %add.x.0 = add nsw i32 %x.i, 64
8295 ret void
8396 }
8497
85 ; SI-LABEL: @odd_invalid_read2st64_f32_0
86 ; SI-NOT: ds_read2st64_b32
87 ; SI: s_endpgm
98 ; GCN-LABEL: @odd_invalid_read2st64_f32_0
99 ; CI: s_mov_b32 m0
100 ; GFX9-NOT: m0
101
102 ; GCN-NOT: ds_read2st64_b32
103 ; GCN: s_endpgm
88104 define amdgpu_kernel void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
89105 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
90106 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
98114 ret void
99115 }
100116
101 ; SI-LABEL: @odd_invalid_read2st64_f32_1
102 ; SI-NOT: ds_read2st64_b32
103 ; SI: s_endpgm
117 ; GCN-LABEL: @odd_invalid_read2st64_f32_1
118 ; CI: s_mov_b32 m0
119 ; GFX9-NOT: m0
120
121 ; GCN-NOT: ds_read2st64_b32
122 ; GCN: s_endpgm
104123 define amdgpu_kernel void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
105124 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
106125 %add.x.0 = add nsw i32 %x.i, 64
115134 ret void
116135 }
117136
118 ; SI-LABEL: @simple_read2st64_f64_0_1
119 ; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
120 ; SI: s_waitcnt lgkmcnt(0)
121 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
122 ; SI: buffer_store_dwordx2 [[RESULT]]
123 ; SI: s_endpgm
137 ; GCN-LABEL: @simple_read2st64_f64_0_1
138 ; CI: s_mov_b32 m0
139 ; GFX9-NOT: m0
140
141 ; GCN: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
142 ; GCN: s_waitcnt lgkmcnt(0)
143 ; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
144 ; CI: buffer_store_dwordx2 [[RESULT]]
145 ; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
124146 define amdgpu_kernel void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
125147 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
126148 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
134156 ret void
135157 }
136158
137 ; SI-LABEL: @simple_read2st64_f64_1_2
138 ; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
139 ; SI: s_waitcnt lgkmcnt(0)
140 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
141 ; SI: buffer_store_dwordx2 [[RESULT]]
142 ; SI: s_endpgm
159 ; GCN-LABEL: @simple_read2st64_f64_1_2
160 ; CI: s_mov_b32 m0
161 ; GFX9-NOT: m0
162
163 ; GCN: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
164 ; GCN: s_waitcnt lgkmcnt(0)
165 ; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
166
167 ; CI: buffer_store_dwordx2 [[RESULT]]
168 ; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
143169 define amdgpu_kernel void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
144170 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
145171 %add.x.0 = add nsw i32 %x.i, 64
156182
157183 ; Alignment only
158184
159 ; SI-LABEL: @misaligned_read2st64_f64
160 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
161 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
162 ; SI: s_endpgm
185 ; GCN-LABEL: @misaligned_read2st64_f64
186 ; CI: s_mov_b32 m0
187 ; GFX9-NOT: m0
188
189 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
190 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
191 ; GCN: s_endpgm
163192 define amdgpu_kernel void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
164193 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
165194 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
174203 }
175204
176205 ; The maximum is not the usual 0xff because 0xff * 8 * 64 > 0xffff
177 ; SI-LABEL: @simple_read2st64_f64_max_offset
178 ; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127
179 ; SI: s_waitcnt lgkmcnt(0)
180 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
181 ; SI: buffer_store_dwordx2 [[RESULT]]
182 ; SI: s_endpgm
206 ; GCN-LABEL: @simple_read2st64_f64_max_offset
207 ; CI: s_mov_b32 m0
208 ; GFX9-NOT: m0
209
210 ; GCN: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127
211 ; GCN: s_waitcnt lgkmcnt(0)
212 ; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
213
214 ; CI: buffer_store_dwordx2 [[RESULT]]
215 ; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
183216 define amdgpu_kernel void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
184217 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
185218 %add.x.0 = add nsw i32 %x.i, 256
194227 ret void
195228 }
196229
197 ; SI-LABEL: @simple_read2st64_f64_over_max_offset
198 ; SI-NOT: ds_read2st64_b64
199 ; SI-DAG: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
200 ; SI-DAG: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
201 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
202 ; SI: s_endpgm
230 ; GCN-LABEL: @simple_read2st64_f64_over_max_offset
231 ; CI: s_mov_b32 m0
232 ; GFX9-NOT: m0
233
234 ; GCN-NOT: ds_read2st64_b64
235 ; GCN-DAG: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
236 ; GCN-DAG: v_add_{{(co_)?}}{{i|u}}32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
237 ; GCN: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
238 ; GCN: s_endpgm
203239 define amdgpu_kernel void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
204240 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
205241 %add.x.0 = add nsw i32 %x.i, 64
214250 ret void
215251 }
216252
217 ; SI-LABEL: @invalid_read2st64_f64_odd_offset
218 ; SI-NOT: ds_read2st64_b64
219 ; SI: s_endpgm
253 ; GCN-LABEL: @invalid_read2st64_f64_odd_offset
254 ; CI: s_mov_b32 m0
255 ; GFX9-NOT: m0
256
257 ; GCN-NOT: ds_read2st64_b64
258 ; GCN: s_endpgm
220259 define amdgpu_kernel void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
221260 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
222261 %add.x.0 = add nsw i32 %x.i, 64
234273 ; The stride of 8 elements is 8 * 8 bytes. We need to make sure the
235274 ; stride in elements, not bytes, is a multiple of 64.
236275
237 ; SI-LABEL: @byte_size_only_divisible_64_read2_f64
238 ; SI-NOT: ds_read2st_b64
239 ; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8
240 ; SI: s_endpgm
276 ; GCN-LABEL: @byte_size_only_divisible_64_read2_f64
277 ; CI: s_mov_b32 m0
278 ; GFX9-NOT: m0
279
280 ; GCN-NOT: ds_read2st_b64
281 ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8
282 ; GCN: s_endpgm
241283 define amdgpu_kernel void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
242284 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
243285 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
251293 ret void
252294 }
253295
254 ; Function Attrs: nounwind readnone
255296 declare i32 @llvm.amdgcn.workitem.id.x() #1
256
257 ; Function Attrs: nounwind readnone
258297 declare i32 @llvm.amdgcn.workitem.id.y() #1
259298
260299 attributes #0 = { nounwind }
None ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefix=SI %s
0 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefixes=GCN,CI %s
1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX9 %s
12
23 @lds = addrspace(3) global [512 x float] undef, align 4
34 @lds.f64 = addrspace(3) global [512 x double] undef, align 8
45
56
6 ; SI-LABEL: @simple_write2_one_val_f32
7 ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
8 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
9 ; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8
10 ; SI: s_endpgm
7 ; GCN-LABEL: {{^}}simple_write2_one_val_f32:
8 ; CI-DAG: s_mov_b32 m0
9 ; GFX9-NOT: m0
10
11 ; GCN-DAG: {{buffer|global}}_load_dword [[VAL:v[0-9]+]]
12 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
13 ; GCN: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8
14 ; GCN: s_endpgm
1115 define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
1216 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
1317 %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
2024 ret void
2125 }
2226
23 ; SI-LABEL: @simple_write2_two_val_f32
24 ; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
25 ; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
26 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
27 ; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
28 ; SI: s_endpgm
27 ; GCN-LABEL: {{^}}simple_write2_two_val_f32:
28 ; CI-DAG: s_mov_b32 m0
29 ; GFX9-NOT: m0
30
31 ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
32 ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
33
34 ; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
35 ; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
36
37 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
38 ; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
39 ; GCN: s_endpgm
2940 define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
3041 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
3142 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
4051 ret void
4152 }
4253
43 ; SI-LABEL: @simple_write2_two_val_f32_volatile_0
44 ; SI-NOT: ds_write2_b32
45 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
46 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
47 ; SI: s_endpgm
54 ; GCN-LABEL: @simple_write2_two_val_f32_volatile_0
55 ; CI-DAG: s_mov_b32 m0
56 ; GFX9-NOT: m0
57
58 ; GCN-NOT: ds_write2_b32
59 ; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
60 ; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
61 ; GCN: s_endpgm
4862 define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
4963 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
5064 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
5973 ret void
6074 }
6175
62 ; SI-LABEL: @simple_write2_two_val_f32_volatile_1
63 ; SI-NOT: ds_write2_b32
64 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
65 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
66 ; SI: s_endpgm
76 ; GCN-LABEL: @simple_write2_two_val_f32_volatile_1
77 ; CI-DAG: s_mov_b32 m0
78 ; GFX9-NOT: m0
79
80 ; GCN-NOT: ds_write2_b32
81 ; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
82 ; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
83 ; GCN: s_endpgm
6784 define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
6885 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
6986 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
7996 }
8097
8198 ; 2 data subregisters from different super registers.
82 ; SI-LABEL: @simple_write2_two_val_subreg2_mixed_f32
83 ; SI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
84 ; SI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
85 ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
86 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
87 ; SI: s_endpgm
99 ; GCN-LABEL: {{^}}simple_write2_two_val_subreg2_mixed_f32:
100 ; GFX9-NOT: m0
101
102 ; CI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
103 ; CI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
104 ; CI-DAG: s_mov_b32 m0
105
106 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
107
108 ; GFX9: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
109 ; GFX9: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
110 ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
111 ; GCN: s_endpgm
88112 define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
89113 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
90114 %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
101125 ret void
102126 }
103127
104 ; SI-LABEL: @simple_write2_two_val_subreg2_f32
105 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
106 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
107 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
108 ; SI: s_endpgm
128 ; GCN-LABEL: @simple_write2_two_val_subreg2_f32
129 ; CI-DAG: s_mov_b32 m0
130 ; GFX9-NOT: m0
131
132 ; GCN-DAG: {{buffer|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
133 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
134 ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
135 ; GCN: s_endpgm
109136 define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
110137 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
111138 %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
120147 ret void
121148 }
122149
123 ; SI-LABEL: @simple_write2_two_val_subreg4_f32
124 ; SI-DAG: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
125 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
126 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
127 ; SI: s_endpgm
150 ; GCN-LABEL: @simple_write2_two_val_subreg4_f32
151 ; CI-DAG: s_mov_b32 m0
152 ; GFX9-NOT: m0
153
154 ; GCN-DAG: {{buffer|global}}_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
155 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
156 ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
157 ; GCN: s_endpgm
128158 define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
129159 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
130160 %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i
139169 ret void
140170 }
141171
142 ; SI-LABEL: @simple_write2_two_val_max_offset_f32
143 ; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
144 ; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
145 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
146 ; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
147 ; SI: s_endpgm
172 ; GCN-LABEL: @simple_write2_two_val_max_offset_f32
173 ; CI-DAG: s_mov_b32 m0
174 ; GFX9-NOT: m0
175
176 ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
177 ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
178
179 ; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
180 ; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
181
182 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
183 ; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
184 ; GCN: s_endpgm
148185 define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
149186 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
150187 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
159196 ret void
160197 }
161198
162 ; SI-LABEL: @simple_write2_two_val_too_far_f32
163 ; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}
164 ; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
165 ; SI: s_endpgm
199 ; GCN-LABEL: @simple_write2_two_val_too_far_f32
200 ; CI-DAG: s_mov_b32 m0
201 ; GFX9-NOT: m0
202
203 ; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}
204 ; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
205 ; GCN: s_endpgm
166206 define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
167207 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
168208 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
177217 ret void
178218 }
179219
180 ; SI-LABEL: @simple_write2_two_val_f32_x2
181 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
182 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
183 ; SI: s_endpgm
220 ; GCN-LABEL: @simple_write2_two_val_f32_x2
221 ; CI-DAG: s_mov_b32 m0
222 ; GFX9-NOT: m0
223
224 ; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
225 ; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
226 ; GCN: s_endpgm
184227 define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
185228 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
186229 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
207250 ret void
208251 }
209252
210 ; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base
211 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
212 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
213 ; SI: s_endpgm
253 ; GCN-LABEL: @simple_write2_two_val_f32_x2_nonzero_base
254 ; CI-DAG: s_mov_b32 m0
255 ; GFX9-NOT: m0
256
257 ; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
258 ; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
259 ; GCN: s_endpgm
214260 define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
215261 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
216262 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
237283 ret void
238284 }
239285
240 ; SI-LABEL: @write2_ptr_subreg_arg_two_val_f32
241 ; SI-NOT: ds_write2_b32
242 ; SI: ds_write_b32
243 ; SI: ds_write_b32
244 ; SI: s_endpgm
286 ; GCN-LABEL: @write2_ptr_subreg_arg_two_val_f32
287 ; CI-DAG: s_mov_b32 m0
288 ; GFX9-NOT: m0
289
290 ; GCN-NOT: ds_write2_b32
291 ; GCN: ds_write_b32
292 ; GCN: ds_write_b32
293 ; GCN: s_endpgm
245294 define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
246295 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
247296 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
264313 ret void
265314 }
266315
267 ; SI-LABEL: @simple_write2_one_val_f64
268 ; SI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]],
269 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
270 ; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8
271 ; SI: s_endpgm
316 ; GCN-LABEL: @simple_write2_one_val_f64
317 ; CI-DAG: s_mov_b32 m0
318 ; GFX9-NOT: m0
319
320 ; GCN-DAG: {{buffer|global}}_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]],
321 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
322 ; GCN: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8
323 ; GCN: s_endpgm
272324 define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
273325 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
274326 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
281333 ret void
282334 }
283335
284 ; SI-LABEL: @misaligned_simple_write2_one_val_f64
285 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
286 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
287 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1
288 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15
289 ; SI: s_endpgm
336 ; GCN-LABEL: @misaligned_simple_write2_one_val_f64
337 ; CI-DAG: s_mov_b32 m0
338 ; GFX9-NOT: m0
339
340 ; GCN-DAG: {{buffer|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
341 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
342 ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1
343 ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15
344 ; GCN: s_endpgm
290345 define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
291346 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
292347 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
299354 ret void
300355 }
301356
302 ; SI-LABEL: @simple_write2_two_val_f64
303 ; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
304 ; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
305 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
306 ; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
307 ; SI: s_endpgm
357 ; GCN-LABEL: @simple_write2_two_val_f64
358 ; CI-DAG: s_mov_b32 m0
359 ; GFX9-NOT: m0
360
361 ; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
362 ; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
363
364 ; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
365 ; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8
366
367
368 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
369 ; GCN: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
370 ; GCN: s_endpgm
308371 define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
309372 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
310373 %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
321384
322385 @foo = addrspace(3) global [4 x i32] undef, align 4
323386
324 ; SI-LABEL: @store_constant_adjacent_offsets
325 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
326 ; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
387 ; GCN-LABEL: @store_constant_adjacent_offsets
388 ; CI-DAG: s_mov_b32 m0
389 ; GFX9-NOT: m0
390
391 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
392 ; GCN: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
327393 define amdgpu_kernel void @store_constant_adjacent_offsets() {
328394 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
329395 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
330396 ret void
331397 }
332398
333 ; SI-LABEL: @store_constant_disjoint_offsets
334 ; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
335 ; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
336 ; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2
399 ; GCN-LABEL: @store_constant_disjoint_offsets
400 ; CI-DAG: s_mov_b32 m0
401 ; GFX9-NOT: m0
402
403 ; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
404 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
405 ; GCN: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2
337406 define amdgpu_kernel void @store_constant_disjoint_offsets() {
338407 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
339408 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
342411
343412 @bar = addrspace(3) global [4 x i64] undef, align 4
344413
345 ; SI-LABEL: @store_misaligned64_constant_offsets
346 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
347 ; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
348 ; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
349 ; SI: s_endpgm
414 ; GCN-LABEL: @store_misaligned64_constant_offsets
415 ; CI-DAG: s_mov_b32 m0
416 ; GFX9-NOT: m0
417
418 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
419 ; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
420 ; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
421 ; GCN: s_endpgm
350422 define amdgpu_kernel void @store_misaligned64_constant_offsets() {
351423 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
352424 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
355427
356428 @bar.large = addrspace(3) global [4096 x i64] undef, align 4
357429
358 ; SI-LABEL: @store_misaligned64_constant_large_offsets
359 ; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
360 ; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}}
361 ; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
362 ; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
363 ; SI: s_endpgm
430 ; GCN-LABEL: @store_misaligned64_constant_large_offsets
431 ; CI-DAG: s_mov_b32 m0
432 ; GFX9-NOT: m0
433
434 ; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
435 ; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}}
436 ; GCN-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
437 ; GCN-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
438 ; GCN: s_endpgm
364439 define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
365440 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
366441 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
405480 ret void
406481 }
407482
408 ; CI-LABEL: {{^}}simple_write2_v4f32_superreg_align4:
409 ; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:2{{$}}
410 ; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:1{{$}}
411 ; CI: s_endpgm
483 ; GCN-LABEL: {{^}}simple_write2_v4f32_superreg_align4:
484 ; CI: s_mov_b32 m0
485 ; GFX9-NOT: m0
486
487 ; GCN: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:2 offset1:3{{$}}
488 ; GCN: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}}
412489 define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
413490 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
414491 %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in
418495 ret void
419496 }
420497
421 ; Function Attrs: nounwind readnone
422498 declare i32 @llvm.amdgcn.workgroup.id.x() #1
423
424 ; Function Attrs: nounwind readnone
425499 declare i32 @llvm.amdgcn.workgroup.id.y() #1
426
427 ; Function Attrs: nounwind readnone
428500 declare i32 @llvm.amdgcn.workitem.id.x() #1
429
430 ; Function Attrs: nounwind readnone
431501 declare i32 @llvm.amdgcn.workitem.id.y() #1
432502
433503 attributes #0 = { nounwind }
None ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI %s
0 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
12
23 @lds = addrspace(3) global [512 x float] undef, align 4
34
4 ; SI-LABEL: @simple_write2st64_one_val_f32_0_1
5 ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
6 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
7 ; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1
8 ; SI: s_endpgm
5 ; GCN-LABEL: @simple_write2st64_one_val_f32_0_1
6 ; CI-DAG: s_mov_b32 m0
7 ; GFX9-NOT: m0n
8
9 ; GCN-DAG: {{buffer|global}}_load_dword [[VAL:v[0-9]+]]
10 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
11 ; GCN: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1
12 ; GCN: s_endpgm
913 define amdgpu_kernel void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
1014 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
1115 %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
1822 ret void
1923 }
2024
21 ; SI-LABEL: @simple_write2st64_two_val_f32_2_5
22 ; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
23 ; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
24 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
25 ; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5
26 ; SI: s_endpgm
25 ; GCN-LABEL: @simple_write2st64_two_val_f32_2_5
26 ; CI-DAG: s_mov_b32 m0
27 ; GFX9-NOT: m0
28
29 ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
30 ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
31
32 ; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
33 ; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
34
35
36 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
37 ; GCN: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5
38 ; GCN: s_endpgm
2739 define amdgpu_kernel void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
2840 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
2941 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
3951 ret void
4052 }
4153
42 ; SI-LABEL: @simple_write2st64_two_val_max_offset_f32
43 ; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
44 ; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
45 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
46 ; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
47 ; SI: s_endpgm
54 ; GCN-LABEL: @simple_write2st64_two_val_max_offset_f32
55 ; CI-DAG: s_mov_b32 m0
56 ; GFX9-NOT: m0
57
58 ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
59 ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
60
61 ; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
62 ; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
63
64 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}}
65 ; GCN: v_add{{(_co)?}}_{{i|u}}32_e32 [[VPTR:v[0-9]+]], vcc, s{{[0-9]+}}, [[SHL]]
66 ; GCN: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
67 ; GCN: s_endpgm
4868 define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
4969 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
5070 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
5979 ret void
6080 }
6181
62 ; SI-LABEL: @simple_write2st64_two_val_max_offset_f64
63 ; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
64 ; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
65 ; SI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]],
66 ; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127
67 ; SI: s_endpgm
82 ; GCN-LABEL: @simple_write2st64_two_val_max_offset_f64
83 ; CI-DAG: s_mov_b32 m0
84 ; GFX9-NOT: m0
85
86 ; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
87 ; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
88
89 ; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
90 ; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8
91
92 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 3, v{{[0-9]+}}
93 ; GCN: v_add_{{(co_)?}}{{i|u}}32_e32 [[VPTR:v[0-9]+]], vcc, s{{[0-9]+}}, [[SHL]]
94 ; GCN: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127
95 ; GCN: s_endpgm
6896 define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
6997 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
7098 %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
80108 ret void
81109 }
82110
83 ; SI-LABEL: @byte_size_only_divisible_64_write2st64_f64
84 ; SI-NOT: ds_write2st64_b64
85 ; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8
86 ; SI: s_endpgm
111 ; GCN-LABEL: @byte_size_only_divisible_64_write2st64_f64
112 ; CI-DAG: s_mov_b32 m0
113 ; GFX9-NOT: m0
114
115 ; GCN-NOT: ds_write2st64_b64
116 ; GCN: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8
117 ; GCN: s_endpgm
87118 define amdgpu_kernel void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
88119 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
89120 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
96127 ret void
97128 }
98129
99 ; Function Attrs: nounwind readnone
100130 declare i32 @llvm.amdgcn.workitem.id.x() #1
101
102 ; Function Attrs: nounwind readnone
103131 declare i32 @llvm.amdgcn.workitem.id.y() #1
104132
105133 attributes #0 = { nounwind }
1515 ; CHECK: ReservedNumVGPRs: 4
1616 ; GFX700: ReservedFirstVGPR: 8
1717 ; GFX800: ReservedFirstVGPR: 8
18 ; GFX900: ReservedFirstVGPR: 11
18 ; GFX900: ReservedFirstVGPR: 10
1919 ; CHECK: PrivateSegmentBufferSGPR: 0
2020 ; CHECK: WavefrontPrivateSegmentOffsetSGPR: 11
2121 define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !7 !kernel_arg_addr_space !12 !kernel_arg_access_qual !13 !kernel_arg_type !14 !kernel_arg_base_type !14 !kernel_arg_type_qual !15 {
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,PREGFX9 %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,GFX9 %s
44
55 ; Tests for indirect addressing on SI, which is implemented using dynamic
66 ; indexing of vectors.
602602 ; IDXMODE: v_mov_b32_e32 v[[VEC0_ELT2]], -4.0
603603 ; IDXMODE: s_set_gpr_idx_off
604604
605 ; GCN: s_mov_b32 m0, -1
605 ; PREGFX9: s_mov_b32 m0, -1
606 ; GFX9-NOT: s_mov_b32 m0
606607 ; GCN: ds_write_b32
607608 ; GCN: ds_write_b32
608609 ; GCN: s_endpgm
1313
1414 ; Make sure no crash on invalid non-constant
1515 ; GCN-LABEL: {{^}}invalid_variable_order_lds_atomic_dec_ret_i32:
16 ; CIVI-DAG: s_mov_b32 m0
17 ; GFX9-NOT: m0
1618 define amdgpu_kernel void @invalid_variable_order_lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %order.var) #0 {
1719 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 %order.var, i32 0, i1 false)
1820 store i32 %result, i32 addrspace(1)* %out
2123
2224 ; Make sure no crash on invalid non-constant
2325 ; GCN-LABEL: {{^}}invalid_variable_scope_lds_atomic_dec_ret_i32:
26 ; CIVI-DAG: s_mov_b32 m0
27 ; GFX9-NOT: m0
2428 define amdgpu_kernel void @invalid_variable_scope_lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %scope.var) #0 {
2529 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 %scope.var, i1 false)
2630 store i32 %result, i32 addrspace(1)* %out
3640 }
3741
3842 ; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32:
39 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
43 ; CIVI-DAG: s_mov_b32 m0
44 ; GFX9-NOT: m0
45
46 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
4047 ; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
4148 define amdgpu_kernel void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
4249 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
4552 }
4653
4754 ; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32_offset:
48 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
55 ; CIVI-DAG: s_mov_b32 m0
56 ; GFX9-NOT: m0
57
58 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
4959 ; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16
5060 define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
5161 %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
5565 }
5666
5767 ; GCN-LABEL: {{^}}lds_atomic_dec_noret_i32:
58 ; GCN: s_load_dword [[SPTR:s[0-9]+]],
59 ; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
60 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
68 ; CIVI-DAG: s_mov_b32 m0
69 ; GFX9-NOT: m0
70
71 ; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]],
72 ; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
73 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
6174 ; GCN: ds_dec_u32 [[VPTR]], [[DATA]]
6275 define amdgpu_kernel void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
6376 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
6578 }
6679
6780 ; GCN-LABEL: {{^}}lds_atomic_dec_noret_i32_offset:
68 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
81 ; CIVI-DAG: s_mov_b32 m0
82 ; GFX9-NOT: m0
83
84 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
6985 ; GCN: ds_dec_u32 v{{[0-9]+}}, [[K]] offset:16
7086 define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
7187 %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
276292 @lds0 = addrspace(3) global [512 x i32] undef
277293
278294 ; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0:
279 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
295 ; CIVI-DAG: s_mov_b32 m0
296 ; GFX9-NOT: m0
297
298 ; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
280299 ; GCN: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
281300 define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
282301 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
289308 }
290309
291310 ; GCN-LABEL: {{^}}lds_atomic_dec_ret_i64:
311 ; CIVI-DAG: s_mov_b32 m0
312 ; GFX9-NOT: m0
313
292314 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
293315 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
294316 ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
299321 }
300322
301323 ; GCN-LABEL: {{^}}lds_atomic_dec_ret_i64_offset:
324 ; CIVI-DAG: s_mov_b32 m0
325 ; GFX9-NOT: m0
326
302327 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
303328 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
304329 ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32
310335 }
311336
312337 ; GCN-LABEL: {{^}}lds_atomic_dec_noret_i64:
338 ; CIVI-DAG: s_mov_b32 m0
339 ; GFX9-NOT: m0
340
313341 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
314342 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
315343 ; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
319347 }
320348
321349 ; GCN-LABEL: {{^}}lds_atomic_dec_noret_i64_offset:
350 ; CIVI-DAG: s_mov_b32 m0
351 ; GFX9-NOT: m0
352
322353 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
323354 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
324355 ; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}}
405436 @lds1 = addrspace(3) global [512 x i64] undef, align 8
406437
407438 ; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0_i64:
408 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
439 ; CIVI-DAG: s_mov_b32 m0
440 ; GFX9-NOT: m0
441
442 ; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
409443 ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
410444 define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
411445 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
1212 declare i32 @llvm.amdgcn.workitem.id.x() #1
1313
1414 ; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32:
15 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
15 ; CIVI-DAG: s_mov_b32 m0
16 ; GFX9-NOT: m0
17
18 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
1619 ; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
1720 define amdgpu_kernel void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
1821 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
2124 }
2225
2326 ; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32_offset:
24 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
27 ; CIVI-DAG: s_mov_b32 m0
28 ; GFX9-NOT: m0
29
30 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
2531 ; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16
2632 define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
2733 %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
3137 }
3238
3339 ; GCN-LABEL: {{^}}lds_atomic_inc_noret_i32:
34 ; GCN: s_load_dword [[SPTR:s[0-9]+]],
35 ; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
36 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
40 ; CIVI-DAG: s_mov_b32 m0
41 ; GFX9-NOT: m0
42
43 ; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]],
44 ; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
45 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
3746 ; GCN: ds_inc_u32 [[VPTR]], [[DATA]]
3847 define amdgpu_kernel void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
3948 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
4150 }
4251
4352 ; GCN-LABEL: {{^}}lds_atomic_inc_noret_i32_offset:
44 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
53 ; CIVI-DAG: s_mov_b32 m0
54 ; GFX9-NOT: m0
55
56 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
4557 ; GCN: ds_inc_u32 v{{[0-9]+}}, [[K]] offset:16
4658 define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
4759 %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
6868 ; FIXME: Remove m0 initialization
6969 ; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo_shift:
7070 ; GCN: s_waitcnt
71 ; GFX9-NEXT: s_mov_b32 m0, -1
7271 ; GFX9-NEXT: ds_read_u16 v0, v0
7372 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7473 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
562561 ; FIXME: Is there a cost to using the extload over not?
563562 ; GCN-LABEL: {{^}}load_local_v2i16_split:
564563 ; GCN: s_waitcnt
565 ; GFX9-NEXT: s_mov_b32 m0, -1
566564 ; GFX9-NEXT: ds_read_u16 v1, v0
567565 ; GFX9-NEXT: s_waitcnt
568566 ; GFX9-NEXT: ds_read_u16_d16_hi v1, v0 offset:2
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
2 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
2 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
33
44 ; FUNC-LABEL: {{^}}load_f32_local:
5 ; GCN: s_mov_b32 m0
5 ; SICIVI: s_mov_b32 m0
6 ; GFX9-NOT: m0
67 ; GCN: ds_read_b32
78
89 ; EG: LDS_READ_RET
1415 }
1516
1617 ; FUNC-LABEL: {{^}}load_v2f32_local:
17 ; GCN: s_mov_b32 m0
18 ; SICIVI: s_mov_b32 m0
19 ; GFX9-NOT: m0
20
1821 ; GCN: ds_read_b64
1922
2023 ; EG: LDS_READ_RET
2831
2932 ; FIXME: should this do a read2_b64?
3033 ; FUNC-LABEL: {{^}}local_load_v3f32:
34 ; SICIVI: s_mov_b32 m0
35 ; GFX9-NOT: m0
36
3137 ; GCN-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:8
3238 ; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}}
3339 ; GCN: s_waitcnt
4551 }
4652
4753 ; FUNC-LABEL: {{^}}local_load_v4f32:
54 ; SICIVI: s_mov_b32 m0
55 ; GFX9-NOT: m0
56
4857 ; GCN: ds_read2_b64
4958
5059 ; EG: LDS_READ_RET
5968 }
6069
6170 ; FUNC-LABEL: {{^}}local_load_v8f32:
71 ; SICIVI: s_mov_b32 m0
72 ; GFX9-NOT: m0
73
6274 ; GCN: ds_read2_b64
6375 ; GCN: ds_read2_b64
6476
7890 }
7991
8092 ; FUNC-LABEL: {{^}}local_load_v16f32:
93 ; SICIVI: s_mov_b32 m0
94 ; GFX9-NOT: m0
95
8196 ; GCN: ds_read2_b64
8297 ; GCN: ds_read2_b64
8398 ; GCN: ds_read2_b64
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
3 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
4 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
45
56 ; FUNC-LABEL: {{^}}local_load_f64:
7 ; SICIV: s_mov_b32 m0
8 ; GFX9-NOT: m0
9
610 ; GCN: ds_read_b64 [[VAL:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}{{$}}
711 ; GCN: ds_write_b64 v{{[0-9]+}}, [[VAL]]
812
1519 }
1620
1721 ; FUNC-LABEL: {{^}}local_load_v2f64:
22 ; SICIV: s_mov_b32 m0
23 ; GFX9-NOT: m0
24
1825 ; GCN: ds_read2_b64
1926
2027 ; EG: LDS_READ_RET
2936 }
3037
3138 ; FUNC-LABEL: {{^}}local_load_v3f64:
39 ; SICIV: s_mov_b32 m0
40 ; GFX9-NOT: m0
41
3242 ; GCN-DAG: ds_read2_b64
3343 ; GCN-DAG: ds_read_b64
3444
4656 }
4757
4858 ; FUNC-LABEL: {{^}}local_load_v4f64:
59 ; SICIV: s_mov_b32 m0
60 ; GFX9-NOT: m0
61
4962 ; GCN: ds_read2_b64
5063 ; GCN: ds_read2_b64
5164
6679 }
6780
6881 ; FUNC-LABEL: {{^}}local_load_v8f64:
82 ; SICIV: s_mov_b32 m0
83 ; GFX9-NOT: m0
84
6985 ; GCN: ds_read2_b64
7086 ; GCN: ds_read2_b64
7187 ; GCN: ds_read2_b64
95111 }
96112
97113 ; FUNC-LABEL: {{^}}local_load_v16f64:
114 ; SICIV: s_mov_b32 m0
115 ; GFX9-NOT: m0
116
98117 ; GCN: ds_read2_b64
99118 ; GCN: ds_read2_b64
100119 ; GCN: ds_read2_b64
None ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
2 ; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
1 ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
2 ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
3 ; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=cypress < %s | FileCheck -check-prefixes=EG,FUNC %s
34
45 ; FUNC-LABEL: {{^}}local_load_i1:
6 ; SICIVI: s_mov_b32 m0
7 ; GFX9-NOT: m0
8
59 ; GCN: ds_read_u8
610 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 1
711 ; GCN: ds_write_b8
1620 }
1721
1822 ; FUNC-LABEL: {{^}}local_load_v2i1:
23 ; SICIVI: s_mov_b32 m0
24 ; GFX9-NOT: m0
1925 define amdgpu_kernel void @local_load_v2i1(<2 x i1> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
2026 %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
2127 store <2 x i1> %load, <2 x i1> addrspace(3)* %out
2329 }
2430
2531 ; FUNC-LABEL: {{^}}local_load_v3i1:
32 ; SICIVI: s_mov_b32 m0
33 ; GFX9-NOT: m0
2634 define amdgpu_kernel void @local_load_v3i1(<3 x i1> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
2735 %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
2836 store <3 x i1> %load, <3 x i1> addrspace(3)* %out
3038 }
3139
3240 ; FUNC-LABEL: {{^}}local_load_v4i1:
41 ; SICIVI: s_mov_b32 m0
42 ; GFX9-NOT: m0
3343 define amdgpu_kernel void @local_load_v4i1(<4 x i1> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
3444 %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
3545 store <4 x i1> %load, <4 x i1> addrspace(3)* %out
3747 }
3848
3949 ; FUNC-LABEL: {{^}}local_load_v8i1:
50 ; SICIVI: s_mov_b32 m0
51 ; GFX9-NOT: m0
4052 define amdgpu_kernel void @local_load_v8i1(<8 x i1> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
4153 %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
4254 store <8 x i1> %load, <8 x i1> addrspace(3)* %out
4456 }
4557
4658 ; FUNC-LABEL: {{^}}local_load_v16i1:
59 ; SICIVI: s_mov_b32 m0
60 ; GFX9-NOT: m0
4761 define amdgpu_kernel void @local_load_v16i1(<16 x i1> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
4862 %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
4963 store <16 x i1> %load, <16 x i1> addrspace(3)* %out
5165 }
5266
5367 ; FUNC-LABEL: {{^}}local_load_v32i1:
68 ; SICIVI: s_mov_b32 m0
69 ; GFX9-NOT: m0
5470 define amdgpu_kernel void @local_load_v32i1(<32 x i1> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
5571 %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
5672 store <32 x i1> %load, <32 x i1> addrspace(3)* %out
5874 }
5975
6076 ; FUNC-LABEL: {{^}}local_load_v64i1:
77 ; SICIVI: s_mov_b32 m0
78 ; GFX9-NOT: m0
6179 define amdgpu_kernel void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
6280 %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
6381 store <64 x i1> %load, <64 x i1> addrspace(3)* %out
6583 }
6684
6785 ; FUNC-LABEL: {{^}}local_zextload_i1_to_i32:
86 ; SICIVI: s_mov_b32 m0
87 ; GFX9-NOT: m0
88
6889 ; GCN: ds_read_u8
6990 ; GCN: ds_write_b32
7091 define amdgpu_kernel void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
7596 }
7697
7798 ; FUNC-LABEL: {{^}}local_sextload_i1_to_i32:
99 ; SICIVI: s_mov_b32 m0
100 ; GFX9-NOT: m0
101
78102 ; GCN: ds_read_u8
79103 ; GCN: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1{{$}}
80104 ; GCN: ds_write_b32
89113 }
90114
91115 ; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i32:
116 ; SICIVI: s_mov_b32 m0
117 ; GFX9-NOT: m0
92118 define amdgpu_kernel void @local_zextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
93119 %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
94120 %ext = zext <1 x i1> %load to <1 x i32>
97123 }
98124
99125 ; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i32:
126 ; SICIVI: s_mov_b32 m0
127 ; GFX9-NOT: m0
100128 define amdgpu_kernel void @local_sextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
101129 %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
102130 %ext = sext <1 x i1> %load to <1 x i32>
105133 }
106134
107135 ; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i32:
136 ; SICIVI: s_mov_b32 m0
137 ; GFX9-NOT: m0
108138 define amdgpu_kernel void @local_zextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
109139 %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
110140 %ext = zext <2 x i1> %load to <2 x i32>
113143 }
114144
115145 ; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i32:
146 ; SICIVI: s_mov_b32 m0
147 ; GFX9-NOT: m0
116148 define amdgpu_kernel void @local_sextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
117149 %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
118150 %ext = sext <2 x i1> %load to <2 x i32>
121153 }
122154
123155 ; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i32:
156 ; SICIVI: s_mov_b32 m0
157 ; GFX9-NOT: m0
124158 define amdgpu_kernel void @local_zextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
125159 %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
126160 %ext = zext <3 x i1> %load to <3 x i32>
129163 }
130164
131165 ; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i32:
166 ; SICIVI: s_mov_b32 m0
167 ; GFX9-NOT: m0
132168 define amdgpu_kernel void @local_sextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
133169 %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
134170 %ext = sext <3 x i1> %load to <3 x i32>
137173 }
138174
139175 ; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i32:
176 ; SICIVI: s_mov_b32 m0
177 ; GFX9-NOT: m0
140178 define amdgpu_kernel void @local_zextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
141179 %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
142180 %ext = zext <4 x i1> %load to <4 x i32>
145183 }
146184
147185 ; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i32:
186 ; SICIVI: s_mov_b32 m0
187 ; GFX9-NOT: m0
148188 define amdgpu_kernel void @local_sextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
149189 %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
150190 %ext = sext <4 x i1> %load to <4 x i32>
153193 }
154194
155195 ; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i32:
196 ; SICIVI: s_mov_b32 m0
197 ; GFX9-NOT: m0
156198 define amdgpu_kernel void @local_zextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
157199 %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
158200 %ext = zext <8 x i1> %load to <8 x i32>
161203 }
162204
163205 ; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i32:
206 ; SICIVI: s_mov_b32 m0
207 ; GFX9-NOT: m0
164208 define amdgpu_kernel void @local_sextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
165209 %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
166210 %ext = sext <8 x i1> %load to <8 x i32>
169213 }
170214
171215 ; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i32:
216 ; SICIVI: s_mov_b32 m0
217 ; GFX9-NOT: m0
172218 define amdgpu_kernel void @local_zextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
173219 %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
174220 %ext = zext <16 x i1> %load to <16 x i32>
177223 }
178224
179225 ; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i32:
226 ; SICIVI: s_mov_b32 m0
227 ; GFX9-NOT: m0
180228 define amdgpu_kernel void @local_sextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
181229 %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
182230 %ext = sext <16 x i1> %load to <16 x i32>
185233 }
186234
187235 ; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i32:
236 ; SICIVI: s_mov_b32 m0
237 ; GFX9-NOT: m0
188238 define amdgpu_kernel void @local_zextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
189239 %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
190240 %ext = zext <32 x i1> %load to <32 x i32>
193243 }
194244
195245 ; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i32:
246 ; SICIVI: s_mov_b32 m0
247 ; GFX9-NOT: m0
196248 define amdgpu_kernel void @local_sextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
197249 %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
198250 %ext = sext <32 x i1> %load to <32 x i32>
201253 }
202254
203255 ; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i32:
256 ; SICIVI: s_mov_b32 m0
257 ; GFX9-NOT: m0
204258 define amdgpu_kernel void @local_zextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
205259 %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
206260 %ext = zext <64 x i1> %load to <64 x i32>
209263 }
210264
211265 ; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i32:
266 ; SICIVI: s_mov_b32 m0
267 ; GFX9-NOT: m0
212268 define amdgpu_kernel void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
213269 %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
214270 %ext = sext <64 x i1> %load to <64 x i32>
217273 }
218274
219275 ; FUNC-LABEL: {{^}}local_zextload_i1_to_i64:
276 ; SICIVI: s_mov_b32 m0
277 ; GFX9-NOT: m0
278
220279 ; GCN-DAG: ds_read_u8 [[LOAD:v[0-9]+]],
221280 ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
222281 ; GCN: ds_write_b64
228287 }
229288
230289 ; FUNC-LABEL: {{^}}local_sextload_i1_to_i64:
290 ; SICIVI: s_mov_b32 m0
291 ; GFX9-NOT: m0
292
231293 ; GCN: ds_read_u8 [[LOAD:v[0-9]+]],
232294 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
233295 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
240302 }
241303
242304 ; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i64:
305 ; SICIVI: s_mov_b32 m0
306 ; GFX9-NOT: m0
243307 define amdgpu_kernel void @local_zextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
244308 %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
245309 %ext = zext <1 x i1> %load to <1 x i64>
248312 }
249313
250314 ; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i64:
315 ; SICIVI: s_mov_b32 m0
316 ; GFX9-NOT: m0
251317 define amdgpu_kernel void @local_sextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
252318 %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
253319 %ext = sext <1 x i1> %load to <1 x i64>
256322 }
257323
258324 ; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i64:
325 ; SICIVI: s_mov_b32 m0
326 ; GFX9-NOT: m0
259327 define amdgpu_kernel void @local_zextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
260328 %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
261329 %ext = zext <2 x i1> %load to <2 x i64>
264332 }
265333
266334 ; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i64:
335 ; SICIVI: s_mov_b32 m0
336 ; GFX9-NOT: m0
267337 define amdgpu_kernel void @local_sextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
268338 %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
269339 %ext = sext <2 x i1> %load to <2 x i64>
272342 }
273343
274344 ; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i64:
345 ; SICIVI: s_mov_b32 m0
346 ; GFX9-NOT: m0
275347 define amdgpu_kernel void @local_zextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
276348 %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
277349 %ext = zext <3 x i1> %load to <3 x i64>
280352 }
281353
282354 ; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i64:
355 ; SICIVI: s_mov_b32 m0
356 ; GFX9-NOT: m0
283357 define amdgpu_kernel void @local_sextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
284358 %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
285359 %ext = sext <3 x i1> %load to <3 x i64>
288362 }
289363
290364 ; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i64:
365 ; SICIVI: s_mov_b32 m0
366 ; GFX9-NOT: m0
291367 define amdgpu_kernel void @local_zextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
292368 %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
293369 %ext = zext <4 x i1> %load to <4 x i64>
296372 }
297373
298374 ; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i64:
375 ; SICIVI: s_mov_b32 m0
376 ; GFX9-NOT: m0
299377 define amdgpu_kernel void @local_sextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
300378 %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
301379 %ext = sext <4 x i1> %load to <4 x i64>
304382 }
305383
306384 ; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i64:
385 ; SICIVI: s_mov_b32 m0
386 ; GFX9-NOT: m0
307387 define amdgpu_kernel void @local_zextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
308388 %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
309389 %ext = zext <8 x i1> %load to <8 x i64>
312392 }
313393
314394 ; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i64:
395 ; SICIVI: s_mov_b32 m0
396 ; GFX9-NOT: m0
315397 define amdgpu_kernel void @local_sextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
316398 %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
317399 %ext = sext <8 x i1> %load to <8 x i64>
320402 }
321403
322404 ; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i64:
405 ; SICIVI: s_mov_b32 m0
406 ; GFX9-NOT: m0
323407 define amdgpu_kernel void @local_zextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
324408 %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
325409 %ext = zext <16 x i1> %load to <16 x i64>
328412 }
329413
330414 ; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i64:
415 ; SICIVI: s_mov_b32 m0
416 ; GFX9-NOT: m0
331417 define amdgpu_kernel void @local_sextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
332418 %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
333419 %ext = sext <16 x i1> %load to <16 x i64>
336422 }
337423
338424 ; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i64:
425 ; SICIVI: s_mov_b32 m0
426 ; GFX9-NOT: m0
339427 define amdgpu_kernel void @local_zextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
340428 %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
341429 %ext = zext <32 x i1> %load to <32 x i64>
344432 }
345433
346434 ; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i64:
435 ; SICIVI: s_mov_b32 m0
436 ; GFX9-NOT: m0
347437 define amdgpu_kernel void @local_sextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
348438 %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
349439 %ext = sext <32 x i1> %load to <32 x i64>
352442 }
353443
354444 ; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i64:
445 ; SICIVI: s_mov_b32 m0
446 ; GFX9-NOT: m0
355447 define amdgpu_kernel void @local_zextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
356448 %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
357449 %ext = zext <64 x i1> %load to <64 x i64>
360452 }
361453
362454 ; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i64:
455 ; SICIVI: s_mov_b32 m0
456 ; GFX9-NOT: m0
363457 define amdgpu_kernel void @local_sextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
364458 %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
365459 %ext = sext <64 x i1> %load to <64 x i64>
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,SICIVI,FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,GFX89,FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX89,FUNC %s
23 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
34
45 ; FUNC-LABEL: {{^}}local_load_i16:
6 ; GFX9-NOT: m0
7 ; SICIVI: s_mov_b32 m0
8
59 ; GCN: ds_read_u16 v{{[0-9]+}}
610
711 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
1721 }
1822
1923 ; FUNC-LABEL: {{^}}local_load_v2i16:
24 ; GFX9-NOT: m0
25 ; SICIVI: s_mov_b32 m0
26
2027 ; GCN: ds_read_b32
2128
2229 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
3239 }
3340
3441 ; FUNC-LABEL: {{^}}local_load_v3i16:
42 ; GFX9-NOT: m0
43 ; SICIVI: s_mov_b32 m0
44
3545 ; GCN: ds_read_b64
3646 ; GCN-DAG: ds_write_b32
3747 ; GCN-DAG: ds_write_b16
4656 }
4757
4858 ; FUNC-LABEL: {{^}}local_load_v4i16:
59 ; GFX9-NOT: m0
60 ; SICIVI: s_mov_b32 m0
61
4962 ; GCN: ds_read_b64
5063
5164 ; EG: LDS_READ_RET
5871 }
5972
6073 ; FUNC-LABEL: {{^}}local_load_v8i16:
74 ; GFX9-NOT: m0
75 ; SICIVI: s_mov_b32 m0
76
6177 ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
6278
6379 ; EG: LDS_READ_RET
7288 }
7389
7490 ; FUNC-LABEL: {{^}}local_load_v16i16:
91 ; GFX9-NOT: m0
92 ; SICIVI: s_mov_b32 m0
93
7594 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:3{{$}}
7695 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
7796
93112 }
94113
95114 ; FUNC-LABEL: {{^}}local_zextload_i16_to_i32:
115 ; GFX9-NOT: m0
116 ; SICIVI: s_mov_b32 m0
117
96118 ; GCN: ds_read_u16
97119 ; GCN: ds_write_b32
98120
110132
111133 ; FUNC-LABEL: {{^}}local_sextload_i16_to_i32:
112134 ; GCN-NOT: s_wqm_b64
113 ; GCN: s_mov_b32 m0
135
136 ; GFX9-NOT: m0
137 ; SICIVI: s_mov_b32 m0
138
114139 ; GCN: ds_read_i16
115140
116141 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
128153 }
129154
130155 ; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i32:
156 ; GFX9-NOT: m0
157 ; SICIVI: s_mov_b32 m0
158
131159 ; GCN: ds_read_u16
132160
133161 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
143171 }
144172
145173 ; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i32:
174 ; GFX9-NOT: m0
175 ; SICIVI: s_mov_b32 m0
176
146177 ; GCN: ds_read_i16
147178
148179 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
161192
162193 ; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i32:
163194 ; GCN-NOT: s_wqm_b64
164 ; GCN: s_mov_b32 m0
195 ; GFX9-NOT: m0
196 ; SICIVI: s_mov_b32 m0
197
165198 ; GCN: ds_read_b32
166199
167200 ; EG: LDS_READ_RET
174207
175208 ; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i32:
176209 ; GCN-NOT: s_wqm_b64
177 ; GCN: s_mov_b32 m0
210 ; GFX9-NOT: m0
211 ; SICIVI: s_mov_b32 m0
212
178213 ; GCN: ds_read_b32
179214
180215 ; EG: LDS_READ_RET
188223 }
189224
190225 ; FUNC-LABEL: {{^}}local_local_zextload_v3i16_to_v3i32:
226 ; GFX9-NOT: m0
227 ; SICIVI: s_mov_b32 m0
228
191229 ; GCN: ds_read_b64
192230 ; GCN-DAG: ds_write_b32
193231 ; GCN-DAG: ds_write_b64
202240 }
203241
204242 ; FUNC-LABEL: {{^}}local_local_sextload_v3i16_to_v3i32:
243 ; GFX9-NOT: m0
244 ; SICIVI: s_mov_b32 m0
245
205246 ; GCN: ds_read_b64
206247 ; GCN-DAG: ds_write_b32
207248 ; GCN-DAG: ds_write_b64
220261
221262 ; FUNC-LABEL: {{^}}local_local_zextload_v4i16_to_v4i32:
222263 ; GCN-NOT: s_wqm_b64
223 ; GCN: s_mov_b32 m0
264 ; GFX9-NOT: m0
265 ; SICIVI: s_mov_b32 m0
266
224267 ; GCN: ds_read_b64
225268
226269 ; EG: LDS_READ_RET
234277
235278 ; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i32:
236279 ; GCN-NOT: s_wqm_b64
237 ; GCN: s_mov_b32 m0
280 ; GFX9-NOT: m0
281 ; SICIVI: s_mov_b32 m0
282
238283 ; GCN: ds_read_b64
239284
240285 ; EG: LDS_READ_RET
251296 }
252297
253298 ; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i32:
299 ; GFX9-NOT: m0
300 ; SICIVI: s_mov_b32 m0
301
254302 ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
255303
256304 ; EG: LDS_READ_RET
265313 }
266314
267315 ; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i32:
316 ; GFX9-NOT: m0
317 ; SICIVI: s_mov_b32 m0
318
268319 ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
269320
270321 ; EG: LDS_READ_RET
287338 }
288339
289340 ; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i32:
341 ; GFX9-NOT: m0
342 ; SICIVI: s_mov_b32 m0
343
290344 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
291345 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
292346
311365 }
312366
313367 ; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i32:
368 ; GFX9-NOT: m0
369 ; SICIVI: s_mov_b32 m0
370
314371
315372 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
316373 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
347404 }
348405
349406 ; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i32:
407 ; GFX9-NOT: m0
408 ; SICIVI: s_mov_b32 m0
409
350410 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
351411 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
352412 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
376436 }
377437
378438 ; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32:
439 ; GFX9-NOT: m0
440 ; SICIVI: s_mov_b32 m0
441
379442 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
380443 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
381444 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
413476 }
414477
415478 ; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32:
479 ; GFX9-NOT: m0
480 ; SICIVI: s_mov_b32 m0
481
416482 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:14 offset1:15
417483 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
418484 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
478544 }
479545
480546 ; FUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i32:
547 ; GFX9-NOT: m0
548 ; SICIVI: s_mov_b32 m0
481549
482550 ; EG: LDS_READ_RET
483551 ; EG: LDS_READ_RET
519587 }
520588
521589 ; FUNC-LABEL: {{^}}local_zextload_i16_to_i64:
590 ; GFX9-NOT: m0
591 ; SICIVI: s_mov_b32 m0
592
522593 ; GCN-DAG: ds_read_u16 v[[LO:[0-9]+]],
523594 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
524595
537608 }
538609
539610 ; FUNC-LABEL: {{^}}local_sextload_i16_to_i64:
611 ; GFX9-NOT: m0
612 ; SICIVI: s_mov_b32 m0
613
540614 ; FIXME: Need to optimize this sequence to avoid an extra shift.
541615 ; t25: i32,ch = load t12, t10, undef:i32
542616 ; t28: i64 = any_extend t25
543617 ; t30: i64 = sign_extend_inreg t28, ValueType:ch:i16
544618 ; SI: ds_read_i16 v[[LO:[0-9]+]],
545 ; VI: ds_read_u16 v[[ULO:[0-9]+]]
546 ; VI: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16
619 ; GFX89: ds_read_u16 v[[ULO:[0-9]+]]
620 ; GFX89: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16
547621 ; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
548622
549623 ; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]
564638 }
565639
566640 ; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i64:
641 ; GFX9-NOT: m0
642 ; SICIVI: s_mov_b32 m0
643
567644
568645 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
569646 ; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
578655 }
579656
580657 ; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i64:
658 ; GFX9-NOT: m0
659 ; SICIVI: s_mov_b32 m0
660
581661
582662 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
583663 ; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
595675 }
596676
597677 ; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i64:
678 ; GFX9-NOT: m0
679 ; SICIVI: s_mov_b32 m0
680
598681
599682 ; EG: LDS_READ_RET
600683 define amdgpu_kernel void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
605688 }
606689
607690 ; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i64:
691 ; GFX9-NOT: m0
692 ; SICIVI: s_mov_b32 m0
693
608694
609695 ; EG: LDS_READ_RET
610696 ; EG-DAG: BFE_INT
617703 }
618704
619705 ; FUNC-LABEL: {{^}}local_zextload_v4i16_to_v4i64:
706 ; GFX9-NOT: m0
707 ; SICIVI: s_mov_b32 m0
708
620709
621710 ; EG: LDS_READ_RET
622711 ; EG: LDS_READ_RET
628717 }
629718
630719 ; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i64:
720 ; GFX9-NOT: m0
721 ; SICIVI: s_mov_b32 m0
722
631723
632724 ; EG: LDS_READ_RET
633725 ; EG: LDS_READ_RET
643735 }
644736
645737 ; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i64:
738 ; GFX9-NOT: m0
739 ; SICIVI: s_mov_b32 m0
740
646741
647742 ; EG: LDS_READ_RET
648743 ; EG: LDS_READ_RET
656751 }
657752
658753 ; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i64:
754 ; GFX9-NOT: m0
755 ; SICIVI: s_mov_b32 m0
756
659757
660758 ; EG: LDS_READ_RET
661759 ; EG: LDS_READ_RET
677775 }
678776
679777 ; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i64:
778 ; GFX9-NOT: m0
779 ; SICIVI: s_mov_b32 m0
780
680781
681782 ; EG: LDS_READ_RET
682783 ; EG: LDS_READ_RET
694795 }
695796
696797 ; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i64:
798 ; GFX9-NOT: m0
799 ; SICIVI: s_mov_b32 m0
800
697801
698802 ; EG: LDS_READ_RET
699803 ; EG: LDS_READ_RET
727831 }
728832
729833 ; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i64:
834 ; GFX9-NOT: m0
835 ; SICIVI: s_mov_b32 m0
836
730837
731838 ; EG: LDS_READ_RET
732839 ; EG: LDS_READ_RET
752859 }
753860
754861 ; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i64:
862 ; GFX9-NOT: m0
863 ; SICIVI: s_mov_b32 m0
864
755865
756866 ; EG: LDS_READ_RET
757867 ; EG: LDS_READ_RET
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
23 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
3
44
55 ; FUNC-LABEL: {{^}}local_load_i32:
66 ; GCN-NOT: s_wqm_b64
7 ; GCN: s_mov_b32 m0, -1
7 ; SICIVI: s_mov_b32 m0, -1
8 ; GFX9-NOT: m0
89 ; GCN: ds_read_b32
910
1011 ; EG: LDS_READ_RET
1617 }
1718
1819 ; FUNC-LABEL: {{^}}local_load_v2i32:
20 ; SICIVI: s_mov_b32 m0, -1
21 ; GFX9-NOT: m0
22
1923 ; GCN: ds_read_b64
2024 define amdgpu_kernel void @local_load_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
2125 entry:
2529 }
2630
2731 ; FUNC-LABEL: {{^}}local_load_v3i32:
32 ; SICIVI: s_mov_b32 m0, -1
33 ; GFX9-NOT: m0
34
2835 ; GCN-DAG: ds_read_b64
2936 ; GCN-DAG: ds_read_b32
3037 define amdgpu_kernel void @local_load_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> addrspace(3)* %in) #0 {
3542 }
3643
3744 ; FUNC-LABEL: {{^}}local_load_v4i32:
45 ; SICIVI: s_mov_b32 m0, -1
46 ; GFX9-NOT: m0
47
3848 ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
3949
4050 define amdgpu_kernel void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
4555 }
4656
4757 ; FUNC-LABEL: {{^}}local_load_v8i32:
58 ; SICIVI: s_mov_b32 m0, -1
59 ; GFX9-NOT: m0
60
4861 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
4962 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
5063 define amdgpu_kernel void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
5568 }
5669
5770 ; FUNC-LABEL: {{^}}local_load_v16i32:
71 ; SICIVI: s_mov_b32 m0, -1
72 ; GFX9-NOT: m0
73
5874 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7{{$}}
5975 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5{{$}}
6076 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
7187 }
7288
7389 ; FUNC-LABEL: {{^}}local_zextload_i32_to_i64:
90 ; SICIVI: s_mov_b32 m0, -1
91 ; GFX9-NOT: m0
92
7493 define amdgpu_kernel void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
7594 %ld = load i32, i32 addrspace(3)* %in
7695 %ext = zext i32 %ld to i64
7998 }
8099
81100 ; FUNC-LABEL: {{^}}local_sextload_i32_to_i64:
101 ; SICIVI: s_mov_b32 m0, -1
102 ; GFX9-NOT: m0
103
82104 define amdgpu_kernel void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
83105 %ld = load i32, i32 addrspace(3)* %in
84106 %ext = sext i32 %ld to i64
87109 }
88110
89111 ; FUNC-LABEL: {{^}}local_zextload_v1i32_to_v1i64:
112 ; SICIVI: s_mov_b32 m0, -1
113 ; GFX9-NOT: m0
114
90115 define amdgpu_kernel void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
91116 %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in
92117 %ext = zext <1 x i32> %ld to <1 x i64>
95120 }
96121
97122 ; FUNC-LABEL: {{^}}local_sextload_v1i32_to_v1i64:
123 ; SICIVI: s_mov_b32 m0, -1
124 ; GFX9-NOT: m0
125
98126 define amdgpu_kernel void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
99127 %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in
100128 %ext = sext <1 x i32> %ld to <1 x i64>
103131 }
104132
105133 ; FUNC-LABEL: {{^}}local_zextload_v2i32_to_v2i64:
134 ; SICIVI: s_mov_b32 m0, -1
135 ; GFX9-NOT: m0
136
106137 define amdgpu_kernel void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
107138 %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
108139 %ext = zext <2 x i32> %ld to <2 x i64>
111142 }
112143
113144 ; FUNC-LABEL: {{^}}local_sextload_v2i32_to_v2i64:
145 ; SICIVI: s_mov_b32 m0, -1
146 ; GFX9-NOT: m0
147
114148 define amdgpu_kernel void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
115149 %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
116150 %ext = sext <2 x i32> %ld to <2 x i64>
119153 }
120154
121155 ; FUNC-LABEL: {{^}}local_zextload_v4i32_to_v4i64:
156 ; SICIVI: s_mov_b32 m0, -1
157 ; GFX9-NOT: m0
158
122159 define amdgpu_kernel void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
123160 %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
124161 %ext = zext <4 x i32> %ld to <4 x i64>
127164 }
128165
129166 ; FUNC-LABEL: {{^}}local_sextload_v4i32_to_v4i64:
167 ; SICIVI: s_mov_b32 m0, -1
168 ; GFX9-NOT: m0
169
130170 define amdgpu_kernel void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
131171 %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
132172 %ext = sext <4 x i32> %ld to <4 x i64>
135175 }
136176
137177 ; FUNC-LABEL: {{^}}local_zextload_v8i32_to_v8i64:
178 ; SICIVI: s_mov_b32 m0, -1
179 ; GFX9-NOT: m0
180
138181 define amdgpu_kernel void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
139182 %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
140183 %ext = zext <8 x i32> %ld to <8 x i64>
143186 }
144187
145188 ; FUNC-LABEL: {{^}}local_sextload_v8i32_to_v8i64:
189 ; SICIVI: s_mov_b32 m0, -1
190 ; GFX9-NOT: m0
191
146192 define amdgpu_kernel void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
147193 %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
148194 %ext = sext <8 x i32> %ld to <8 x i64>
151197 }
152198
153199 ; FUNC-LABEL: {{^}}local_sextload_v16i32_to_v16i64:
200 ; SICIVI: s_mov_b32 m0, -1
201 ; GFX9-NOT: m0
202
154203 define amdgpu_kernel void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
155204 %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
156205 %ext = sext <16 x i32> %ld to <16 x i64>
159208 }
160209
161210 ; FUNC-LABEL: {{^}}local_zextload_v16i32_to_v16i64
211 ; SICIVI: s_mov_b32 m0, -1
212 ; GFX9-NOT: m0
213
162214 define amdgpu_kernel void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
163215 %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
164216 %ext = zext <16 x i32> %ld to <16 x i64>
167219 }
168220
169221 ; FUNC-LABEL: {{^}}local_sextload_v32i32_to_v32i64:
222 ; SICIVI: s_mov_b32 m0, -1
223 ; GFX9-NOT: m0
224
170225 define amdgpu_kernel void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
171226 %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in
172227 %ext = sext <32 x i32> %ld to <32 x i64>
175230 }
176231
177232 ; FUNC-LABEL: {{^}}local_zextload_v32i32_to_v32i64:
233 ; SICIVI: s_mov_b32 m0, -1
234 ; GFX9-NOT: m0
235
178236 define amdgpu_kernel void @local_zextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
179237 %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in
180238 %ext = zext <32 x i32> %ld to <32 x i64>
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
3 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
4 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
45
56 ; FUNC-LABEL: {{^}}local_load_i64:
7 ; SICIVI: s_mov_b32 m0
8 ; GFX9-NOT: m0
9
610 ; GCN: ds_read_b64 [[VAL:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}{{$}}
711 ; GCN: ds_write_b64 v{{[0-9]+}}, [[VAL]]
812
1519 }
1620
1721 ; FUNC-LABEL: {{^}}local_load_v2i64:
22 ; SICIVI: s_mov_b32 m0
23 ; GFX9-NOT: m0
24
1825 ; GCN: ds_read2_b64
1926
2027 ; EG: LDS_READ_RET
2936 }
3037
3138 ; FUNC-LABEL: {{^}}local_load_v3i64:
39 ; SICIVI: s_mov_b32 m0
40 ; GFX9-NOT: m0
41
3242 ; GCN-DAG: ds_read2_b64
3343 ; GCN-DAG: ds_read_b64
3444
4656 }
4757
4858 ; FUNC-LABEL: {{^}}local_load_v4i64:
59 ; SICIVI: s_mov_b32 m0
60 ; GFX9-NOT: m0
61
4962 ; GCN: ds_read2_b64
5063 ; GCN: ds_read2_b64
5164
6679 }
6780
6881 ; FUNC-LABEL: {{^}}local_load_v8i64:
82 ; SICIVI: s_mov_b32 m0
83 ; GFX9-NOT: m0
84
6985 ; GCN: ds_read2_b64
7086 ; GCN: ds_read2_b64
7187 ; GCN: ds_read2_b64
95111 }
96112
97113 ; FUNC-LABEL: {{^}}local_load_v16i64:
114 ; SICIVI: s_mov_b32 m0
115 ; GFX9-NOT: m0
116
98117 ; GCN: ds_read2_b64
99118 ; GCN: ds_read2_b64
100119 ; GCN: ds_read2_b64
None ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
1 ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
0 ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,SICIVI,FUNC %s
1 ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,SICIVI,FUNC %s
2 ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
23 ; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
34
45
56 ; FUNC-LABEL: {{^}}local_load_i8:
67 ; GCN-NOT: s_wqm_b64
7 ; GCN: s_mov_b32 m0
8 ; SICIVI: s_mov_b32 m0
9 ; GFX9-NOT: m0
810 ; GCN: ds_read_u8
911
1012 ; EG: LDS_UBYTE_READ_RET
1719
1820 ; FUNC-LABEL: {{^}}local_load_v2i8:
1921 ; GCN-NOT: s_wqm_b64
20 ; GCN: s_mov_b32 m0
22 ; SICIVI: s_mov_b32 m0
23 ; GFX9-NOT: m0
2124 ; GCN: ds_read_u16
2225
2326 ; EG: LDS_USHORT_READ_RET
2932 }
3033
3134 ; FUNC-LABEL: {{^}}local_load_v3i8:
35 ; GFX9-NOT: m0
3236 ; GCN: ds_read_b32
3337
3438 ; EG: DS_READ_RET
4044 }
4145
4246 ; FUNC-LABEL: {{^}}local_load_v4i8:
47 ; GFX9-NOT: m0
4348 ; GCN: ds_read_b32
4449
4550 ; EG: LDS_READ_RET
5156 }
5257
5358 ; FUNC-LABEL: {{^}}local_load_v8i8:
59 ; GFX9-NOT: m0
5460 ; GCN: ds_read_b64
5561
5662 ; EG: LDS_READ_RET
6369 }
6470
6571 ; FUNC-LABEL: {{^}}local_load_v16i8:
72 ; GFX9-NOT: m0
6673 ; GCN: ds_read2_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
6774 ; GCN: ds_write2_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:{{[0-9]+}}], v[{{[0-9]+}}:[[HI]]{{\]}} offset1:1{{$}}
6875
7885 }
7986
8087 ; FUNC-LABEL: {{^}}local_zextload_i8_to_i32:
88 ; GFX9-NOT: m0
8189 ; GCN-NOT: s_wqm_b64
82 ; GCN: s_mov_b32 m0
90 ; SICIVI: s_mov_b32 m0
8391 ; GCN: ds_read_u8
8492
8593 ; EG: LDS_UBYTE_READ_RET
92100
93101 ; FUNC-LABEL: {{^}}local_sextload_i8_to_i32:
94102