llvm.org GIT mirror llvm / 7dd37ae
R600/SI: Add support for i8 and i16 private loads/stores git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@199823 91177308-0d34-0410-b5e6-96231b3b80d8 Tom Stellard 6 years ago
6 changed file(s) with 202 addition(s) and 27 deletion(s). Raw diff Collapse all Expand all
588588 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, &Chains[0], NumElts);
589589 }
590590
591 SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
592 SDLoc DL(Op);
593 LoadSDNode *Load = cast(Op);
594 ISD::LoadExtType ExtType = Load->getExtensionType();
595
596 if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS ||
597 ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32))
598 return SDValue();
599
600
601 EVT VT = Op.getValueType();
602 EVT MemVT = Load->getMemoryVT();
603 unsigned Mask = 0;
604 if (Load->getMemoryVT() == MVT::i8) {
605 Mask = 0xff;
606 } else if (Load->getMemoryVT() == MVT::i16) {
607 Mask = 0xffff;
608 }
609 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
610 DAG.getConstant(2, MVT::i32));
611 SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
612 Load->getChain(), Ptr,
613 DAG.getTargetConstant(0, MVT::i32),
614 Op.getOperand(2));
615 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
616 Load->getBasePtr(),
617 DAG.getConstant(0x3, MVT::i32));
618 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
619 DAG.getConstant(3, MVT::i32));
620 Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
621 Ret = DAG.getNode(ISD::AND, DL, MVT::i32, Ret,
622 DAG.getConstant(Mask, MVT::i32));
623 if (ExtType == ISD::SEXTLOAD) {
624 SDValue SExtShift = DAG.getConstant(
625 VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
626 Ret = DAG.getNode(ISD::SHL, DL, MVT::i32, Ret, SExtShift);
627 Ret = DAG.getNode(ISD::SRA, DL, MVT::i32, Ret, SExtShift);
628 }
629
630 return Ret;
631 }
632
591633 SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
634 SDLoc DL(Op);
592635 SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG);
593636 if (Result.getNode()) {
594637 return Result;
595638 }
596639
597640 StoreSDNode *Store = cast(Op);
641 SDValue Chain = Store->getChain();
598642 if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
599643 Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
600644 Store->getValue().getValueType().isVector()) {
601645 return SplitVectorStore(Op, DAG);
646 }
647
648 if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
649 Store->getMemoryVT().bitsLT(MVT::i32)) {
650 unsigned Mask = 0;
651 if (Store->getMemoryVT() == MVT::i8) {
652 Mask = 0xff;
653 } else if (Store->getMemoryVT() == MVT::i16) {
654 Mask = 0xffff;
655 }
656 SDValue TruncPtr = DAG.getZExtOrTrunc(Store->getBasePtr(), DL, MVT::i32);
657 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr,
658 DAG.getConstant(2, MVT::i32));
659 SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
660 Chain, Ptr, DAG.getTargetConstant(0, MVT::i32));
661 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, TruncPtr,
662 DAG.getConstant(0x3, MVT::i32));
663 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
664 DAG.getConstant(3, MVT::i32));
665 SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
666 Store->getValue());
667 SDValue MaskedValue = DAG.getNode(ISD::AND, DL, MVT::i32, SExtValue,
668 DAG.getConstant(Mask, MVT::i32));
669 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
670 MaskedValue, ShiftAmt);
671 SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(Mask, MVT::i32),
672 ShiftAmt);
673 DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
674 DAG.getConstant(0xffffffff, MVT::i32));
675 Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
676
677 SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
678 return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
679 Chain, Value, Ptr, DAG.getTargetConstant(0, MVT::i32));
602680 }
603681 return SDValue();
604682 }
5353 /// \brief Split a vector load into multiple scalar loads.
5454 SDValue SplitVectorLoad(const SDValue &Op, SelectionDAG &DAG) const;
5555 SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
56 SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
5657 SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
5758 bool isHWTrueValue(SDValue Op) const;
5859 bool isHWFalseValue(SDValue Op) const;
11121112 return SDValue();
11131113 }
11141114
1115 SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1116 if (Ret.getNode()) {
1117 return Ret;
1118 }
11151119 // Lowering for indirect addressing
11161120
11171121 const MachineFunction &MF = DAG.getMachineFunction();
12021206 SDValue Chain = Op.getOperand(0);
12031207 SDValue Ptr = Op.getOperand(1);
12041208 SDValue LoweredLoad;
1209
1210 SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
1211 if (Ret.getNode()) {
1212 SDValue Ops[2];
1213 Ops[0] = Ret;
1214 Ops[1] = Chain;
1215 return DAG.getMergeValues(Ops, 2, DL);
1216 }
1217
12051218
12061219 if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
12071220 SDValue MergedValues[2] = {
124124 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
125125
126126 setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
127 setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand);
127 setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
128 setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
128129 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand);
129130 setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand);
130131
132 setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
133 setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
134 setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand);
131135 setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
136 setTruncStoreAction(MVT::i32, MVT::i8, Custom);
137 setTruncStoreAction(MVT::i32, MVT::i16, Custom);
132138 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
133139 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
134140 setTruncStoreAction(MVT::i128, MVT::i64, Expand);
699705 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
700706 SDLoc DL(Op);
701707 LoadSDNode *Load = cast(Op);
702
703 if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
708 SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
709 SDValue MergedValues[2];
710 MergedValues[1] = Load->getChain();
711 if (Ret.getNode()) {
712 MergedValues[0] = Ret;
713 return DAG.getMergeValues(MergedValues, 2, DL);
714 }
715
716 if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
704717 return SDValue();
718 }
705719
706720 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
707721 DAG.getConstant(2, MVT::i32));
708
709 SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
710 Load->getChain(), Ptr,
711 DAG.getTargetConstant(0, MVT::i32),
712 Op.getOperand(2));
713 SDValue MergedValues[2] = {
714 Ret,
715 Load->getChain()
716 };
722 Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
723 Load->getChain(), Ptr,
724 DAG.getTargetConstant(0, MVT::i32),
725 Op.getOperand(2));
726
727 MergedValues[0] = Ret;
717728 return DAG.getMergeValues(MergedValues, 2, DL);
718729
719730 }
795806 SDValue Chain = Store->getChain();
796807 SmallVector Values;
797808
798 if (VT == MVT::i64) {
809 if (Store->isTruncatingStore()) {
810 unsigned Mask = 0;
811 if (Store->getMemoryVT() == MVT::i8) {
812 Mask = 0xff;
813 } else if (Store->getMemoryVT() == MVT::i16) {
814 Mask = 0xffff;
815 }
816 SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
817 Chain, Store->getBasePtr(),
818 DAG.getConstant(0, MVT::i32));
819 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, Store->getBasePtr(),
820 DAG.getConstant(0x3, MVT::i32));
821 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
822 DAG.getConstant(3, MVT::i32));
823 SDValue MaskedValue = DAG.getNode(ISD::AND, DL, MVT::i32, Store->getValue(),
824 DAG.getConstant(Mask, MVT::i32));
825 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
826 MaskedValue, ShiftAmt);
827 SDValue RotrAmt = DAG.getNode(ISD::SUB, DL, MVT::i32,
828 DAG.getConstant(32, MVT::i32), ShiftAmt);
829 SDValue DstMask = DAG.getNode(ISD::ROTR, DL, MVT::i32,
830 DAG.getConstant(Mask, MVT::i32),
831 RotrAmt);
832 Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
833 Dst = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
834
835 Values.push_back(Dst);
836 } else if (VT == MVT::i64) {
799837 for (unsigned i = 0; i < 2; ++i) {
800838 Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32,
801839 Store->getValue(), DAG.getConstant(i, MVT::i32)));
11
22 ; EG-LABEL: @anyext_load_i8:
33 ; EG: AND_INT
4 ; EG-NEXT: 255
4 ; EG: 255
55 define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
66 %cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)*
77 %load = load i32 addrspace(1)* %cast, align 1
1313
1414 ; EG-LABEL: @anyext_load_i16:
1515 ; EG: AND_INT
16 ; EG: LSHL
17 ; EG: 65535
16 ; EG: AND_INT
17 ; EG-DAG: 65535
18 ; EG-DAG: -65536
1819 define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
1920 %cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)*
2021 %load = load i32 addrspace(1)* %cast, align 1
2627
2728 ; EG-LABEL: @anyext_load_lds_i8:
2829 ; EG: AND_INT
29 ; EG-NEXT: 255
30 ; EG: 255
3031 define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
3132 %cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)*
3233 %load = load i32 addrspace(3)* %cast, align 1
3839
3940 ; EG-LABEL: @anyext_load_lds_i16:
4041 ; EG: AND_INT
41 ; EG: LSHL
42 ; EG: 65535
42 ; EG: AND_INT
43 ; EG-DAG: 65535
44 ; EG-DAG: -65536
4345 define void @anyext_load_lds_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
4446 %cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)*
4547 %load = load i32 addrspace(3)* %cast, align 1
None ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
1 ; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
0 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK --check-prefix=FUNC
1 ; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
22
33 ; This test checks that uses and defs of the AR register happen in the same
44 ; instruction clause.
55
6 ; R600-CHECK-LABEL: @mova_same_clause
6 ; FUNC-LABEL: @mova_same_clause
7
78 ; R600-CHECK: MOVA_INT
89 ; R600-CHECK-NOT: ALU clause
910 ; R600-CHECK: 0 + AR.x
1112 ; R600-CHECK-NOT: ALU clause
1213 ; R600-CHECK: 0 + AR.x
1314
14 ; SI-CHECK-LABEL: @mova_same_clause
1515 ; SI-CHECK: V_READFIRSTLANE
1616 ; SI-CHECK: V_MOVRELD
1717 ; SI-CHECK: S_CBRANCH
4545 ; XXX: This generated code has unnecessary MOVs, we should be able to optimize
4646 ; this.
4747
48 ; R600-CHECK-LABEL: @multiple_structs
48 ; FUNC-LABEL: @multiple_structs
4949 ; R600-CHECK-NOT: MOVA_INT
50 ; SI-CHECK-LABEL: @multiple_structs
5150 ; SI-CHECK-NOT: V_MOVREL
5251 %struct.point = type { i32, i32 }
5352
7675 ; loads and stores should be lowered to copies, so there shouldn't be any
7776 ; MOVA instructions.
7877
79 ; R600-CHECK-LABEL: @direct_loop
78 ; FUNC-LABEL: @direct_loop
8079 ; R600-CHECK-NOT: MOVA_INT
81 ; SI-CHECK-LABEL: @direct_loop
8280 ; SI-CHECK-NOT: V_MOVREL
8381
8482 define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
112110 store i32 %value, i32 addrspace(1)* %out
113111 ret void
114112 }
113
114 ; FUNC-LABEL: @short_array
115
116 ; R600-CHECK: MOV {{\** *}}T{{[0-9]\.[XYZW]}}, literal
117 ; R600-CHECK: 65536
118 ; R600-CHECK: MOVA_INT
119
120 ; SI-CHECK: V_MOV_B32_e32 v{{[0-9]}}, 65536
121 ; SI-CHECK: V_MOVRELS_B32_e32
122 define void @short_array(i32 addrspace(1)* %out, i32 %index) {
123 entry:
124 %0 = alloca [2 x i16]
125 %1 = getelementptr [2 x i16]* %0, i32 0, i32 0
126 %2 = getelementptr [2 x i16]* %0, i32 0, i32 1
127 store i16 0, i16* %1
128 store i16 1, i16* %2
129 %3 = getelementptr [2 x i16]* %0, i32 0, i32 %index
130 %4 = load i16* %3
131 %5 = sext i16 %4 to i32
132 store i32 %5, i32 addrspace(1)* %out
133 ret void
134 }
135
136 ; FUNC-LABEL: @char_array
137
138 ; R600-CHECK: OR_INT {{\** *}}T{{[0-9]\.[XYZW]}}, {{[PVT0-9]+\.[XYZW]}}, literal
139 ; R600-CHECK: 256
140 ; R600-CHECK: MOVA_INT
141
142 ; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}, 256
143 ; SI-CHECK: V_MOVRELS_B32_e32
144 define void @char_array(i32 addrspace(1)* %out, i32 %index) {
145 entry:
146 %0 = alloca [2 x i8]
147 %1 = getelementptr [2 x i8]* %0, i32 0, i32 0
148 %2 = getelementptr [2 x i8]* %0, i32 0, i32 1
149 store i8 0, i8* %1
150 store i8 1, i8* %2
151 %3 = getelementptr [2 x i8]* %0, i32 0, i32 %index
152 %4 = load i8* %3
153 %5 = sext i8 %4 to i32
154 store i32 %5, i32 addrspace(1)* %out
155 ret void
156
157 }