llvm.org GIT mirror llvm / c8a344e
Merging r167948, r168198: into the 3.2 release branch r168198 [NVPTX] Order global variables in def-use order before emiting them in the final assembly r167948 [NVPTX] Implement custom lowering of loads/stores for i1 Loads from i1 become loads from i8 followed by trunc Stores to i1 become zext to i8 followed by store to i8 git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_32@168335 91177308-0d34-0410-b5e6-96231b3b80d8 Pawel Wodnicki 6 years ago
5 changed file(s) with 175 addition(s) and 6 deletion(s). Raw diff Collapse all Expand all
6767 cl::location(llvm::InterleaveSrcInPtx));
6868
6969
70
70 namespace {
71 /// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V
72 /// depends.
73 void DiscoverDependentGlobals(Value *V,
74 DenseSet &Globals) {
75 if (GlobalVariable *GV = dyn_cast(V))
76 Globals.insert(GV);
77 else {
78 if (User *U = dyn_cast(V)) {
79 for (unsigned i = 0, e = U->getNumOperands(); i != e; ++i) {
80 DiscoverDependentGlobals(U->getOperand(i), Globals);
81 }
82 }
83 }
84 }
85
86 /// VisitGlobalVariableForEmission - Add \p GV to the list of GlobalVariable
87 /// instances to be emitted, but only after any dependents have been added
88 /// first.
89 void VisitGlobalVariableForEmission(GlobalVariable *GV,
90 SmallVectorImpl &Order,
91 DenseSet &Visited,
92 DenseSet &Visiting) {
93 // Have we already visited this one?
94 if (Visited.count(GV)) return;
95
96 // Do we have a circular dependency?
97 if (Visiting.count(GV))
98 report_fatal_error("Circular dependency found in global variable set");
99
100 // Start visiting this global
101 Visiting.insert(GV);
102
103 // Make sure we visit all dependents first
104 DenseSet Others;
105 for (unsigned i = 0, e = GV->getNumOperands(); i != e; ++i)
106 DiscoverDependentGlobals(GV->getOperand(i), Others);
107
108 for (DenseSet::iterator I = Others.begin(),
109 E = Others.end(); I != E; ++I)
110 VisitGlobalVariableForEmission(*I, Order, Visited, Visiting);
111
112 // Now we can visit ourself
113 Order.push_back(GV);
114 Visited.insert(GV);
115 Visiting.erase(GV);
116 }
117 }
71118
72119 // @TODO: This is a copy from AsmPrinter.cpp. The function is static, so we
73120 // cannot just link to the existing version.
892939
893940 emitDeclarations(M, OS2);
894941
895 // Print out module-level global variables here.
942 // As ptxas does not support forward references of globals, we need to first
943 // sort the list of module-level globals in def-use order. We visit each
944 // global variable in order, and ensure that we emit it *after* its dependent
945 // globals. We use a little extra memory maintaining both a set and a list to
946 // have fast searches while maintaining a strict ordering.
947 SmallVector Globals;
948 DenseSet GVVisited;
949 DenseSet GVVisiting;
950
951 // Visit each global variable, in order
896952 for (Module::global_iterator I = M.global_begin(), E = M.global_end();
897 I != E; ++I)
898 printModuleLevelGV(I, OS2);
953 I != E; ++I)
954 VisitGlobalVariableForEmission(I, Globals, GVVisited, GVVisiting);
955
956 assert(GVVisited.size() == M.getGlobalList().size() &&
957 "Missed a global variable");
958 assert(GVVisiting.size() == 0 && "Did not fully process a global variable");
959
960 // Print out module-level global variables in proper order
961 for (unsigned i = 0, e = Globals.size(); i != e; ++i)
962 printModuleLevelGV(Globals[i], OS2);
899963
900964 OS2 << '\n';
901965
173173 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
174174
175175 // PTX does not support load / store predicate registers
176 setOperationAction(ISD::LOAD, MVT::i1, Expand);
176 setOperationAction(ISD::LOAD, MVT::i1, Custom);
177 setOperationAction(ISD::STORE, MVT::i1, Custom);
178
177179 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
178180 setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
179 setOperationAction(ISD::STORE, MVT::i1, Expand);
180181 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
181182 setTruncStoreAction(MVT::i32, MVT::i1, Expand);
182183 setTruncStoreAction(MVT::i16, MVT::i1, Expand);
855856 case ISD::EXTRACT_SUBVECTOR:
856857 return Op;
857858 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
859 case ISD::STORE: return LowerSTORE(Op, DAG);
860 case ISD::LOAD: return LowerLOAD(Op, DAG);
858861 default:
859862 llvm_unreachable("Custom lowering not defined for operation");
860863 }
861864 }
865
866
867 // v = ld i1* addr
868 // =>
869 // v1 = ld i8* addr
870 // v = trunc v1 to i1
871 SDValue NVPTXTargetLowering::
872 LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
873 SDNode *Node = Op.getNode();
874 LoadSDNode *LD = cast(Node);
875 DebugLoc dl = Node->getDebugLoc();
876 ISD::LoadExtType ExtType = LD->getExtensionType();
877 assert(ExtType == ISD::NON_EXTLOAD) ;
878 EVT VT = Node->getValueType(0);
879 assert(VT == MVT::i1 && "Custom lowering for i1 load only");
880 SDValue newLD = DAG.getLoad(MVT::i8, dl, LD->getChain(), LD->getBasePtr(),
881 LD->getPointerInfo(),
882 LD->isVolatile(), LD->isNonTemporal(),
883 LD->isInvariant(),
884 LD->getAlignment());
885 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
886 // The legalizer (the caller) is expecting two values from the legalized
887 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
888 // in LegalizeDAG.cpp which also uses MergeValues.
889 SDValue Ops[] = {result, LD->getChain()};
890 return DAG.getMergeValues(Ops, 2, dl);
891 }
892
893 // st i1 v, addr
894 // =>
895 // v1 = zxt v to i8
896 // st i8, addr
897 SDValue NVPTXTargetLowering::
898 LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
899 SDNode *Node = Op.getNode();
900 DebugLoc dl = Node->getDebugLoc();
901 StoreSDNode *ST = cast(Node);
902 SDValue Tmp1 = ST->getChain();
903 SDValue Tmp2 = ST->getBasePtr();
904 SDValue Tmp3 = ST->getValue();
905 EVT VT = Tmp3.getValueType();
906 assert(VT == MVT::i1 && "Custom lowering for i1 store only");
907 unsigned Alignment = ST->getAlignment();
908 bool isVolatile = ST->isVolatile();
909 bool isNonTemporal = ST->isNonTemporal();
910 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl,
911 MVT::i8, Tmp3);
912 SDValue Result = DAG.getStore(Tmp1, dl, Tmp3, Tmp2,
913 ST->getPointerInfo(), isVolatile,
914 isNonTemporal, Alignment);
915 return Result;
916 }
917
862918
863919 SDValue
864920 NVPTXTargetLowering::getExtSymb(SelectionDAG &DAG, const char *inname, int idx,
137137 SDValue getParamHelpSymbol(SelectionDAG &DAG, int idx);
138138
139139 SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
140
141 SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
142 SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
140143 };
141144 } // namespace llvm
142145
0 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
1 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
2
3 ; Make sure we emit these globals in def-use order
4
5
6 ; PTX32: .visible .global .align 1 .u8 a = 2;
7 ; PTX32-NEXT: .visible .global .align 4 .u32 a2 = a;
8 ; PTX64: .visible .global .align 1 .u8 a = 2;
9 ; PTX64-NEXT: .visible .global .align 8 .u64 a2 = a;
10 @a2 = addrspace(1) global i8 addrspace(1)* @a
11 @a = addrspace(1) global i8 2
12
13
14 ; PTX32: .visible .global .align 1 .u8 b = 1;
15 ; PTX32-NEXT: .visible .global .align 4 .u32 b2[2] = {b, b};
16 ; PTX64: .visible .global .align 1 .u8 b = 1;
17 ; PTX64-NEXT: .visible .global .align 8 .u64 b2[2] = {b, b};
18 @b2 = addrspace(1) global [2 x i8 addrspace(1)*] [i8 addrspace(1)* @b, i8 addrspace(1)* @b]
19 @b = addrspace(1) global i8 1
0 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
1 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
2
3 define ptx_kernel void @t1(i1* %a) {
4 ; PTX32: mov.u16 %rc{{[0-9]+}}, 0;
5 ; PTX32-NEXT: st.u8 [%r{{[0-9]+}}], %rc{{[0-9]+}};
6 ; PTX64: mov.u16 %rc{{[0-9]+}}, 0;
7 ; PTX64-NEXT: st.u8 [%rl{{[0-9]+}}], %rc{{[0-9]+}};
8 store i1 false, i1* %a
9 ret void
10 }
11
12
13 define ptx_kernel void @t2(i1* %a, i8* %b) {
14 ; PTX32: ld.u8 %rc{{[0-9]+}}, [%r{{[0-9]+}}]
15 ; PTX32: and.b16 temp, %rc{{[0-9]+}}, 1;
16 ; PTX32: setp.b16.eq %p{{[0-9]+}}, temp, 1;
17 ; PTX64: ld.u8 %rc{{[0-9]+}}, [%rl{{[0-9]+}}]
18 ; PTX64: and.b16 temp, %rc{{[0-9]+}}, 1;
19 ; PTX64: setp.b16.eq %p{{[0-9]+}}, temp, 1;
20
21 %t1 = load i1* %a
22 %t2 = select i1 %t1, i8 1, i8 2
23 store i8 %t2, i8* %b
24 ret void
25 }