llvm.org GIT mirror llvm / 6ecd744
R600/SI: Remove explicit m0 operand from DS instructions Instead add m0 as an implicit operand. This helps avoid spills of the m0 register in some cases. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@237141 91177308-0d34-0410-b5e6-96231b3b80d8 Tom Stellard 4 years ago
8 changed file(s) with 274 addition(s) and 133 deletion(s). Raw diff Collapse all Expand all
7676 bool isPrivateLoad(const LoadSDNode *N) const;
7777 bool isLocalLoad(const LoadSDNode *N) const;
7878 bool isRegionLoad(const LoadSDNode *N) const;
79
80 SDNode *glueCopyToM0(SDNode *N) const;
7981
8082 const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
8183 bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
241243 return true;
242244 }
243245
246 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
247 if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
248 !checkType(cast(N)->getMemOperand()->getValue(),
249 AMDGPUAS::LOCAL_ADDRESS))
250 return N;
251
252 const SITargetLowering& Lowering =
253 *static_cast(getTargetLowering());
254
255 // Write max value to m0 before each load operation
256
257 SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N),
258 CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
259
260 SDValue Glue = M0.getValue(1);
261
262 SmallVector Ops;
263 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
264 Ops.push_back(N->getOperand(i));
265 }
266 Ops.push_back(Glue);
267 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
268
269 return N;
270 }
271
244272 SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
245273 unsigned int Opc = N->getOpcode();
246274 if (N->isMachineOpcode()) {
247275 N->setNodeId(-1);
248276 return nullptr; // Already selected.
249277 }
278
279 if (isa(N))
280 N = glueCopyToM0(N);
250281
251282 switch (Opc) {
252283 default: break;
422453 }
423454
424455 case ISD::LOAD: {
456 LoadSDNode *LD = cast(N);
457 SDLoc SL(N);
458 EVT VT = N->getValueType(0);
459
460 if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) {
461 N = glueCopyToM0(N);
462 break;
463 }
464
425465 // To simplify the TableGen patters, we replace all i64 loads with
426466 // v2i32 loads. Alternatively, we could promote i64 loads to v2i32
427467 // during DAG legalization, however, so places (ExpandUnalignedLoad)
428468 // in the DAG legalizer assume that if i64 is legal, so doing this
429469 // promotion early can cause problems.
430 EVT VT = N->getValueType(0);
431 LoadSDNode *LD = cast(N);
432 if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD)
433 break;
434470
435471 SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(),
436 LD->getBasePtr(), LD->getMemOperand());
437 SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SDLoc(N),
472 LD->getBasePtr(), LD->getMemOperand());
473 SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL,
438474 MVT::i64, NewLoad);
439475 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1));
440476 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast);
441 SelectCode(NewLoad.getNode());
477 SDNode *Load = glueCopyToM0(NewLoad.getNode());
478 SelectCode(Load);
442479 N = BitCast.getNode();
443480 break;
444481 }
447484 // Handle i64 stores here for the same reason mentioned above for loads.
448485 StoreSDNode *ST = cast(N);
449486 SDValue Value = ST->getValue();
450 if (Value.getValueType() != MVT::i64 || ST->isTruncatingStore())
451 break;
452
453 SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N),
454 MVT::v2i32, Value);
455 SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue,
456 ST->getBasePtr(), ST->getMemOperand());
457
458 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore);
459
460 if (NewValue.getOpcode() == ISD::BITCAST) {
461 Select(NewStore.getNode());
462 return SelectCode(NewValue.getNode());
463 }
464
465 // getNode() may fold the bitcast if its input was another bitcast. If that
466 // happens we should only select the new store.
467 N = NewStore.getNode();
487 if (Value.getValueType() == MVT::i64 && !ST->isTruncatingStore()) {
488
489 SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N),
490 MVT::v2i32, Value);
491 SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue,
492 ST->getBasePtr(), ST->getMemOperand());
493
494 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore);
495
496 if (NewValue.getOpcode() == ISD::BITCAST) {
497 Select(NewStore.getNode());
498 return SelectCode(NewValue.getNode());
499 }
500
501 // getNode() may fold the bitcast if its input was another bitcast. If that
502 // happens we should only select the new store.
503 N = NewStore.getNode();
504 }
505
506 N = glueCopyToM0(N);
468507 break;
469508 }
470509
182182 return isConstantLoad(dyn_cast(N), -1);
183183 }]>;
184184
185 def az_extload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
185 class AZExtLoadBase : PatFrag<(ops node:$ptr),
186 (ld_node node:$ptr), [{
186187 LoadSDNode *L = cast(N);
187188 return L->getExtensionType() == ISD::ZEXTLOAD ||
188189 L->getExtensionType() == ISD::EXTLOAD;
189190 }]>;
191
192 def az_extload : AZExtLoadBase ;
190193
191194 def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
192195 return cast(N)->getMemoryVT() == MVT::i8;
360363 return cast(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
361364 }]>;
362365
363
364 def atomic_cmp_swap_32_local :
365 PatFrag<(ops node:$ptr, node:$cmp, node:$swap),
366 (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{
367 AtomicSDNode *AN = cast(N);
368 return AN->getMemoryVT() == MVT::i32 &&
369 AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
370 }]>;
371
372 def atomic_cmp_swap_64_local :
373 PatFrag<(ops node:$ptr, node:$cmp, node:$swap),
374 (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{
375 AtomicSDNode *AN = cast(N);
376 return AN->getMemoryVT() == MVT::i64 &&
377 AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
378 }]>;
366 multiclass AtomicCmpSwapLocal {
367
368 def _32_local : PatFrag <
369 (ops node:$ptr, node:$cmp, node:$swap),
370 (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
371 AtomicSDNode *AN = cast(N);
372 return AN->getMemoryVT() == MVT::i32 &&
373 AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
374 }]>;
375
376 def _64_local : PatFrag<
377 (ops node:$ptr, node:$cmp, node:$swap),
378 (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
379 AtomicSDNode *AN = cast(N);
380 return AN->getMemoryVT() == MVT::i64 &&
381 AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
382 }]>;
383 }
384
385 defm atomic_cmp_swap : AtomicCmpSwapLocal ;
379386
380387 def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
381388 return isFlatLoad(dyn_cast(N));
603603 let LGKM_CNT = 1;
604604 let DS = 1;
605605 let UseNamedOperandTable = 1;
606 let DisableEncoding = "$m0";
606 let Uses = [M0];
607607
608608 // Most instruction load and store data, so set this as the default.
609609 let mayLoad = 1;
122122 def SIconstdata_ptr : SDNode<
123123 "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]>
124124 >;
125
126 //===----------------------------------------------------------------------===//
127 // SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1
128 // to be glued to the memory instructions.
129 //===----------------------------------------------------------------------===//
130
131 def SIld_local : SDNode <"ISD::LOAD", SDTLoad,
132 [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
133 >;
134
135 def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{
136 return isLocalLoad(cast(N));
137 }]>;
138
139 def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{
140 return cast(N)->getAddressingMode() == ISD::UNINDEXED &&
141 cast(N)->getExtensionType() == ISD::NON_EXTLOAD;
142 }]>;
143
144 def si_load_local_align8 : Aligned8Bytes <
145 (ops node:$ptr), (si_load_local node:$ptr)
146 >;
147
148 def si_sextload_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{
149 return cast(N)->getExtensionType() == ISD::SEXTLOAD;
150 }]>;
151 def si_az_extload_local : AZExtLoadBase ;
152
153 multiclass SIExtLoadLocal {
154
155 def _i8 : PatFrag <(ops node:$ptr), (ld_node node:$ptr),
156 [{return cast(N)->getMemoryVT() == MVT::i8;}]
157 >;
158
159 def _i16 : PatFrag <(ops node:$ptr), (ld_node node:$ptr),
160 [{return cast(N)->getMemoryVT() == MVT::i16;}]
161 >;
162 }
163
164 defm si_sextload_local : SIExtLoadLocal ;
165 defm si_az_extload_local : SIExtLoadLocal ;
166
167 def SIst_local : SDNode <"ISD::STORE", SDTStore,
168 [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue]
169 >;
170
171 def si_st_local : PatFrag <
172 (ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{
173 return isLocalStore(cast(N));
174 }]>;
175
176 def si_store_local : PatFrag <
177 (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{
178 return cast(N)->getAddressingMode() == ISD::UNINDEXED &&
179 !cast(N)->isTruncatingStore();
180 }]>;
181
182 def si_store_local_align8 : Aligned8Bytes <
183 (ops node:$val, node:$ptr), (si_store_local node:$val, node:$ptr)
184 >;
185
186 def si_truncstore_local : PatFrag <
187 (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{
188 return cast(N)->isTruncatingStore();
189 }]>;
190
191 def si_truncstore_local_i8 : PatFrag <
192 (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{
193 return cast(N)->getMemoryVT() == MVT::i8;
194 }]>;
195
196 def si_truncstore_local_i16 : PatFrag <
197 (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{
198 return cast(N)->getMemoryVT() == MVT::i16;
199 }]>;
200
201 multiclass SIAtomicM0Glue2 {
202
203 def _glue : SDNode <"ISD::ATOMIC_"#op_name, SDTAtomic2,
204 [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
205 >;
206
207 def _local : local_binary_atomic_op (NAME#"_glue")>;
208 }
209
210 defm si_atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
211 defm si_atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
212 defm si_atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">;
213 defm si_atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">;
214 defm si_atomic_load_or : SIAtomicM0Glue2 <"LOAD_OR">;
215 defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
216 defm si_atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">;
217 defm si_atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
218 defm si_atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
219 defm si_atomic_swap : SIAtomicM0Glue2 <"SWAP">;
220
221 def si_atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3,
222 [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
223 >;
224
225 defm si_atomic_cmp_swap : AtomicCmpSwapLocal ;
125226
126227 // Transformation function, extract the lower 32bit of a 64bit immediate
127228 def LO32 : SDNodeXForm
17251826
17261827 multiclass DS_1A_RET op, string opName, RegisterClass rc,
17271828 dag outs = (outs rc:$vdst),
1728 dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds, M0Reg:$m0),
1829 dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds),
17291830 string asm = opName#" $vdst, $addr"#"$offset$gds"> {
17301831
17311832 def "" : DS_Pseudo ;
17391840 multiclass DS_1A_Off8_RET op, string opName, RegisterClass rc,
17401841 dag outs = (outs rc:$vdst),
17411842 dag ins = (ins VGPR_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1,
1742 gds01:$gds, M0Reg:$m0),
1843 gds01:$gds),
17431844 string asm = opName#" $vdst, $addr"#"$offset0"#"$offset1$gds"> {
17441845
17451846 def "" : DS_Pseudo ;
17521853
17531854 multiclass DS_1A1D_NORET op, string opName, RegisterClass rc,
17541855 dag outs = (outs),
1755 dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds,
1756 M0Reg:$m0),
1856 dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds),
17571857 string asm = opName#" $addr, $data0"#"$offset$gds"> {
17581858
17591859 def "" : DS_Pseudo ,
17681868 multiclass DS_1A1D_Off8_NORET op, string opName, RegisterClass rc,
17691869 dag outs = (outs),
17701870 dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1,
1771 ds_offset0:$offset0, ds_offset1:$offset1, gds01:$gds, M0Reg:$m0),
1871 ds_offset0:$offset0, ds_offset1:$offset1, gds01:$gds),
17721872 string asm = opName#" $addr, $data0, $data1"#"$offset0"#"$offset1"#"$gds"> {
17731873
17741874 def "" : DS_Pseudo ;
17821882 multiclass DS_1A1D_RET op, string opName, RegisterClass rc,
17831883 string noRetOp = "",
17841884 dag outs = (outs rc:$vdst),
1785 dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds,
1786 M0Reg:$m0),
1885 dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds),
17871886 string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> {
17881887
17891888 def "" : DS_Pseudo ,
18111910 string noRetOp = "", RegisterClass src = rc> :
18121911 DS_1A2D_RET_m
18131912 (ins VGPR_32:$addr, src:$data0, src:$data1,
1814 ds_offset:$offset, gds:$gds, M0Reg:$m0)
1913 ds_offset:$offset, gds:$gds)
18151914 >;
18161915
18171916 multiclass DS_1A2D_NORET op, string opName, RegisterClass rc,
18181917 string noRetOp = opName,
18191918 dag outs = (outs),
18201919 dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1,
1821 ds_offset:$offset, gds:$gds, M0Reg:$m0),
1920 ds_offset:$offset, gds:$gds),
18221921 string asm = opName#" $addr, $data0, $data1"#"$offset"#"$gds"> {
18231922
18241923 def "" : DS_Pseudo ,
18321931
18331932 multiclass DS_0A_RET op, string opName,
18341933 dag outs = (outs VGPR_32:$vdst),
1835 dag ins = (ins ds_offset:$offset, gds:$gds, M0Reg:$m0),
1934 dag ins = (ins ds_offset:$offset, gds:$gds),
18361935 string asm = opName#" $vdst"#"$offset"#"$gds"> {
18371936
18381937 let mayLoad = 1, mayStore = 1 in {
18471946
18481947 multiclass DS_1A_RET_GDS op, string opName,
18491948 dag outs = (outs VGPR_32:$vdst),
1850 dag ins = (ins VGPR_32:$addr, ds_offset_gds:$offset, M0Reg:$m0),
1949 dag ins = (ins VGPR_32:$addr, ds_offset_gds:$offset),
18511950 string asm = opName#" $vdst, $addr"#"$offset gds"> {
18521951
18531952 def "" : DS_Pseudo ;
18601959
18611960 multiclass DS_1A_GDS op, string opName,
18621961 dag outs = (outs),
1863 dag ins = (ins VGPR_32:$addr, M0Reg:$m0),
1962 dag ins = (ins VGPR_32:$addr),
18641963 string asm = opName#" $addr gds"> {
18651964
18661965 def "" : DS_Pseudo ;
18731972
18741973 multiclass DS_1A op, string opName,
18751974 dag outs = (outs),
1876 dag ins = (ins VGPR_32:$addr, ds_offset:$offset, M0Reg:$m0, gds:$gds),
1975 dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds),
18771976 string asm = opName#" $addr"#"$offset"#"$gds"> {
18781977
18791978 let mayLoad = 1, mayStore = 1 in {
28232823
28242824 class DSReadPat : Pat <
28252825 (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))),
2826 (inst $ptr, (as_i16imm $offset), (i1 0), (S_MOV_B32 -1))
2827 >;
2828
2829 def : DSReadPat ;
2830 def : DSReadPat ;
2831 def : DSReadPat ;
2832 def : DSReadPat ;
2833 def : DSReadPat ;
2826 (inst $ptr, (as_i16imm $offset), (i1 0))
2827 >;
2828
2829 def : DSReadPat ;
2830 def : DSReadPat ;
2831 def : DSReadPat ;
2832 def : DSReadPat ;
2833 def : DSReadPat ;
28342834
28352835 let AddedComplexity = 100 in {
28362836
2837 def : DSReadPat local_load_aligned8bytes>;
2837 def : DSReadPat si_load_local_align8>;
28382838
28392839 } // End AddedComplexity = 100
28402840
28412841 def : Pat <
2842 (v2i32 (local_load (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
2842 (v2i32 (si_load_local (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
28432843 i8:$offset1))),
2844 (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0), (S_MOV_B32 -1))
2844 (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0))
28452845 >;
28462846
28472847 class DSWritePat : Pat <
28482848 (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
2849 (inst $ptr, $value, (as_i16imm $offset), (i1 0), (S_MOV_B32 -1))
2850 >;
2851
2852 def : DSWritePat ;
2853 def : DSWritePat ;
2854 def : DSWritePat ;
2849 (inst $ptr, $value, (as_i16imm $offset), (i1 0))
2850 >;
2851
2852 def : DSWritePat ;
2853 def : DSWritePat ;
2854 def : DSWritePat ;
28552855
28562856 let AddedComplexity = 100 in {
28572857
2858 def : DSWritePat local_store_aligned8bytes>;
2858 def : DSWritePat si_store_local_align8>;
28592859 } // End AddedComplexity = 100
28602860
28612861 def : Pat <
2862 (local_store v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
2863 i8:$offset1)),
2862 (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
2863 i8:$offset1)),
28642864 (DS_WRITE2_B32 $ptr, (EXTRACT_SUBREG $value, sub0),
28652865 (EXTRACT_SUBREG $value, sub1), $offset0, $offset1,
2866 (i1 0), (S_MOV_B32 -1))
2866 (i1 0))
28672867 >;
28682868
28692869 class DSAtomicRetPat : Pat <
28702870 (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
2871 (inst $ptr, $value, (as_i16imm $offset), (i1 0), (S_MOV_B32 -1))
2871 (inst $ptr, $value, (as_i16imm $offset), (i1 0))
28722872 >;
28732873
28742874 // Special case of DSAtomicRetPat for add / sub 1 -> inc / dec
28842884 class DSAtomicIncRetPat
28852885 Instruction LoadImm, PatFrag frag> : Pat <
28862886 (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)),
2887 (inst $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (i1 0), (S_MOV_B32 -1))
2887 (inst $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (i1 0))
28882888 >;
28892889
28902890
28912891 class DSAtomicCmpXChg : Pat <
28922892 (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
2893 (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0), (S_MOV_B32 -1))
2893 (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0))
28942894 >;
28952895
28962896
28972897 // 32-bit atomics.
28982898 def : DSAtomicIncRetPat
2899 S_MOV_B32, atomic_load_add_local>;
2899 S_MOV_B32, si_atomic_load_add_local>;
29002900 def : DSAtomicIncRetPat
2901 S_MOV_B32, atomic_load_sub_local>;
2902
2903 def : DSAtomicRetPat;
2904 def : DSAtomicRetPat;
2905 def : DSAtomicRetPat;
2906 def : DSAtomicRetPat;
2907 def : DSAtomicRetPat;
2908 def : DSAtomicRetPat;
2909 def : DSAtomicRetPat;
2910 def : DSAtomicRetPat;
2911 def : DSAtomicRetPat;
2912 def : DSAtomicRetPat;
2913
2914 def : DSAtomicCmpXChg_local>;
2901 S_MOV_B32, si_atomic_load_sub_local>;
2902
2903 def : DSAtomicRetPat;
2904 def : DSAtomicRetPat;
2905 def : DSAtomicRetPat;
2906 def : DSAtomicRetPat;
2907 def : DSAtomicRetPat;
2908 def : DSAtomicRetPat;
2909 def : DSAtomicRetPat;
2910 def : DSAtomicRetPat;
2911 def : DSAtomicRetPat;
2912 def : DSAtomicRetPat;
2913
2914 def : DSAtomicCmpXChg;
29152915
29162916 // 64-bit atomics.
29172917 def : DSAtomicIncRetPat
2918 S_MOV_B64, atomic_load_add_local>;
2918 S_MOV_B64, si_atomic_load_add_local>;
29192919 def : DSAtomicIncRetPat
2920 S_MOV_B64, atomic_load_sub_local>;
2921
2922 def : DSAtomicRetPat;
2923 def : DSAtomicRetPat;
2924 def : DSAtomicRetPat;
2925 def : DSAtomicRetPat;
2926 def : DSAtomicRetPat;
2927 def : DSAtomicRetPat;
2928 def : DSAtomicRetPat;
2929 def : DSAtomicRetPat;
2930 def : DSAtomicRetPat;
2931 def : DSAtomicRetPat;
2932
2933 def : DSAtomicCmpXChg_local>;
2920 S_MOV_B64, si_atomic_load_sub_local>;
2921
2922 def : DSAtomicRetPat;
2923 def : DSAtomicRetPat;
2924 def : DSAtomicRetPat;
2925 def : DSAtomicRetPat;
2926 def : DSAtomicRetPat;
2927 def : DSAtomicRetPat;
2928 def : DSAtomicRetPat;
2929 def : DSAtomicRetPat;
2930 def : DSAtomicRetPat;
2931 def : DSAtomicRetPat;
2932
2933 def : DSAtomicCmpXChg;
29342934
29352935
29362936 //===----------------------------------------------------------------------===//
212212 // Be careful, since the addresses could be subregisters themselves in weird
213213 // cases, like vectors of pointers.
214214 const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
215 const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0);
216215
217216 unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg();
218217 unsigned DestReg1
253252 .addImm(NewOffset0) // offset0
254253 .addImm(NewOffset1) // offset1
255254 .addImm(0) // gds
256 .addOperand(*M0Reg) // M0
257255 .addMemOperand(*I->memoperands_begin())
258256 .addMemOperand(*Paired->memoperands_begin());
259
260 LIS->InsertMachineInstrInMaps(Read2);
261257
262258 unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
263259 unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
265261 updateRegDefsUses(DestReg1, DestReg, SubRegIdx1);
266262
267263 LIS->RemoveMachineInstrFromMaps(I);
268 LIS->RemoveMachineInstrFromMaps(Paired);
264 // Replacing Paired in the maps with Read2 allows us to avoid updating the
265 // live range for the m0 register.
266 LIS->ReplaceMachineInstrInMaps(Paired, Read2);
269267 I->eraseFromParent();
270268 Paired->eraseFromParent();
271269
272270 LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg());
273271 LIS->shrinkToUses(&AddrRegLI);
274
275 LiveInterval &M0RegLI = LIS->getInterval(M0Reg->getReg());
276 LIS->shrinkToUses(&M0RegLI);
277
278 // Currently m0 is treated as a register class with one member instead of an
279 // implicit physical register. We are using the virtual register for the first
280 // one, but we still need to update the live range of the now unused second m0
281 // virtual register to avoid verifier errors.
282 const MachineOperand *PairedM0Reg
283 = TII->getNamedOperand(*Paired, AMDGPU::OpName::m0);
284 LiveInterval &PairedM0RegLI = LIS->getInterval(PairedM0Reg->getReg());
285 LIS->shrinkToUses(&PairedM0RegLI);
286272
287273 LIS->getInterval(DestReg); // Create new LI
288274
299285 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
300286 // sure we preserve the subregister index and any register flags set on them.
301287 const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
302 const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0);
303288 const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0);
304289 const MachineOperand *Data1
305290 = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0);
329314
330315 const MCInstrDesc &Write2Desc = TII->get(Opc);
331316 DebugLoc DL = I->getDebugLoc();
317
318 // repairLiveintervalsInRange() doesn't handle physical register, so we have
319 // to update the M0 range manually.
320 SlotIndex PairedIndex = LIS->getInstructionIndex(Paired);
321 LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
322 LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
323 bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
332324
333325 MachineInstrBuilder Write2
334326 = BuildMI(*MBB, I, DL, Write2Desc)
338330 .addImm(NewOffset0) // offset0
339331 .addImm(NewOffset1) // offset1
340332 .addImm(0) // gds
341 .addOperand(*M0Reg) // m0
342333 .addMemOperand(*I->memoperands_begin())
343334 .addMemOperand(*Paired->memoperands_begin());
344335
345336 // XXX - How do we express subregisters here?
346 unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg(),
347 M0Reg->getReg()};
337 unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() };
348338
349339 LIS->RemoveMachineInstrFromMaps(I);
350340 LIS->RemoveMachineInstrFromMaps(Paired);
351341 I->eraseFromParent();
352342 Paired->eraseFromParent();
353343
344 // This doesn't handle physical registers like M0
354345 LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs);
346
347 if (UpdateM0Range) {
348 SlotIndex Write2Index = LIS->getInstructionIndex(Write2);
349 M0Segment->end = Write2Index.getRegSlot();
350 }
355351
356352 DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
357353 return Write2.getInstr();
6464
6565 ; SI-LABEL: @simple_read2st64_f32_over_max_offset
6666 ; SI-NOT: ds_read2st64_b32
67 ; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
6768 ; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
68 ; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
6969 ; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]
7070 ; SI: s_endpgm
7171 define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
6868 ; pointer can be used with an offset into the second one.
6969
7070 ; SI-LABEL: {{^}}load_shl_base_lds_2:
71 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
7172 ; SI: s_mov_b32 m0, -1
72 ; SI-NEXT: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
7373 ; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
7474 ; SI: s_endpgm
7575 define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {