llvm.org GIT mirror llvm / a53b9eb
[AMDGPU] New buffer intrinsics Summary: This commit adds new intrinsics llvm.amdgcn.raw.buffer.load llvm.amdgcn.raw.buffer.load.format llvm.amdgcn.raw.buffer.load.format.d16 llvm.amdgcn.struct.buffer.load llvm.amdgcn.struct.buffer.load.format llvm.amdgcn.struct.buffer.load.format.d16 llvm.amdgcn.raw.buffer.store llvm.amdgcn.raw.buffer.store.format llvm.amdgcn.raw.buffer.store.format.d16 llvm.amdgcn.struct.buffer.store llvm.amdgcn.struct.buffer.store.format llvm.amdgcn.struct.buffer.store.format.d16 llvm.amdgcn.raw.buffer.atomic.* llvm.amdgcn.struct.buffer.atomic.* with the following changes from the llvm.amdgcn.buffer.* intrinsics: * there are separate raw and struct versions: raw does not have an index arg and sets idxen=0 in the instruction, and struct always sets idxen=1 in the instruction even if the index is 0, to allow for the fact that gfx9 does bounds checking differently depending on whether idxen is set; * there is a combined cachepolicy arg (glc+slc) * there are now only two offset args: one for the offset that is included in bounds checking and swizzling, to be split between the instruction's voffset and immoffset fields, and one for the offset that is excluded from bounds checking and swizzling, to go into the instruction's soffset field. The AMDISD::BUFFER_* SD nodes always have an index operand, all three offset operands, combined cachepolicy operand, and an extra idxen operand. The obsolescent llvm.amdgcn.buffer.* intrinsics continue to work. Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye, jfb, llvm-commits Differential Revision: https://reviews.llvm.org/D50306 Change-Id: If897ea7dc34fcbf4d5496e98cc99a934f62fc205 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@340269 91177308-0d34-0410-b5e6-96231b3b80d8 Tim Renouf 1 year, 6 months ago
22 changed file(s) with 2038 addition(s) and 184 deletion(s). Raw diff Collapse all Expand all
813813 AMDGPURsrcIntrinsic<1>;
814814 def int_amdgcn_buffer_store_format : AMDGPUBufferStore;
815815 def int_amdgcn_buffer_store : AMDGPUBufferStore;
816
817 // New buffer intrinsics with separate raw and struct variants. The raw
818 // variant never has an index. The struct variant always has an index, even if
819 // it is const 0. A struct intrinsic with constant 0 index is different to the
820 // corresponding raw intrinsic on gfx9+ because the behavior of bound checking
821 // and swizzling changes depending on whether idxen is set in the instruction.
822 // These new instrinsics also keep the offset and soffset arguments separate as
823 // they behave differently in bounds checking and swizzling.
824 class AMDGPURawBufferLoad : Intrinsic <
825 [llvm_anyfloat_ty],
826 [llvm_v4i32_ty, // rsrc(SGPR)
827 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
828 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
829 llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
830 [IntrReadMem], "", [SDNPMemOperand]>,
831 AMDGPURsrcIntrinsic<0>;
832 def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad;
833 def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad;
834
835 class AMDGPUStructBufferLoad : Intrinsic <
836 [llvm_anyfloat_ty],
837 [llvm_v4i32_ty, // rsrc(SGPR)
838 llvm_i32_ty, // vindex(VGPR)
839 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
840 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
841 llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
842 [IntrReadMem], "", [SDNPMemOperand]>,
843 AMDGPURsrcIntrinsic<0>;
844 def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad;
845 def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad;
846
847 class AMDGPURawBufferStore : Intrinsic <
848 [],
849 [llvm_anyfloat_ty, // vdata(VGPR) -- can currently only select f32, v2f32, v4f32
850 llvm_v4i32_ty, // rsrc(SGPR)
851 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
852 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
853 llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
854 [IntrWriteMem], "", [SDNPMemOperand]>,
855 AMDGPURsrcIntrinsic<1>;
856 def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore;
857 def int_amdgcn_raw_buffer_store : AMDGPURawBufferStore;
858
859 class AMDGPUStructBufferStore : Intrinsic <
860 [],
861 [llvm_anyfloat_ty, // vdata(VGPR) -- can currently only select f32, v2f32, v4f32
862 llvm_v4i32_ty, // rsrc(SGPR)
863 llvm_i32_ty, // vindex(VGPR)
864 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
865 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
866 llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
867 [IntrWriteMem], "", [SDNPMemOperand]>,
868 AMDGPURsrcIntrinsic<1>;
869 def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore;
870 def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore;
871
872 class AMDGPURawBufferAtomic : Intrinsic <
873 [llvm_i32_ty],
874 [llvm_i32_ty, // vdata(VGPR)
875 llvm_v4i32_ty, // rsrc(SGPR)
876 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
877 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
878 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc)
879 [], "", [SDNPMemOperand]>,
880 AMDGPURsrcIntrinsic<1, 0>;
881 def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic;
882 def int_amdgcn_raw_buffer_atomic_add : AMDGPURawBufferAtomic;
883 def int_amdgcn_raw_buffer_atomic_sub : AMDGPURawBufferAtomic;
884 def int_amdgcn_raw_buffer_atomic_smin : AMDGPURawBufferAtomic;
885 def int_amdgcn_raw_buffer_atomic_umin : AMDGPURawBufferAtomic;
886 def int_amdgcn_raw_buffer_atomic_smax : AMDGPURawBufferAtomic;
887 def int_amdgcn_raw_buffer_atomic_umax : AMDGPURawBufferAtomic;
888 def int_amdgcn_raw_buffer_atomic_and : AMDGPURawBufferAtomic;
889 def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic;
890 def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic;
891 def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
892 [llvm_i32_ty],
893 [llvm_i32_ty, // src(VGPR)
894 llvm_i32_ty, // cmp(VGPR)
895 llvm_v4i32_ty, // rsrc(SGPR)
896 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
897 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
898 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc)
899 [], "", [SDNPMemOperand]>,
900 AMDGPURsrcIntrinsic<2, 0>;
901
902 class AMDGPUStructBufferAtomic : Intrinsic <
903 [llvm_i32_ty],
904 [llvm_i32_ty, // vdata(VGPR)
905 llvm_v4i32_ty, // rsrc(SGPR)
906 llvm_i32_ty, // vindex(VGPR)
907 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
908 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
909 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc)
910 [], "", [SDNPMemOperand]>,
911 AMDGPURsrcIntrinsic<1, 0>;
912 def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic;
913 def int_amdgcn_struct_buffer_atomic_add : AMDGPUStructBufferAtomic;
914 def int_amdgcn_struct_buffer_atomic_sub : AMDGPUStructBufferAtomic;
915 def int_amdgcn_struct_buffer_atomic_smin : AMDGPUStructBufferAtomic;
916 def int_amdgcn_struct_buffer_atomic_umin : AMDGPUStructBufferAtomic;
917 def int_amdgcn_struct_buffer_atomic_smax : AMDGPUStructBufferAtomic;
918 def int_amdgcn_struct_buffer_atomic_umax : AMDGPUStructBufferAtomic;
919 def int_amdgcn_struct_buffer_atomic_and : AMDGPUStructBufferAtomic;
920 def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic;
921 def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic;
922 def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
923 [llvm_i32_ty],
924 [llvm_i32_ty, // src(VGPR)
925 llvm_i32_ty, // cmp(VGPR)
926 llvm_v4i32_ty, // rsrc(SGPR)
927 llvm_i32_ty, // vindex(VGPR)
928 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
929 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
930 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc)
931 [], "", [SDNPMemOperand]>,
932 AMDGPURsrcIntrinsic<2, 0>;
816933
817934 // Obsolescent tbuffer intrinsics.
818935 def int_amdgcn_tbuffer_load : Intrinsic <
144144 bool SelectMUBUFConstant(SDValue Constant,
145145 SDValue &SOffset,
146146 SDValue &ImmOffset) const;
147 bool SelectMUBUFIntrinsicOffset(SDValue Offset, SDValue &SOffset,
148 SDValue &ImmOffset) const;
149 bool SelectMUBUFIntrinsicVOffset(SDValue Offset, SDValue &SOffset,
150 SDValue &ImmOffset, SDValue &VOffset) const;
151147
152148 bool SelectFlatAtomic(SDValue Addr, SDValue &VAddr,
153149 SDValue &Offset, SDValue &SLC) const;
12921288 SDValue &SOffset,
12931289 SDValue &ImmOffset) const {
12941290 SDLoc DL(Constant);
1295 const uint32_t Align = 4;
1296 const uint32_t MaxImm = alignDown(4095, Align);
12971291 uint32_t Imm = cast(Constant)->getZExtValue();
1298 uint32_t Overflow = 0;
1299
1300 if (Imm > MaxImm) {
1301 if (Imm <= MaxImm + 64) {
1302 // Use an SOffset inline constant for 4..64
1303 Overflow = Imm - MaxImm;
1304 Imm = MaxImm;
1305 } else {
1306 // Try to keep the same value in SOffset for adjacent loads, so that
1307 // the corresponding register contents can be re-used.
1308 //
1309 // Load values with all low-bits (except for alignment bits) set into
1310 // SOffset, so that a larger range of values can be covered using
1311 // s_movk_i32.
1312 //
1313 // Atomic operations fail to work correctly when individual address
1314 // components are unaligned, even if their sum is aligned.
1315 uint32_t High = (Imm + Align) & ~4095;
1316 uint32_t Low = (Imm + Align) & 4095;
1317 Imm = Low;
1318 Overflow = High - Align;
1319 }
1320 }
1321
1322 // There is a hardware bug in SI and CI which prevents address clamping in
1323 // MUBUF instructions from working correctly with SOffsets. The immediate
1324 // offset is unaffected.
1325 if (Overflow > 0 &&
1326 Subtarget->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
1292 uint32_t Overflow;
1293 if (!AMDGPU::splitMUBUFOffset(Imm, Overflow, Imm, Subtarget))
13271294 return false;
1328
13291295 ImmOffset = CurDAG->getTargetConstant(Imm, DL, MVT::i16);
1330
13311296 if (Overflow <= 64)
13321297 SOffset = CurDAG->getTargetConstant(Overflow, DL, MVT::i32);
13331298 else
13341299 SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
13351300 CurDAG->getTargetConstant(Overflow, DL, MVT::i32)),
13361301 0);
1337
1338 return true;
1339 }
1340
1341 bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicOffset(SDValue Offset,
1342 SDValue &SOffset,
1343 SDValue &ImmOffset) const {
1344 SDLoc DL(Offset);
1345
1346 if (!isa(Offset))
1347 return false;
1348
1349 return SelectMUBUFConstant(Offset, SOffset, ImmOffset);
1350 }
1351
1352 bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset,
1353 SDValue &SOffset,
1354 SDValue &ImmOffset,
1355 SDValue &VOffset) const {
1356 SDLoc DL(Offset);
1357
1358 // Don't generate an unnecessary voffset for constant offsets.
1359 if (isa(Offset)) {
1360 SDValue Tmp1, Tmp2;
1361
1362 // When necessary, use a voffset in <= CI anyway to work around a hardware
1363 // bug.
1364 if (Subtarget->getGeneration() > AMDGPUSubtarget::SEA_ISLANDS ||
1365 SelectMUBUFConstant(Offset, Tmp1, Tmp2))
1366 return false;
1367 }
1368
1369 if (CurDAG->isBaseWithConstantOffset(Offset)) {
1370 SDValue N0 = Offset.getOperand(0);
1371 SDValue N1 = Offset.getOperand(1);
1372 if (cast(N1)->getSExtValue() >= 0 &&
1373 SelectMUBUFConstant(N1, SOffset, ImmOffset)) {
1374 VOffset = N0;
1375 return true;
1376 }
1377 }
1378
1379 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1380 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1381 VOffset = Offset;
13821302
13831303 return true;
13841304 }
10421042 multiclass MUBUF_LoadIntrinsicPat
10431043 string opcode> {
10441044 def : GCNPat<
1045 (vt (name v4i32:$rsrc, 0,
1046 (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
1047 imm:$glc, imm:$slc)),
1045 (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
1046 imm:$cachepolicy, 0)),
10481047 (!cast(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
1049 (as_i1imm $glc), (as_i1imm $slc), 0)
1048 (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
10501049 >;
10511050
10521051 def : GCNPat<
1053 (vt (name v4i32:$rsrc, i32:$vindex,
1054 (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
1055 imm:$glc, imm:$slc)),
1052 (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
1053 imm:$cachepolicy, 0)),
1054 (!cast(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
1055 (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
1056 >;
1057
1058 def : GCNPat<
1059 (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
1060 imm:$cachepolicy, imm)),
10561061 (!cast(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
1057 (as_i1imm $glc), (as_i1imm $slc), 0)
1062 (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
10581063 >;
10591064
10601065 def : GCNPat<
1061 (vt (name v4i32:$rsrc, 0,
1062 (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
1063 imm:$glc, imm:$slc)),
1064 (!cast(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
1065 (as_i1imm $glc), (as_i1imm $slc), 0)
1066 >;
1067
1068 def : GCNPat<
1069 (vt (name v4i32:$rsrc, i32:$vindex,
1070 (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
1071 imm:$glc, imm:$slc)),
1066 (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
1067 imm:$cachepolicy, imm)),
10721068 (!cast(opcode # _BOTHEN)
10731069 (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
10741070 $rsrc, $soffset, (as_i16imm $offset),
1075 (as_i1imm $glc), (as_i1imm $slc), 0)
1071 (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
10761072 >;
10771073 }
10781074
10991095 multiclass MUBUF_StoreIntrinsicPat
11001096 string opcode> {
11011097 def : GCNPat<
1102 (name vt:$vdata, v4i32:$rsrc, 0,
1103 (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
1104 imm:$glc, imm:$slc),
1098 (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
1099 imm:$cachepolicy, 0),
11051100 (!cast(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset),
1106 (as_i1imm $glc), (as_i1imm $slc), 0)
1101 (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
11071102 >;
11081103
11091104 def : GCNPat<
1110 (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
1111 (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
1112 imm:$glc, imm:$slc),
1105 (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
1106 imm:$cachepolicy, 0),
1107 (!cast(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
1108 (as_i16imm $offset), (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
1109 >;
1110
1111 def : GCNPat<
1112 (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
1113 imm:$cachepolicy, imm),
11131114 (!cast(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
1114 (as_i16imm $offset), (as_i1imm $glc),
1115 (as_i1imm $slc), 0)
1115 (as_i16imm $offset), (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
11161116 >;
11171117
11181118 def : GCNPat<
1119 (name vt:$vdata, v4i32:$rsrc, 0,
1120 (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
1121 imm:$glc, imm:$slc),
1122 (!cast(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
1123 (as_i16imm $offset), (as_i1imm $glc),
1124 (as_i1imm $slc), 0)
1125 >;
1126
1127 def : GCNPat<
1128 (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
1129 (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
1130 imm:$glc, imm:$slc),
1119 (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
1120 imm:$cachepolicy, imm),
11311121 (!cast(opcode # _BOTHEN_exact)
11321122 $vdata,
11331123 (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
11341124 $rsrc, $soffset, (as_i16imm $offset),
1135 (as_i1imm $glc), (as_i1imm $slc), 0)
1125 (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
11361126 >;
11371127 }
11381128
11631153 multiclass BufferAtomicPatterns {
11641154 def : GCNPat<
11651155 (name i32:$vdata_in, v4i32:$rsrc, 0,
1166 (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
1167 imm:$slc),
1156 0, i32:$soffset, imm:$offset,
1157 imm:$cachepolicy, 0),
11681158 (!cast(opcode # _OFFSET_RTN) $vdata_in, $rsrc, $soffset,
1169 (as_i16imm $offset), (as_i1imm $slc))
1159 (as_i16imm $offset), (extract_slc $cachepolicy))
11701160 >;
11711161
11721162 def : GCNPat<
11731163 (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex,
1174 (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
1175 imm:$slc),
1164 0, i32:$soffset, imm:$offset,
1165 imm:$cachepolicy, imm),
11761166 (!cast(opcode # _IDXEN_RTN) $vdata_in, $vindex, $rsrc, $soffset,
1177 (as_i16imm $offset), (as_i1imm $slc))
1167 (as_i16imm $offset), (extract_slc $cachepolicy))
11781168 >;
11791169
11801170 def : GCNPat<
11811171 (name i32:$vdata_in, v4i32:$rsrc, 0,
1182 (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
1183 imm:$slc),
1172 i32:$voffset, i32:$soffset, imm:$offset,
1173 imm:$cachepolicy, 0),
11841174 (!cast(opcode # _OFFEN_RTN) $vdata_in, $voffset, $rsrc, $soffset,
1185 (as_i16imm $offset), (as_i1imm $slc))
1175 (as_i16imm $offset), (extract_slc $cachepolicy))
11861176 >;
11871177
11881178 def : GCNPat<
11891179 (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex,
1190 (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
1191 imm:$slc),
1180 i32:$voffset, i32:$soffset, imm:$offset,
1181 imm:$cachepolicy, imm),
11921182 (!cast(opcode # _BOTHEN_RTN)
11931183 $vdata_in,
11941184 (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
1195 $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc))
1185 $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy))
11961186 >;
11971187 }
11981188
12101200 def : GCNPat<
12111201 (SIbuffer_atomic_cmpswap
12121202 i32:$data, i32:$cmp, v4i32:$rsrc, 0,
1213 (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
1214 imm:$slc),
1203 0, i32:$soffset, imm:$offset,
1204 imm:$cachepolicy, 0),
12151205 (EXTRACT_SUBREG
12161206 (BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN
12171207 (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
1218 $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
1208 $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)),
12191209 sub0)
12201210 >;
12211211
12221212 def : GCNPat<
12231213 (SIbuffer_atomic_cmpswap
12241214 i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
1225 (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
1226 imm:$slc),
1215 0, i32:$soffset, imm:$offset,
1216 imm:$cachepolicy, imm),
12271217 (EXTRACT_SUBREG
12281218 (BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN
12291219 (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
1230 $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
1220 $vindex, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)),
12311221 sub0)
12321222 >;
12331223
12341224 def : GCNPat<
12351225 (SIbuffer_atomic_cmpswap
12361226 i32:$data, i32:$cmp, v4i32:$rsrc, 0,
1237 (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
1238 imm:$slc),
1227 i32:$voffset, i32:$soffset, imm:$offset,
1228 imm:$cachepolicy, 0),
12391229 (EXTRACT_SUBREG
12401230 (BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN
12411231 (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
1242 $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
1232 $voffset, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)),
12431233 sub0)
12441234 >;
12451235
12461236 def : GCNPat<
12471237 (SIbuffer_atomic_cmpswap
12481238 i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
1249 (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
1250 imm:$slc),
1239 i32:$voffset, i32:$soffset, imm:$offset,
1240 imm:$cachepolicy, imm),
12511241 (EXTRACT_SUBREG
12521242 (BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN
12531243 (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
12541244 (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
1255 $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
1245 $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)),
12561246 sub0)
12571247 >;
12581248
51315131 }
51325132 case Intrinsic::amdgcn_buffer_load:
51335133 case Intrinsic::amdgcn_buffer_load_format: {
5134 unsigned Glc = cast(Op.getOperand(5))->getZExtValue();
5135 unsigned Slc = cast(Op.getOperand(6))->getZExtValue();
5136 unsigned IdxEn = 1;
5137 if (auto Idx = dyn_cast(Op.getOperand(3)))
5138 IdxEn = Idx->getZExtValue() != 0;
51345139 SDValue Ops[] = {
51355140 Op.getOperand(0), // Chain
51365141 Op.getOperand(2), // rsrc
51375142 Op.getOperand(3), // vindex
5138 Op.getOperand(4), // offset
5139 Op.getOperand(5), // glc
5140 Op.getOperand(6) // slc
5143 SDValue(), // voffset -- will be set by setBufferOffsets
5144 SDValue(), // soffset -- will be set by setBufferOffsets
5145 SDValue(), // offset -- will be set by setBufferOffsets
5146 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5147 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
51415148 };
51425149
5150 setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
51435151 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
51445152 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5153
5154 EVT VT = Op.getValueType();
5155 EVT IntVT = VT.changeTypeToInteger();
5156 auto *M = cast(Op);
5157 EVT LoadVT = Op.getValueType();
5158
5159 if (LoadVT.getScalarType() == MVT::f16)
5160 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5161 M, DAG, Ops);
5162 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5163 M->getMemOperand());
5164 }
5165 case Intrinsic::amdgcn_raw_buffer_load:
5166 case Intrinsic::amdgcn_raw_buffer_load_format: {
5167 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
5168 SDValue Ops[] = {
5169 Op.getOperand(0), // Chain
5170 Op.getOperand(2), // rsrc
5171 DAG.getConstant(0, DL, MVT::i32), // vindex
5172 Offsets.first, // voffset
5173 Op.getOperand(4), // soffset
5174 Offsets.second, // offset
5175 Op.getOperand(5), // cachepolicy
5176 DAG.getConstant(0, DL, MVT::i1), // idxen
5177 };
5178
5179 unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ?
5180 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5181
5182 EVT VT = Op.getValueType();
5183 EVT IntVT = VT.changeTypeToInteger();
5184 auto *M = cast(Op);
5185 EVT LoadVT = Op.getValueType();
5186
5187 if (LoadVT.getScalarType() == MVT::f16)
5188 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5189 M, DAG, Ops);
5190 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5191 M->getMemOperand());
5192 }
5193 case Intrinsic::amdgcn_struct_buffer_load:
5194 case Intrinsic::amdgcn_struct_buffer_load_format: {
5195 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5196 SDValue Ops[] = {
5197 Op.getOperand(0), // Chain
5198 Op.getOperand(2), // rsrc
5199 Op.getOperand(3), // vindex
5200 Offsets.first, // voffset
5201 Op.getOperand(5), // soffset
5202 Offsets.second, // offset
5203 Op.getOperand(6), // cachepolicy
5204 DAG.getConstant(1, DL, MVT::i1), // idxen
5205 };
5206
5207 unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ?
5208 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5209
51455210 EVT VT = Op.getValueType();
51465211 EVT IntVT = VT.changeTypeToInteger();
51475212 auto *M = cast(Op);
52415306 case Intrinsic::amdgcn_buffer_atomic_and:
52425307 case Intrinsic::amdgcn_buffer_atomic_or:
52435308 case Intrinsic::amdgcn_buffer_atomic_xor: {
5309 unsigned Slc = cast(Op.getOperand(6))->getZExtValue();
5310 unsigned IdxEn = 1;
5311 if (auto Idx = dyn_cast(Op.getOperand(4)))
5312 IdxEn = Idx->getZExtValue() != 0;
52445313 SDValue Ops[] = {
52455314 Op.getOperand(0), // Chain
52465315 Op.getOperand(2), // vdata
52475316 Op.getOperand(3), // rsrc
52485317 Op.getOperand(4), // vindex
5249 Op.getOperand(5), // offset
5250 Op.getOperand(6) // slc
5318 SDValue(), // voffset -- will be set by setBufferOffsets
5319 SDValue(), // soffset -- will be set by setBufferOffsets
5320 SDValue(), // offset -- will be set by setBufferOffsets
5321 DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
5322 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
52515323 };
5324 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
52525325 EVT VT = Op.getValueType();
52535326
52545327 auto *M = cast(Op);
52925365 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
52935366 M->getMemOperand());
52945367 }
5295
5368 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5369 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5370 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5371 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5372 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5373 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5374 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5375 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5376 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5377 case Intrinsic::amdgcn_raw_buffer_atomic_xor: {
5378 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5379 SDValue Ops[] = {
5380 Op.getOperand(0), // Chain
5381 Op.getOperand(2), // vdata
5382 Op.getOperand(3), // rsrc
5383 DAG.getConstant(0, DL, MVT::i32), // vindex
5384 Offsets.first, // voffset
5385 Op.getOperand(5), // soffset
5386 Offsets.second, // offset
5387 Op.getOperand(6), // cachepolicy
5388 DAG.getConstant(0, DL, MVT::i1), // idxen
5389 };
5390 EVT VT = Op.getValueType();
5391
5392 auto *M = cast(Op);
5393 unsigned Opcode = 0;
5394
5395 switch (IntrID) {
5396 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5397 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
5398 break;
5399 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5400 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
5401 break;
5402 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5403 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
5404 break;
5405 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5406 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
5407 break;
5408 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5409 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
5410 break;
5411 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5412 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
5413 break;
5414 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5415 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
5416 break;
5417 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5418 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
5419 break;
5420 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5421 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5422 break;
5423 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5424 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
5425 break;
5426 default:
5427 llvm_unreachable("unhandled atomic opcode");
5428 }
5429
5430 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5431 M->getMemOperand());
5432 }
5433 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5434 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5435 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5436 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5437 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5438 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5439 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5440 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5441 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5442 case Intrinsic::amdgcn_struct_buffer_atomic_xor: {
5443 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
5444 SDValue Ops[] = {
5445 Op.getOperand(0), // Chain
5446 Op.getOperand(2), // vdata
5447 Op.getOperand(3), // rsrc
5448 Op.getOperand(4), // vindex
5449 Offsets.first, // voffset
5450 Op.getOperand(6), // soffset
5451 Offsets.second, // offset
5452 Op.getOperand(7), // cachepolicy
5453 DAG.getConstant(1, DL, MVT::i1), // idxen
5454 };
5455 EVT VT = Op.getValueType();
5456
5457 auto *M = cast(Op);
5458 unsigned Opcode = 0;
5459
5460 switch (IntrID) {
5461 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5462 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
5463 break;
5464 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5465 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
5466 break;
5467 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5468 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
5469 break;
5470 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5471 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
5472 break;
5473 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5474 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
5475 break;
5476 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5477 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
5478 break;
5479 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5480 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
5481 break;
5482 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5483 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
5484 break;
5485 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5486 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5487 break;
5488 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5489 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
5490 break;
5491 default:
5492 llvm_unreachable("unhandled atomic opcode");
5493 }
5494
5495 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5496 M->getMemOperand());
5497 }
52965498 case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
5499 unsigned Slc = cast(Op.getOperand(7))->getZExtValue();
5500 unsigned IdxEn = 1;
5501 if (auto Idx = dyn_cast(Op.getOperand(5)))
5502 IdxEn = Idx->getZExtValue() != 0;
52975503 SDValue Ops[] = {
52985504 Op.getOperand(0), // Chain
52995505 Op.getOperand(2), // src
53005506 Op.getOperand(3), // cmp
53015507 Op.getOperand(4), // rsrc
53025508 Op.getOperand(5), // vindex
5303 Op.getOperand(6), // offset
5304 Op.getOperand(7) // slc
5509 SDValue(), // voffset -- will be set by setBufferOffsets
5510 SDValue(), // soffset -- will be set by setBufferOffsets
5511 SDValue(), // offset -- will be set by setBufferOffsets
5512 DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
5513 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5514 };
5515 setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
5516 EVT VT = Op.getValueType();
5517 auto *M = cast(Op);
5518
5519 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
5520 Op->getVTList(), Ops, VT, M->getMemOperand());
5521 }
5522 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: {
5523 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
5524 SDValue Ops[] = {
5525 Op.getOperand(0), // Chain
5526 Op.getOperand(2), // src
5527 Op.getOperand(3), // cmp
5528 Op.getOperand(4), // rsrc
5529 DAG.getConstant(0, DL, MVT::i32), // vindex
5530 Offsets.first, // voffset
5531 Op.getOperand(6), // soffset
5532 Offsets.second, // offset
5533 Op.getOperand(7), // cachepolicy
5534 DAG.getConstant(0, DL, MVT::i1), // idxen
5535 };
5536 EVT VT = Op.getValueType();
5537 auto *M = cast(Op);
5538
5539 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
5540 Op->getVTList(), Ops, VT, M->getMemOperand());
5541 }
5542 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: {
5543 auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
5544 SDValue Ops[] = {
5545 Op.getOperand(0), // Chain
5546 Op.getOperand(2), // src
5547 Op.getOperand(3), // cmp
5548 Op.getOperand(4), // rsrc
5549 Op.getOperand(5), // vindex
5550 Offsets.first, // voffset
5551 Op.getOperand(7), // soffset
5552 Offsets.second, // offset
5553 Op.getOperand(8), // cachepolicy
5554 DAG.getConstant(1, DL, MVT::i1), // idxen
53055555 };
53065556 EVT VT = Op.getValueType();
53075557 auto *M = cast(Op);
55765826 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
55775827 if (IsD16)
55785828 VData = handleD16VData(VData, DAG);
5829 unsigned Glc = cast(Op.getOperand(6))->getZExtValue();
5830 unsigned Slc = cast(Op.getOperand(7))->getZExtValue();
5831 unsigned IdxEn = 1;
5832 if (auto Idx = dyn_cast(Op.getOperand(4)))
5833 IdxEn = Idx->getZExtValue() != 0;
55795834 SDValue Ops[] = {
55805835 Chain,
5581 VData, // vdata
5836 VData,
55825837 Op.getOperand(3), // rsrc
55835838 Op.getOperand(4), // vindex
5584 Op.getOperand(5), // offset
5585 Op.getOperand(6), // glc
5586 Op.getOperand(7) // slc
5839 SDValue(), // voffset -- will be set by setBufferOffsets
5840 SDValue(), // soffset -- will be set by setBufferOffsets
5841 SDValue(), // offset -- will be set by setBufferOffsets
5842 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5843 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
55875844 };
5845 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
55885846 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
55895847 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
55905848 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
55925850 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
55935851 M->getMemoryVT(), M->getMemOperand());
55945852 }
5853
5854 case Intrinsic::amdgcn_raw_buffer_store:
5855 case Intrinsic::amdgcn_raw_buffer_store_format: {
5856 SDValue VData = Op.getOperand(2);
5857 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
5858 if (IsD16)
5859 VData = handleD16VData(VData, DAG);
5860 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5861 SDValue Ops[] = {
5862 Chain,
5863 VData,
5864 Op.getOperand(3), // rsrc
5865 DAG.getConstant(0, DL, MVT::i32), // vindex
5866 Offsets.first, // voffset
5867 Op.getOperand(5), // soffset
5868 Offsets.second, // offset
5869 Op.getOperand(6), // cachepolicy
5870 DAG.getConstant(0, DL, MVT::i1), // idxen
5871 };
5872 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_raw_buffer_store ?
5873 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
5874 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
5875 MemSDNode *M = cast(Op);
5876 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
5877 M->getMemoryVT(), M->getMemOperand());
5878 }
5879
5880 case Intrinsic::amdgcn_struct_buffer_store:
5881 case Intrinsic::amdgcn_struct_buffer_store_format: {
5882 SDValue VData = Op.getOperand(2);
5883 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
5884 if (IsD16)
5885 VData = handleD16VData(VData, DAG);
5886 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
5887 SDValue Ops[] = {
5888 Chain,
5889 VData,
5890 Op.getOperand(3), // rsrc
5891 Op.getOperand(4), // vindex
5892 Offsets.first, // voffset
5893 Op.getOperand(6), // soffset
5894 Offsets.second, // offset
5895 Op.getOperand(7), // cachepolicy
5896 DAG.getConstant(1, DL, MVT::i1), // idxen
5897 };
5898 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ?
5899 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
5900 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
5901 MemSDNode *M = cast(Op);
5902 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
5903 M->getMemoryVT(), M->getMemOperand());
5904 }
5905
55955906 default: {
55965907 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
55975908 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
56025913 }
56035914 }
56045915
5605 // The raw.tbuffer and struct.tbuffer intrinsics have two offset args: offset
5606 // (the offset that is included in bounds checking and swizzling, to be split
5607 // between the instruction's voffset and immoffset fields) and soffset (the
5608 // offset that is excluded from bounds checking and swizzling, to go in the
5609 // instruction's soffset field). This function takes the first kind of offset
5610 // and figures out how to split it between voffset and immoffset.
5916 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5917 // offset (the offset that is included in bounds checking and swizzling, to be
5918 // split between the instruction's voffset and immoffset fields) and soffset
5919 // (the offset that is excluded from bounds checking and swizzling, to go in
5920 // the instruction's soffset field). This function takes the first kind of
5921 // offset and figures out how to split it between voffset and immoffset.
56115922 std::pair SITargetLowering::splitBufferOffsets(
56125923 SDValue Offset, SelectionDAG &DAG) const {
56135924 SDLoc DL(Offset);
56445955 if (!C1)
56455956 C1 = cast(DAG.getConstant(0, DL, MVT::i32));
56465957 return {N0, SDValue(C1, 0)};
5958 }
5959
5960 // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
5961 // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
5962 // pointed to by Offsets.
5963 void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
5964 SelectionDAG &DAG,
5965 SDValue *Offsets) const {
5966 SDLoc DL(CombinedOffset);
5967 if (auto C = dyn_cast(CombinedOffset)) {
5968 uint32_t Imm = C->getZExtValue();
5969 uint32_t SOffset, ImmOffset;
5970 if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget)) {
5971 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
5972 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
5973 Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
5974 return;
5975 }
5976 }
5977 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
5978 SDValue N0 = CombinedOffset.getOperand(0);
5979 SDValue N1 = CombinedOffset.getOperand(1);
5980 uint32_t SOffset, ImmOffset;
5981 int Offset = cast(N1)->getSExtValue();
5982 if (Offset >= 0
5983 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, Subtarget)) {
5984 Offsets[0] = N0;
5985 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
5986 Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
5987 return;
5988 }
5989 }
5990 Offsets[0] = CombinedOffset;
5991 Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
5992 Offsets[2] = DAG.getConstant(0, DL, MVT::i32);
56475993 }
56485994
56495995 static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
185185 /// global value \p GV, false otherwise.
186186 bool shouldEmitPCReloc(const GlobalValue *GV) const;
187187
188 // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
189 // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
190 // pointed to by Offsets.
191 void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
192 SDValue *Offsets) const;
193
188194 public:
189195 SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI);
190196
107107 SDTtbuffer_store,
108108 [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
109109
110 def SDTBufferLoad : SDTypeProfile<1, 5,
110 def SDTBufferLoad : SDTypeProfile<1, 7,
111111 [ // vdata
112112 SDTCisVT<1, v4i32>, // rsrc
113 SDTCisVT<2, i32>, // vindex
114 SDTCisVT<3, i32>, // offset
115 SDTCisVT<4, i1>, // glc
116 SDTCisVT<5, i1>]>; // slc
113 SDTCisVT<2, i32>, // vindex(VGPR)
114 SDTCisVT<3, i32>, // voffset(VGPR)
115 SDTCisVT<4, i32>, // soffset(SGPR)
116 SDTCisVT<5, i32>, // offset(imm)
117 SDTCisVT<6, i32>, // cachepolicy(imm)
118 SDTCisVT<7, i1>]>; // idxen(imm)
117119
118120 def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad,
119121 [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
123125 SDTBufferLoad,
124126 [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
125127
126 def SDTBufferStore : SDTypeProfile<0, 6,
128 def SDTBufferStore : SDTypeProfile<0, 8,
127129 [ // vdata
128130 SDTCisVT<1, v4i32>, // rsrc
129 SDTCisVT<2, i32>, // vindex
130 SDTCisVT<3, i32>, // offset
131 SDTCisVT<4, i1>, // glc
132 SDTCisVT<5, i1>]>; // slc
131 SDTCisVT<2, i32>, // vindex(VGPR)
132 SDTCisVT<3, i32>, // voffset(VGPR)
133 SDTCisVT<4, i32>, // soffset(SGPR)
134 SDTCisVT<5, i32>, // offset(imm)
135 SDTCisVT<6, i32>, // cachepolicy(imm)
136 SDTCisVT<7, i1>]>; // idxen(imm)
133137
134138 def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore,
135139 [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
141145 [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
142146
143147 class SDBufferAtomic : SDNode
144 SDTypeProfile<1, 5,
148 SDTypeProfile<1, 8,
145149 [SDTCisVT<0, i32>, // dst
146150 SDTCisVT<1, i32>, // vdata
147151 SDTCisVT<2, v4i32>, // rsrc
148 SDTCisVT<3, i32>, // vindex
149 SDTCisVT<4, i32>, // offset
150 SDTCisVT<5, i1>]>, // slc
152 SDTCisVT<3, i32>, // vindex(VGPR)
153 SDTCisVT<4, i32>, // voffset(VGPR)
154 SDTCisVT<5, i32>, // soffset(SGPR)
155 SDTCisVT<6, i32>, // offset(imm)
156 SDTCisVT<7, i32>, // cachepolicy(imm)
157 SDTCisVT<8, i1>]>, // idxen(imm)
151158 [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
152159 >;
153160
163170 def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">;
164171
165172 def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
166 SDTypeProfile<1, 6,
173 SDTypeProfile<1, 9,
167174 [SDTCisVT<0, i32>, // dst
168175 SDTCisVT<1, i32>, // src
169176 SDTCisVT<2, i32>, // cmp
170177 SDTCisVT<3, v4i32>, // rsrc
171 SDTCisVT<4, i32>, // vindex
172 SDTCisVT<5, i32>, // offset
173 SDTCisVT<6, i1>]>, // slc
178 SDTCisVT<4, i32>, // vindex(VGPR)
179 SDTCisVT<5, i32>, // voffset(VGPR)
180 SDTCisVT<6, i32>, // soffset(SGPR)
181 SDTCisVT<7, i32>, // offset(imm)
182 SDTCisVT<8, i32>, // cachepolicy(imm)
183 SDTCisVT<9, i1>]>, // idxen(imm)
174184 [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
175185 >;
176186
934934 isUInt<20>(EncodedOffset) : isUInt<8>(EncodedOffset);
935935 }
936936
937 // Given Imm, split it into the values to put into the SOffset and ImmOffset
938 // fields in an MUBUF instruction. Return false if it is not possible (due to a
939 // hardware bug needing a workaround).
940 bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
941 const GCNSubtarget *Subtarget) {
942 const uint32_t Align = 4;
943 const uint32_t MaxImm = alignDown(4095, Align);
944 uint32_t Overflow = 0;
945
946 if (Imm > MaxImm) {
947 if (Imm <= MaxImm + 64) {
948 // Use an SOffset inline constant for 4..64
949 Overflow = Imm - MaxImm;
950 Imm = MaxImm;
951 } else {
952 // Try to keep the same value in SOffset for adjacent loads, so that
953 // the corresponding register contents can be re-used.
954 //
955 // Load values with all low-bits (except for alignment bits) set into
956 // SOffset, so that a larger range of values can be covered using
957 // s_movk_i32.
958 //
959 // Atomic operations fail to work correctly when individual address
960 // components are unaligned, even if their sum is aligned.
961 uint32_t High = (Imm + Align) & ~4095;
962 uint32_t Low = (Imm + Align) & 4095;
963 Imm = Low;
964 Overflow = High - Align;
965 }
966 }
967
968 // There is a hardware bug in SI and CI which prevents address clamping in
969 // MUBUF instructions from working correctly with SOffsets. The immediate
970 // offset is unaffected.
971 if (Overflow > 0 &&
972 Subtarget->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
973 return false;
974
975 ImmOffset = Imm;
976 SOffset = Overflow;
977 return true;
978 }
979
937980 } // end namespace AMDGPU
938981
939982 } // end namespace llvm
2525 namespace llvm {
2626
2727 class Argument;
28 class AMDGPUSubtarget;
2829 class FeatureBitset;
2930 class Function;
31 class GCNSubtarget;
3032 class GlobalValue;
3133 class MCContext;
3234 class MCRegisterClass;
446448 /// not the encoded offset.
447449 bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
448450
451 // Given Imm, split it into the values to put into the SOffset and ImmOffset
452 // fields in an MUBUF instruction. Return false if it is not possible (due to a
453 // hardware bug needing a workaround).
454 bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
455 const GCNSubtarget *Subtarget);
456
449457 /// \returns true if the intrinsic is divergent
450458 bool isIntrinsicSourceOfDivergence(unsigned IntrID);
451459
0 ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI
1 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI
2
3 ;CHECK-LABEL: {{^}}test1:
4 ;CHECK-NOT: s_waitcnt
5 ;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc
6 ;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
7 ;CHECK: s_waitcnt vmcnt(0)
8 ;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 offen glc
9 ;CHECK: s_waitcnt vmcnt(0)
10 ;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 offen offset:42 glc
11 ;CHECK-DAG: s_waitcnt vmcnt(0)
12 ;CHECK: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:4 glc
13 ;CHECK: s_waitcnt vmcnt(0)
14 ;CHECK: buffer_atomic_swap v0, off, s[0:3], 0{{$}}
15 define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %voffset) {
16 main_body:
17 %o1 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
18 %o3 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap(i32 %o1, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
19 %off5 = add i32 %voffset, 42
20 %o5 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap(i32 %o3, <4 x i32> %rsrc, i32 %off5, i32 0, i32 0)
21 %o6 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap(i32 %o5, <4 x i32> %rsrc, i32 4, i32 8188, i32 0)
22 %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.swap(i32 %o6, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
23 %out = bitcast i32 %o6 to float
24 ret float %out
25 }
26
27 ;CHECK-LABEL: {{^}}test2:
28 ;CHECK-NOT: s_waitcnt
29 ;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 offen glc{{$}}
30 ;CHECK: s_waitcnt vmcnt(0)
31 ;CHECK: buffer_atomic_sub v0, v1, s[0:3], 0 offen glc slc
32 ;CHECK: s_waitcnt vmcnt(0)
33 ;CHECK: buffer_atomic_smin v0, v1, s[0:3], 0 offen glc{{$}}
34 ;CHECK: s_waitcnt vmcnt(0)
35 ;CHECK: buffer_atomic_umin v0, v1, s[0:3], 0 offen glc slc
36 ;CHECK: s_waitcnt vmcnt(0)
37 ;CHECK: buffer_atomic_smax v0, v1, s[0:3], 0 offen glc{{$}}
38 ;CHECK: s_waitcnt vmcnt(0)
39 ;CHECK: buffer_atomic_umax v0, v1, s[0:3], 0 offen glc slc
40 ;CHECK: s_waitcnt vmcnt(0)
41 ;CHECK: buffer_atomic_and v0, v1, s[0:3], 0 offen glc{{$}}
42 ;CHECK: s_waitcnt vmcnt(0)
43 ;CHECK: buffer_atomic_or v0, v1, s[0:3], 0 offen glc slc
44 ;CHECK: s_waitcnt vmcnt(0)
45 ;CHECK: buffer_atomic_xor v0, v1, s[0:3], 0 offen glc
46 define amdgpu_ps float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %voffset) {
47 main_body:
48 %t1 = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %data, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
49 %t2 = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %t1, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2)
50 %t3 = call i32 @llvm.amdgcn.raw.buffer.atomic.smin(i32 %t2, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
51 %t4 = call i32 @llvm.amdgcn.raw.buffer.atomic.umin(i32 %t3, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2)
52 %t5 = call i32 @llvm.amdgcn.raw.buffer.atomic.smax(i32 %t4, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
53 %t6 = call i32 @llvm.amdgcn.raw.buffer.atomic.umax(i32 %t5, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2)
54 %t7 = call i32 @llvm.amdgcn.raw.buffer.atomic.and(i32 %t6, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
55 %t8 = call i32 @llvm.amdgcn.raw.buffer.atomic.or(i32 %t7, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2)
56 %t9 = call i32 @llvm.amdgcn.raw.buffer.atomic.xor(i32 %t8, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
57 %out = bitcast i32 %t9 to float
58 ret float %out
59 }
60
61 ; Ideally, we would teach tablegen & friends that cmpswap only modifies the
62 ; first vgpr. Since we don't do that yet, the register allocator will have to
63 ; create copies which we don't bother to track here.
64 ;
65 ;CHECK-LABEL: {{^}}test3:
66 ;CHECK-NOT: s_waitcnt
67 ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 glc
68 ;CHECK: s_waitcnt vmcnt(0)
69 ;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
70 ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 offen glc
71 ;CHECK: s_waitcnt vmcnt(0)
72 ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 offen offset:44 glc
73 ;CHECK-DAG: s_waitcnt vmcnt(0)
74 ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:4 glc
75 define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) {
76 main_body:
77 %o1 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
78 %o3 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
79 %ofs.5 = add i32 %voffset, 44
80 %o5 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %ofs.5, i32 0, i32 0)
81 %o6 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 4, i32 8188, i32 0)
82
83 ; Detecting the no-return variant doesn't work right now because of how the
84 ; intrinsic is replaced by an instruction that feeds into an EXTRACT_SUBREG.
85 ; Since there probably isn't a reasonable use-case of cmpswap that discards
86 ; the return value, that seems okay.
87 ;
88 ; %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap(i32 %o6, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
89 %out = bitcast i32 %o6 to float
90 ret float %out
91 }
92
93 ;CHECK-LABEL: {{^}}test4:
94 ;CHECK: buffer_atomic_add v0,
95 define amdgpu_ps float @test4() {
96 main_body:
97 %v = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> undef, i32 4, i32 0, i32 0)
98 %v.float = bitcast i32 %v to float
99 ret float %v.float
100 }
101
102 declare i32 @llvm.amdgcn.raw.buffer.atomic.swap(i32, <4 x i32>, i32, i32, i32) #0
103 declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32) #0
104 declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32) #0
105 declare i32 @llvm.amdgcn.raw.buffer.atomic.smin(i32, <4 x i32>, i32, i32, i32) #0
106 declare i32 @llvm.amdgcn.raw.buffer.atomic.umin(i32, <4 x i32>, i32, i32, i32) #0
107 declare i32 @llvm.amdgcn.raw.buffer.atomic.smax(i32, <4 x i32>, i32, i32, i32) #0
108 declare i32 @llvm.amdgcn.raw.buffer.atomic.umax(i32, <4 x i32>, i32, i32, i32) #0
109 declare i32 @llvm.amdgcn.raw.buffer.atomic.and(i32, <4 x i32>, i32, i32, i32) #0
110 declare i32 @llvm.amdgcn.raw.buffer.atomic.or(i32, <4 x i32>, i32, i32, i32) #0
111 declare i32 @llvm.amdgcn.raw.buffer.atomic.xor(i32, <4 x i32>, i32, i32, i32) #0
112 declare i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i32) #0
113
114 attributes #0 = { nounwind }
0 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
1 ; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
2 ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
3
4 ; GCN-LABEL: {{^}}buffer_load_format_d16_x:
5 ; GCN: buffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0
6 define amdgpu_ps half @buffer_load_format_d16_x(<4 x i32> inreg %rsrc) {
7 main_body:
8 %data = call half @llvm.amdgcn.raw.buffer.load.format.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
9 ret half %data
10 }
11
12 ; GCN-LABEL: {{^}}buffer_load_format_d16_xy:
13 ; UNPACKED: buffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0
14 ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
15
16 ; PACKED: buffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0
17 ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]]
18 define amdgpu_ps half @buffer_load_format_d16_xy(<4 x i32> inreg %rsrc) {
19 main_body:
20 %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
21 %elt = extractelement <2 x half> %data, i32 1
22 ret half %elt
23 }
24
25 ; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw:
26 ; UNPACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0
27 ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
28
29 ; PACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0
30 ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]]
31 define amdgpu_ps half @buffer_load_format_d16_xyzw(<4 x i32> inreg %rsrc) {
32 main_body:
33 %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.format.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
34 %elt = extractelement <4 x half> %data, i32 3
35 ret half %elt
36 }
37
38 declare half @llvm.amdgcn.raw.buffer.load.format.f16(<4 x i32>, i32, i32, i32)
39 declare <2 x half> @llvm.amdgcn.raw.buffer.load.format.v2f16(<4 x i32>, i32, i32, i32)
40 declare <4 x half> @llvm.amdgcn.raw.buffer.load.format.v4f16(<4 x i32>, i32, i32, i32)
0 ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI
1 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI
2
3 ;CHECK-LABEL: {{^}}buffer_load:
4 ;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0
5 ;CHECK: buffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc
6 ;CHECK: buffer_load_format_xyzw v[8:11], off, s[0:3], 0 slc
7 ;CHECK: s_waitcnt
8 define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) {
9 main_body:
10 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0)
11 %data_glc = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 1)
12 %data_slc = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 2)
13 %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0
14 %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1
15 %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2
16 ret {<4 x float>, <4 x float>, <4 x float>} %r2
17 }
18
19 ;CHECK-LABEL: {{^}}buffer_load_immoffs:
20 ;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42
21 ;CHECK: s_waitcnt
22 define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
23 main_body:
24 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 42, i32 0, i32 0)
25 ret <4 x float> %data
26 }
27
28 ;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
29 ;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 60 offset:4092
30 ;CHECK-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7ffc
31 ;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4092
32 ;CHECK-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8ffc
33 ;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:4
34 ;CHECK: s_waitcnt
35 define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
36 main_body:
37 %d.0 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 4092, i32 60, i32 0)
38 %d.1 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 4092, i32 32764, i32 0)
39 %d.2 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 4, i32 36860, i32 0)
40 %d.3 = fadd <4 x float> %d.0, %d.1
41 %data = fadd <4 x float> %d.2, %d.3
42 ret <4 x float> %data
43 }
44
45 ;CHECK-LABEL: {{^}}buffer_load_ofs:
46 ;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen
47 ;CHECK: s_waitcnt
48 define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
49 main_body:
50 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0)
51 ret <4 x float> %data
52 }
53
54 ;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
55 ;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60
56 ;CHECK: s_waitcnt
57 define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
58 main_body:
59 %ofs = add i32 %1, 60
60 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 %ofs, i32 0, i32 0)
61 ret <4 x float> %data
62 }
63
64 ;CHECK-LABEL: {{^}}buffer_load_x:
65 ;CHECK: buffer_load_format_x v0, off, s[0:3], 0
66 ;CHECK: s_waitcnt
67 define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) {
68 main_body:
69 %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
70 ret float %data
71 }
72
73 ;CHECK-LABEL: {{^}}buffer_load_xy:
74 ;CHECK: buffer_load_format_xy v[0:1], off, s[0:3], 0
75 ;CHECK: s_waitcnt
76 define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) {
77 main_body:
78 %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
79 ret <2 x float> %data
80 }
81
82 declare float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32>, i32, i32, i32) #0
83 declare <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32) #0
84 declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #0
85
86 attributes #0 = { nounwind readonly }
0 ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI
1 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI
2
3 ;CHECK-LABEL: {{^}}buffer_load:
4 ;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
5 ;CHECK: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc
6 ;CHECK: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc
7 ;CHECK: s_waitcnt
8 define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) {
9 main_body:
10 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0)
11 %data_glc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 1)
12 %data_slc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 2)
13 %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0
14 %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1
15 %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2
16 ret {<4 x float>, <4 x float>, <4 x float>} %r2
17 }
18
19 ;CHECK-LABEL: {{^}}buffer_load_immoffs:
20 ;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40
21 ;CHECK: s_waitcnt
22 define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
23 main_body:
24 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 40, i32 0, i32 0)
25 ret <4 x float> %data
26 }
27
28 ;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
29 ;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc
30 ;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:4
31 ;CHECK: s_waitcnt
32 define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
33 main_body:
34 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 4, i32 8188, i32 0)
35 ret <4 x float> %data
36 }
37
38 ;CHECK-LABEL: {{^}}buffer_load_ofs:
39 ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen
40 ;CHECK: s_waitcnt
41 define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
42 main_body:
43 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0)
44 ret <4 x float> %data
45 }
46
47 ;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
48 ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:60
49 ;CHECK: s_waitcnt
50 define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
51 main_body:
52 %ofs = add i32 %1, 60
53 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %ofs, i32 0, i32 0)
54 ret <4 x float> %data
55 }
56
57 ;CHECK-LABEL: {{^}}buffer_load_x1:
58 ;CHECK: buffer_load_dword v0, v0, s[0:3], 0 offen
59 ;CHECK: s_waitcnt
60 define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %ofs) {
61 main_body:
62 %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0, i32 0)
63 ret float %data
64 }
65
66 ;CHECK-LABEL: {{^}}buffer_load_x2:
67 ;CHECK: buffer_load_dwordx2 v[0:1], v0, s[0:3], 0 offen
68 ;CHECK: s_waitcnt
69 define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %ofs) {
70 main_body:
71 %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0, i32 0)
72 ret <2 x float> %data
73 }
74
75 ;CHECK-LABEL: {{^}}buffer_load_negative_offset:
76 ;CHECK: v_add_{{[iu]}}32_e32 [[VOFS:v[0-9]+]], vcc, 0xfffff000, v0
77 ;CHECK: buffer_load_dwordx4 v[0:3], [[VOFS]], s[0:3], 0 offen offset:4080
78 define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) {
79 main_body:
80 %ofs.1 = add i32 %ofs, -16
81 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %ofs.1, i32 0, i32 0)
82 ret <4 x float> %data
83 }
84
85 ; SI won't merge ds memory operations, because of the signed offset bug, so
86 ; we only have check lines for VI.
87 ; CHECK-LABEL: buffer_load_mmo:
88 ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
89 ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
90 define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, float addrspace(3)* %lds) {
91 entry:
92 store float 0.0, float addrspace(3)* %lds
93 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
94 %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
95 store float 0.0, float addrspace(3)* %tmp2
96 ret float %val
97 }
98
99 ;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged:
100 ;CHECK-NEXT: %bb.
101 ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
102 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
103 ;CHECK: s_waitcnt
104 define amdgpu_ps void @buffer_load_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a) {
105 main_body:
106 %a1 = add i32 %a, 4
107 %a2 = add i32 %a, 8
108 %a3 = add i32 %a, 12
109 %a4 = add i32 %a, 16
110 %a5 = add i32 %a, 28
111 %a6 = add i32 %a, 32
112 %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a1, i32 0, i32 0)
113 %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a2, i32 0, i32 0)
114 %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a3, i32 0, i32 0)
115 %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a4, i32 0, i32 0)
116 %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a5, i32 0, i32 0)
117 %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a6, i32 0, i32 0)
118 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
119 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
120 ret void
121 }
122
123 ;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged_glc_slc:
124 ;CHECK-NEXT: %bb.
125 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}}
126 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}}
127 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}}
128 ;CHECK: s_waitcnt
129 define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a) {
130 main_body:
131 %a1 = add i32 %a, 4
132 %a2 = add i32 %a, 8
133 %a3 = add i32 %a, 12
134 %a4 = add i32 %a, 16
135 %a5 = add i32 %a, 28
136 %a6 = add i32 %a, 32
137 %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a1, i32 0, i32 0)
138 %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a2, i32 0, i32 0)
139 %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a3, i32 0, i32 1)
140 %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a4, i32 0, i32 1)
141 %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a5, i32 0, i32 3)
142 %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a6, i32 0, i32 3)
143 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
144 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
145 ret void
146 }
147
148 ;CHECK-LABEL: {{^}}buffer_load_x2_offen_merged:
149 ;CHECK-NEXT: %bb.
150 ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
151 ;CHECK: s_waitcnt
152 define amdgpu_ps void @buffer_load_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a) {
153 main_body:
154 %a1 = add i32 %a, 4
155 %a2 = add i32 %a, 12
156 %vr1 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %a1, i32 0, i32 0)
157 %vr2 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %a2, i32 0, i32 0)
158 %r1 = extractelement <2 x float> %vr1, i32 0
159 %r2 = extractelement <2 x float> %vr1, i32 1
160 %r3 = extractelement <2 x float> %vr2, i32 0
161 %r4 = extractelement <2 x float> %vr2, i32 1
162 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
163 ret void
164 }
165
166 ;CHECK-LABEL: {{^}}buffer_load_x1_offset_merged:
167 ;CHECK-NEXT: %bb.
168 ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
169 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28
170 ;CHECK: s_waitcnt
171 define amdgpu_ps void @buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) {
172 main_body:
173 %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0)
174 %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 0)
175 %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 12, i32 0, i32 0)
176 %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 16, i32 0, i32 0)
177 %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 28, i32 0, i32 0)
178 %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 32, i32 0, i32 0)
179 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
180 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
181 ret void
182 }
183
184 ;CHECK-LABEL: {{^}}buffer_load_x2_offset_merged:
185 ;CHECK-NEXT: %bb.
186 ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
187 ;CHECK: s_waitcnt
188 define amdgpu_ps void @buffer_load_x2_offset_merged(<4 x i32> inreg %rsrc) {
189 main_body:
190 %vr1 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0)
191 %vr2 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 12, i32 0, i32 0)
192 %r1 = extractelement <2 x float> %vr1, i32 0
193 %r2 = extractelement <2 x float> %vr1, i32 1
194 %r3 = extractelement <2 x float> %vr2, i32 0
195 %r4 = extractelement <2 x float> %vr2, i32 1
196 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
197 ret void
198 }
199
200 declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #0
201 declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) #0
202 declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0
203 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
204
205 attributes #0 = { nounwind readonly }
0 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,UNPACKED %s
1 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,GFX81 %s
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,GFX9 %s
3
4 ; GCN-LABEL: {{^}}buffer_store_format_d16_x:
5 ; GCN: s_load_dword s[[LO:[0-9]+]]
6 ; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]]
7 ; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
8 define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %voffset) {
9 main_body:
10 call void @llvm.amdgcn.raw.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
11 ret void
12 }
13
14 ; GCN-LABEL: {{^}}buffer_store_format_d16_xy:
15
16 ; UNPACKED: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
17 ; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16
18 ; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}}
19 ; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]]
20 ; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]]
21 ; UNPACKED: buffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
22
23 ; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
24 define amdgpu_kernel void @buffer_store_format_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %voffset) {
25 main_body:
26 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
27 ret void
28 }
29
30 ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw:
31 ; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
32
33 ; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
34 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
35 ; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
36 ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16
37 ; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
38
39 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
40 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
41
42 ; UNPACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
43
44 ; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
45 ; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]]
46
47 ; PACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
48 define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %voffset) {
49 main_body:
50 call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
51 ret void
52 }
53
54 declare void @llvm.amdgcn.raw.buffer.store.format.f16(half, <4 x i32>, i32, i32, i32)
55 declare void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i32)
56 declare void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i32)
0 ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s
1 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
2
3 ;CHECK-LABEL: {{^}}buffer_store:
4 ;CHECK-NOT: s_waitcnt
5 ;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0
6 ;CHECK: buffer_store_format_xyzw v[4:7], off, s[0:3], 0 glc
7 ;CHECK: buffer_store_format_xyzw v[8:11], off, s[0:3], 0 slc
8 define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
9 main_body:
10 call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0)
11 call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 1)
12 call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 2)
13 ret void
14 }
15
16 ;CHECK-LABEL: {{^}}buffer_store_immoffs:
17 ;CHECK-NOT: s_waitcnt
18 ;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0 offset:42
19 define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
20 main_body:
21 call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0)
22 ret void
23 }
24
25 ;CHECK-LABEL: {{^}}buffer_store_ofs:
26 ;CHECK-NOT: s_waitcnt
27 ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 offen
28 define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
29 main_body:
30 call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0)
31 ret void
32 }
33
34 ; Ideally, the register allocator would avoid the wait here
35 ;
36 ;CHECK-LABEL: {{^}}buffer_store_wait:
37 ;CHECK-NOT: s_waitcnt
38 ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 offen
39 ;VERDE: s_waitcnt expcnt(0)
40 ;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 offen
41 ;CHECK: s_waitcnt vmcnt(0)
42 ;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 offen
43 define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) {
44 main_body:
45 call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0)
46 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0)
47 call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i32 0)
48 ret void
49 }
50
51 ;CHECK-LABEL: {{^}}buffer_store_x1:
52 ;CHECK-NOT: s_waitcnt
53 ;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 offen
54 define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %offset) {
55 main_body:
56 call void @llvm.amdgcn.raw.buffer.store.format.f32(float %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
57 ret void
58 }
59
60 ;CHECK-LABEL: {{^}}buffer_store_x2:
61 ;CHECK-NOT: s_waitcnt
62 ;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 offen
63 define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %offset) {
64 main_body:
65 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
66 ret void
67 }
68
69 declare void @llvm.amdgcn.raw.buffer.store.format.f32(float, <4 x i32>, i32, i32, i32) #0
70 declare void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0
71 declare void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0
72 declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #1
73
74 attributes #0 = { nounwind }
75 attributes #1 = { nounwind readonly }
0 ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s
1 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
2
3 ;CHECK-LABEL: {{^}}buffer_store:
4 ;CHECK-NOT: s_waitcnt
5 ;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
6 ;CHECK: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc
7 ;CHECK: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc
8 define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
9 main_body:
10 call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0)
11 call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 1)
12 call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 2)
13 ret void
14 }
15
16 ;CHECK-LABEL: {{^}}buffer_store_immoffs:
17 ;CHECK-NOT: s_waitcnt
18 ;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42
19 define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
20 main_body:
21 call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0)
22 ret void
23 }
24
25 ;CHECK-LABEL: {{^}}buffer_store_ofs:
26 ;CHECK-NOT: s_waitcnt
27 ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
28 define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
29 main_body:
30 call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0)
31 ret void
32 }
33
34 ; Ideally, the register allocator would avoid the wait here
35 ;
36 ;CHECK-LABEL: {{^}}buffer_store_wait:
37 ;CHECK-NOT: s_waitcnt
38 ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
39 ;VERDE: s_waitcnt expcnt(0)
40 ;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen
41 ;CHECK: s_waitcnt vmcnt(0)
42 ;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen
43 define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) {
44 main_body:
45 call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0)
46 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0)
47 call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i32 0)
48 ret void
49 }
50
51 ;CHECK-LABEL: {{^}}buffer_store_x1:
52 ;CHECK-NOT: s_waitcnt
53 ;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen
54 define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %offset) {
55 main_body:
56 call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
57 ret void
58 }
59
60 ;CHECK-LABEL: {{^}}buffer_store_x2:
61 ;CHECK-NOT: s_waitcnt
62 ;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
63 define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %offset) #0 {
64 main_body:
65 call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
66 ret void
67 }
68
69 ;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged:
70 ;CHECK-NOT: s_waitcnt
71 ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
72 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
73 define amdgpu_ps void @buffer_store_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
74 %a1 = add i32 %a, 4
75 %a2 = add i32 %a, 8
76 %a3 = add i32 %a, 12
77 %a4 = add i32 %a, 16
78 %a5 = add i32 %a, 28
79 %a6 = add i32 %a, 32
80 call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0)
81 call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %a2, i32 0, i32 0)
82 call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 %a3, i32 0, i32 0)
83 call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 %a4, i32 0, i32 0)
84 call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 %a5, i32 0, i32 0)
85 call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 %a6, i32 0, i32 0)
86 ret void
87 }
88
89 ;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_glc_slc:
90 ;CHECK-NOT: s_waitcnt
91 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}}
92 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}}
93 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}}
94 define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
95 %a1 = add i32 %a, 4
96 %a2 = add i32 %a, 8
97 %a3 = add i32 %a, 12
98 %a4 = add i32 %a, 16
99 %a5 = add i32 %a, 28
100 %a6 = add i32 %a, 32
101 call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0)
102 call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %a2, i32 0, i32 0)
103 call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 %a3, i32 0, i32 1)
104 call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 %a4, i32 0, i32 1)
105 call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 %a5, i32 0, i32 3)
106 call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 %a6, i32 0, i32 3)
107 ret void
108 }
109
110 ;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged:
111 ;CHECK-NOT: s_waitcnt
112 ;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
113 define amdgpu_ps void @buffer_store_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) {
114 %a1 = add i32 %a, 4
115 %a2 = add i32 %a, 12
116 call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0)
117 call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 %a2, i32 0, i32 0)
118 ret void
119 }
120
121 ;CHECK-LABEL: {{^}}buffer_store_x1_offset_merged:
122 ;CHECK-NOT: s_waitcnt
123 ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
124 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28
125 define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
126 call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0)
127 call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0)
128 call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0)
129 call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 16, i32 0, i32 0)
130 call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 28, i32 0, i32 0)
131 call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 32, i32 0, i32 0)
132 ret void
133 }
134
135 ;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged:
136 ;CHECK-NOT: s_waitcnt
137 ;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
138 define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1,<2 x float> %v2) {
139 call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0)
140 call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 12, i32 0, i32 0)
141 ret void
142 }
143
144 declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) #0
145 declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0
146 declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0
147 declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #1
148
149 attributes #0 = { nounwind }
150 attributes #1 = { nounwind readonly }
0 ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI
1 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI
2
3 ;CHECK-LABEL: {{^}}test1:
4 ;CHECK-NOT: s_waitcnt
5 ;CHECK: buffer_atomic_swap v0, {{v[0-9]+}}, s[0:3], 0 idxen glc
6 ;CHECK: s_waitcnt vmcnt(0)
7 ;CHECK: buffer_atomic_swap v0, {{v[0-9]+}}, s[0:3], 0 idxen glc
8 ;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
9 ;CHECK: s_waitcnt vmcnt(0)
10 ;CHECK: buffer_atomic_swap v0, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen glc
11 ;CHECK: s_waitcnt vmcnt(0)
12 ;CHECK: buffer_atomic_swap v0, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen glc
13 ;CHECK: s_waitcnt vmcnt(0)
14 ;CHECK: buffer_atomic_swap v0, v[1:2], s[0:3], 0 idxen offen offset:42 glc
15 ;CHECK-DAG: s_waitcnt vmcnt(0)
16 ;CHECK: buffer_atomic_swap v0, {{v[0-9]+}}, s[0:3], [[SOFS]] idxen offset:4 glc
17 ;CHECK: s_waitcnt vmcnt(0)
18 ;CHECK: buffer_atomic_swap v0, {{v[0-9]+}}, s[0:3], 0 idxen{{$}}
19 define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) {
20 main_body:
21 %o1 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
22 %o2 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %o1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
23 %o3 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %o2, <4 x i32> %rsrc, i32 0, i32 %voffset, i32 0, i32 0)
24 %o4 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %o3, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 0, i32 0)
25 %ofs.5 = add i32 %voffset, 42
26 %o5 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %o4, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i32 0, i32 0)
27 %o6 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %o5, <4 x i32> %rsrc, i32 0, i32 4, i32 8188, i32 0)
28 %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %o6, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
29 %out = bitcast i32 %o6 to float
30 ret float %out
31 }
32
33 ;CHECK-LABEL: {{^}}test2:
34 ;CHECK-NOT: s_waitcnt
35 ;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 idxen glc{{$}}
36 ;CHECK: s_waitcnt vmcnt(0)
37 ;CHECK: buffer_atomic_sub v0, v1, s[0:3], 0 idxen glc slc
38 ;CHECK: s_waitcnt vmcnt(0)
39 ;CHECK: buffer_atomic_smin v0, v1, s[0:3], 0 idxen glc{{$}}
40 ;CHECK: s_waitcnt vmcnt(0)
41 ;CHECK: buffer_atomic_umin v0, v1, s[0:3], 0 idxen glc slc
42 ;CHECK: s_waitcnt vmcnt(0)
43 ;CHECK: buffer_atomic_smax v0, v1, s[0:3], 0 idxen glc{{$}}
44 ;CHECK: s_waitcnt vmcnt(0)
45 ;CHECK: buffer_atomic_umax v0, v1, s[0:3], 0 idxen glc slc
46 ;CHECK: s_waitcnt vmcnt(0)
47 ;CHECK: buffer_atomic_and v0, v1, s[0:3], 0 idxen glc{{$}}
48 ;CHECK: s_waitcnt vmcnt(0)
49 ;CHECK: buffer_atomic_or v0, v1, s[0:3], 0 idxen glc slc
50 ;CHECK: s_waitcnt vmcnt(0)
51 ;CHECK: buffer_atomic_xor v0, v1, s[0:3], 0 idxen glc
52 define amdgpu_ps float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) {
53 main_body:
54 %t1 = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
55 %t2 = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %t1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2)
56 %t3 = call i32 @llvm.amdgcn.struct.buffer.atomic.smin(i32 %t2, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
57 %t4 = call i32 @llvm.amdgcn.struct.buffer.atomic.umin(i32 %t3, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2)
58 %t5 = call i32 @llvm.amdgcn.struct.buffer.atomic.smax(i32 %t4, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
59 %t6 = call i32 @llvm.amdgcn.struct.buffer.atomic.umax(i32 %t5, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2)
60 %t7 = call i32 @llvm.amdgcn.struct.buffer.atomic.and(i32 %t6, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
61 %t8 = call i32 @llvm.amdgcn.struct.buffer.atomic.or(i32 %t7, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2)
62 %t9 = call i32 @llvm.amdgcn.struct.buffer.atomic.xor(i32 %t8, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
63 %out = bitcast i32 %t9 to float
64 ret float %out
65 }
66
67 ; Ideally, we would teach tablegen & friends that cmpswap only modifies the
68 ; first vgpr. Since we don't do that yet, the register allocator will have to
69 ; create copies which we don't bother to track here.
70 ;
71 ;CHECK-LABEL: {{^}}test3:
72 ;CHECK-NOT: s_waitcnt
73 ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 idxen glc
74 ;CHECK: s_waitcnt vmcnt(0)
75 ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc
76 ;CHECK: s_waitcnt vmcnt(0)
77 ;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
78 ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen glc
79 ;CHECK: s_waitcnt vmcnt(0)
80 ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen glc
81 ;CHECK: s_waitcnt vmcnt(0)
82 ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen offset:44 glc
83 ;CHECK-DAG: s_waitcnt vmcnt(0)
84 ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], [[SOFS]] idxen offset:4 glc
85 define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) {
86 main_body:
87 %o1 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
88 %o2 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
89 %o3 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %o2, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %voffset, i32 0, i32 0)
90 %o4 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 0, i32 0)
91 %offs.5 = add i32 %voffset, 44
92 %o5 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %o4, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %offs.5, i32 0, i32 0)
93 %o6 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 4, i32 8188, i32 0)
94
95 ; Detecting the no-return variant doesn't work right now because of how the
96 ; intrinsic is replaced by an instruction that feeds into an EXTRACT_SUBREG.
97 ; Since there probably isn't a reasonable use-case of cmpswap that discards
98 ; the return value, that seems okay.
99 ;
100 ; %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %o6, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
101 %out = bitcast i32 %o6 to float
102 ret float %out
103 }
104
105 ;CHECK-LABEL: {{^}}test4:
106 ;CHECK: buffer_atomic_add v0,
107 define amdgpu_ps float @test4() {
108 main_body:
109 %v = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> undef, i32 0, i32 4, i32 0, i32 0)
110 %v.float = bitcast i32 %v to float
111 ret float %v.float
112 }
113
114 declare i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32, <4 x i32>, i32, i32, i32, i32) #0
115 declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32) #0
116 declare i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32, i32) #0
117 declare i32 @llvm.amdgcn.struct.buffer.atomic.smin(i32, <4 x i32>, i32, i32, i32, i32) #0
118 declare i32 @llvm.amdgcn.struct.buffer.atomic.umin(i32, <4 x i32>, i32, i32, i32, i32) #0
119 declare i32 @llvm.amdgcn.struct.buffer.atomic.smax(i32, <4 x i32>, i32, i32, i32, i32) #0
120 declare i32 @llvm.amdgcn.struct.buffer.atomic.umax(i32, <4 x i32>, i32, i32, i32, i32) #0
121 declare i32 @llvm.amdgcn.struct.buffer.atomic.and(i32, <4 x i32>, i32, i32, i32, i32) #0
122 declare i32 @llvm.amdgcn.struct.buffer.atomic.or(i32, <4 x i32>, i32, i32, i32, i32) #0
123 declare i32 @llvm.amdgcn.struct.buffer.atomic.xor(i32, <4 x i32>, i32, i32, i32, i32) #0
124 declare i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i32, i32) #0
125
126 attributes #0 = { nounwind }
0 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
1 ; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
2 ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
3
4 ; GCN-LABEL: {{^}}buffer_load_format_d16_x:
5 ; GCN: buffer_load_format_d16_x v{{[0-9]+}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
6 define amdgpu_ps half @buffer_load_format_d16_x(<4 x i32> inreg %rsrc) {
7 main_body:
8 %data = call half @llvm.amdgcn.struct.buffer.load.format.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
9 ret half %data
10 }
11
12 ; GCN-LABEL: {{^}}buffer_load_format_d16_xy:
13 ; UNPACKED: buffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
14 ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
15
16 ; PACKED: buffer_load_format_d16_xy v[[FULL:[0-9]+]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
17 ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]]
18 define amdgpu_ps half @buffer_load_format_d16_xy(<4 x i32> inreg %rsrc) {
19 main_body:
20 %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
21 %elt = extractelement <2 x half> %data, i32 1
22 ret half %elt
23 }
24
25 ; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw:
26 ; UNPACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
27 ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
28
29 ; PACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
30 ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]]
31 define amdgpu_ps half @buffer_load_format_d16_xyzw(<4 x i32> inreg %rsrc) {
32 main_body:
33 %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
34 %elt = extractelement <4 x half> %data, i32 3
35 ret half %elt
36 }
37
38 declare half @llvm.amdgcn.struct.buffer.load.format.f16(<4 x i32>, i32, i32, i32, i32)
39 declare <2 x half> @llvm.amdgcn.struct.buffer.load.format.v2f16(<4 x i32>, i32, i32, i32, i32)
40 declare <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(<4 x i32>, i32, i32, i32, i32)
0 ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI
1 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI
2
3 ;CHECK-LABEL: {{^}}buffer_load:
4 ;CHECK: buffer_load_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen
5 ;CHECK: buffer_load_format_xyzw v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc
6 ;CHECK: buffer_load_format_xyzw v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc
7 ;CHECK: s_waitcnt
8 define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) {
9 main_body:
10 %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0)
11 %data_glc = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 1)
12 %data_slc = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 2)
13 %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0
14 %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1
15 %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2
16 ret {<4 x float>, <4 x float>, <4 x float>} %r2
17 }
18
19 ;CHECK-LABEL: {{^}}buffer_load_immoffs:
20 ;CHECK: buffer_load_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42
21 ;CHECK: s_waitcnt
22 define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
23 main_body:
24 %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 42, i32 0, i32 0)
25 ret <4 x float> %data
26 }
27
28 ;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
29 ;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 60 idxen offset:4092
30 ;CHECK-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7ffc
31 ;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], [[OFS1]] idxen offset:4092
32 ;CHECK-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8ffc
33 ;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], [[OFS2]] idxen offset:4
34 ;CHECK: s_waitcnt
35 define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
36 main_body:
37 %d.0 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4092, i32 60, i32 0)
38 %d.1 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4092, i32 32764, i32 0)
39 %d.2 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4, i32 36860, i32 0)
40 %d.3 = fadd <4 x float> %d.0, %d.1
41 %data = fadd <4 x float> %d.2, %d.3
42 ret <4 x float> %data
43 }
44
45 ;CHECK-LABEL: {{^}}buffer_load_idx:
46 ;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen
47 ;CHECK: s_waitcnt
48 define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) {
49 main_body:
50 %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0, i32 0)
51 ret <4 x float> %data
52 }
53
54 ;CHECK-LABEL: {{^}}buffer_load_ofs:
55 ;CHECK: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen
56 ;CHECK: s_waitcnt
57 define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
58 main_body:
59 %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %1, i32 0, i32 0)
60 ret <4 x float> %data
61 }
62
63 ;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
64 ;CHECK: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60
65 ;CHECK: s_waitcnt
66 define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
67 main_body:
68 %ofs = add i32 %1, 60
69 %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i32 0, i32 0)
70 ret <4 x float> %data
71 }
72
73 ;CHECK-LABEL: {{^}}buffer_load_both:
74 ;CHECK: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen
75 ;CHECK: s_waitcnt
76 define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) {
77 main_body:
78 %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 %2, i32 0, i32 0)
79 ret <4 x float> %data
80 }
81
82 ;CHECK-LABEL: {{^}}buffer_load_both_reversed:
83 ;CHECK: v_mov_b32_e32 v2, v0
84 ;CHECK: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen
85 ;CHECK: s_waitcnt
86 define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) {
87 main_body:
88 %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %2, i32 %1, i32 0, i32 0)
89 ret <4 x float> %data
90 }
91
92 ;CHECK-LABEL: {{^}}buffer_load_x:
93 ;CHECK: buffer_load_format_x v0, {{v[0-9]+}}, s[0:3], 0 idxen
94 ;CHECK: s_waitcnt
95 define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) {
96 main_body:
97 %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
98 ret float %data
99 }
100
101 ;CHECK-LABEL: {{^}}buffer_load_xy:
102 ;CHECK: buffer_load_format_xy v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen
103 ;CHECK: s_waitcnt
104 define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) {
105 main_body:
106 %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
107 ret <2 x float> %data
108 }
109
110 declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32) #0
111 declare <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32, i32) #0
112 declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32) #0
113
114 attributes #0 = { nounwind readonly }
0 ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI
1 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI
2
3 ;CHECK-LABEL: {{^}}buffer_load:
4 ;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen
5 ;CHECK: buffer_load_dwordx4 v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc
6 ;CHECK: buffer_load_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc
7 ;CHECK: s_waitcnt
8 define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) {
9 main_body:
10 %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0)
11 %data_glc = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 1)
12 %data_slc = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 2)
13 %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0
14 %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1
15 %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2
16 ret {<4 x float>, <4 x float>, <4 x float>} %r2
17 }
18
19 ;CHECK-LABEL: {{^}}buffer_load_immoffs:
20 ;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:40
21 ;CHECK: s_waitcnt
22 define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
23 main_body:
24 %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 40, i32 0, i32 0)
25 ret <4 x float> %data
26 }
27
28 ;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
29 ;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc
30 ;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], [[OFFSET]] idxen offset:4
31 ;CHECK: s_waitcnt
32 define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
33 main_body:
34 %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 4, i32 8188, i32 0)
35 ret <4 x float> %data
36 }
37
38 ;CHECK-LABEL: {{^}}buffer_load_idx:
39 ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen
40 ;CHECK: s_waitcnt
41 define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) {
42 main_body:
43 %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0, i32 0)
44 ret <4 x float> %data
45 }
46
47 ;CHECK-LABEL: {{^}}buffer_load_ofs:
48 ;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
49 ;CHECK: s_waitcnt
50 define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
51 main_body:
52 %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i32 0, i32 0)
53 ret <4 x float> %data
54 }
55
56 ;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
57 ;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60
58 ;CHECK: s_waitcnt
59 define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
60 main_body:
61 %ofs = add i32 %1, 60
62 %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i32 0, i32 0)
63 ret <4 x float> %data
64 }
65
66 ;CHECK-LABEL: {{^}}buffer_load_both:
67 ;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
68 ;CHECK: s_waitcnt
69 define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) {
70 main_body:
71 %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i32 0, i32 0)
72 ret <4 x float> %data
73 }
74
75 ;CHECK-LABEL: {{^}}buffer_load_both_reversed:
76 ;CHECK: v_mov_b32_e32 v2, v0
77 ;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen
78 ;CHECK: s_waitcnt
79 define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) {
80 main_body:
81 %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i32 0, i32 0)
82 ret <4 x float> %data
83 }
84
85 ;CHECK-LABEL: {{^}}buffer_load_x1:
86 ;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen
87 ;CHECK: s_waitcnt
88 define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
89 main_body:
90 %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
91 ret float %data
92 }
93
94 ;CHECK-LABEL: {{^}}buffer_load_x2:
95 ;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen
96 ;CHECK: s_waitcnt
97 define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
98 main_body:
99 %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
100 ret <2 x float> %data
101 }
102
103 ;CHECK-LABEL: {{^}}buffer_load_negative_offset:
104 ;CHECK: v_add_{{[iu]}}32_e32 {{v[0-9]+}}, vcc, 0xfffff000, v0
105 ;CHECK: buffer_load_dwordx4 v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen offset:4080
106 define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) {
107 main_body:
108 %ofs.1 = add i32 %ofs, -16
109 %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs.1, i32 0, i32 0)
110 ret <4 x float> %data
111 }
112
113 ; SI won't merge ds memory operations, because of the signed offset bug, so
114 ; we only have check lines for VI.
115 ; CHECK-LABEL: buffer_load_mmo:
116 ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
117 ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
118 define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, float addrspace(3)* %lds) {
119 entry:
120 store float 0.0, float addrspace(3)* %lds
121 %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
122 %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
123 store float 0.0, float addrspace(3)* %tmp2
124 ret float %val
125 }
126
127 declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #0
128 declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) #0
129 declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #0
130 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
131
132 attributes #0 = { nounwind readonly }
0 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,UNPACKED %s
1 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,GFX81 %s
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,GFX9 %s
3
4 ; GCN-LABEL: {{^}}buffer_store_format_d16_x:
5 ; GCN: s_load_dword s[[LO:[0-9]+]]
6 ; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]]
7 ; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
8 define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %index) {
9 main_body:
10 call void @llvm.amdgcn.struct.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
11 ret void
12 }
13
14 ; GCN-LABEL: {{^}}buffer_store_format_d16_xy:
15
16 ; UNPACKED: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
17 ; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16
18 ; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}}
19 ; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]]
20 ; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]]
21 ; UNPACKED: buffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
22
23 ; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
24 define amdgpu_kernel void @buffer_store_format_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %index) {
25 main_body:
26 call void @llvm.amdgcn.struct.buffer.store.format.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
27 ret void
28 }
29
30 ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw:
31 ; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
32
33 ; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
34 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
35 ; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
36 ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16
37 ; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
38
39 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
40 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
41
42 ; UNPACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
43
44 ; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
45 ; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]]
46
47 ; PACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
48 define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %index) {
49 main_body:
50 call void @llvm.amdgcn.struct.buffer.store.format.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
51 ret void
52 }
53
54 declare void @llvm.amdgcn.struct.buffer.store.format.f16(half, <4 x i32>, i32, i32, i32, i32)
55 declare void @llvm.amdgcn.struct.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32)
56 declare void @llvm.amdgcn.struct.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32)
0 ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s
1 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
2
3 ;CHECK-LABEL: {{^}}buffer_store:
4 ;CHECK-NOT: s_waitcnt
5 ;CHECK: buffer_store_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen
6 ;CHECK: buffer_store_format_xyzw v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc
7 ;CHECK: buffer_store_format_xyzw v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc
8 define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
9 main_body:
10 call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0)
11 call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1)
12 call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 2)
13 ret void
14 }
15
16 ;CHECK-LABEL: {{^}}buffer_store_immoffs:
17 ;CHECK-NOT: s_waitcnt
18 ;CHECK: buffer_store_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42
19 define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
20 main_body:
21 call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 0)
22 ret void
23 }
24
25 ;CHECK-LABEL: {{^}}buffer_store_idx:
26 ;CHECK-NOT: s_waitcnt
27 ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
28 define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {
29 main_body:
30 call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0)
31 ret void
32 }
33
34 ;CHECK-LABEL: {{^}}buffer_store_ofs:
35 ;CHECK-NOT: s_waitcnt
36 ;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen
37 define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
38 main_body:
39 call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i32 0, i32 0)
40 ret void
41 }
42
43 ;CHECK-LABEL: {{^}}buffer_store_both:
44 ;CHECK-NOT: s_waitcnt
45 ;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen
46 define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) {
47 main_body:
48 call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i32 0, i32 0)
49 ret void
50 }
51
52 ;CHECK-LABEL: {{^}}buffer_store_both_reversed:
53 ;CHECK: v_mov_b32_e32 v6, v4
54 ;CHECK-NOT: s_waitcnt
55 ;CHECK: buffer_store_format_xyzw v[0:3], v[5:6], s[0:3], 0 idxen offen
56 define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) {
57 main_body:
58 call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i32 0, i32 0)
59 ret void
60 }
61
62 ; Ideally, the register allocator would avoid the wait here
63 ;
64 ;CHECK-LABEL: {{^}}buffer_store_wait:
65 ;CHECK-NOT: s_waitcnt
66 ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
67 ;VERDE: s_waitcnt expcnt(0)
68 ;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
69 ;CHECK: s_waitcnt vmcnt(0)
70 ;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen
71 define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) {
72 main_body:
73 call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0)
74 %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0, i32 0)
75 call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i32 0, i32 0)
76 ret void
77 }
78
79 ;CHECK-LABEL: {{^}}buffer_store_x1:
80 ;CHECK-NOT: s_waitcnt
81 ;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen
82 define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) {
83 main_body:
84 call void @llvm.amdgcn.struct.buffer.store.format.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
85 ret void
86 }
87
88 ;CHECK-LABEL: {{^}}buffer_store_x2:
89 ;CHECK-NOT: s_waitcnt
90 ;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen
91 define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) {
92 main_body:
93 call void @llvm.amdgcn.struct.buffer.store.format.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
94 ret void
95 }
96
97 declare void @llvm.amdgcn.struct.buffer.store.format.f32(float, <4 x i32>, i32, i32, i32, i32) #0
98 declare void @llvm.amdgcn.struct.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32) #0
99 declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0
100 declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32) #1
101
102 attributes #0 = { nounwind }
103 attributes #1 = { nounwind readonly }
0 ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s
1 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
2
3 ;CHECK-LABEL: {{^}}buffer_store:
4 ;CHECK-NOT: s_waitcnt
5 ;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen
6 ;CHECK: buffer_store_dwordx4 v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc
7 ;CHECK: buffer_store_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc
8 define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
9 main_body:
10 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0)
11 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1)
12 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 2)
13 ret void
14 }
15
16 ;CHECK-LABEL: {{^}}buffer_store_immoffs:
17 ;CHECK-NOT: s_waitcnt
18 ;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42
19 define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
20 main_body:
21 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 0)
22 ret void
23 }
24
25 ;CHECK-LABEL: {{^}}buffer_store_idx:
26 ;CHECK-NOT: s_waitcnt
27 ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
28 define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {
29 main_body:
30 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0)
31 ret void
32 }
33
34 ;CHECK-LABEL: {{^}}buffer_store_ofs:
35 ;CHECK-NOT: s_waitcnt
36 ;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
37 define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
38 main_body:
39 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i32 0, i32 0)
40 ret void
41 }
42
43 ;CHECK-LABEL: {{^}}buffer_store_both:
44 ;CHECK-NOT: s_waitcnt
45 ;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
46 define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) {
47 main_body:
48 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i32 0, i32 0)
49 ret void
50 }
51
52 ;CHECK-LABEL: {{^}}buffer_store_both_reversed:
53 ;CHECK: v_mov_b32_e32 v6, v4
54 ;CHECK-NOT: s_waitcnt
55 ;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen
56 define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) {
57 main_body:
58 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i32 0, i32 0)
59 ret void
60 }
61
62 ; Ideally, the register allocator would avoid the wait here
63 ;
64 ;CHECK-LABEL: {{^}}buffer_store_wait:
65 ;CHECK-NOT: s_waitcnt
66 ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
67 ;VERDE: s_waitcnt expcnt(0)
68 ;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
69 ;CHECK: s_waitcnt vmcnt(0)
70 ;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
71 define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) {
72 main_body:
73 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0)
74 %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0, i32 0)
75 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i32 0, i32 0)
76 ret void
77 }
78
79 ;CHECK-LABEL: {{^}}buffer_store_x1:
80 ;CHECK-NOT: s_waitcnt
81 ;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen
82 define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) {
83 main_body:
84 call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
85 ret void
86 }
87
88 ;CHECK-LABEL: {{^}}buffer_store_x2:
89 ;CHECK-NOT: s_waitcnt
90 ;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
91 define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 {
92 main_body:
93 call void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
94 ret void
95 }
96
97 declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32) #0
98 declare void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32) #0
99 declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0
100 declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #1
101
102 attributes #0 = { nounwind }
103 attributes #1 = { nounwind readonly }