llvm.org GIT mirror llvm / 79b91c2
AMDGPU: Merge BUFFER_LOAD_DWORD_OFFEN into x2, x4 Summary: -9.9% code size decrease in affected shaders. Totals (changed stats only): SGPRS: 2151462 -> 2170646 (0.89 %) VGPRS: 1634612 -> 1640288 (0.35 %) Spilled SGPRs: 8942 -> 8940 (-0.02 %) Code Size: 52940672 -> 51727288 (-2.29 %) bytes Max Waves: 373066 -> 371718 (-0.36 %) Totals from affected shaders: SGPRS: 283520 -> 302704 (6.77 %) VGPRS: 227632 -> 233308 (2.49 %) Spilled SGPRs: 3966 -> 3964 (-0.05 %) Code Size: 12203080 -> 10989696 (-9.94 %) bytes Max Waves: 44070 -> 42722 (-3.06 %) Reviewers: arsenm, nhaehnle Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye Differential Revision: https://reviews.llvm.org/D38950 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317752 91177308-0d34-0410-b5e6-96231b3b80d8 Marek Olsak 2 years ago
3 changed file(s) with 235 addition(s) and 29 deletion(s). Raw diff Collapse all Expand all
7474 namespace {
7575
7676 class SILoadStoreOptimizer : public MachineFunctionPass {
77 enum InstClassEnum {
78 DS_READ_WRITE,
79 S_BUFFER_LOAD_IMM,
80 BUFFER_LOAD_OFFEN,
81 };
82
7783 struct CombineInfo {
7884 MachineBasicBlock::iterator I;
7985 MachineBasicBlock::iterator Paired;
8187 unsigned Offset0;
8288 unsigned Offset1;
8389 unsigned BaseOff;
90 InstClassEnum InstClass;
8491 bool GLC0;
8592 bool GLC1;
93 bool SLC0;
94 bool SLC1;
8695 bool UseST64;
87 bool IsSBufferLoadImm;
8896 bool IsX2;
8997 SmallVector InstsToMove;
9098 };
103111 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
104112 MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
105113 MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
114 MachineBasicBlock::iterator mergeBufferLoadOffenPair(CombineInfo &CI);
106115
107116 public:
108117 static char ID;
221230 CI.BaseOff = 0;
222231
223232 // SMEM offsets must be consecutive.
224 if (CI.IsSBufferLoadImm) {
233 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
234 CI.InstClass == BUFFER_LOAD_OFFEN) {
225235 unsigned Diff = CI.IsX2 ? 2 : 1;
226236 return (EltOffset0 + Diff == EltOffset1 ||
227237 EltOffset1 + Diff == EltOffset0) &&
228 CI.GLC0 == CI.GLC1;
238 CI.GLC0 == CI.GLC1 &&
239 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
229240 }
230241
231242 // If the offset in elements doesn't fit in 8-bits, we might be able to use
270281 MachineBasicBlock::iterator E = MBB->end();
271282 MachineBasicBlock::iterator MBBI = CI.I;
272283
273 unsigned AddrOpName;
274 if (CI.IsSBufferLoadImm)
275 AddrOpName = AMDGPU::OpName::sbase;
276 else
277 AddrOpName = AMDGPU::OpName::addr;
278
279 int AddrIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName);
280 const MachineOperand &AddrReg0 = CI.I->getOperand(AddrIdx);
281
282 // We only ever merge operations with the same base address register, so don't
283 // bother scanning forward if there are no other uses.
284 if (TargetRegisterInfo::isPhysicalRegister(AddrReg0.getReg()) ||
285 MRI->hasOneNonDBGUse(AddrReg0.getReg()))
286 return false;
284 unsigned AddrOpName[3] = {0};
285 int AddrIdx[3];
286 const MachineOperand *AddrReg[3];
287 unsigned NumAddresses = 0;
288
289 switch (CI.InstClass) {
290 case DS_READ_WRITE:
291 AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
292 break;
293 case S_BUFFER_LOAD_IMM:
294 AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
295 break;
296 case BUFFER_LOAD_OFFEN:
297 AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
298 AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
299 AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
300 break;
301 default:
302 llvm_unreachable("invalid InstClass");
303 }
304
305 for (unsigned i = 0; i < NumAddresses; i++) {
306 AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
307 AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
308
309 // We only ever merge operations with the same base address register, so don't
310 // bother scanning forward if there are no other uses.
311 if (AddrReg[i]->isReg() &&
312 (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||
313 MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
314 return false;
315 }
287316
288317 ++MBBI;
289318
334363 if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove))
335364 continue;
336365
337 const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
338
339 // Check same base pointer. Be careful of subregisters, which can occur with
340 // vectors of pointers.
341 if (AddrReg0.getReg() == AddrReg1.getReg() &&
342 AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
366 bool Match = true;
367 for (unsigned i = 0; i < NumAddresses; i++) {
368 const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
369
370 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
371 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
372 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
373 Match = false;
374 break;
375 }
376 continue;
377 }
378
379 // Check same base pointer. Be careful of subregisters, which can occur with
380 // vectors of pointers.
381 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
382 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
383 Match = false;
384 break;
385 }
386 }
387
388 if (Match) {
343389 int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
344390 AMDGPU::OpName::offset);
345391 CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
346392 CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
347393 CI.Paired = MBBI;
348394
349 if (CI.IsSBufferLoadImm) {
395 if (CI.InstClass == DS_READ_WRITE) {
396 CI.Offset0 &= 0xffff;
397 CI.Offset1 &= 0xffff;
398 } else {
350399 CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
351400 CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
352 } else {
353 CI.Offset0 &= 0xffff;
354 CI.Offset1 &= 0xffff;
401 if (CI.InstClass == BUFFER_LOAD_OFFEN) {
402 CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
403 CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
404 }
355405 }
356406
357407 // Check both offsets fit in the reduced range.
564614 return Next;
565615 }
566616
617 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadOffenPair(
618 CombineInfo &CI) {
619 MachineBasicBlock *MBB = CI.I->getParent();
620 DebugLoc DL = CI.I->getDebugLoc();
621 unsigned Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN :
622 AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
623
624 const TargetRegisterClass *SuperRC =
625 CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
626 unsigned DestReg = MRI->createVirtualRegister(SuperRC);
627 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
628
629 BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
630 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
631 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
632 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
633 .addImm(MergedOffset) // offset
634 .addImm(CI.GLC0) // glc
635 .addImm(CI.SLC0) // slc
636 .addImm(0) // tfe
637 .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
638
639 unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
640 unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
641
642 // Handle descending offsets
643 if (CI.Offset0 > CI.Offset1)
644 std::swap(SubRegIdx0, SubRegIdx1);
645
646 // Copy to the old destination registers.
647 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
648 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
649 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
650
651 BuildMI(*MBB, CI.Paired, DL, CopyDesc)
652 .add(*Dest0) // Copy to same destination including flags and sub reg.
653 .addReg(DestReg, 0, SubRegIdx0);
654 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
655 .add(*Dest1)
656 .addReg(DestReg, RegState::Kill, SubRegIdx1);
657
658 moveInstsAfter(Copy1, CI.InstsToMove);
659
660 MachineBasicBlock::iterator Next = std::next(CI.I);
661 CI.I->eraseFromParent();
662 CI.Paired->eraseFromParent();
663 return Next;
664 }
665
567666 // Scan through looking for adjacent LDS operations with constant offsets from
568667 // the same base register. We rely on the scheduler to do the hard work of
569668 // clustering nearby loads, and assume these are all adjacent.
581680
582681 CombineInfo CI;
583682 CI.I = I;
584 CI.IsSBufferLoadImm = false;
585683 unsigned Opc = MI.getOpcode();
586684 if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
685 CI.InstClass = DS_READ_WRITE;
587686 CI.EltSize = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
588687 if (findMatchingInst(CI)) {
589688 Modified = true;
595694 continue;
596695 }
597696 if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
697 CI.InstClass = DS_READ_WRITE;
598698 CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
599699 if (findMatchingInst(CI)) {
600700 Modified = true;
609709 (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
610710 Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM)) {
611711 // EltSize is in units of the offset encoding.
712 CI.InstClass = S_BUFFER_LOAD_IMM;
612713 CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
613 CI.IsSBufferLoadImm = true;
614714 CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
615715 if (findMatchingInst(CI)) {
616716 Modified = true;
622722 }
623723 continue;
624724 }
725 if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
726 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN) {
727 CI.InstClass = BUFFER_LOAD_OFFEN;
728 CI.EltSize = 4;
729 CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
730 if (findMatchingInst(CI)) {
731 Modified = true;
732 I = mergeBufferLoadOffenPair(CI);
733 if (!CI.IsX2)
734 CreatedX2++;
735 } else {
736 ++I;
737 }
738 continue;
739 }
625740
626741 ++I;
627742 }
125125 ret float %val
126126 }
127127
128 ;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged:
129 ;CHECK-NEXT: BB#
130 ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
131 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
132 ;CHECK: s_waitcnt
133 define amdgpu_ps void @buffer_load_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a) {
134 main_body:
135 %a1 = add i32 %a, 4
136 %a2 = add i32 %a, 8
137 %a3 = add i32 %a, 12
138 %a4 = add i32 %a, 16
139 %a5 = add i32 %a, 28
140 %a6 = add i32 %a, 32
141 %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
142 %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
143 %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a3, i1 0, i1 0)
144 %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a4, i1 0, i1 0)
145 %r5 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a5, i1 0, i1 0)
146 %r6 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a6, i1 0, i1 0)
147 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
148 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
149 ret void
150 }
151
152 ;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged_glc_slc:
153 ;CHECK-NEXT: BB#
154 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}}
155 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}}
156 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}}
157 ;CHECK: s_waitcnt
158 define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a) {
159 main_body:
160 %a1 = add i32 %a, 4
161 %a2 = add i32 %a, 8
162 %a3 = add i32 %a, 12
163 %a4 = add i32 %a, 16
164 %a5 = add i32 %a, 28
165 %a6 = add i32 %a, 32
166 %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
167 %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
168 %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a3, i1 1, i1 0)
169 %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a4, i1 1, i1 0)
170 %r5 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a5, i1 1, i1 1)
171 %r6 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a6, i1 1, i1 1)
172 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
173 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
174 ret void
175 }
176
177 ;CHECK-LABEL: {{^}}buffer_load_x2_offen_merged:
178 ;CHECK-NEXT: BB#
179 ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
180 ;CHECK: s_waitcnt
181 define amdgpu_ps void @buffer_load_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a) {
182 main_body:
183 %a1 = add i32 %a, 4
184 %a2 = add i32 %a, 12
185 %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
186 %vr2 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
187 %r1 = extractelement <2 x float> %vr1, i32 0
188 %r2 = extractelement <2 x float> %vr1, i32 1
189 %r3 = extractelement <2 x float> %vr2, i32 0
190 %r4 = extractelement <2 x float> %vr2, i32 1
191 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
192 ret void
193 }
194
128195 declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
129196 declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0
130197 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0
198 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
131199
132200 attributes #0 = { nounwind readonly }
237237 ret void
238238 }
239239
240 ; GCN-LABEL: {{^}}smrd_vgpr_merged:
241 ; GCN-NEXT: BB#
242 ; GCN-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
243 ; GCN-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
244 define amdgpu_ps void @smrd_vgpr_merged(<4 x i32> inreg %desc, i32 %a) #0 {
245 main_body:
246 %a1 = add i32 %a, 4
247 %a2 = add i32 %a, 8
248 %a3 = add i32 %a, 12
249 %a4 = add i32 %a, 16
250 %a5 = add i32 %a, 28
251 %a6 = add i32 %a, 32
252 %r1 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %a1)
253 %r2 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %a2)
254 %r3 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %a3)
255 %r4 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %a4)
256 %r5 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %a5)
257 %r6 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %a6)
258 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) #0
259 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) #0
260 ret void
261 }
262
240263 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
241264 declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
242265