llvm.org GIT mirror llvm / 4a5c408
AMDGPU/SI: Implement a custom MachineSchedStrategy Summary: GCNSchedStrategy re-uses most of GenericScheduler, it's just uses a different method to compute the excess and critical register pressure limits. It's not enabled by default, to enable it you need to pass -misched=gcn to llc. Shader DB stats: 32464 shaders in 17874 tests Totals: SGPRS: 1542846 -> 1643125 (6.50 %) VGPRS: 1005595 -> 904653 (-10.04 %) Spilled SGPRs: 29929 -> 27745 (-7.30 %) Spilled VGPRs: 334 -> 352 (5.39 %) Scratch VGPRs: 1612 -> 1624 (0.74 %) dwords per thread Code Size: 36688188 -> 37034900 (0.95 %) bytes LDS: 1913 -> 1913 (0.00 %) blocks Max Waves: 254101 -> 265125 (4.34 %) Wait states: 0 -> 0 (0.00 %) Totals from affected shaders: SGPRS: 1338220 -> 1438499 (7.49 %) VGPRS: 886221 -> 785279 (-11.39 %) Spilled SGPRs: 29869 -> 27685 (-7.31 %) Spilled VGPRs: 334 -> 352 (5.39 %) Scratch VGPRs: 1612 -> 1624 (0.74 %) dwords per thread Code Size: 34315716 -> 34662428 (1.01 %) bytes LDS: 1551 -> 1551 (0.00 %) blocks Max Waves: 188127 -> 199151 (5.86 %) Wait states: 0 -> 0 (0.00 %) Reviewers: arsenm, mareko, nhaehnle, MatzeB, atrick Subscribers: arsenm, kzhuravl, llvm-commits Differential Revision: https://reviews.llvm.org/D23688 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@279995 91177308-0d34-0410-b5e6-96231b3b80d8 Tom Stellard 3 years ago
32 changed file(s) with 515 addition(s) and 65 deletion(s). Raw diff Collapse all Expand all
213213 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
214214 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
215215 }
216
217 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
218 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
219 if (SGPRs <= 80)
220 return 10;
221 if (SGPRs <= 88)
222 return 9;
223 if (SGPRs <= 100)
224 return 8;
225 return 7;
226 }
227 if (SGPRs <= 48)
228 return 10;
229 if (SGPRs <= 56)
230 return 9;
231 if (SGPRs <= 64)
232 return 8;
233 if (SGPRs <= 72)
234 return 7;
235 if (SGPRs <= 80)
236 return 6;
237 return 5;
238 }
239
240 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
241 if (VGPRs <= 24)
242 return 10;
243 if (VGPRs <= 28)
244 return 9;
245 if (VGPRs <= 32)
246 return 8;
247 if (VGPRs <= 36)
248 return 7;
249 if (VGPRs <= 40)
250 return 6;
251 if (VGPRs <= 48)
252 return 5;
253 if (VGPRs <= 64)
254 return 4;
255 if (VGPRs <= 84)
256 return 3;
257 if (VGPRs <= 128)
258 return 2;
259 return 1;
260 }
428428 bool hasSGPRInitBug() const {
429429 return SGPRInitBug;
430430 }
431
432 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
433 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
434
435 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs VGPRs
436 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
431437 };
432438
433439 } // End namespace llvm
1717 #include "AMDGPUCallLowering.h"
1818 #include "AMDGPUTargetObjectFile.h"
1919 #include "AMDGPUTargetTransformInfo.h"
20 #include "GCNSchedStrategy.h"
2021 #include "R600ISelLowering.h"
2122 #include "R600InstrInfo.h"
2223 #include "R600MachineScheduler.h"
9596 return new SIScheduleDAGMI(C);
9697 }
9798
99 static ScheduleDAGInstrs *
100 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
101 ScheduleDAGMILive *DAG =
102 new ScheduleDAGMILive(C, make_unique(C));
103 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
104 return DAG;
105 }
106
98107 static MachineSchedRegistry
99108 R600SchedRegistry("r600", "Run R600's custom scheduler",
100109 createR600MachineScheduler);
102111 static MachineSchedRegistry
103112 SISchedRegistry("si", "Run SI's custom scheduler",
104113 createSIMachineScheduler);
114
115 static MachineSchedRegistry
116 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
117 "Run GCN scheduler to maximize occupancy",
118 createGCNMaxOccupancyMachineScheduler);
105119
106120 static StringRef computeDataLayout(const Triple &TT) {
107121 if (TT.getArch() == Triple::r600) {
466480 const SISubtarget &ST = C->MF->getSubtarget();
467481 if (ST.enableSIScheduler())
468482 return createSIMachineScheduler(C);
469 return nullptr;
483 return createGCNMaxOccupancyMachineScheduler(C);
470484 }
471485
472486 bool GCNPassConfig::addPreISel() {
4848 AMDGPUPromoteAlloca.cpp
4949 AMDGPURegisterInfo.cpp
5050 GCNHazardRecognizer.cpp
51 GCNSchedStrategy.cpp
5152 R600ClauseMergePass.cpp
5253 R600ControlFlowFinalizer.cpp
5354 R600EmitClauseMarkers.cpp
0 //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This contains a MachineSchedStrategy implementation for maximizing wave
11 /// occupancy on GCN hardware.
12 //===----------------------------------------------------------------------===//
13
14 #include "GCNSchedStrategy.h"
15 #include "AMDGPUSubtarget.h"
16 #include "SIInstrInfo.h"
17 #include "SIMachineFunctionInfo.h"
18 #include "SIRegisterInfo.h"
19 #include "llvm/CodeGen/RegisterClassInfo.h"
20
21 #define DEBUG_TYPE "misched"
22
23 using namespace llvm;
24
25 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
26 const MachineSchedContext *C) :
27 GenericScheduler(C) { }
28
29 static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs,
30 const MachineFunction &MF) {
31
32 const SISubtarget &ST = MF.getSubtarget();
33 const SIMachineFunctionInfo *MFI = MF.getInfo();
34 unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs),
35 ST.getOccupancyWithNumVGPRs(VGPRs));
36 return std::min(MinRegOccupancy,
37 ST.getOccupancyWithLocalMemSize(MFI->getLDSSize()));
38 }
39
40 void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
41 bool AtTop, const RegPressureTracker &RPTracker,
42 const SIRegisterInfo *SRI,
43 int SGPRPressure,
44 int VGPRPressure,
45 int SGPRExcessLimit,
46 int VGPRExcessLimit,
47 int SGPRCriticalLimit,
48 int VGPRCriticalLimit) {
49
50 Cand.SU = SU;
51 Cand.AtTop = AtTop;
52
53 // getDownwardPressure() and getUpwardPressure() make temporary changes to
54 // the the tracker, so we need to pass those function a non-const copy.
55 RegPressureTracker &TempTracker = const_cast(RPTracker);
56
57 std::vector Pressure;
58 std::vector MaxPressure;
59
60 if (AtTop)
61 TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
62 else {
63 // FIXME: I think for bottom up scheduling, the register pressure is cached
64 // and can be retrieved by DAG->getPressureDif(SU).
65 TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
66 }
67
68 int NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()];
69 int NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()];
70
71 // If two instructions increase the pressure of different register sets
72 // by the same amount, the generic scheduler will prefer to schedule the
73 // instruction that increases the set with the least amount of registers,
74 // which in our case would be SGPRs. This is rarely what we want, so
75 // when we report excess/critical register pressure, we do it either
76 // only for VGPRs or only for SGPRs.
77
78 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
79 const int MaxVGPRPressureInc = 16;
80 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
81 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
82
83
84 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
85 // to increase the likelihood we don't go over the limits. We should improve
86 // the analysis to look through dependencies to find the path with the least
87 // register pressure.
88 // FIXME: This is also necessary, because some passes that run after
89 // scheduling and before regalloc increase register pressure.
90 const int ErrorMargin = 3;
91 VGPRExcessLimit -= ErrorMargin;
92 SGPRExcessLimit -= ErrorMargin;
93
94 // We only need to update the RPDelata for instructions that increase
95 // register pressure. Instructions that decrease or keep reg pressure
96 // the same will be marked as RegExcess in tryCandidate() when they
97 // are compared with instructions that increase the register pressure.
98 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
99 Cand.RPDelta.Excess = PressureChange(SRI->getVGPRPressureSet());
100 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
101 }
102
103 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
104 Cand.RPDelta.Excess = PressureChange(SRI->getSGPRPressureSet());
105 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure = SGPRExcessLimit);
106 }
107
108 // Register pressure is considered 'CRITICAL' if it is approaching a value
109 // that would reduce the wave occupancy for the execution unit. When
110 // register pressure is 'CRITICAL', increading SGPR and VGPR pressure both
111 // has the same cost, so we don't need to prefer one over the other.
112
113 VGPRCriticalLimit -= ErrorMargin;
114 SGPRCriticalLimit -= ErrorMargin;
115
116 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
117 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
118
119 if (SGPRDelta >= 0 || VGPRDelta >= 0) {
120 if (SGPRDelta > VGPRDelta) {
121 Cand.RPDelta.CriticalMax = PressureChange(SRI->getSGPRPressureSet());
122 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
123 } else {
124 Cand.RPDelta.CriticalMax = PressureChange(SRI->getVGPRPressureSet());
125 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
126 }
127 }
128 }
129
130 // This function is mostly cut and pasted from
131 // GenericScheduler::pickNodeFromQueue()
132 void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
133 const CandPolicy &ZonePolicy,
134 const RegPressureTracker &RPTracker,
135 SchedCandidate &Cand) {
136 const SISubtarget &ST = DAG->MF.getSubtarget();
137 const SIRegisterInfo *SRI = static_cast(TRI);
138 ArrayRef Pressure = RPTracker.getRegSetPressureAtPos();
139 unsigned SGPRPressure = Pressure[SRI->getSGPRPressureSet()];
140 unsigned VGPRPressure = Pressure[SRI->getVGPRPressureSet()];
141 unsigned SGPRExcessLimit =
142 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
143 unsigned VGPRExcessLimit =
144 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
145 unsigned MaxWaves = getMaxWaves(SGPRPressure, VGPRPressure, DAG->MF);
146 unsigned SGPRCriticalLimit = SRI->getNumSGPRsAllowed(ST, MaxWaves);
147 unsigned VGPRCriticalLimit = SRI->getNumVGPRsAllowed(MaxWaves);
148
149 ReadyQueue &Q = Zone.Available;
150 for (SUnit *SU : Q) {
151
152 SchedCandidate TryCand(ZonePolicy);
153 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI,
154 SGPRPressure, VGPRPressure,
155 SGPRExcessLimit, VGPRExcessLimit,
156 SGPRCriticalLimit, VGPRCriticalLimit);
157 // Pass SchedBoundary only when comparing nodes from the same boundary.
158 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
159 GenericScheduler::tryCandidate(Cand, TryCand, ZoneArg);
160 if (TryCand.Reason != NoCand) {
161 // Initialize resource delta if needed in case future heuristics query it.
162 if (TryCand.ResDelta == SchedResourceDelta())
163 TryCand.initResourceDelta(Zone.DAG, SchedModel);
164 Cand.setBest(TryCand);
165 }
166 }
167 }
168
169 static int getBidirectionalReasonRank(GenericSchedulerBase::CandReason Reason) {
170 switch (Reason) {
171 default:
172 return Reason;
173 case GenericSchedulerBase::RegCritical:
174 case GenericSchedulerBase::RegExcess:
175 return -Reason;
176 }
177 }
178
179 // This function is mostly cut and pasted from
180 // GenericScheduler::pickNodeBidirectional()
181 SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
182 // Schedule as far as possible in the direction of no choice. This is most
183 // efficient, but also provides the best heuristics for CriticalPSets.
184 if (SUnit *SU = Bot.pickOnlyChoice()) {
185 IsTopNode = false;
186 return SU;
187 }
188 if (SUnit *SU = Top.pickOnlyChoice()) {
189 IsTopNode = true;
190 return SU;
191 }
192 // Set the bottom-up policy based on the state of the current bottom zone and
193 // the instructions outside the zone, including the top zone.
194 CandPolicy BotPolicy;
195 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
196 // Set the top-down policy based on the state of the current top zone and
197 // the instructions outside the zone, including the bottom zone.
198 CandPolicy TopPolicy;
199 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
200
201 // See if BotCand is still valid (because we previously scheduled from Top).
202 DEBUG(dbgs() << "Picking from Bot:\n");
203 if (!BotCand.isValid() || BotCand.SU->isScheduled ||
204 BotCand.Policy != BotPolicy) {
205 BotCand.reset(CandPolicy());
206 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand);
207 assert(BotCand.Reason != NoCand && "failed to find the first candidate");
208 } else {
209 DEBUG(traceCandidate(BotCand));
210 }
211
212 // Check if the top Q has a better candidate.
213 DEBUG(dbgs() << "Picking from Top:\n");
214 if (!TopCand.isValid() || TopCand.SU->isScheduled ||
215 TopCand.Policy != TopPolicy) {
216 TopCand.reset(CandPolicy());
217 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand);
218 assert(TopCand.Reason != NoCand && "failed to find the first candidate");
219 } else {
220 DEBUG(traceCandidate(TopCand));
221 }
222
223 // Pick best from BotCand and TopCand.
224 DEBUG(
225 dbgs() << "Top Cand: ";
226 traceCandidate(BotCand);
227 dbgs() << "Bot Cand: ";
228 traceCandidate(TopCand);
229 );
230 SchedCandidate Cand;
231 if (TopCand.Reason == BotCand.Reason) {
232 Cand = BotCand;
233 GenericSchedulerBase::CandReason TopReason = TopCand.Reason;
234 TopCand.Reason = NoCand;
235 GenericScheduler::tryCandidate(Cand, TopCand, nullptr);
236 if (TopCand.Reason != NoCand) {
237 Cand.setBest(TopCand);
238 } else {
239 TopCand.Reason = TopReason;
240 }
241 } else {
242 if (TopCand.Reason == RegExcess && TopCand.RPDelta.Excess.getUnitInc() <= 0) {
243 Cand = TopCand;
244 } else if (BotCand.Reason == RegExcess && BotCand.RPDelta.Excess.getUnitInc() <= 0) {
245 Cand = BotCand;
246 } else if (TopCand.Reason == RegCritical && TopCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
247 Cand = TopCand;
248 } else if (BotCand.Reason == RegCritical && BotCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
249 Cand = BotCand;
250 } else {
251 int TopRank = getBidirectionalReasonRank(TopCand.Reason);
252 int BotRank = getBidirectionalReasonRank(BotCand.Reason);
253 if (TopRank > BotRank) {
254 Cand = TopCand;
255 } else {
256 Cand = BotCand;
257 }
258 }
259 }
260 DEBUG(
261 dbgs() << "Picking: ";
262 traceCandidate(Cand);
263 );
264
265 IsTopNode = Cand.AtTop;
266 return Cand.SU;
267 }
268
269 // This function is mostly cut and pasted from
270 // GenericScheduler::pickNode()
271 SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) {
272 if (DAG->top() == DAG->bottom()) {
273 assert(Top.Available.empty() && Top.Pending.empty() &&
274 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
275 return nullptr;
276 }
277 SUnit *SU;
278 do {
279 if (RegionPolicy.OnlyTopDown) {
280 SU = Top.pickOnlyChoice();
281 if (!SU) {
282 CandPolicy NoPolicy;
283 TopCand.reset(NoPolicy);
284 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand);
285 assert(TopCand.Reason != NoCand && "failed to find a candidate");
286 SU = TopCand.SU;
287 }
288 IsTopNode = true;
289 } else if (RegionPolicy.OnlyBottomUp) {
290 SU = Bot.pickOnlyChoice();
291 if (!SU) {
292 CandPolicy NoPolicy;
293 BotCand.reset(NoPolicy);
294 pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand);
295 assert(BotCand.Reason != NoCand && "failed to find a candidate");
296 SU = BotCand.SU;
297 }
298 IsTopNode = false;
299 } else {
300 SU = pickNodeBidirectional(IsTopNode);
301 }
302 } while (SU->isScheduled);
303
304 if (SU->isTopReady())
305 Top.removeReady(SU);
306 if (SU->isBottomReady())
307 Bot.removeReady(SU);
308
309 DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());
310 return SU;
311 }
0 //===-- GCNSchedStrategy.h - GCN Scheduler Strategy -*- C++ -*-------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 //
11 //===----------------------------------------------------------------------===//
12
13 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
14 #define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
15
16 #include "llvm/CodeGen/MachineScheduler.h"
17
18 namespace llvm {
19
20 class SIRegisterInfo;
21
22 /// This is a minimal scheduler strategy. The main difference between this
23 /// and the GenericScheduler is that GCNSchedStrategy uses different
24 /// heuristics to determine excess/critical pressure sets. Its goal is to
25 /// maximize kernel occupancy (i.e. maximum number of waves per simd).
26 class GCNMaxOccupancySchedStrategy : public GenericScheduler {
27
28 SUnit *pickNodeBidirectional(bool &IsTopNode);
29
30 void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
31 const RegPressureTracker &RPTracker,
32 SchedCandidate &Cand);
33
34 void initCandidate(SchedCandidate &Cand, SUnit *SU,
35 bool AtTop, const RegPressureTracker &RPTracker,
36 const SIRegisterInfo *SRI,
37 int SGPRPressure, int VGPRPressure,
38 int SGPRExcessLimit, int VGPRExcessLimit,
39 int SGPRCriticalLimit, int VGPRCriticalLimit);
40
41 void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
42 SchedBoundary *Zone, const SIRegisterInfo *SRI,
43 unsigned SGPRPressure, unsigned VGPRPressure);
44
45 public:
46 GCNMaxOccupancySchedStrategy(const MachineSchedContext *C);
47
48 SUnit *pickNode(bool &IsTopNode) override;
49 };
50
51 } // End namespace llvm
52
53 #endif // GCNSCHEDSTRATEGY_H
246246 return SGPRLimit;
247247
248248 return VGPRLimit;
249 }
250
251 unsigned
252 SIRegisterInfo::getDefaultRegPressureSetLimit(const MachineFunction &MF,
253 unsigned Idx) const {
254 return AMDGPURegisterInfo::getRegPressureSetLimit(MF, Idx);
249255 }
250256
251257 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
5050 unsigned getRegPressureSetLimit(const MachineFunction &MF,
5151 unsigned Idx) const override;
5252
53 unsigned getDefaultRegPressureSetLimit(const MachineFunction &MF,
54 unsigned Idx) const;
5355
5456 bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
5557
4646
4747 class SISchedMachineModel : SchedMachineModel {
4848 let CompleteModel = 1;
49 // MicroOpBufferSize = 1 means that instructions will always be added
50 // the ready queue when they become available. This exposes them
51 // to the register pressure analysis.
52 let MicroOpBufferSize = 1;
4953 let IssueWidth = 1;
5054 let PostRAScheduler = 1;
5155 }
257257 }
258258
259259 ; FUNC-LABEL: {{^}}v_and_multi_use_constant_i64:
260 ; SI: buffer_load_dwordx2 v{{\[}}[[LO0:[0-9]+]]:[[HI0:[0-9]+]]{{\]}}
261 ; SI: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}}
260 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO0:[0-9]+]]:[[HI0:[0-9]+]]{{\]}}
261 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}}
262 ; SI-DAG: s_movk_i32 [[KHI:s[0-9]+]], 0x11e{{$}}
262263 ; SI-DAG: s_mov_b32 [[KLO:s[0-9]+]], 0xab19b207{{$}}
263 ; SI-DAG: s_movk_i32 [[KHI:s[0-9]+]], 0x11e{{$}}
264264 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO0]]
265265 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI0]]
266266 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO1]]
283283 ; SI: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}}
284284 ; SI-NOT: and
285285 ; SI: v_and_b32_e32 v[[RESLO0:[0-9]+]], 63, v[[LO0]]
286 ; SI: v_and_b32_e32 v[[RESLO1:[0-9]+]], 63, v[[LO1]]
286287 ; SI-NOT: and
287288 ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO0]]
288 ; SI: v_and_b32_e32 v[[RESLO1:[0-9]+]], 63, v[[LO1]]
289 ; SI-NOT: and
290289 ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO1]]
291290 define void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
292291 %a = load volatile i64, i64 addrspace(1)* %aptr
485484 ; low 32-bits, which is not a valid 64-bit inline immmediate.
486485
487486 ; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64:
487 ; SI: s_load_dwordx2
488488 ; SI: s_load_dword s
489 ; SI: s_load_dwordx2
490489 ; SI-NOT: and
491490 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
492491 ; SI-NOT: and
154154 }
155155
156156 ; FUNC-LABEL: {{^}}s_ctpop_i65:
157 ; GCN: s_bcnt1_i32_b64 [[REG0:s[0-9]+]],
157158 ; GCN: s_and_b32
158 ; GCN: s_bcnt1_i32_b64 [[REG0:s[0-9]+]],
159159 ; GCN: s_bcnt1_i32_b64 [[REG1:s[0-9]+]],
160160 ; GCN: s_add_i32 {{s[0-9]+}}, [[REG0]], [[REG1]]
161161 ; GCN: s_endpgm
77
88 ; SI-LABEL: {{^}}offset_order:
99
10 ; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
10 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
1111 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14
1212 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44
1313
1212 ; CI: v_ceil_f64_e32
1313 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
1414 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
15 ; SI-DAG: s_addk_i32 [[SEXP]], 0xfc01
16 ; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP]]
15 ; FIXME: We should be using s_addk_i32 here, but the reg allocation hints
16 ; are not always followed.
17 ; SI-DAG: s_add_i32 [[SEXP0:s[0-9]+]], [[SEXP]], 0xfffffc01
18 ; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP0]]
1719 ; SI-DAG: s_not_b64
1820 ; SI-DAG: s_and_b64
1921 ; SI-DAG: cmp_gt_i32
547547
548548 ; FUNC-LABEL: {{^}}test_f64_interp:
549549 ; SI: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]]
550 ; SI: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
550 ; SI: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
551551 define void @test_f64_interp(double addrspace(1)* %out,
552552 double addrspace(1)* %in1,
553553 double addrspace(1)* %in2,
55 ; SI-LABEL: {{^}}test_fmax3_f64:
66 ; SI-DAG: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
77 ; SI-DAG: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
8 ; SI-DAG: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
98 ; SI: v_max_f64 [[REGA]], [[REGA]], [[REGB]]
9 ; SI: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
1010 ; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]]
1111 ; SI: buffer_store_dwordx2 [[RESULT]],
1212 ; SI: s_endpgm
2121
2222 ; XXX: Could do v_or_b32 directly
2323 ; CHECK-LABEL: {{^}}extract_w_offset_salu_use_vector:
24 ; CHECK: s_mov_b32 m0
2425 ; CHECK-DAG: s_or_b32
2526 ; CHECK-DAG: s_or_b32
2627 ; CHECK-DAG: s_or_b32
2930 ; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
3031 ; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
3132 ; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
32 ; CHECK: s_mov_b32 m0
33 ; CHECK-NEXT: v_movrels_b32_e32
33 ; CHECK: v_movrels_b32_e32
3434 define void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <4 x i32> %or.val) {
3535 entry:
3636 %idx = add i32 %in, 1
241241 ; FIXME: Why is vector copied in between?
242242
243243 ; CHECK-DAG: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]]
244 ; CHECK-DAG: s_mov_b32 [[S_ELT1:s[0-9]+]], 9
244245 ; CHECK-DAG: s_mov_b32 [[S_ELT0:s[0-9]+]], 7
245 ; CHECK-DAG: s_mov_b32 [[S_ELT1:s[0-9]+]], 9
246246 ; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], [[S_ELT0]]
247247 ; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], [[S_ELT1]]
248248
249249 ; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
250 ; CHECK: s_waitcnt vmcnt(0) lgkmcnt(0)
250 ; CHECK: s_waitcnt vmcnt(0)
251251
252252 ; CHECK: [[LOOP0:BB[0-9]+_[0-9]+]]:
253253 ; CHECK-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
302302 ; CHECK-DAG: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]]
303303 ; CHECK-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
304304
305 ; CHECK-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
306305 ; CHECK-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]]
306 ; CHECK: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}}
307 ; CHECK: v_mov_b32_e32 v[[VEC_ELT1:[0-9]+]], s{{[0-9]+}}
308 ; CHECK: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
307309
308310 ; CHECK: [[LOOP0:BB[0-9]+_[0-9]+]]:
309311 ; CHECK-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
323325 ; CHECK: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
324326 ; CHECK: s_mov_b32 m0, [[READLANE]]
325327 ; CHECK: s_and_saveexec_b64 vcc, vcc
326 ; CHECK-NEXT: v_movreld_b32_e32 [[VEC_ELT1]], 63
328 ; CHECK-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63
327329 ; CHECK-NEXT: s_xor_b64 exec, exec, vcc
328330 ; CHECK: s_cbranch_execnz [[LOOP1]]
329331
343343 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
344344 ; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000
345345
346 ; GCN: s_mov_b32 m0, [[SCALEDIDX]]
346 ; GCN-DAG: s_mov_b32 m0, [[SCALEDIDX]]
347347 ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, 0
348348
349349 ; Increment to next element folded into base register, but FileCheck
342342 ; FUNC-LABEL: {{^}}constant_zextload_v32i16_to_v32i32:
343343 ; GCN-DAG: s_load_dwordx16
344344 ; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
345 ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]]
346 ; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
345 ; GCN-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]]
346 ; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
347347
348348 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 0, #1
349349 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1
359359 ; GCN-NOHSA: buffer_load_dwordx4
360360 ; GCN-NOHSA: buffer_load_dwordx4
361361 ; GCN-NOHSA: buffer_load_dwordx4
362 ; GCN-NOHSA: buffer_load_dwordx4
362 ; GCN-NOHSA-DAG: buffer_load_dwordx4
363363
364364 ; GCN-HSA: flat_load_dwordx4
365365 ; GCN-HSA: flat_load_dwordx4
55 ; resulting in losing the store to gptr
66
77 ; FUNC-LABEL: {{^}}missing_store_reduced:
8 ; SI: s_load_dwordx2
89 ; SI: ds_read_b64
910 ; SI-DAG: buffer_store_dword
1011 ; SI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
1112 ; SI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
12 ; SI: s_load_dword
13 ; SI: s_nop 2
13 ; SI: s_nop 3
1414 ; SI: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}
1515 ; SI: buffer_store_dword
1616 ; SI: s_endpgm
99 ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}}
1010
1111 ; GCN-NOT: v_mov_b32
12 ; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]]
13 ; GCN-NOT: v_mov_b32
1214 ; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]
13 ; GCN-NOT: v_mov_b32
14 ; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]]
1515 ; GCN-NOT: v_mov_b32
1616
1717 ; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]]
102102 ; FUNC-LABEL: {{^}}rcp_fabs_fneg_pat_multi_use_f32:
103103 ; GCN: s_load_dword [[SRC:s[0-9]+]]
104104 ; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -|[[SRC]]|
105 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[SRC]], -|[[SRC]]|
105106 ; GCN: buffer_store_dword [[RCP]]
106
107 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[SRC]], -|[[SRC]]|
108107 ; GCN: buffer_store_dword [[MUL]]
109108 define void @rcp_fabs_fneg_pat_multi_use_f32(float addrspace(1)* %out, float %src) #0 {
110109 %src.fabs = call float @llvm.fabs.f32(float %src)
1717 }
1818
1919 ; GCN-LABEL: {{^}}vgpr_literal:
20 ; GCN: exp 15, 0, 1, 1, 1, v0, v0, v0, v0
21 ; GCN: s_waitcnt expcnt(0)
20 ; GCN: v_mov_b32_e32 v4, v0
21 ; GCN: exp 15, 0, 1, 1, 1, v4, v4, v4, v4
2222 ; GCN-DAG: v_mov_b32_e32 v0, 1.0
2323 ; GCN-DAG: v_mov_b32_e32 v1, 2.0
2424 ; GCN-DAG: v_mov_b32_e32 v2, 4.0
2525 ; GCN-DAG: v_mov_b32_e32 v3, -1.0
26 ; GCN: s_waitcnt expcnt(0)
2627 ; GCN-NOT: s_endpgm
2728 define amdgpu_vs {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
2829 call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
228229
229230
230231 ; GCN-LABEL: {{^}}structure_literal:
231 ; GCN: exp 15, 0, 1, 1, 1, v0, v0, v0, v0
232 ; GCN: s_waitcnt expcnt(0)
232 ; GCN: v_mov_b32_e32 v3, v0
233 ; GCN: exp 15, 0, 1, 1, 1, v3, v3, v3, v3
233234 ; GCN-DAG: v_mov_b32_e32 v0, 1.0
234235 ; GCN-DAG: s_mov_b32 s0, 2
235236 ; GCN-DAG: s_mov_b32 s1, 3
236237 ; GCN-DAG: v_mov_b32_e32 v1, 2.0
237238 ; GCN-DAG: v_mov_b32_e32 v2, 4.0
239 ; GCN: s_waitcnt expcnt(0)
238240 define amdgpu_vs {{float, i32}, {i32, <2 x float>}} @structure_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
239241 call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
240242 ret {{float, i32}, {i32, <2 x float>}} {{float, i32} {float 1.0, i32 2}, {i32, <2 x float>} {i32 3, <2 x float> }}
133133
134134 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_select_pat2:
135135 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
136 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
137136 ; GCN: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
137 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
138138 define void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
139139 %icmp0 = icmp ugt i32 %a, %b
140140 %sub0 = sub i32 %a, %b
169169 ; CI.
170170
171171 ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x8:
172 ; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
172173 ; GCN-NOHSA-NOT: v_add
173174 ; GCN-NOHSA: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}}
174175 ; GCN-NOHSA-NOT: v_add
176 ; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
175177 ; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
176 ; GCN-NOHSA-NOT: v_add
177 ; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
178 ; GCN-NOHSA-NOT: v_add
179 ; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
180178
181179 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
182180 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
9292 ; SI-DAG: s_load_dwordx2 s{{\[}}[[ALO:[0-9]+]]:[[AHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
9393 ; SI-DAG: s_load_dwordx2 s{{\[}}[[BLO:[0-9]+]]:[[BHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}}
9494
95 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]]
96 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]]
9597 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]]
96 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]]
97 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]]
98 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]]
9998 ; SI-DAG: v_cmp_eq_i32_e64 vcc, 0, s{{[0-9]+}}
10099
101100 ; SI: v_cndmask_b32_e32
101 ; SI: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]]
102102 ; SI: v_cndmask_b32_e32
103103 ; SI: buffer_store_dwordx2
104104 define void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind {
340340
341341 ; GCN-LABEL: {{^}}v_uextract_bit_34_37_multi_use_shift_i64:
342342 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
343 ; GCN: v_mov_b32_e32 v[[ZERO_SHR:[0-9]+]], 0{{$}}
344 ; GCN: v_mov_b32_e32 v[[ZERO_BFE:[0-9]+]], v[[ZERO_SHR]]
343345 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 2, [[VAL]]
344346 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 2, 3
345 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
346 ; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHR]]:[[ZERO]]{{\]}}
347 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
347 ; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHR]]:[[ZERO_SHR]]{{\]}}
348 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO_BFE]]{{\]}}
348349 define void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
349350 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
350351 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
6969 }
7070
7171 ; FUNC-LABEL: @reorder_constant_load_global_store_constant_load
72 ; CI-DAG: buffer_store_dword
7372 ; CI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
7473 ; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
74 ; CI: buffer_store_dword
7575 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
7676 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
7777 ; CI: buffer_store_dword
135135 }
136136
137137 ; FUNC-LABEL: @reorder_global_load_local_store_global_load
138 ; CI: ds_write_b32
138139 ; CI: buffer_load_dword
139140 ; CI: buffer_load_dword
140 ; CI: ds_write_b32
141141 ; CI: buffer_store_dword
142142 define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
143143 %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1
180180
181181 ; FUNC-LABEL: @reorder_global_offsets
182182 ; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
183 ; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
183184 ; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
184 ; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
185 ; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
186185 ; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
187186 ; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
187 ; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
188188 ; CI: s_endpgm
189189 define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
190190 %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3
3434 ; SI: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
3535 ; SI: s_lshl_b64 s{{\[}}[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_SREG]]:{{[0-9]+\]}}, 2
3636 ; SI: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]],
37 ; SI: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
3738 ; SI: s_addc_u32
39 ; SI: buffer_store_dword v[[LO_VREG]],
3840 ; SI: v_mov_b32_e32
39 ; SI: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
4041 ; SI: v_mov_b32_e32
41 ; SI: buffer_store_dword v[[LO_VREG]],
4242 define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
4343 %aa = add i64 %a, 234 ; Prevent shrinking store.
4444 %b = shl i64 %aa, 2
3737 ; SI: v_cndmask_b32_e64
3838 ; SI: v_mul_hi_u32 [[Quotient:v[0-9]+]]
3939 ; SI: v_mul_lo_i32 [[Num_S_Remainder:v[0-9]+]]
40 ; SI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], vcc, 1, [[Quotient]]
4041 ; SI-DAG: v_sub_i32_e32 [[Remainder:v[0-9]+]], vcc, {{[vs][0-9]+}}, [[Num_S_Remainder]]
4142 ; SI-DAG: v_cndmask_b32_e64
4243 ; SI-DAG: v_cndmask_b32_e64
44 ; SI-DAG: v_subrev_i32_e32 [[Quotient_S_One:v[0-9]+]],
45 ; SI-DAG: v_subrev_i32_e32 [[Remainder_S_Den:v[0-9]+]],
4346 ; SI: v_and_b32_e32 [[Tmp1:v[0-9]+]]
44 ; SI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], vcc, 1, [[Quotient]]
45 ; SI-DAG: v_subrev_i32_e32 [[Quotient_S_One:v[0-9]+]],
4647 ; SI-DAG: v_cndmask_b32_e64
4748 ; SI-DAG: v_cndmask_b32_e64
4849 ; SI-DAG: v_add_i32_e32 [[Remainder_A_Den:v[0-9]+]],
49 ; SI-DAG: v_subrev_i32_e32 [[Remainder_S_Den:v[0-9]+]],
5050 ; SI-DAG: v_cndmask_b32_e64
5151 ; SI-DAG: v_cndmask_b32_e64
5252 ; SI: s_endpgm
4141 }
4242
4343 ; GCN-LABEL: {{^}}test_use_s_v_s:
44 ; GCN: buffer_load_dword [[VA0:v[0-9]+]]
45 ; GCN: buffer_load_dword [[VA1:v[0-9]+]]
4644 ; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
4745 ; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
46 ; SI: buffer_load_dword [[VA0:v[0-9]+]]
47 ; SI: buffer_load_dword [[VA1:v[0-9]+]]
4848
4949 ; GCN-NOT: v_mov_b32
5050 ; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
5151 ; GCN-NOT: v_mov_b32
52
53 ; VI: buffer_load_dword [[VA0:v[0-9]+]]
54 ; VI: buffer_load_dword [[VA1:v[0-9]+]]
5255
5356 ; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VA0]], [[SA]], [[VB]]
5457 ; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VA1]], [[SA]], [[VB]]
1111
1212 ; GCN-LABEL: {{^}}main:
1313
14 ; GCN-DAG: s_mov_b32 s13, s12
15 ; GCN-DAG: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
16 ; GCN-DAG: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
17 ; GCN-DAG: s_mov_b32 s18, -1
18 ; SI-DAG: s_mov_b32 s19, 0xe8f000
19 ; VI-DAG: s_mov_b32 s19, 0xe80000
20
21 ; s13 is offset system SGPR
22 ; GCN: buffer_store_dword {{v[0-9]+}}, off, s[16:19], s13 offset:{{[0-9]+}} ; 16-byte Folded Spill
23 ; GCN: buffer_load_dword v{{[0-9]+}}, off, s[16:19], s13 offset:{{[0-9]+}} ; 16-byte Folded Reload
14 ; GCN-DAG: s_mov_b32 s11, s12
15 ; GCN-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
16 ; GCN-DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
17 ; GCN-DAG: s_mov_b32 s14, -1
18 ; SI-DAG: s_mov_b32 s15, 0xe8f000
19 ; VI-DAG: s_mov_b32 s15, 0xe80000
20
21 ; s11 is offset system SGPR
22 ; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Spill
23 ; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Reload
2424
2525 ; GCN: NumVgprs: 256
2626 ; GCN: ScratchSize: 1024