llvm.org GIT mirror llvm / c0a189c
ScheduleDAGInstrs: Rework schedule graph builder. The new algorithm remembers the uses encountered while walking backwards until a matching def is found. Contrary to the previous version this: - Works without LiveIntervals being available - Allows to increase the precision to subregisters/lanemasks (not used for now) The changes in the AMDGPU tests are necessary because the R600 scheduler is not stable with respect to the order of nodes in the ready queues. Differential Revision: http://reviews.llvm.org/D9068 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254577 91177308-0d34-0410-b5e6-96231b3b80d8 Matthias Braun 4 years ago
16 changed file(s) with 281 addition(s) and 160 deletion(s). Raw diff Collapse all Expand all
3232 /// An individual mapping from virtual register number to SUnit.
3333 struct VReg2SUnit {
3434 unsigned VirtReg;
35 LaneBitmask LaneMask;
3536 SUnit *SU;
3637
37 VReg2SUnit(unsigned reg, SUnit *su): VirtReg(reg), SU(su) {}
38 VReg2SUnit(unsigned VReg, LaneBitmask LaneMask, SUnit *SU)
39 : VirtReg(VReg), LaneMask(LaneMask), SU(SU) {}
3840
3941 unsigned getSparseSetIndex() const {
4042 return TargetRegisterInfo::virtReg2Index(VirtReg);
4143 }
44 };
45
46 /// Mapping from virtual register to SUnit including an operand index.
47 struct VReg2SUnitOperIdx : public VReg2SUnit {
48 unsigned OperandIndex;
49
50 VReg2SUnitOperIdx(unsigned VReg, LaneBitmask LaneMask,
51 unsigned OperandIndex, SUnit *SU)
52 : VReg2SUnit(VReg, LaneMask, SU), OperandIndex(OperandIndex) {}
4253 };
4354
4455 /// Record a physical register access.
6879 /// Track local uses of virtual registers. These uses are gathered by the DAG
6980 /// builder and may be consulted by the scheduler to avoid iterating an entire
7081 /// vreg use list.
71 typedef SparseMultiSet VReg2UseMap;
82 typedef SparseMultiSet VReg2SUnitMultiMap;
83
84 typedef SparseMultiSet
85 VReg2SUnitOperIdxMultiMap;
7286
7387 /// ScheduleDAGInstrs - A ScheduleDAG subclass for scheduling lists of
7488 /// MachineInstrs.
94108 /// it has taken responsibility for scheduling the terminator correctly.
95109 bool CanHandleTerminators;
96110
111 /// Whether lane masks should get tracked.
112 bool TrackLaneMasks;
113
97114 /// State specific to the current scheduling region.
98115 /// ------------------------------------------------
99116
116133 /// After calling BuildSchedGraph, each vreg used in the scheduling region
117134 /// is mapped to a set of SUnits. These include all local vreg uses, not
118135 /// just the uses for a singly defined vreg.
119 VReg2UseMap VRegUses;
136 VReg2SUnitMultiMap VRegUses;
120137
121138 /// State internal to DAG building.
122139 /// -------------------------------
128145 Reg2SUnitsMap Defs;
129146 Reg2SUnitsMap Uses;
130147
131 /// Track the last instruction in this region defining each virtual register.
132 VReg2SUnitMap VRegDefs;
148 /// Tracks the last instruction(s) in this region defining each virtual
149 /// register. There may be multiple current definitions for a register with
150 /// disjunct lanemasks.
151 VReg2SUnitMultiMap CurrentVRegDefs;
152 /// Tracks the last instructions in this region using each virtual register.
153 VReg2SUnitOperIdxMultiMap CurrentVRegUses;
133154
134155 /// PendingLoads - Remember where unknown loads are after the most recent
135156 /// unknown store, as we iterate. As with Defs and Uses, this is here
199220 /// input.
200221 void buildSchedGraph(AliasAnalysis *AA,
201222 RegPressureTracker *RPTracker = nullptr,
202 PressureDiffs *PDiffs = nullptr);
223 PressureDiffs *PDiffs = nullptr,
224 bool TrackLaneMasks = false);
203225
204226 /// addSchedBarrierDeps - Add dependencies from instructions in the current
205227 /// list of instructions being scheduled to scheduling barrier. We want to
246268 /// Other adjustments may be made to the instruction if necessary. Return
247269 /// true if the operand has been deleted, false if not.
248270 bool toggleKillFlag(MachineInstr *MI, MachineOperand &MO);
271
272 /// Returns a mask for which lanes get read/written by the given (register)
273 /// machine operand.
274 LaneBitmask getLaneMaskForMO(const MachineOperand &MO) const;
275
276 void collectVRegUses(SUnit *SU);
249277 };
250278
251279 /// newSUnit - Creates a new SUnit and return a ptr to it.
1212 //===----------------------------------------------------------------------===//
1313
1414 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
15 #include "llvm/ADT/IntEqClasses.h"
1516 #include "llvm/ADT/MapVector.h"
1617 #include "llvm/ADT/SmallPtrSet.h"
1718 #include "llvm/ADT/SmallSet.h"
1819 #include "llvm/Analysis/AliasAnalysis.h"
1920 #include "llvm/Analysis/ValueTracking.h"
20 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
2121 #include "llvm/CodeGen/MachineFunctionPass.h"
2222 #include "llvm/CodeGen/MachineFrameInfo.h"
2323 #include "llvm/CodeGen/MachineInstrBuilder.h"
5454 bool RemoveKillFlags)
5555 : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()), LIS(LIS),
5656 RemoveKillFlags(RemoveKillFlags), CanHandleTerminators(false),
57 FirstDbgValue(nullptr) {
57 TrackLaneMasks(false), FirstDbgValue(nullptr) {
5858 DbgValues.clear();
5959
6060 const TargetSubtargetInfo &ST = mf.getSubtarget();
362362 }
363363 }
364364
365 LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const
366 {
367 unsigned Reg = MO.getReg();
368 // No point in tracking lanemasks if we don't have interesting subregisters.
369 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
370 if (!RC.HasDisjunctSubRegs)
371 return ~0u;
372
373 unsigned SubReg = MO.getSubReg();
374 if (SubReg == 0)
375 return RC.getLaneMask();
376 return TRI->getSubRegIndexLaneMask(SubReg);
377 }
378
365379 /// addVRegDefDeps - Add register output and data dependencies from this SUnit
366380 /// to instructions that occur later in the same scheduling region if they read
367381 /// from or write to the virtual register defined at OperIdx.
369383 /// TODO: Hoist loop induction variable increments. This has to be
370384 /// reevaluated. Generally, IV scheduling should be done before coalescing.
371385 void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
372 const MachineInstr *MI = SU->getInstr();
373 unsigned Reg = MI->getOperand(OperIdx).getReg();
374
375 // Singly defined vregs do not have output/anti dependencies.
376 // The current operand is a def, so we have at least one.
377 // Check here if there are any others...
386 MachineInstr *MI = SU->getInstr();
387 MachineOperand &MO = MI->getOperand(OperIdx);
388 unsigned Reg = MO.getReg();
389
390 LaneBitmask DefLaneMask;
391 LaneBitmask KillLaneMask;
392 if (TrackLaneMasks) {
393 bool IsKill = MO.getSubReg() == 0 || MO.isUndef();
394 DefLaneMask = getLaneMaskForMO(MO);
395 // If we have a flag, none of the lane values comes from an
396 // earlier instruction.
397 KillLaneMask = IsKill ? ~0u : DefLaneMask;
398
399 // Clear undef flag, we'll re-add it later once we know which subregister
400 // Def is first.
401 MO.setIsUndef(false);
402 } else {
403 DefLaneMask = ~0u;
404 KillLaneMask = ~0u;
405 }
406
407 if (MO.isDead()) {
408 assert(CurrentVRegUses.find(Reg) == CurrentVRegUses.end() &&
409 "Dead defs should have no uses");
410 } else {
411 // Add data dependence to all uses we found so far.
412 const TargetSubtargetInfo &ST = MF.getSubtarget();
413 for (VReg2SUnitOperIdxMultiMap::iterator I = CurrentVRegUses.find(Reg),
414 E = CurrentVRegUses.end(); I != E; /*empty*/) {
415 LaneBitmask LaneMask = I->LaneMask;
416 // Ignore uses of other lanes.
417 if ((LaneMask & KillLaneMask) == 0) {
418 ++I;
419 continue;
420 }
421
422 if ((LaneMask & DefLaneMask) != 0) {
423 SUnit *UseSU = I->SU;
424 MachineInstr *Use = UseSU->getInstr();
425 SDep Dep(SU, SDep::Data, Reg);
426 Dep.setLatency(SchedModel.computeOperandLatency(MI, OperIdx, Use,
427 I->OperandIndex));
428 ST.adjustSchedDependency(SU, UseSU, Dep);
429 UseSU->addPred(Dep);
430 }
431
432 LaneMask &= ~KillLaneMask;
433 // If we found a Def for all lanes of this use, remove it from the list.
434 if (LaneMask != 0) {
435 I->LaneMask = LaneMask;
436 ++I;
437 } else
438 I = CurrentVRegUses.erase(I);
439 }
440 }
441
442 // Shortcut: Singly defined vregs do not have output/anti dependencies.
378443 if (MRI.hasOneDef(Reg))
379444 return;
380445
381 // Add output dependence to the next nearest def of this vreg.
446 // Add output dependence to the next nearest defs of this vreg.
382447 //
383448 // Unless this definition is dead, the output dependence should be
384449 // transitively redundant with antidependencies from this definition's
385450 // uses. We're conservative for now until we have a way to guarantee the uses
386451 // are not eliminated sometime during scheduling. The output dependence edge
387452 // is also useful if output latency exceeds def-use latency.
388 VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg);
389 if (DefI == VRegDefs.end())
390 VRegDefs.insert(VReg2SUnit(Reg, SU));
391 else {
392 SUnit *DefSU = DefI->SU;
393 if (DefSU != SU && DefSU != &ExitSU) {
394 SDep Dep(SU, SDep::Output, Reg);
395 Dep.setLatency(
396 SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
397 DefSU->addPred(Dep);
398 }
399 DefI->SU = SU;
400 }
453 LaneBitmask LaneMask = DefLaneMask;
454 for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg),
455 CurrentVRegDefs.end())) {
456 // Ignore defs for other lanes.
457 if ((V2SU.LaneMask & LaneMask) == 0)
458 continue;
459 // Add an output dependence.
460 SUnit *DefSU = V2SU.SU;
461 // Ignore additional defs of the same lanes in one instruction. This can
462 // happen because lanemasks are shared for targets with too many
463 // subregisters. We also use some representration tricks/hacks where we
464 // add super-register defs/uses, to imply that although we only access parts
465 // of the reg we care about the full one.
466 if (DefSU == SU)
467 continue;
468 SDep Dep(SU, SDep::Output, Reg);
469 Dep.setLatency(
470 SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
471 DefSU->addPred(Dep);
472
473 // Update current definition. This can get tricky if the def was about a
474 // bigger lanemask before. We then have to shrink it and create a new
475 // VReg2SUnit for the non-overlapping part.
476 LaneBitmask OverlapMask = V2SU.LaneMask & LaneMask;
477 LaneBitmask NonOverlapMask = V2SU.LaneMask & ~LaneMask;
478 if (NonOverlapMask != 0)
479 CurrentVRegDefs.insert(VReg2SUnit(Reg, NonOverlapMask, V2SU.SU));
480 V2SU.SU = SU;
481 V2SU.LaneMask = OverlapMask;
482 }
483 // If there was no CurrentVRegDefs entry for some lanes yet, create one.
484 if (LaneMask != 0)
485 CurrentVRegDefs.insert(VReg2SUnit(Reg, LaneMask, SU));
401486 }
402487
403488 /// addVRegUseDeps - Add a register data dependency if the instruction that
407492 ///
408493 /// TODO: Handle ExitSU "uses" properly.
409494 void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
410 MachineInstr *MI = SU->getInstr();
411 unsigned Reg = MI->getOperand(OperIdx).getReg();
412
413 // Record this local VReg use.
414 VReg2UseMap::iterator UI = VRegUses.find(Reg);
415 for (; UI != VRegUses.end(); ++UI) {
416 if (UI->SU == SU)
417 break;
418 }
419 if (UI == VRegUses.end())
420 VRegUses.insert(VReg2SUnit(Reg, SU));
421
422 // Lookup this operand's reaching definition.
423 assert(LIS && "vreg dependencies requires LiveIntervals");
424 LiveQueryResult LRQ
425 = LIS->getInterval(Reg).Query(LIS->getInstructionIndex(MI));
426 VNInfo *VNI = LRQ.valueIn();
427
428 // VNI will be valid because MachineOperand::readsReg() is checked by caller.
429 assert(VNI && "No value to read by operand");
430 MachineInstr *Def = LIS->getInstructionFromIndex(VNI->def);
431 // Phis and other noninstructions (after coalescing) have a NULL Def.
432 if (Def) {
433 SUnit *DefSU = getSUnit(Def);
434 if (DefSU) {
435 // The reaching Def lives within this scheduling region.
436 // Create a data dependence.
437 SDep dep(DefSU, SDep::Data, Reg);
438 // Adjust the dependence latency using operand def/use information, then
439 // allow the target to perform its own adjustments.
440 int DefOp = Def->findRegisterDefOperandIdx(Reg);
441 dep.setLatency(SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx));
442
443 const TargetSubtargetInfo &ST = MF.getSubtarget();
444 ST.adjustSchedDependency(DefSU, SU, const_cast(dep));
445 SU->addPred(dep);
446 }
447 }
448
449 // Add antidependence to the following def of the vreg it uses.
450 VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg);
451 if (DefI != VRegDefs.end() && DefI->SU != SU)
452 DefI->SU->addPred(SDep(SU, SDep::Anti, Reg));
495 const MachineInstr *MI = SU->getInstr();
496 const MachineOperand &MO = MI->getOperand(OperIdx);
497 unsigned Reg = MO.getReg();
498
499 // Remember the use. Data dependencies will be added when we find the def.
500 LaneBitmask LaneMask = TrackLaneMasks ? getLaneMaskForMO(MO) : ~0u;
501 CurrentVRegUses.insert(VReg2SUnitOperIdx(Reg, LaneMask, OperIdx, SU));
502
503 // Add antidependences to the following defs of the vreg.
504 for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg),
505 CurrentVRegDefs.end())) {
506 // Ignore defs for unrelated lanes.
507 LaneBitmask PrevDefLaneMask = V2SU.LaneMask;
508 if ((PrevDefLaneMask & LaneMask) == 0)
509 continue;
510 if (V2SU.SU == SU)
511 continue;
512
513 V2SU.SU->addPred(SDep(SU, SDep::Anti, Reg));
514 }
453515 }
454516
455517 /// Return true if MI is an instruction we are unable to reason about
732794 }
733795 }
734796
797 void ScheduleDAGInstrs::collectVRegUses(SUnit *SU) {
798 const MachineInstr *MI = SU->getInstr();
799 for (const MachineOperand &MO : MI->operands()) {
800 if (!MO.isReg())
801 continue;
802 if (!MO.isUse() && (MO.getSubReg() == 0 || !TrackLaneMasks))
803 continue;
804
805 unsigned Reg = MO.getReg();
806 if (!TargetRegisterInfo::isVirtualRegister(Reg))
807 continue;
808
809 // Record this local VReg use.
810 VReg2SUnitMultiMap::iterator UI = VRegUses.find(Reg);
811 for (; UI != VRegUses.end(); ++UI) {
812 if (UI->SU == SU)
813 break;
814 }
815 if (UI == VRegUses.end())
816 VRegUses.insert(VReg2SUnit(Reg, 0, SU));
817 }
818 }
819
735820 /// If RegPressure is non-null, compute register pressure as a side effect. The
736821 /// DAG builder is an efficient place to do it because it already visits
737822 /// operands.
738823 void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
739824 RegPressureTracker *RPTracker,
740 PressureDiffs *PDiffs) {
825 PressureDiffs *PDiffs,
826 bool TrackLaneMasks) {
741827 const TargetSubtargetInfo &ST = MF.getSubtarget();
742828 bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI
743829 : ST.useAA();
744830 AliasAnalysis *AAForDep = UseAA ? AA : nullptr;
745831
832 this->TrackLaneMasks = TrackLaneMasks;
746833 MISUnitMap.clear();
747834 ScheduleDAG::clearDAG();
748835
776863 Defs.setUniverse(TRI->getNumRegs());
777864 Uses.setUniverse(TRI->getNumRegs());
778865
779 assert(VRegDefs.empty() && "Only BuildSchedGraph may access VRegDefs");
866 assert(CurrentVRegDefs.empty() && "nobody else should use CurrentVRegDefs");
867 assert(CurrentVRegUses.empty() && "nobody else should use CurrentVRegUses");
868 unsigned NumVirtRegs = MRI.getNumVirtRegs();
869 CurrentVRegDefs.setUniverse(NumVirtRegs);
870 CurrentVRegUses.setUniverse(NumVirtRegs);
871
780872 VRegUses.clear();
781 VRegDefs.setUniverse(MRI.getNumVirtRegs());
782 VRegUses.setUniverse(MRI.getNumVirtRegs());
873 VRegUses.setUniverse(NumVirtRegs);
783874
784875 // Model data dependencies between instructions being scheduled and the
785876 // ExitSU.
807898 RPTracker->recede(/*LiveUses=*/nullptr, PDiff);
808899 assert(RPTracker->getPos() == std::prev(MII) &&
809900 "RPTracker can't find MI");
901 collectVRegUses(SU);
810902 }
811903
812904 assert(
10561148
10571149 Defs.clear();
10581150 Uses.clear();
1059 VRegDefs.clear();
1151 CurrentVRegDefs.clear();
1152 CurrentVRegUses.clear();
10601153 PendingLoads.clear();
10611154 }
10621155
55
66 ; FUNC-LABEL: {{^}}width_2d:
77 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
8 ; EG: MOV [[VAL]], KC0[2].Z
8 ; EG: MOV * [[VAL]], KC0[2].Z
99 define void @width_2d (%opencl.image2d_t addrspace(1)* %in,
1010 i32 addrspace(1)* %out) {
1111 entry:
1818
1919 ; FUNC-LABEL: {{^}}width_3d:
2020 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
21 ; EG: MOV [[VAL]], KC0[2].Z
21 ; EG: MOV * [[VAL]], KC0[2].Z
2222 define void @width_3d (%opencl.image3d_t addrspace(1)* %in,
2323 i32 addrspace(1)* %out) {
2424 entry:
3535
3636 ; FUNC-LABEL: {{^}}height_2d:
3737 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
38 ; EG: MOV [[VAL]], KC0[2].W
38 ; EG: MOV * [[VAL]], KC0[2].W
3939 define void @height_2d (%opencl.image2d_t addrspace(1)* %in,
4040 i32 addrspace(1)* %out) {
4141 entry:
4848
4949 ; FUNC-LABEL: {{^}}height_3d:
5050 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
51 ; EG: MOV [[VAL]], KC0[2].W
51 ; EG: MOV * [[VAL]], KC0[2].W
5252 define void @height_3d (%opencl.image3d_t addrspace(1)* %in,
5353 i32 addrspace(1)* %out) {
5454 entry:
6565
6666 ; FUNC-LABEL: {{^}}depth_3d:
6767 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
68 ; EG: MOV [[VAL]], KC0[3].X
68 ; EG: MOV * [[VAL]], KC0[3].X
6969 define void @depth_3d (%opencl.image3d_t addrspace(1)* %in,
7070 i32 addrspace(1)* %out) {
7171 entry:
8282
8383 ; FUNC-LABEL: {{^}}data_type_2d:
8484 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
85 ; EG: MOV [[VAL]], KC0[3].Y
85 ; EG: MOV * [[VAL]], KC0[3].Y
8686 define void @data_type_2d (%opencl.image2d_t addrspace(1)* %in,
8787 i32 addrspace(1)* %out) {
8888 entry:
9595
9696 ; FUNC-LABEL: {{^}}data_type_3d:
9797 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
98 ; EG: MOV [[VAL]], KC0[3].Y
98 ; EG: MOV * [[VAL]], KC0[3].Y
9999 define void @data_type_3d (%opencl.image3d_t addrspace(1)* %in,
100100 i32 addrspace(1)* %out) {
101101 entry:
112112
113113 ; FUNC-LABEL: {{^}}channel_order_2d:
114114 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
115 ; EG: MOV [[VAL]], KC0[3].Z
115 ; EG: MOV * [[VAL]], KC0[3].Z
116116 define void @channel_order_2d (%opencl.image2d_t addrspace(1)* %in,
117117 i32 addrspace(1)* %out) {
118118 entry:
125125
126126 ; FUNC-LABEL: {{^}}channel_order_3d:
127127 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
128 ; EG: MOV [[VAL]], KC0[3].Z
128 ; EG: MOV * [[VAL]], KC0[3].Z
129129 define void @channel_order_3d (%opencl.image3d_t addrspace(1)* %in,
130130 i32 addrspace(1)* %out) {
131131 entry:
144144 ;
145145 ; FUNC-LABEL: {{^}}image_arg_2nd:
146146 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
147 ; EG: MOV [[VAL]], KC0[4].Z
147 ; EG: MOV * [[VAL]], KC0[4].Z
148148 define void @image_arg_2nd (%opencl.image3d_t addrspace(1)* %in1,
149149 i32 %x,
150150 %opencl.image2d_t addrspace(1)* %in2,
66 ; ADD_INT literal.x KC0[2].Z, 5
77
88 ; CHECK: {{^}}i32_literal:
9 ; CHECK: ADD_INT {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
10 ; CHECK-NEXT: LSHR
9 ; CHECK: LSHR
10 ; CHECK-NEXT: ADD_INT * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y
1111 ; CHECK-NEXT: 5
1212 define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
1313 entry:
2323 ; ADD literal.x KC0[2].Z, 5.0
2424
2525 ; CHECK: {{^}}float_literal:
26 ; CHECK: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
27 ; CHECK-NEXT: LSHR
26 ; CHECK: LSHR
27 ; CHECK-NEXT: ADD * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y
2828 ; CHECK-NEXT: 1084227584(5.0
2929 define void @float_literal(float addrspace(1)* %out, float %in) {
3030 entry:
33
44 ; FUNC-LABEL: {{^}}read_workdim:
55 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
6 ; EG: MOV [[VAL]], KC0[2].Z
6 ; EG: MOV * [[VAL]], KC0[2].Z
77
88 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
99 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
22 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
33
44 ; R600: {{^}}amdgpu_trunc:
5 ; R600: TRUNC T{{[0-9]+\.[XYZW]}}, KC0[2].Z
5 ; R600: TRUNC {{\*? *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
66 ; SI: {{^}}amdgpu_trunc:
77 ; SI: v_trunc_f32
88
44
55 ; FUNC-LABEL: {{^}}local_size_x:
66 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
7 ; EG: MOV [[VAL]], KC0[1].Z
7 ; EG: MOV * [[VAL]], KC0[1].Z
88
99 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
1010 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
2222
2323 ; FUNC-LABEL: {{^}}local_size_y:
2424 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
25 ; EG: MOV [[VAL]], KC0[1].W
25 ; EG: MOV * [[VAL]], KC0[1].W
2626
2727 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
2828 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
3737
3838 ; FUNC-LABEL: {{^}}local_size_z:
3939 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
40 ; EG: MOV [[VAL]], KC0[2].X
40 ; EG: MOV * [[VAL]], KC0[2].X
4141
4242 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
4343 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
152152 }
153153
154154 ; FUNC-LABEL: {{^}}or_i1:
155 ; EG: OR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
155 ; EG: OR_INT * {{\** *}}T{{[0-9]+\.[XYZW], PS, PV\.[XYZW]}}
156156
157157 ; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
158158 define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
44 ; SET*DX10 instructions.
55
66 ; CHECK: {{^}}fcmp_une_select_fptosi:
7 ; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
8 ; CHECK-NEXT: LSHR
7 ; CHECK: LSHR
8 ; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
99 ; CHECK-NEXT: 1084227584(5.000000e+00)
1010 define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) {
1111 entry:
1818 }
1919
2020 ; CHECK: {{^}}fcmp_une_select_i32:
21 ; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
22 ; CHECK-NEXT: LSHR
21 ; CHECK: LSHR
22 ; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
2323 ; CHECK-NEXT: 1084227584(5.000000e+00)
2424 define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) {
2525 entry:
3030 }
3131
3232 ; CHECK: {{^}}fcmp_oeq_select_fptosi:
33 ; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
34 ; CHECK-NEXT: LSHR
33 ; CHECK: LSHR
34 ; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
3535 ; CHECK-NEXT: 1084227584(5.000000e+00)
3636 define void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) {
3737 entry:
4444 }
4545
4646 ; CHECK: {{^}}fcmp_oeq_select_i32:
47 ; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
48 ; CHECK-NEXT: LSHR
47 ; CHECK: LSHR
48 ; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
4949 ; CHECK-NEXT: 1084227584(5.000000e+00)
5050 define void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) {
5151 entry:
5656 }
5757
5858 ; CHECK: {{^}}fcmp_ogt_select_fptosi:
59 ; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
60 ; CHECK-NEXT: LSHR
59 ; CHECK: LSHR
60 ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
6161 ; CHECK-NEXT: 1084227584(5.000000e+00)
6262 define void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) {
6363 entry:
7070 }
7171
7272 ; CHECK: {{^}}fcmp_ogt_select_i32:
73 ; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
74 ; CHECK-NEXT: LSHR
73 ; CHECK: LSHR
74 ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
7575 ; CHECK-NEXT: 1084227584(5.000000e+00)
7676 define void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) {
7777 entry:
8282 }
8383
8484 ; CHECK: {{^}}fcmp_oge_select_fptosi:
85 ; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
86 ; CHECK-NEXT: LSHR
85 ; CHECK: LSHR
86 ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
8787 ; CHECK-NEXT: 1084227584(5.000000e+00)
8888 define void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) {
8989 entry:
9696 }
9797
9898 ; CHECK: {{^}}fcmp_oge_select_i32:
99 ; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
100 ; CHECK-NEXT: LSHR
99 ; CHECK: LSHR
100 ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
101101 ; CHECK-NEXT: 1084227584(5.000000e+00)
102102 define void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) {
103103 entry:
108108 }
109109
110110 ; CHECK: {{^}}fcmp_ole_select_fptosi:
111 ; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
112 ; CHECK-NEXT: LSHR
111 ; CHECK: LSHR
112 ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
113113 ; CHECK-NEXT: 1084227584(5.000000e+00)
114114 define void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) {
115115 entry:
122122 }
123123
124124 ; CHECK: {{^}}fcmp_ole_select_i32:
125 ; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
126 ; CHECK-NEXT: LSHR
125 ; CHECK: LSHR
126 ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
127127 ; CHECK-NEXT: 1084227584(5.000000e+00)
128128 define void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) {
129129 entry:
134134 }
135135
136136 ; CHECK: {{^}}fcmp_olt_select_fptosi:
137 ; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
138 ; CHECK-NEXT: LSHR
137 ; CHECK: LSHR
138 ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
139139 ; CHECK-NEXT: 1084227584(5.000000e+00)
140140 define void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) {
141141 entry:
148148 }
149149
150150 ; CHECK: {{^}}fcmp_olt_select_i32:
151 ; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
152 ; CHECK-NEXT: LSHR
151 ; CHECK: LSHR
152 ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
153153 ; CHECK-NEXT: 1084227584(5.000000e+00)
154154 define void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) {
155155 entry:
1111 ; SI: buffer_store_dword [[EXTRACT]],
1212
1313 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
14 ; EG: BFE_INT [[RES]], {{.*}}, 0.0, 1
15 ; EG-NEXT: LSHR * [[ADDR]]
14 ; EG: LSHR * [[ADDR]]
15 ; EG: BFE_INT * [[RES]], {{.*}}, 0.0, 1
1616 define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) {
1717 %shl = shl i32 %in, 31
1818 %sext = ashr i32 %shl, 31
5252 ret void
5353 }
5454
55 ;EG: {{^}}shl_i64:
55 ;EG-LABEL: {{^}}shl_i64:
5656 ;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
5757 ;EG: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
58 ;EG: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
59 ;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
58 ;EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
59 ;EG-DAG: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
6060 ;EG-DAG: LSHL {{\*? *}}[[HISMTMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], [[SHIFT]]
61 ;EG-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
62 ;EG-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]}}
61 ;EG-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}}
62 ;EG-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]|PV.[XYZW]}}
6363 ;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
6464 ;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
6565 ;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
7979 ret void
8080 }
8181
82 ;EG: {{^}}shl_v2i64:
82 ;EG-LABEL: {{^}}shl_v2i64:
8383 ;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
8484 ;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
8585 ;EG-DAG: LSHR {{\*? *}}[[COMPSHA]]
6969 ;EG-LABEL: {{^}}ashr_i64_2:
7070 ;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
7171 ;EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
72 ;EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
73 ;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
72 ;EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
73 ;EG-DAG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
7474 ;EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
75 ;EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
76 ;EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
75 ;EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}}
76 ;EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|PV.[XYZW]|[[SHIFT]]}}
7777 ;EG-DAG: ASHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
7878 ;EG-DAG: ASHR {{\*? *}}[[HIBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
7979 ;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
6464
6565 ; EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
6666 ; EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
67 ; EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
6867 ; EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
68 ; EG-DAG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
6969 ; EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
70 ; EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
71 ; EG-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
72 ; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
70 ; EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}}
71 ; EG-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]|PV\.[XYZW]}}
7372 ; EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
74 ; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
73 ; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]|PS}}
74 ; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], [[SHIFT]]
7575 ; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
7676 define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
7777 %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
22 ; These tests are for condition codes that are not supported by the hardware
33
44 ; CHECK-LABEL: {{^}}slt:
5 ; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
6 ; CHECK-NEXT: LSHR
5 ; CHECK: LSHR
6 ; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
77 ; CHECK-NEXT: 5(7.006492e-45)
88 define void @slt(i32 addrspace(1)* %out, i32 %in) {
99 entry:
1414 }
1515
1616 ; CHECK-LABEL: {{^}}ult_i32:
17 ; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
18 ; CHECK-NEXT: LSHR
17 ; CHECK: LSHR
18 ; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
1919 ; CHECK-NEXT: 5(7.006492e-45)
2020 define void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
2121 entry:
3939 }
4040
4141 ; CHECK-LABEL: {{^}}ult_float_native:
42 ; CHECK: SETGE T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
43 ; CHECK-NEXT: LSHR *
42 ; CHECK: LSHR
43 ; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}}
4444 ; CHECK-NEXT: 1084227584(5.000000e+00)
4545 define void @ult_float_native(float addrspace(1)* %out, float %in) {
4646 entry:
5151 }
5252
5353 ; CHECK-LABEL: {{^}}olt:
54 ; CHECK: SETGT T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
55 ; CHECK-NEXT: LSHR *
54 ; CHECK: LSHR
55 ; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
5656 ; CHECK-NEXT: 1084227584(5.000000e+00)
5757 define void @olt(float addrspace(1)* %out, float %in) {
5858 entry:
6363 }
6464
6565 ; CHECK-LABEL: {{^}}sle:
66 ; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
67 ; CHECK-NEXT: LSHR
66 ; CHECK: LSHR
67 ; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
6868 ; CHECK-NEXT: 6(8.407791e-45)
6969 define void @sle(i32 addrspace(1)* %out, i32 %in) {
7070 entry:
7575 }
7676
7777 ; CHECK-LABEL: {{^}}ule_i32:
78 ; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
79 ; CHECK-NEXT: LSHR
78 ; CHECK: LSHR
79 ; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
8080 ; CHECK-NEXT: 6(8.407791e-45)
8181 define void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
8282 entry:
100100 }
101101
102102 ; CHECK-LABEL: {{^}}ule_float_native:
103 ; CHECK: SETGT T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
104 ; CHECK-NEXT: LSHR *
103 ; CHECK: LSHR
104 ; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}}
105105 ; CHECK-NEXT: 1084227584(5.000000e+00)
106106 define void @ule_float_native(float addrspace(1)* %out, float %in) {
107107 entry:
112112 }
113113
114114 ; CHECK-LABEL: {{^}}ole:
115 ; CHECK: SETGE T{{[0-9]\.[XYZW]}}, literal.x, KC0[2].Z
116 ; CHECK-NEXT: LSHR *
115 ; CHECK: LSHR
116 ; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
117117 ; CHECK-NEXT:1084227584(5.000000e+00)
118118 define void @ole(float addrspace(1)* %out, float %in) {
119119 entry:
66
77 ; FUNC-LABEL: {{^}}ngroups_x:
88 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
9 ; EG: MOV [[VAL]], KC0[0].X
9 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].X
1010
1111 ; HSA: .amd_kernel_code_t
1212
3737
3838 ; FUNC-LABEL: {{^}}ngroups_y:
3939 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
40 ; EG: MOV [[VAL]], KC0[0].Y
40 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].Y
4141
4242 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1
4343 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
5252
5353 ; FUNC-LABEL: {{^}}ngroups_z:
5454 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
55 ; EG: MOV [[VAL]], KC0[0].Z
55 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].Z
5656
5757 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2
5858 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
6767
6868 ; FUNC-LABEL: {{^}}global_size_x:
6969 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
70 ; EG: MOV [[VAL]], KC0[0].W
70 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].W
7171
7272 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3
7373 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc
8282
8383 ; FUNC-LABEL: {{^}}global_size_y:
8484 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
85 ; EG: MOV [[VAL]], KC0[1].X
85 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].X
8686
8787 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
8888 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10
9797
9898 ; FUNC-LABEL: {{^}}global_size_z:
9999 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
100 ; EG: MOV [[VAL]], KC0[1].Y
100 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].Y
101101
102102 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5
103103 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14
3737 }
3838
3939 ; FUNC-LABEL: {{^}}xor_i1:
40 ; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
40 ; EG: XOR_INT {{\** *}}{{T[0-9]+\.[XYZW]}}, {{PS|PV\.[XYZW]}}, {{PS|PV\.[XYZW]}}
4141
4242 ; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 0, {{v[0-9]+}}
4343 ; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1.0, {{v[0-9]+}}