llvm.org GIT mirror llvm / dbda053
[AMDGPU] gfx1010 wait count insertion Differential Revision: https://reviews.llvm.org/D61534 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@359938 91177308-0d34-0410-b5e6-96231b3b80d8 Stanislav Mekhanoshin 1 year, 5 months ago
2 changed file(s) with 408 addition(s) and 60 deletion(s). Raw diff Collapse all Expand all
9999
100100 #define CNT_MASK(t) (1u << (t))
101101
102 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
102 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
103103
104104 iterator_range> inst_counter_types() {
105105 return make_range(enum_iterator(VM_CNT),
112112 uint32_t VmcntMax;
113113 uint32_t ExpcntMax;
114114 uint32_t LgkmcntMax;
115 uint32_t VscntMax;
115116 int32_t NumVGPRsMax;
116117 int32_t NumSGPRsMax;
117118 } HardwareLimits;
125126
126127 enum WaitEventType {
127128 VMEM_ACCESS, // vector-memory read & write
129 VMEM_READ_ACCESS, // vector-memory read
130 VMEM_WRITE_ACCESS,// vector-memory write
128131 LDS_ACCESS, // lds read & write
129132 GDS_ACCESS, // gds read & write
130133 SQ_MESSAGE, // send message
138141 };
139142
140143 static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
141 (1 << VMEM_ACCESS),
144 (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
142145 (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
143146 (1 << SQ_MESSAGE),
144147 (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
145148 (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
149 (1 << VMEM_WRITE_ACCESS)
146150 };
147151
148152 // The mapping is:
170174 case LGKM_CNT:
171175 Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
172176 break;
177 case VS_CNT:
178 Wait.VsCnt = std::min(Wait.VsCnt, Count);
179 break;
173180 default:
174181 llvm_unreachable("bad InstCounterType");
175182 }
198205 return HardwareLimits.LgkmcntMax;
199206 case EXP_CNT:
200207 return HardwareLimits.ExpcntMax;
208 case VS_CNT:
209 return HardwareLimits.VscntMax;
201210 default:
202211 break;
203212 }
220229
221230 // Mapping from event to counter.
222231 InstCounterType eventCounter(WaitEventType E) {
223 if (E == VMEM_ACCESS)
232 if (WaitEventMaskForInst[VM_CNT] & (1 << E))
224233 return VM_CNT;
225234 if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
226235 return LGKM_CNT;
236 if (WaitEventMaskForInst[VS_CNT] & (1 << E))
237 return VS_CNT;
227238 assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
228239 return EXP_CNT;
229240 }
664675 case EXP_CNT:
665676 OS << " EXP_CNT(" << UB - LB << "): ";
666677 break;
678 case VS_CNT:
679 OS << " VS_CNT(" << UB - LB << "): ";
680 break;
667681 default:
668682 OS << " UNKNOWN(" << UB - LB << "): ";
669683 break;
703717 bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
704718 return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
705719 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
706 simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
720 simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt) |
721 simplifyWaitcnt(VS_CNT, Wait.VsCnt);
707722 }
708723
709724 bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
746761 applyWaitcnt(VM_CNT, Wait.VmCnt);
747762 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
748763 applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
764 applyWaitcnt(VS_CNT, Wait.VsCnt);
749765 }
750766
751767 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
816832 // TODO: Handle other cases of NeedsWaitcntVmBefore()
817833 if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
818834 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
819 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
835 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
836 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
837 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
820838 Wait.VmCnt = 0;
821839 }
822840
825843 // with knowledge of the called routines.
826844 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
827845 MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
828 Wait = AMDGPU::Waitcnt::allZero(IV);
846 Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
829847 }
830848 // Resolve vm waits before gs-done.
831849 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
9971015 // requiring a WAITCNT beforehand.
9981016 if (MI.getOpcode() == AMDGPU::S_BARRIER &&
9991017 !ST->hasAutoWaitcntBeforeBarrier()) {
1000 Wait = AMDGPU::Waitcnt::allZero(IV);
1018 Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
10011019 }
10021020
10031021 // TODO: Remove this work-around, enable the assert for Bug 457939
10151033 if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
10161034 bool Modified = false;
10171035 if (OldWaitcntInstr) {
1018 if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
1019 TrackedWaitcntSet.erase(OldWaitcntInstr);
1020 OldWaitcntInstr->eraseFromParent();
1021 Modified = true;
1022 } else {
1023 int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
1024 ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
1025 }
1026 Modified = true;
1036 for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
1037 &*II != &MI; II = NextI, ++NextI) {
1038 if (II->isDebugInstr())
1039 continue;
1040
1041 if (TrackedWaitcntSet.count(&*II)) {
1042 TrackedWaitcntSet.erase(&*II);
1043 II->eraseFromParent();
1044 Modified = true;
1045 } else if (II->getOpcode() == AMDGPU::S_WAITCNT) {
1046 int64_t Imm = II->getOperand(0).getImm();
1047 ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
1048 } else {
1049 assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
1050 assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1051 ScoreBrackets.applyWaitcnt(
1052 AMDGPU::Waitcnt(0, 0, 0, II->getOperand(1).getImm()));
1053 }
1054 }
10271055 }
10281056 return Modified;
10291057 }
10371065 Wait.ExpCnt = 0;
10381066 if (ForceEmitWaitcnt[LGKM_CNT])
10391067 Wait.LgkmCnt = 0;
1068 if (ForceEmitWaitcnt[VS_CNT])
1069 Wait.VsCnt = 0;
10401070
10411071 ScoreBrackets.applyWaitcnt(Wait);
10421072
10431073 AMDGPU::Waitcnt OldWait;
1074 bool Modified = false;
1075
10441076 if (OldWaitcntInstr) {
1045 OldWait =
1046 AMDGPU::decodeWaitcnt(IV, OldWaitcntInstr->getOperand(0).getImm());
1047 }
1048 if (OldWait.dominates(Wait))
1049 return false;
1050
1051 if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr))
1052 Wait = Wait.combined(OldWait);
1053
1054 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1055 if (OldWaitcntInstr) {
1056 OldWaitcntInstr->getOperand(0).setImm(Enc);
1057
1058 LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
1059 << "Old Instr: " << MI << '\n'
1060 << "New Instr: " << *OldWaitcntInstr << '\n');
1061 } else {
1077 for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
1078 &*II != &MI; II = NextI, NextI++) {
1079 if (II->isDebugInstr())
1080 continue;
1081
1082 if (II->getOpcode() == AMDGPU::S_WAITCNT) {
1083 unsigned IEnc = II->getOperand(0).getImm();
1084 AMDGPU::Waitcnt IWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1085 OldWait = OldWait.combined(IWait);
1086 if (!TrackedWaitcntSet.count(&*II))
1087 Wait = Wait.combined(IWait);
1088 unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
1089 if (IEnc != NewEnc) {
1090 II->getOperand(0).setImm(NewEnc);
1091 Modified = true;
1092 }
1093 Wait.VmCnt = ~0u;
1094 Wait.LgkmCnt = ~0u;
1095 Wait.ExpCnt = ~0u;
1096 } else {
1097 assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
1098 assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1099
1100 unsigned ICnt = II->getOperand(1).getImm();
1101 OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt);
1102 if (!TrackedWaitcntSet.count(&*II))
1103 Wait.VsCnt = std::min(Wait.VsCnt, ICnt);
1104 if (Wait.VsCnt != ICnt) {
1105 II->getOperand(1).setImm(Wait.VsCnt);
1106 Modified = true;
1107 }
1108 Wait.VsCnt = ~0u;
1109 }
1110
1111 LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
1112 << "Old Instr: " << MI << '\n'
1113 << "New Instr: " << *II << '\n');
1114
1115 if (!Wait.hasWait())
1116 return Modified;
1117 }
1118 }
1119
1120 if (Wait.VmCnt != ~0u || Wait.LgkmCnt != ~0u || Wait.ExpCnt != ~0u) {
1121 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
10621122 auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
10631123 MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
10641124 .addImm(Enc);
10651125 TrackedWaitcntSet.insert(SWaitInst);
1126 Modified = true;
10661127
10671128 LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
10681129 << "Old Instr: " << MI << '\n'
10691130 << "New Instr: " << *SWaitInst << '\n');
10701131 }
10711132
1072 return true;
1133 if (Wait.VsCnt != ~0u) {
1134 assert(ST->hasVscnt());
1135
1136 auto SWaitInst =
1137 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1138 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1139 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1140 .addImm(Wait.VsCnt);
1141 TrackedWaitcntSet.insert(SWaitInst);
1142 Modified = true;
1143
1144 LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
1145 << "Old Instr: " << MI << '\n'
1146 << "New Instr: " << *SWaitInst << '\n');
1147 }
1148
1149 return Modified;
10731150 }
10741151
10751152 // This is a flat memory operation. Check to see if it has memory
11041181 } else if (TII->isFLAT(Inst)) {
11051182 assert(Inst.mayLoad() || Inst.mayStore());
11061183
1107 if (TII->usesVM_CNT(Inst))
1108 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1184 if (TII->usesVM_CNT(Inst)) {
1185 if (!ST->hasVscnt())
1186 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1187 else if (Inst.mayLoad() &&
1188 AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1)
1189 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
1190 else
1191 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
1192 }
11091193
11101194 if (TII->usesLGKM_CNT(Inst)) {
11111195 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
11201204 // TODO: get a better carve out.
11211205 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
11221206 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1123 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
1124 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1207 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL &&
1208 Inst.getOpcode() != AMDGPU::BUFFER_GL0_INV &&
1209 Inst.getOpcode() != AMDGPU::BUFFER_GL1_INV) {
1210 if (!ST->hasVscnt())
1211 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1212 else if ((Inst.mayLoad() &&
1213 AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1) ||
1214 /* IMAGE_GET_RESINFO / IMAGE_GET_LOD */
1215 (TII->isMIMG(Inst) && !Inst.mayLoad() && !Inst.mayStore()))
1216 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
1217 else if (Inst.mayStore())
1218 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
1219
11251220 if (ST->vmemWriteNeedsExpWaitcnt() &&
11261221 (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
11271222 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
12421337 Iter != E;) {
12431338 MachineInstr &Inst = *Iter;
12441339
1245 // Remove any previously existing waitcnts.
1246 if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
1247 if (OldWaitcntInstr) {
1248 if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
1249 TrackedWaitcntSet.erase(OldWaitcntInstr);
1250 OldWaitcntInstr->eraseFromParent();
1251 OldWaitcntInstr = nullptr;
1252 } else if (!TrackedWaitcntSet.count(&Inst)) {
1253 // Two successive s_waitcnt's, both of which are pre-existing and
1254 // are therefore preserved.
1255 int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
1256 ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
1257 } else {
1258 ++Iter;
1259 Inst.eraseFromParent();
1260 Modified = true;
1261 continue;
1262 }
1263 }
1264
1265 OldWaitcntInstr = &Inst;
1340 // Track pre-existing waitcnts from earlier iterations.
1341 if (Inst.getOpcode() == AMDGPU::S_WAITCNT ||
1342 (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1343 Inst.getOperand(0).isReg() &&
1344 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL)) {
1345 if (!OldWaitcntInstr)
1346 OldWaitcntInstr = &Inst;
12661347 ++Iter;
12671348 continue;
12681349 }
13191400 // Restore the vccz bit. Any time a value is written to vcc, the vcc
13201401 // bit is updated, so we can restore the bit by reading the value of
13211402 // vcc and then writing it back to the register.
1322 BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
1403 BuildMI(Block, Inst, Inst.getDebugLoc(),
1404 TII->get(AMDGPU::S_MOV_B64),
13231405 AMDGPU::VCC)
13241406 .addReg(AMDGPU::VCC);
13251407 VCCZBugHandledSet.insert(&Inst);
13471429 HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
13481430 HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
13491431 HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1432 HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0;
13501433
13511434 HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
13521435 HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
14821565 // TODO: Could insert earlier and schedule more liberally with operations
14831566 // that only use caller preserved registers.
14841567 MachineBasicBlock &EntryBB = MF.front();
1568 if (ST->hasVscnt())
1569 BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(),
1570 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1571 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1572 .addImm(0);
14851573 BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
14861574 .addImm(0);
14871575
0 ; RUN: llc -march=amdgcn -mcpu=gfx802 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8_9 %s
1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9_10,GFX8_9 %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX9_10 %s
3
4 ; GCN-LABEL: barrier_vmcnt_global:
5 ; GFX8: flat_load_dword
6 ; GFX9_10: global_load_dword
7 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
8 ; GFX9_10-NEXT: s_waitcnt vmcnt(0){{$}}
9 ; GCN-NEXT: s_barrier
10 define amdgpu_kernel void @barrier_vmcnt_global(i32 addrspace(1)* %arg) {
11 bb:
12 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
13 %tmp1 = zext i32 %tmp to i64
14 %tmp2 = shl nuw nsw i64 %tmp1, 32
15 %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp1
16 %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
17 fence syncscope("singlethread") release
18 tail call void @llvm.amdgcn.s.barrier()
19 fence syncscope("singlethread") acquire
20 %tmp5 = add nuw nsw i64 %tmp2, 4294967296
21 %tmp6 = lshr exact i64 %tmp5, 32
22 %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp6
23 store i32 %tmp4, i32 addrspace(1)* %tmp7, align 4
24 ret void
25 }
26
27 ; GCN-LABEL: barrier_vscnt_global:
28 ; GFX8: flat_store_dword
29 ; GFX9_10: global_store_dword
30 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
31 ; GFX9-NEXT: s_waitcnt vmcnt(0){{$}}
32 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
33 ; GCN-NEXT: s_barrier
34 define amdgpu_kernel void @barrier_vscnt_global(i32 addrspace(1)* %arg) {
35 bb:
36 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
37 %tmp1 = zext i32 %tmp to i64
38 %tmp2 = shl nuw nsw i64 %tmp1, 32
39 %tmp3 = add nuw nsw i64 %tmp2, 8589934592
40 %tmp4 = lshr exact i64 %tmp3, 32
41 %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4
42 store i32 0, i32 addrspace(1)* %tmp5, align 4
43 fence syncscope("singlethread") release
44 tail call void @llvm.amdgcn.s.barrier() #3
45 fence syncscope("singlethread") acquire
46 %tmp6 = add nuw nsw i64 %tmp2, 4294967296
47 %tmp7 = lshr exact i64 %tmp6, 32
48 %tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp7
49 store i32 1, i32 addrspace(1)* %tmp8, align 4
50 ret void
51 }
52
53 ; GCN-LABEL: barrier_vmcnt_vscnt_global:
54 ; GFX8: flat_load_dword
55 ; GFX9_10: global_load_dword
56 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
57 ; GFX9_10-NEXT: s_waitcnt vmcnt(0){{$}}
58 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
59 ; GCN-NEXT: s_barrier
60 define amdgpu_kernel void @barrier_vmcnt_vscnt_global(i32 addrspace(1)* %arg) {
61 bb:
62 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
63 %tmp1 = zext i32 %tmp to i64
64 %tmp2 = shl nuw nsw i64 %tmp1, 32
65 %tmp3 = add nuw nsw i64 %tmp2, 8589934592
66 %tmp4 = lshr exact i64 %tmp3, 32
67 %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4
68 store i32 0, i32 addrspace(1)* %tmp5, align 4
69 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp1
70 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
71 fence syncscope("singlethread") release
72 tail call void @llvm.amdgcn.s.barrier()
73 fence syncscope("singlethread") acquire
74 %tmp8 = add nuw nsw i64 %tmp2, 4294967296
75 %tmp9 = lshr exact i64 %tmp8, 32
76 %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp9
77 store i32 %tmp7, i32 addrspace(1)* %tmp10, align 4
78 ret void
79 }
80
81 ; GCN-LABEL: barrier_vmcnt_flat:
82 ; GCN: flat_load_dword
83 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
84 ; GCN-NEXT: s_barrier
85 define amdgpu_kernel void @barrier_vmcnt_flat(i32* %arg) {
86 bb:
87 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
88 %tmp1 = zext i32 %tmp to i64
89 %tmp2 = shl nuw nsw i64 %tmp1, 32
90 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
91 %tmp4 = load i32, i32* %tmp3, align 4
92 fence syncscope("singlethread") release
93 tail call void @llvm.amdgcn.s.barrier()
94 fence syncscope("singlethread") acquire
95 %tmp5 = add nuw nsw i64 %tmp2, 4294967296
96 %tmp6 = lshr exact i64 %tmp5, 32
97 %tmp7 = getelementptr inbounds i32, i32* %arg, i64 %tmp6
98 store i32 %tmp4, i32* %tmp7, align 4
99 ret void
100 }
101
102 ; GCN-LABEL: barrier_vscnt_flat:
103 ; GCN: flat_store_dword
104 ; GFX8_9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
105 ; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}}
106 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
107 ; GCN-NEXT: s_barrier
108 define amdgpu_kernel void @barrier_vscnt_flat(i32* %arg) {
109 bb:
110 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
111 %tmp1 = zext i32 %tmp to i64
112 %tmp2 = shl nuw nsw i64 %tmp1, 32
113 %tmp3 = add nuw nsw i64 %tmp2, 8589934592
114 %tmp4 = lshr exact i64 %tmp3, 32
115 %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4
116 store i32 0, i32* %tmp5, align 4
117 fence syncscope("singlethread") release
118 tail call void @llvm.amdgcn.s.barrier() #3
119 fence syncscope("singlethread") acquire
120 %tmp6 = add nuw nsw i64 %tmp2, 4294967296
121 %tmp7 = lshr exact i64 %tmp6, 32
122 %tmp8 = getelementptr inbounds i32, i32* %arg, i64 %tmp7
123 store i32 1, i32* %tmp8, align 4
124 ret void
125 }
126
127 ; GCN-LABEL: barrier_vmcnt_vscnt_flat:
128 ; GCN: flat_load_dword
129 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
130 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
131 ; GCN-NEXT: s_barrier
132 define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(i32* %arg) {
133 bb:
134 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
135 %tmp1 = zext i32 %tmp to i64
136 %tmp2 = shl nuw nsw i64 %tmp1, 32
137 %tmp3 = add nuw nsw i64 %tmp2, 8589934592
138 %tmp4 = lshr exact i64 %tmp3, 32
139 %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4
140 store i32 0, i32* %tmp5, align 4
141 %tmp6 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
142 %tmp7 = load i32, i32* %tmp6, align 4
143 fence syncscope("singlethread") release
144 tail call void @llvm.amdgcn.s.barrier()
145 fence syncscope("singlethread") acquire
146 %tmp8 = add nuw nsw i64 %tmp2, 4294967296
147 %tmp9 = lshr exact i64 %tmp8, 32
148 %tmp10 = getelementptr inbounds i32, i32* %arg, i64 %tmp9
149 store i32 %tmp7, i32* %tmp10, align 4
150 ret void
151 }
152
153 ; GCN-LABEL: barrier_vmcnt_vscnt_flat_workgroup:
154 ; GCN: flat_load_dword
155 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
156 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
157 ; GCN-NEXT: s_barrier
158 define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(i32* %arg) {
159 bb:
160 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
161 %tmp1 = zext i32 %tmp to i64
162 %tmp2 = shl nuw nsw i64 %tmp1, 32
163 %tmp3 = add nuw nsw i64 %tmp2, 8589934592
164 %tmp4 = lshr exact i64 %tmp3, 32
165 %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4
166 store i32 0, i32* %tmp5, align 4
167 %tmp6 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
168 %tmp7 = load i32, i32* %tmp6, align 4
169 fence syncscope("workgroup") release
170 tail call void @llvm.amdgcn.s.barrier()
171 fence syncscope("workgroup") acquire
172 %tmp8 = add nuw nsw i64 %tmp2, 4294967296
173 %tmp9 = lshr exact i64 %tmp8, 32
174 %tmp10 = getelementptr inbounds i32, i32* %arg, i64 %tmp9
175 store i32 %tmp7, i32* %tmp10, align 4
176 ret void
177 }
178
179 ; GCN-LABEL: load_vmcnt_global:
180 ; GFX8: flat_load_dword
181 ; GFX9_10: global_load_dword
182 ; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
183 ; GFX9_10: s_waitcnt vmcnt(0){{$}}
184 ; GCN-NEXT: {{global|flat}}_store_dword
185 define amdgpu_kernel void @load_vmcnt_global(i32 addrspace(1)* %arg) {
186 bb:
187 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
188 %tmp1 = zext i32 %tmp to i64
189 %tmp2 = shl nuw nsw i64 %tmp1, 32
190 %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp1
191 %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
192 %tmp5 = add nuw nsw i64 %tmp2, 4294967296
193 %tmp6 = lshr exact i64 %tmp5, 32
194 %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp6
195 store i32 %tmp4, i32 addrspace(1)* %tmp7, align 4
196 ret void
197 }
198
199 ; GCN-LABEL: load_vmcnt_flat:
200 ; GCN: flat_load_dword
201 ; GCN-NOT: vscnt
202 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
203 ; GCN-NEXT: {{global|flat}}_store_dword
204 define amdgpu_kernel void @load_vmcnt_flat(i32* %arg) {
205 bb:
206 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
207 %tmp1 = zext i32 %tmp to i64
208 %tmp2 = shl nuw nsw i64 %tmp1, 32
209 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
210 %tmp4 = load i32, i32* %tmp3, align 4
211 %tmp5 = add nuw nsw i64 %tmp2, 4294967296
212 %tmp6 = lshr exact i64 %tmp5, 32
213 %tmp7 = getelementptr inbounds i32, i32* %arg, i64 %tmp6
214 store i32 %tmp4, i32* %tmp7, align 4
215 ret void
216 }
217
218 ; GCN-LABEL: store_vscnt_private:
219 ; GCN: buffer_store_dword
220 ; GFX8_9-NEXT: s_waitcnt vmcnt(0)
221 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
222 ; GCN-NEXT: s_setpc_b64
223 define void @store_vscnt_private(i32 addrspace(5)* %p) {
224 store i32 0, i32 addrspace(5)* %p
225 ret void
226 }
227
228 ; GCN-LABEL: store_vscnt_global:
229 ; GFX8: flat_store_dword
230 ; GFX9_10: global_store_dword
231 ; GFX8_9-NEXT: s_waitcnt vmcnt(0)
232 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
233 ; GCN-NEXT: s_setpc_b64
234 define void @store_vscnt_global(i32 addrspace(1)* %p) {
235 store i32 0, i32 addrspace(1)* %p
236 ret void
237 }
238
239 ; GCN-LABEL: store_vscnt_flat:
240 ; GCN: flat_store_dword
241 ; GFX8_9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
242 ; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}}
243 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
244 ; GCN-NEXT: s_setpc_b64
245 define void @store_vscnt_flat(i32* %p) {
246 store i32 0, i32* %p
247 ret void
248 }
249
250 ; GCN-LABEL: function_prologue:
251 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0){{$}}
252 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
253 ; GCN-NEXT: s_setpc_b64
254 define void @function_prologue() {
255 ret void
256 }
257
258 declare void @llvm.amdgcn.s.barrier()
259 declare i32 @llvm.amdgcn.workitem.id.x()