llvm.org GIT mirror llvm / 1ad9529
Revert rL363678 : AMDGPU: Add ds_gws_init / ds_gws_barrier intrinsics There may or may not be additional work to handle this correctly on SI/CI. ........ Breaks EXPENSIVE_CHECKS buildbots - http://lab.llvm.org:8011/builders/llvm-clang-x86_64-expensive-checks-win/builds/78/ git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363797 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim a month ago
12 changed file(s) with 22 addition(s) and 687 deletion(s). Raw diff Collapse all Expand all
13471347 [IntrNoMem, IntrSpeculatable]
13481348 >;
13491349
1350 // llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id)
1351 //
1352 // bar_val is the total number of waves that will wait on this
1353 // barrier, minus 1.
1354 def int_amdgcn_ds_gws_init :
1355 GCCBuiltin<"__builtin_amdgcn_ds_gws_init">,
1356 Intrinsic<[],
1357 [llvm_i32_ty, llvm_i32_ty],
1358 [IntrConvergent, IntrWriteMem, IntrInaccessibleMemOnly], "",
1359 [SDNPMemOperand]
1360 >;
1361
1362 // llvm.amdgcn.ds.gws.barrier(i32 vsrc0, i32 resource_id)
1363 // bar_val is the total number of waves that will wait on this
1364 // barrier, minus 1.
1365 def int_amdgcn_ds_gws_barrier :
1366 GCCBuiltin<"__builtin_amdgcn_ds_gws_barrier">,
1367 Intrinsic<[],
1368 [llvm_i32_ty, llvm_i32_ty],
1369 [IntrConvergent, IntrInaccessibleMemOnly], "",
1370 [SDNPMemOperand]
1371 >;
13721350
13731351 // Copies the source value to the destination value, with the guarantee that
13741352 // the source value is computed as if the entire program were executed in WQM.
217217 void SelectFMAD_FMA(SDNode *N);
218218 void SelectATOMIC_CMP_SWAP(SDNode *N);
219219 void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
220 void SelectDS_GWS(SDNode *N, unsigned IntrID);
221220 void SelectINTRINSIC_W_CHAIN(SDNode *N);
222 void SelectINTRINSIC_VOID(SDNode *N);
223221
224222 protected:
225223 // Include the pieces autogenerated from the target description.
833831 SelectINTRINSIC_W_CHAIN(N);
834832 return;
835833 }
836 case ISD::INTRINSIC_VOID: {
837 SelectINTRINSIC_VOID(N);
838 return;
839 }
840834 }
841835
842836 SelectCode(N);
20392033 CurDAG->setNodeMemRefs(cast(Selected), {MMO});
20402034 }
20412035
2042 void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2043 SDLoc SL(N);
2044 SDValue VSrc0 = N->getOperand(2);
2045 SDValue BaseOffset = N->getOperand(3);
2046 int ImmOffset = 0;
2047 SDNode *CopyToM0;
2048 MemIntrinsicSDNode *M = cast(N);
2049 MachineMemOperand *MMO = M->getMemOperand();
2050
2051 // Don't worry if the offset ends up in a VGPR. Only one lane will have
2052 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2053
2054 // The resource id offset is computed as ( + M0[21:16] +
2055 // offset field) % 64. Some versions of the programming guide omit the m0
2056 // part, or claim it's from offset 0.
2057 if (ConstantSDNode *ConstOffset = dyn_cast(BaseOffset)) {
2058 // If we have a constant offset, try to use the default value for m0 as a
2059 // base to possibly avoid setting it up.
2060 CopyToM0 = glueCopyToM0(N, CurDAG->getTargetConstant(-1, SL, MVT::i32));
2061 ImmOffset = ConstOffset->getZExtValue() + 1;
2062 } else {
2063 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2064 ImmOffset = BaseOffset.getConstantOperandVal(1);
2065 BaseOffset = BaseOffset.getOperand(0);
2066 }
2067
2068 // Prefer to do the shift in an SGPR since it should be possible to use m0
2069 // as the result directly. If it's already an SGPR, it will be eliminated
2070 // later.
2071 SDNode *SGPROffset
2072 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2073 BaseOffset);
2074 // Shift to offset in m0
2075 SDNode *M0Base
2076 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2077 SDValue(SGPROffset, 0),
2078 CurDAG->getTargetConstant(16, SL, MVT::i32));
2079 CopyToM0 = glueCopyToM0(N, SDValue(M0Base, 0));
2080 }
2081
2082 // The manual doesn't mention this, but it seems only v0 works.
2083 SDValue V0 = CurDAG->getRegister(AMDGPU::VGPR0, MVT::i32);
2084
2085 SDValue CopyToV0 = CurDAG->getCopyToReg(
2086 SDValue(CopyToM0, 0), SL, V0, VSrc0,
2087 N->getOperand(N->getNumOperands() - 1));
2088
2089 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2090
2091 // TODO: Can this just be removed from the instruction?
2092 SDValue GDS = CurDAG->getTargetConstant(1, SL, MVT::i1);
2093
2094 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_gws_init ?
2095 AMDGPU::DS_GWS_INIT : AMDGPU::DS_GWS_BARRIER;
2096
2097 SDValue Ops[] = {
2098 V0,
2099 OffsetField,
2100 GDS,
2101 CopyToV0, // Chain
2102 CopyToV0.getValue(1) // Glue
2103 };
2104
2105 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2106 CurDAG->setNodeMemRefs(cast(Selected), {MMO});
2107 }
2108
21092036 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
21102037 unsigned IntrID = cast(N->getOperand(1))->getZExtValue();
21112038 switch (IntrID) {
21162043 SelectDSAppendConsume(N, IntrID);
21172044 return;
21182045 }
2119 }
2120
2121 SelectCode(N);
2122 }
2123
2124 void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2125 unsigned IntrID = cast(N->getOperand(1))->getZExtValue();
2126 switch (IntrID) {
2127 case Intrinsic::amdgcn_ds_gws_init:
2128 case Intrinsic::amdgcn_ds_gws_barrier:
2129 SelectDS_GWS(N, IntrID);
2130 return;
21312046 default:
21322047 break;
21332048 }
466466 defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>;
467467 defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>;
468468
469 let isConvergent = 1 in {
470 def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init"> {
471 let mayLoad = 0;
472 }
469 def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init">;
473470 def DS_GWS_SEMA_V : DS_GWS_0D<"ds_gws_sema_v">;
474471 def DS_GWS_SEMA_BR : DS_GWS_1D<"ds_gws_sema_br">;
475472 def DS_GWS_SEMA_P : DS_GWS_0D<"ds_gws_sema_p">;
476473 def DS_GWS_BARRIER : DS_GWS_1D<"ds_gws_barrier">;
477 }
478474
479475 def DS_ADD_SRC2_U32 : DS_1A<"ds_add_src2_u32">;
480476 def DS_SUB_SRC2_U32 : DS_1A<"ds_sub_src2_u32">;
958958 if (!Vol->isZero())
959959 Info.flags |= MachineMemOperand::MOVolatile;
960960
961 return true;
962 }
963 case Intrinsic::amdgcn_ds_gws_init:
964 case Intrinsic::amdgcn_ds_gws_barrier: {
965 Info.opc = ISD::INTRINSIC_VOID;
966
967 SIMachineFunctionInfo *MFI = MF.getInfo();
968 Info.ptrVal =
969 MFI->getGWSPSV(*MF.getSubtarget().getInstrInfo());
970
971 // This is an abstract access, but we need to specify a type and size.
972 Info.memVT = MVT::i32;
973 Info.size = 4;
974 Info.align = 4;
975
976 Info.flags = MachineMemOperand::MOStore;
977 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
978 Info.flags = MachineMemOperand::MOLoad;
979961 return true;
980962 }
981963 default:
535535 // Put score on the source vgprs. If this is a store, just use those
536536 // specific register(s).
537537 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
538 int AddrOpIdx =
539 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
540538 // All GDS operations must protect their address register (same as
541539 // export.)
542 if (AddrOpIdx != -1) {
543 setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
544 } else {
545 assert(Inst.getOpcode() == AMDGPU::DS_APPEND ||
546 Inst.getOpcode() == AMDGPU::DS_CONSUME ||
547 Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
548 Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER);
549 }
550
540 if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
541 Inst.getOpcode() != AMDGPU::DS_CONSUME) {
542 setExpScore(
543 &Inst, TII, TRI, MRI,
544 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
545 CurrScore);
546 }
551547 if (Inst.mayStore()) {
552548 if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
553549 AMDGPU::OpName::data0) != -1) {
14101406 ScoreBrackets.dump();
14111407 });
14121408
1409 // Check to see if this is a GWS instruction. If so, and if this is CI or
1410 // VI, then the generated code sequence will include an S_WAITCNT 0.
1411 // TODO: Are these the only GWS instructions?
1412 if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
1413 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
1414 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
1415 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
1416 Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
1417 // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
1418 ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt::allZeroExceptVsCnt());
1419 }
1420
14131421 // TODO: Remove this work-around after fixing the scheduler and enable the
14141422 // assert above.
14151423 if (VCCZBugWorkAround) {
25462546 // given the typical code patterns.
25472547 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
25482548 Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE ||
2549 Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP ||
2550 Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER)
2549 Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP)
25512550 return true;
25522551
25532552 if (MI.isCall() || MI.isInlineAsm())
4242 public:
4343 enum AMDGPUPSVKind : unsigned {
4444 PSVBuffer = PseudoSourceValue::TargetCustom,
45 PSVImage,
46 GWSResource
45 PSVImage
4746 };
4847
4948 protected:
8483
8584 static bool classof(const PseudoSourceValue *V) {
8685 return V->kind() == PSVImage;
87 }
88 };
89
90 class AMDGPUGWSResourcePseudoSourceValue final : public AMDGPUPseudoSourceValue {
91 public:
92 explicit AMDGPUGWSResourcePseudoSourceValue(const TargetInstrInfo &TII)
93 : AMDGPUPseudoSourceValue(GWSResource, TII) {}
94
95 static bool classof(const PseudoSourceValue *V) {
96 return V->kind() == GWSResource;
97 }
98
99 // These are inaccessible memory from IR.
100 bool isAliased(const MachineFrameInfo *) const override {
101 return false;
102 }
103
104 // These are inaccessible memory from IR.
105 bool mayAlias(const MachineFrameInfo *) const override {
106 return false;
107 }
108
109 void printCustom(raw_ostream &OS) const override {
110 OS << "GWSResource";
11186 }
11287 };
11388
212187 std::unique_ptr> BufferPSVs;
213188 DenseMap
214189 std::unique_ptr> ImagePSVs;
215 std::unique_ptr GWSResourcePSV;
216190
217191 private:
218192 unsigned LDSWaveSpillSize = 0;
699673 return PSV.first->second.get();
700674 }
701675
702 const AMDGPUGWSResourcePseudoSourceValue *getGWSPSV(const SIInstrInfo &TII) {
703 if (!GWSResourcePSV) {
704 GWSResourcePSV =
705 llvm::make_unique(TII);
706 }
707
708 return GWSResourcePSV.get();
709 }
710
711676 unsigned getOccupancy() const {
712677 return Occupancy;
713678 }
+0
-103
test/CodeGen/AMDGPU/gws-hazards.mir less more
None # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
1 # RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX9 %s
2 # RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=VI %s
3 # RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=CI %s
4 # RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=SI %s
5
6 ---
7 name: m0_gws_init0
8 tracksRegLiveness: true
9 body: |
10
11 bb.0:
12 liveins: $vgpr0
13 ; GFX9-LABEL: name: m0_gws_init0
14 ; GFX9: liveins: $vgpr0
15 ; GFX9: $m0 = S_MOV_B32 -1
16 ; GFX9: S_NOP 0
17 ; GFX9: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
18 ; VI-LABEL: name: m0_gws_init0
19 ; VI: liveins: $vgpr0
20 ; VI: $m0 = S_MOV_B32 -1
21 ; VI: S_NOP 0
22 ; VI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
23 ; CI-LABEL: name: m0_gws_init0
24 ; CI: liveins: $vgpr0
25 ; CI: $m0 = S_MOV_B32 -1
26 ; CI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
27 ; SI-LABEL: name: m0_gws_init0
28 ; SI: liveins: $vgpr0
29 ; SI: $m0 = S_MOV_B32 -1
30 ; SI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
31 $m0 = S_MOV_B32 -1
32 DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
33
34 ...
35
36 ---
37 name: m0_gws_init1
38 tracksRegLiveness: true
39 body: |
40
41 bb.0:
42 ; GFX9-LABEL: name: m0_gws_init1
43 ; GFX9: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
44 ; GFX9: $m0 = S_MOV_B32 -1
45 ; GFX9: S_NOP 0
46 ; GFX9: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
47 ; VI-LABEL: name: m0_gws_init1
48 ; VI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
49 ; VI: $m0 = S_MOV_B32 -1
50 ; VI: S_NOP 0
51 ; VI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
52 ; CI-LABEL: name: m0_gws_init1
53 ; CI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
54 ; CI: $m0 = S_MOV_B32 -1
55 ; CI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
56 ; SI-LABEL: name: m0_gws_init1
57 ; SI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
58 ; SI: $m0 = S_MOV_B32 -1
59 ; SI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
60 $vgpr0 = V_MOV_B32_e32 0, implicit $exec
61 $m0 = S_MOV_B32 -1
62 DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
63
64 ...
65
66 # Test a typical situation where m0 needs to be set from a VGPR
67 # through readfirstlane
68 ---
69 name: m0_gws_readlane
70 tracksRegLiveness: true
71 body: |
72
73 bb.0:
74 liveins: $vgpr0, $vgpr1
75
76 ; GFX9-LABEL: name: m0_gws_readlane
77 ; GFX9: liveins: $vgpr0, $vgpr1
78 ; GFX9: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
79 ; GFX9: $m0 = S_MOV_B32 $sgpr0
80 ; GFX9: S_NOP 0
81 ; GFX9: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
82 ; VI-LABEL: name: m0_gws_readlane
83 ; VI: liveins: $vgpr0, $vgpr1
84 ; VI: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
85 ; VI: $m0 = S_MOV_B32 $sgpr0
86 ; VI: S_NOP 0
87 ; VI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
88 ; CI-LABEL: name: m0_gws_readlane
89 ; CI: liveins: $vgpr0, $vgpr1
90 ; CI: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
91 ; CI: $m0 = S_MOV_B32 $sgpr0
92 ; CI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
93 ; SI-LABEL: name: m0_gws_readlane
94 ; SI: liveins: $vgpr0, $vgpr1
95 ; SI: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
96 ; SI: $m0 = S_MOV_B32 $sgpr0
97 ; SI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
98 $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
99 $m0 = S_MOV_B32 $sgpr0
100 DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
101
102 ...
+0
-59
test/CodeGen/AMDGPU/insert-skips-gws.mir less more
None # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
1 # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-insert-skips -amdgpu-skip-threshold=1 -verify-machineinstrs %s -o - | FileCheck %s
2 # Make sure mandatory skips are inserted to ensure GWS ops aren't run with exec = 0
3
4 ---
5
6 name: skip_gws_init
7 body: |
8 ; CHECK-LABEL: name: skip_gws_init
9 ; CHECK: bb.0:
10 ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
11 ; CHECK: SI_MASK_BRANCH %bb.2, implicit $exec
12 ; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec
13 ; CHECK: bb.1:
14 ; CHECK: successors: %bb.2(0x80000000)
15 ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
16 ; CHECK: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
17 ; CHECK: bb.2:
18 ; CHECK: S_ENDPGM 0
19 bb.0:
20 successors: %bb.1, %bb.2
21 SI_MASK_BRANCH %bb.2, implicit $exec
22
23 bb.1:
24 successors: %bb.2
25 $vgpr0 = V_MOV_B32_e32 0, implicit $exec
26 DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
27
28 bb.2:
29 S_ENDPGM 0
30 ...
31
32 ---
33
34 name: skip_gws_barrier
35 body: |
36 ; CHECK-LABEL: name: skip_gws_barrier
37 ; CHECK: bb.0:
38 ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
39 ; CHECK: SI_MASK_BRANCH %bb.2, implicit $exec
40 ; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec
41 ; CHECK: bb.1:
42 ; CHECK: successors: %bb.2(0x80000000)
43 ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
44 ; CHECK: DS_GWS_BARRIER $vgpr0, 0, 1, implicit $m0, implicit $exec
45 ; CHECK: bb.2:
46 ; CHECK: S_ENDPGM 0
47 bb.0:
48 successors: %bb.1, %bb.2
49 SI_MASK_BRANCH %bb.2, implicit $exec
50
51 bb.1:
52 successors: %bb.2
53 $vgpr0 = V_MOV_B32_e32 0, implicit $exec
54 DS_GWS_BARRIER $vgpr0, 0, 1, implicit $m0, implicit $exec
55
56 bb.2:
57 S_ENDPGM 0
58 ...
+0
-179
test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll less more
None ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
1 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -mattr=+flat-for-global -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s
2 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s
3 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s
4
5 ; Minimum offset
6 ; GCN-LABEL: {{^}}gws_barrier_offset0:
7 ; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
8 ; GCN-DAG: s_mov_b32 m0, -1{{$}}
9 ; GCN: v_mov_b32_e32 v0, [[BAR_NUM]]
10 ; GCN: ds_gws_barrier v0 offset:1 gds{{$}}
11 define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
12 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
13 ret void
14 }
15
16 ; Maximum offset
17 ; GCN-LABEL: {{^}}gws_barrier_offset63:
18 ; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
19 ; GCN-DAG: s_mov_b32 m0, -1{{$}}
20 ; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
21 ; GCN: ds_gws_barrier v0 offset:64 gds{{$}}
22 define amdgpu_kernel void @gws_barrier_offset63(i32 %val) #0 {
23 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 63)
24 ret void
25 }
26
27 ; FIXME: Should be able to shift directly into m0
28 ; GCN-LABEL: {{^}}gws_barrier_sgpr_offset:
29 ; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
30 ; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
31 ; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
32 ; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
33 ; GCN: ds_gws_barrier v0 gds{{$}}
34 define amdgpu_kernel void @gws_barrier_sgpr_offset(i32 %val, i32 %offset) #0 {
35 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset)
36 ret void
37 }
38
39 ; Variable offset in SGPR with constant add
40 ; GCN-LABEL: {{^}}gws_barrier_sgpr_offset_add1:
41 ; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
42 ; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
43 ; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
44 ; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
45 ; GCN: ds_gws_barrier v0 offset:1 gds{{$}}
46 define amdgpu_kernel void @gws_barrier_sgpr_offset_add1(i32 %val, i32 %offset.base) #0 {
47 %offset = add i32 %offset.base, 1
48 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset)
49 ret void
50 }
51
52 ; GCN-LABEL: {{^}}gws_barrier_vgpr_offset:
53 ; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
54 ; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
55 ; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
56 ; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
57 ; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
58 ; GCN: ds_gws_barrier v0 gds{{$}}
59 define amdgpu_kernel void @gws_barrier_vgpr_offset(i32 %val) #0 {
60 %vgpr.offset = call i32 @llvm.amdgcn.workitem.id.x()
61 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %vgpr.offset)
62 ret void
63 }
64
65 ; Variable offset in VGPR with constant add
66 ; GCN-LABEL: {{^}}gws_barrier_vgpr_offset_add:
67 ; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
68 ; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
69 ; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
70 ; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
71 ; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
72 ; GCN: ds_gws_barrier v0 offset:3 gds{{$}}
73 define amdgpu_kernel void @gws_barrier_vgpr_offset_add(i32 %val) #0 {
74 %vgpr.offset.base = call i32 @llvm.amdgcn.workitem.id.x()
75 %vgpr.offset = add i32 %vgpr.offset.base, 3
76 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %vgpr.offset)
77 ret void
78 }
79
80 @lds = internal unnamed_addr addrspace(3) global i32 undef
81
82 ; Check if m0 initialization is shared
83 ; GCN-LABEL: {{^}}gws_barrier_save_m0_barrier_constant_offset:
84 ; GCN: s_mov_b32 m0, -1
85 ; GCN-NOT: s_mov_b32 m0
86 define amdgpu_kernel void @gws_barrier_save_m0_barrier_constant_offset(i32 %val) #0 {
87 store i32 1, i32 addrspace(3)* @lds
88 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 10)
89 store i32 2, i32 addrspace(3)* @lds
90 ret void
91 }
92
93 ; Make sure this increments lgkmcnt
94 ; GCN-LABEL: {{^}}gws_barrier_lgkmcnt:
95 ; GCN: ds_gws_barrier v0 offset:1 gds{{$}}
96 ; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
97 ; GCN-NEXT: s_setpc_b64
98 define void @gws_barrier_lgkmcnt(i32 %val) {
99 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
100 ret void
101 }
102
103 ; Does not imply memory fence on its own
104 ; GCN-LABEL: {{^}}gws_barrier_wait_before:
105 ; GCN: store_dword
106 ; CIPLUS-NOT: s_waitcnt
107 ; GCN: ds_gws_barrier v0 offset:8 gds
108 define amdgpu_kernel void @gws_barrier_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
109 store i32 0, i32 addrspace(1)* %ptr
110 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
111 ret void
112 }
113
114 ; GCN-LABEL: {{^}}gws_barrier_wait_after:
115 ; GCN: ds_gws_barrier v0 offset:8 gds
116 ; GCN-NEXT: s_waitcnt expcnt(0){{$}}
117 ; GCN-NEXT: load_dword
118 define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, i32 addrspace(1)* %ptr) #0 {
119 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
120 %load = load volatile i32, i32 addrspace(1)* %ptr
121 ret void
122 }
123
124 ; Does not imply memory fence on its own
125 ; GCN-LABEL: {{^}}gws_barrier_fence_before:
126 ; GCN: store_dword
127 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
128 ; GCN: ds_gws_barrier v0 offset:8 gds
129 define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
130 store i32 0, i32 addrspace(1)* %ptr
131 fence release
132 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
133 ret void
134 }
135
136 ; GCN-LABEL: {{^}}gws_barrier_fence_after:
137 ; GCN: ds_gws_barrier v0 offset:8 gds
138 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
139 ; GCN-NEXT: load_dword
140 define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, i32 addrspace(1)* %ptr) #0 {
141 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
142 fence release
143 %load = load volatile i32, i32 addrspace(1)* %ptr
144 ret void
145 }
146
147 ; FIXME: Should a wait be inserted here, or is an explicit fence needed?
148 ; GCN-LABEL: {{^}}gws_init_barrier:
149 ; GCN: s_mov_b32 m0, -1
150 ; GCN: ds_gws_init v0 offset:8 gds
151 ; GCN-NEXT: ds_gws_barrier v0 offset:8 gds
152 define amdgpu_kernel void @gws_init_barrier(i32 %val) #0 {
153 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
154 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
155 ret void
156 }
157
158 ; FIXME: Why vmcnt, not expcnt?
159 ; GCN-LABEL: {{^}}gws_init_fence_barrier:
160 ; GCN: s_mov_b32 m0, -1
161 ; GCN: ds_gws_init v0 offset:8 gds
162 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
163 ; GCN-NEXT: ds_gws_barrier v0 offset:8 gds
164 define amdgpu_kernel void @gws_init_fence_barrier(i32 %val) #0 {
165 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
166 fence release
167 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
168 ret void
169 }
170
171 declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #1
172 declare void @llvm.amdgcn.ds.gws.init(i32, i32) #2
173 declare i32 @llvm.amdgcn.workitem.id.x() #3
174
175 attributes #0 = { nounwind }
176 attributes #1 = { convergent inaccessiblememonly nounwind }
177 attributes #2 = { convergent inaccessiblememonly nounwind writeonly }
178 attributes #3 = { nounwind readnone speculatable }
+0
-119
test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll less more
None ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
1 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
2 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
3 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
4
5 ; Minimum offset
6 ; GCN-LABEL: {{^}}gws_init_offset0:
7 ; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
8 ; GCN-DAG: s_mov_b32 m0, -1{{$}}
9 ; GCN: v_mov_b32_e32 v0, [[BAR_NUM]]
10 ; GCN: ds_gws_init v0 offset:1 gds{{$}}
11 define amdgpu_kernel void @gws_init_offset0(i32 %val) #0 {
12 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0)
13 ret void
14 }
15
16 ; Maximum offset
17 ; GCN-LABEL: {{^}}gws_init_offset63:
18 ; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
19 ; GCN-DAG: s_mov_b32 m0, -1{{$}}
20 ; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
21 ; GCN: ds_gws_init v0 offset:64 gds{{$}}
22 define amdgpu_kernel void @gws_init_offset63(i32 %val) #0 {
23 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 63)
24 ret void
25 }
26
27 ; FIXME: Should be able to shift directly into m0
28 ; GCN-LABEL: {{^}}gws_init_sgpr_offset:
29 ; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
30 ; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
31 ; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
32 ; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
33 ; GCN: ds_gws_init v0 gds{{$}}
34 define amdgpu_kernel void @gws_init_sgpr_offset(i32 %val, i32 %offset) #0 {
35 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset)
36 ret void
37 }
38
39 ; Variable offset in SGPR with constant add
40 ; GCN-LABEL: {{^}}gws_init_sgpr_offset_add1:
41 ; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
42 ; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
43 ; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
44 ; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
45 ; GCN: ds_gws_init v0 offset:1 gds{{$}}
46 define amdgpu_kernel void @gws_init_sgpr_offset_add1(i32 %val, i32 %offset.base) #0 {
47 %offset = add i32 %offset.base, 1
48 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset)
49 ret void
50 }
51
52 ; GCN-LABEL: {{^}}gws_init_vgpr_offset:
53 ; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
54 ; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
55 ; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
56 ; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
57 ; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
58 ; GCN: ds_gws_init v0 gds{{$}}
59 define amdgpu_kernel void @gws_init_vgpr_offset(i32 %val) #0 {
60 %vgpr.offset = call i32 @llvm.amdgcn.workitem.id.x()
61 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %vgpr.offset)
62 ret void
63 }
64
65 ; Variable offset in VGPR with constant add
66 ; GCN-LABEL: {{^}}gws_init_vgpr_offset_add:
67 ; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
68 ; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
69 ; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
70 ; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
71 ; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
72 ; GCN: ds_gws_init v0 offset:3 gds{{$}}
73 define amdgpu_kernel void @gws_init_vgpr_offset_add(i32 %val) #0 {
74 %vgpr.offset.base = call i32 @llvm.amdgcn.workitem.id.x()
75 %vgpr.offset = add i32 %vgpr.offset.base, 3
76 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %vgpr.offset)
77 ret void
78 }
79
80 @lds = internal unnamed_addr addrspace(3) global i32 undef
81
82 ; Check if m0 initialization is shared.
83 ; GCN-LABEL: {{^}}gws_init_save_m0_init_constant_offset:
84 ; GCN: s_mov_b32 m0, -1
85 ; GCN-NOT: s_mov_b32 m0
86 define amdgpu_kernel void @gws_init_save_m0_init_constant_offset(i32 %val) #0 {
87 store i32 1, i32 addrspace(3)* @lds
88 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 10)
89 store i32 2, i32 addrspace(3)* @lds
90 ret void
91 }
92
93 ; GCN-LABEL: {{^}}gws_init_lgkmcnt:
94 ; GCN: ds_gws_init v0 offset:1 gds{{$}}
95 ; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
96 ; GCN-NEXT: s_setpc_b64
97 define void @gws_init_lgkmcnt(i32 %val) {
98 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0)
99 ret void
100 }
101
102 ; Does not imply memory fence on its own
103 ; GCN-LABEL: {{^}}gws_init_wait_before:
104 ; GCN: store_dword
105 ; CIPLUS-NOT: s_waitcnt
106 ; GCN: ds_gws_init v0 offset:8 gds
107 define amdgpu_kernel void @gws_init_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
108 store i32 0, i32 addrspace(1)* %ptr
109 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
110 ret void
111 }
112
113 declare void @llvm.amdgcn.ds.gws.init(i32, i32) #1
114 declare i32 @llvm.amdgcn.workitem.id.x() #2
115
116 attributes #0 = { nounwind }
117 attributes #1 = { convergent inaccessiblememonly nounwind writeonly }
118 attributes #2 = { nounwind readnone speculatable }
55 declare void @nonconvergent_func() #0
66 declare void @convergent_func() #1
77 declare void @llvm.amdgcn.s.barrier() #1
8 declare void @llvm.amdgcn.ds.gws.init(i32, i32) #2
9 declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #2
108
119 ; barrier shouldn't be duplicated.
1210
10199 ret void
102100 }
103101
104 ; GCN-LABEL: {{^}}taildup_gws_init:
105 ; GCN: ds_gws_init
106 ; GCN-NOT: ds_gws_init
107 define amdgpu_kernel void @taildup_gws_init(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond, i32 %val, i32 %offset) #0 {
108 entry:
109 br i1 %cond, label %bb1, label %bb2
110
111 bb1:
112 store i32 0, i32 addrspace(1)* %a
113 br label %call
114
115 bb2:
116 store i32 1, i32 addrspace(1)* %a
117 br label %call
118
119 call:
120 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset)
121 br label %ret
122
123 ret:
124 ret void
125 }
126
127 ; GCN-LABEL: {{^}}taildup_gws_barrier:
128 ; GCN: ds_gws_barrier
129 ; GCN-NOT: ds_gws_barrier
130 define amdgpu_kernel void @taildup_gws_barrier(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond, i32 %val, i32 %offset) #0 {
131 entry:
132 br i1 %cond, label %bb1, label %bb2
133
134 bb1:
135 store i32 0, i32 addrspace(1)* %a
136 br label %call
137
138 bb2:
139 store i32 1, i32 addrspace(1)* %a
140 br label %call
141
142 call:
143 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset)
144 br label %ret
145
146 ret:
147 ret void
148 }
149102
150103 attributes #0 = { nounwind }
151104 attributes #1 = { nounwind convergent }
152 attributes #2 = { convergent inaccessiblememonly nounwind }