llvm.org GIT mirror llvm / 876bc45
AMDGPU: Unify divergent function exits. StructurizeCFG can't handle cases with multiple returns creating regions with multiple exits. Create a copy of UnifyFunctionExitNodes that only unifies exit nodes that skips exit nodes with uniform branch sources. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@298729 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 3 years ago
14 changed file(s) with 1217 addition(s) and 67 deletion(s). Raw diff Collapse all Expand all
704704
705705 def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], [IntrConvergent]>;
706706
707 // Represent unreachable in a divergent region.
708 def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent]>;
709
707710 // Emit 2.5 ulp, no denormal division. Should only be inserted by
708711 // pass based on !fpmath metadata.
709712 def int_amdgcn_fdiv_fast : Intrinsic<
122122 void initializeSIInsertWaitsPass(PassRegistry&);
123123 extern char &SIInsertWaitsID;
124124
125 void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&);
126 extern char &AMDGPUUnifyDivergentExitNodesID;
127
125128 ImmutablePass *createAMDGPUAAWrapperPass();
126129 void initializeAMDGPUAAWrapperPassPass(PassRegistry&);
127130
131131 initializeSIInsertSkipsPass(*PR);
132132 initializeSIDebuggerInsertNopsPass(*PR);
133133 initializeSIOptimizeExecMaskingPass(*PR);
134 initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
134135 initializeAMDGPUAAWrapperPassPass(*PR);
135136 }
136137
672673 // supported.
673674 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
674675 addPass(createAMDGPUAnnotateKernelFeaturesPass(&TM));
676
677 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
678 // regions formed by them.
679 addPass(&AMDGPUUnifyDivergentExitNodesID);
675680 addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
676681 addPass(createSinkingPass());
677682 addPass(createSITypeRewriter());
0 //===- AMDGPUUnifyDivergentExitNodes.cpp ----------------------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is a variant of the UnifyDivergentExitNodes pass. Rather than ensuring
10 // there is at most one ret and one unreachable instruction, it ensures there is
11 // at most one divergent exiting block.
12 //
13 // StructurizeCFG can't deal with multi-exit regions formed by branches to
14 // multiple return nodes. It is not desirable to structurize regions with
15 // uniform branches, so unifying those to the same return block as divergent
16 // branches inhibits use of scalar branching. It still can't deal with the case
17 // where one branch goes to return, and one unreachable. Replace unreachable in
18 // this case with a return.
19 //
20 //===----------------------------------------------------------------------===//
21
22 #include "AMDGPU.h"
23 #include "llvm/ADT/DepthFirstIterator.h"
24 #include "llvm/ADT/StringExtras.h"
25 #include "llvm/Analysis/DivergenceAnalysis.h"
26 #include "llvm/Analysis/PostDominators.h"
27 #include "llvm/Analysis/TargetTransformInfo.h"
28 #include "llvm/IR/BasicBlock.h"
29 #include "llvm/IR/CFG.h"
30 #include "llvm/IR/Function.h"
31 #include "llvm/IR/Instructions.h"
32 #include "llvm/IR/Type.h"
33 #include "llvm/Transforms/Scalar.h"
34 #include "llvm/Transforms/Utils/Local.h"
35 using namespace llvm;
36
37 #define DEBUG_TYPE "amdgpu-unify-divergent-exit-nodes"
38
39 namespace {
40
41 class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
42 public:
43 static char ID; // Pass identification, replacement for typeid
44 AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
45 initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry());
46 }
47
48 // We can preserve non-critical-edgeness when we unify function exit nodes
49 void getAnalysisUsage(AnalysisUsage &AU) const override;
50 bool runOnFunction(Function &F) override;
51 };
52
53 }
54
55 char AMDGPUUnifyDivergentExitNodes::ID = 0;
56 INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
57 "Unify divergent function exit nodes", false, false)
58 INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
59 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
60 INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
61 "Unify divergent function exit nodes", false, false)
62
63 char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;
64
65 void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
66 // TODO: Preserve dominator tree.
67 AU.addRequired();
68
69 AU.addRequired();
70
71 // No divergent values are changed, only blocks and branch edges.
72 AU.addPreserved();
73
74 // We preserve the non-critical-edgeness property
75 AU.addPreservedID(BreakCriticalEdgesID);
76
77 // This is a cluster of orthogonal Transforms
78 AU.addPreservedID(LowerSwitchID);
79 FunctionPass::getAnalysisUsage(AU);
80
81 AU.addRequired();
82 }
83
84 /// \returns true if \p BB is reachable through only uniform branches.
85 /// XXX - Is there a more efficient way to find this?
86 static bool isUniformlyReached(const DivergenceAnalysis &DA,
87 BasicBlock &BB) {
88 SmallVector Stack;
89 SmallPtrSet Visited;
90
91 for (BasicBlock *Pred : predecessors(&BB))
92 Stack.push_back(Pred);
93
94 while (!Stack.empty()) {
95 BasicBlock *Top = Stack.pop_back_val();
96 if (!DA.isUniform(Top->getTerminator()))
97 return false;
98
99 for (BasicBlock *Pred : predecessors(Top)) {
100 if (Visited.insert(Pred).second)
101 Stack.push_back(Pred);
102 }
103 }
104
105 return true;
106 }
107
108 static BasicBlock *unifyReturnBlockSet(Function &F,
109 ArrayRef ReturningBlocks,
110 const TargetTransformInfo &TTI,
111 StringRef Name) {
112 // Otherwise, we need to insert a new basic block into the function, add a PHI
113 // nodes (if the function returns values), and convert all of the return
114 // instructions into unconditional branches.
115 //
116 BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
117
118 PHINode *PN = nullptr;
119 if (F.getReturnType()->isVoidTy()) {
120 ReturnInst::Create(F.getContext(), nullptr, NewRetBlock);
121 } else {
122 // If the function doesn't return void... add a PHI node to the block...
123 PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),
124 "UnifiedRetVal");
125 NewRetBlock->getInstList().push_back(PN);
126 ReturnInst::Create(F.getContext(), PN, NewRetBlock);
127 }
128
129 // Loop over all of the blocks, replacing the return instruction with an
130 // unconditional branch.
131 //
132 for (BasicBlock *BB : ReturningBlocks) {
133 // Add an incoming element to the PHI node for every return instruction that
134 // is merging into this new block...
135 if (PN)
136 PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
137
138 BB->getInstList().pop_back(); // Remove the return insn
139 BranchInst::Create(NewRetBlock, BB);
140 }
141
142 for (BasicBlock *BB : ReturningBlocks) {
143 // Cleanup possible branch to unconditional branch to the return.
144 SimplifyCFG(BB, TTI, 2);
145 }
146
147 return NewRetBlock;
148 }
149
150 bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
151 auto &PDT = getAnalysis().getPostDomTree();
152 if (PDT.getRoots().size() <= 1)
153 return false;
154
155 DivergenceAnalysis &DA = getAnalysis();
156
157 // Loop over all of the blocks in a function, tracking all of the blocks that
158 // return.
159 //
160 SmallVector ReturningBlocks;
161 SmallVector UnreachableBlocks;
162
163 for (BasicBlock *BB : PDT.getRoots()) {
164 if (isa(BB->getTerminator())) {
165 if (!isUniformlyReached(DA, *BB))
166 ReturningBlocks.push_back(BB);
167 } else if (isa(BB->getTerminator())) {
168 if (!isUniformlyReached(DA, *BB))
169 UnreachableBlocks.push_back(BB);
170 }
171 }
172
173 if (!UnreachableBlocks.empty()) {
174 BasicBlock *UnreachableBlock = nullptr;
175
176 if (UnreachableBlocks.size() == 1) {
177 UnreachableBlock = UnreachableBlocks.front();
178 } else {
179 UnreachableBlock = BasicBlock::Create(F.getContext(),
180 "UnifiedUnreachableBlock", &F);
181 new UnreachableInst(F.getContext(), UnreachableBlock);
182
183 for (BasicBlock *BB : UnreachableBlocks) {
184 BB->getInstList().pop_back(); // Remove the unreachable inst.
185 BranchInst::Create(UnreachableBlock, BB);
186 }
187 }
188
189 if (!ReturningBlocks.empty()) {
190 // Don't create a new unreachable inst if we have a return. The
191 // structurizer/annotator can't handle the multiple exits
192
193 Type *RetTy = F.getReturnType();
194 Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
195 UnreachableBlock->getInstList().pop_back(); // Remove the unreachable inst.
196
197 Function *UnreachableIntrin =
198 Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable);
199
200 // Insert a call to an intrinsic tracking that this is an unreachable
201 // point, in case we want to kill the active lanes or something later.
202 CallInst::Create(UnreachableIntrin, {}, "", UnreachableBlock);
203
204 // Don't create a scalar trap. We would only want to trap if this code was
205 // really reached, but a scalar trap would happen even if no lanes
206 // actually reached here.
207 ReturnInst::Create(F.getContext(), RetVal, UnreachableBlock);
208 ReturningBlocks.push_back(UnreachableBlock);
209 }
210 }
211
212 // Now handle return blocks.
213 if (ReturningBlocks.empty())
214 return false; // No blocks return
215
216 if (ReturningBlocks.size() == 1)
217 return false; // Already has a single return block
218
219 const TargetTransformInfo &TTI
220 = getAnalysis().getTTI(F);
221
222 unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock");
223 return true;
224 }
5757 AMDGPUInstrInfo.cpp
5858 AMDGPUPromoteAlloca.cpp
5959 AMDGPURegisterInfo.cpp
60 AMDGPUUnifyDivergentExitNodes.cpp
6061 GCNHazardRecognizer.cpp
6162 GCNSchedStrategy.cpp
6263 R600ClauseMergePass.cpp
137137 let AsmVariantName = AMDGPUAsmVariants.Default;
138138 }
139139
140 class PseudoInstSI pattern = []>
141 : InstSI {
140 class PseudoInstSI pattern = [], string asm = "">
141 : InstSI {
142142 let isPseudo = 1;
143143 let isCodeGenOnly = 1;
144144 }
145145
146 class SPseudoInstSI pattern = []>
147 : PseudoInstSI {
146 class SPseudoInstSI pattern = [], string asm = "">
147 : PseudoInstSI {
148148 let SALU = 1;
149149 }
150150
151 class VPseudoInstSI pattern = []>
152 : PseudoInstSI {
151 class VPseudoInstSI pattern = [], string asm = "">
152 : PseudoInstSI {
153153 let VALU = 1;
154154 let Uses = [EXEC];
155155 }
38023802 if (DescSize != 0 && DescSize != 4)
38033803 return DescSize;
38043804
3805 if (Opc == AMDGPU::WAVE_BARRIER)
3806 return 0;
3807
38083805 // 4-byte instructions may have a 32-bit literal encoded after them. Check
38093806 // operands that coud ever be literals.
38103807 if (isVALU(MI) || isSALU(MI)) {
3811 if (isFixedSize(MI)) {
3812 assert(DescSize == 4);
3808 if (isFixedSize(MI))
38133809 return DescSize;
3814 }
38153810
38163811 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
38173812 if (Src0Idx == -1)
38343829 return 4;
38353830
38363831 switch (Opc) {
3837 case AMDGPU::SI_MASK_BRANCH:
38383832 case TargetOpcode::IMPLICIT_DEF:
38393833 case TargetOpcode::KILL:
38403834 case TargetOpcode::DBG_VALUE:
151151 let mayStore = 1;
152152 let isBarrier = 1;
153153 let isConvergent = 1;
154 let FixedSize = 1;
155 let Size = 0;
154156 }
155157
156158 // SI pseudo instructions. These are used by the CFG structurizer pass
158160
159161 // Dummy terminator instruction to use after control flow instructions
160162 // replaced with exec mask operations.
161 def SI_MASK_BRANCH : PseudoInstSI <
163 def SI_MASK_BRANCH : VPseudoInstSI <
162164 (outs), (ins brtarget:$target)> {
163165 let isBranch = 0;
164166 let isTerminator = 1;
165167 let isBarrier = 0;
166 let Uses = [EXEC];
167168 let SchedRW = [];
168169 let hasNoSchedulingInfo = 1;
170 let FixedSize = 1;
171 let Size = 0;
169172 }
170173
171174 let isTerminator = 1 in {
257260 (outs SReg_64:$dst), (ins),
258261 [(set i1:$dst, (int_amdgcn_ps_live))]> {
259262 let SALU = 1;
263 }
264
265 def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
266 [(int_amdgcn_unreachable)],
267 "; divergent unreachable"> {
268 let Size = 0;
269 let hasNoSchedulingInfo = 1;
270 let FixedSize = 1;
260271 }
261272
262273 // Used as an isel pseudo to directly emit initialization with an
1414 ; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[OTHERCC]]
1515 ; GCN: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[AND]]
1616 ; GCN: s_xor_b64 {{s\[[0-9]+:[0-9]+\]}}, exec, [[SAVED]]
17 ;
18 ; TODO: The following sequence is a bug (missing s_endpgm)!
19 ;
20 ; GCN: s_branch [[BB:BB[0-9]+_[0-9]+]]
21 ; GCN: [[BB]]:
22 ; GCN-NEXT: .Lfunc_end0:
17 ; GCN: ; mask branch [[BB5:BB[0-9]+_[0-9]+]]
18
19 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %bb4
20 ; GCN: ds_write_b32
21 ; GCN: s_waitcnt
22
23 ; GCN-NEXT: [[BB5]]
24 ; GCN: s_or_b64 exec, exec
25 ; GCN-NEXT: s_endpgm
26 ; GCN-NEXT: .Lfunc_end
2327 define amdgpu_ps void @ham(float %arg, float %arg1) #0 {
2428 bb:
2529 %tmp = fcmp ogt float %arg, 0.000000e+00
2832 br i1 %tmp3, label %bb4, label %bb5
2933
3034 bb4: ; preds = %bb
35 store volatile i32 4, i32 addrspace(3)* undef
3136 unreachable
3237
3338 bb5: ; preds = %bb
0 ; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s
1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
2
3 ; Add an extra verifier runs. There were some cases where invalid IR
4 ; was produced but happened to be fixed by the later passes.
5
6 ; Make sure divergent control flow with multiple exits from a region
7 ; is properly handled. UnifyFunctionExitNodes should be run before
8 ; StructurizeCFG.
9
10 ; IR-LABEL: @multi_divergent_region_exit_ret_ret(
11 ; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
12 ; IR: %2 = extractvalue { i1, i64 } %1, 0
13 ; IR: %3 = extractvalue { i1, i64 } %1, 1
14 ; IR: br i1 %2, label %LeafBlock1, label %Flow
15
16 ; IR: Flow:
17 ; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
18 ; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
19 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
20 ; IR: %7 = extractvalue { i1, i64 } %6, 0
21 ; IR: %8 = extractvalue { i1, i64 } %6, 1
22 ; IR: br i1 %7, label %LeafBlock, label %Flow1
23
24 ; IR: LeafBlock:
25 ; IR: br label %Flow1
26
27 ; IR: LeafBlock1:
28 ; IR: br label %Flow{{$}}
29
30 ; IR: Flow2:
31 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
32 ; IR: call void @llvm.amdgcn.end.cf(i64 %19)
33 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
34 ; IR: %13 = extractvalue { i1, i64 } %12, 0
35 ; IR: %14 = extractvalue { i1, i64 } %12, 1
36 ; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
37
38 ; IR: exit0:
39 ; IR: store volatile i32 9, i32 addrspace(1)* undef
40 ; IR: br label %UnifiedReturnBlock
41
42 ; IR: Flow1:
43 ; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
44 ; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
45 ; IR: call void @llvm.amdgcn.end.cf(i64 %8)
46 ; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
47 ; IR: %18 = extractvalue { i1, i64 } %17, 0
48 ; IR: %19 = extractvalue { i1, i64 } %17, 1
49 ; IR: br i1 %18, label %exit1, label %Flow2
50
51 ; IR: exit1:
52 ; IR: store volatile i32 17, i32 addrspace(3)* undef
53 ; IR: br label %Flow2
54
55 ; IR: UnifiedReturnBlock:
56 ; IR: call void @llvm.amdgcn.end.cf(i64 %14)
57 ; IR: ret void
58
59
60 ; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
61 ; GCN: v_cmp_lt_i32_e32 vcc, 1
62 ; GCN: s_and_saveexec_b64
63 ; GCN: s_xor_b64
64
65
66 ; FIXME: Why is this compare essentially repeated?
67 ; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]
68 ; GCN-NEXT: v_cmp_ne_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, [[REG]]
69 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
70 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1
71
72 ; GCN: ; %Flow1
73 ; GCN-NEXT: s_or_b64 exec, exec
74 ; GCN: v_cmp_ne_u32_e32 vcc, 0
75
76 ; GCN: ; %exit1
77 ; GCN: ds_write_b32
78
79 ; GCN: %Flow2
80 ; GCN-NEXT: s_or_b64 exec, exec
81 ; GCN: v_cmp_ne_u32_e32 vcc, 0
82 ; GCN-NEXT: s_and_saveexec_b64
83 ; GCN-NEXT: s_xor_b64
84
85 ; GCN: ; %exit0
86 ; GCN: buffer_store_dword
87
88 ; GCN: ; %UnifiedReturnBlock
89 ; GCN-NEXT: s_or_b64 exec, exec
90 ; GCN-NEXT: s_endpgm
91 define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
92 entry:
93 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
94 %tmp1 = add i32 0, %tmp
95 %tmp2 = zext i32 %tmp1 to i64
96 %tmp3 = add i64 0, %tmp2
97 %tmp4 = shl i64 %tmp3, 32
98 %tmp5 = ashr exact i64 %tmp4, 32
99 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
100 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
101 %tmp8 = sext i32 %tmp7 to i64
102 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
103 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
104 %tmp13 = zext i32 %tmp10 to i64
105 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
106 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
107 %Pivot = icmp slt i32 %tmp16, 2
108 br i1 %Pivot, label %LeafBlock, label %LeafBlock1
109
110 LeafBlock: ; preds = %entry
111 %SwitchLeaf = icmp eq i32 %tmp16, 1
112 br i1 %SwitchLeaf, label %exit0, label %exit1
113
114 LeafBlock1: ; preds = %entry
115 %SwitchLeaf2 = icmp eq i32 %tmp16, 2
116 br i1 %SwitchLeaf2, label %exit0, label %exit1
117
118 exit0: ; preds = %LeafBlock, %LeafBlock1
119 store volatile i32 9, i32 addrspace(1)* undef
120 ret void
121
122 exit1: ; preds = %LeafBlock, %LeafBlock1
123 store volatile i32 17, i32 addrspace(3)* undef
124 ret void
125 }
126
127 ; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable(
128 ; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
129
130 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
131
132 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
133 ; IR: call void @llvm.amdgcn.end.cf(i64 %19)
134 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
135 ; IR: br i1 %13, label %exit0, label %UnifiedUnreachableBlock
136
137
138 ; IR: UnifiedUnreachableBlock:
139 ; IR-NEXT: unreachable
140
141
142 ; FIXME: Probably should insert an s_endpgm anyway.
143 ; GCN-LABEL: {{^}}multi_divergent_region_exit_unreachable_unreachable:
144 ; GCN: ; %UnifiedUnreachableBlock
145 ; GCN-NEXT: .Lfunc_end
146 define amdgpu_kernel void @multi_divergent_region_exit_unreachable_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
147 entry:
148 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
149 %tmp1 = add i32 0, %tmp
150 %tmp2 = zext i32 %tmp1 to i64
151 %tmp3 = add i64 0, %tmp2
152 %tmp4 = shl i64 %tmp3, 32
153 %tmp5 = ashr exact i64 %tmp4, 32
154 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
155 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
156 %tmp8 = sext i32 %tmp7 to i64
157 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
158 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
159 %tmp13 = zext i32 %tmp10 to i64
160 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
161 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
162 %Pivot = icmp slt i32 %tmp16, 2
163 br i1 %Pivot, label %LeafBlock, label %LeafBlock1
164
165 LeafBlock: ; preds = %entry
166 %SwitchLeaf = icmp eq i32 %tmp16, 1
167 br i1 %SwitchLeaf, label %exit0, label %exit1
168
169 LeafBlock1: ; preds = %entry
170 %SwitchLeaf2 = icmp eq i32 %tmp16, 2
171 br i1 %SwitchLeaf2, label %exit0, label %exit1
172
173 exit0: ; preds = %LeafBlock, %LeafBlock1
174 store volatile i32 9, i32 addrspace(1)* undef
175 unreachable
176
177 exit1: ; preds = %LeafBlock, %LeafBlock1
178 store volatile i32 17, i32 addrspace(3)* undef
179 unreachable
180 }
181
182 ; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret(
183 ; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2
184 ; IR: llvm.amdgcn.if
185 ; IR: br i1
186
187 ; IR: {{^}}Flow:
188 ; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
189 ; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
190 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
191 ; IR: br i1 %7, label %LeafBlock, label %Flow1
192
193 ; IR: {{^}}LeafBlock:
194 ; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1
195 ; IR: %9 = xor i1 %divergent.cond1, true
196 ; IR: br label %Flow1
197
198 ; IR: LeafBlock1:
199 ; IR: %uniform.cond0 = icmp eq i32 %arg3, 2
200 ; IR: %10 = xor i1 %uniform.cond0, true
201 ; IR: br label %Flow
202
203 ; IR: Flow2:
204 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
205 ; IR: call void @llvm.amdgcn.end.cf(i64 %19)
206 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
207 ; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
208
209 ; IR: exit0:
210 ; IR: store volatile i32 9, i32 addrspace(1)* undef
211 ; IR: br label %UnifiedReturnBlock
212
213 ; IR: {{^}}Flow1:
214 ; IR: %15 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ]
215 ; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
216 ; IR: call void @llvm.amdgcn.end.cf(i64 %8)
217 ; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
218 ; IR: %18 = extractvalue { i1, i64 } %17, 0
219 ; IR: %19 = extractvalue { i1, i64 } %17, 1
220 ; IR: br i1 %18, label %exit1, label %Flow2
221
222 ; IR: exit1:
223 ; IR: store volatile i32 17, i32 addrspace(3)* undef
224 ; IR: br label %Flow2
225
226 ; IR: UnifiedReturnBlock:
227 ; IR: call void @llvm.amdgcn.end.cf(i64 %14)
228 ; IR: ret void
229 define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
230 entry:
231 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
232 %tmp1 = add i32 0, %tmp
233 %tmp2 = zext i32 %tmp1 to i64
234 %tmp3 = add i64 0, %tmp2
235 %tmp4 = shl i64 %tmp3, 32
236 %tmp5 = ashr exact i64 %tmp4, 32
237 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
238 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
239 %tmp8 = sext i32 %tmp7 to i64
240 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
241 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
242 %tmp13 = zext i32 %tmp10 to i64
243 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
244 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
245 %divergent.cond0 = icmp slt i32 %tmp16, 2
246 br i1 %divergent.cond0, label %LeafBlock, label %LeafBlock1
247
248 LeafBlock: ; preds = %entry
249 %divergent.cond1 = icmp eq i32 %tmp16, 1
250 br i1 %divergent.cond1, label %exit0, label %exit1
251
252 LeafBlock1: ; preds = %entry
253 %uniform.cond0 = icmp eq i32 %arg3, 2
254 br i1 %uniform.cond0, label %exit0, label %exit1
255
256 exit0: ; preds = %LeafBlock, %LeafBlock1
257 store volatile i32 9, i32 addrspace(1)* undef
258 ret void
259
260 exit1: ; preds = %LeafBlock, %LeafBlock1
261 store volatile i32 17, i32 addrspace(3)* undef
262 ret void
263 }
264
265 ; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret(
266 ; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
267 ; IR: br i1 %2, label %LeafBlock1, label %Flow
268
269 ; IR: Flow:
270 ; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
271 ; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
272 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
273
274 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
275 ; IR: call void @llvm.amdgcn.end.cf(i64 %19)
276 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
277
278 define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
279 entry:
280 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
281 %tmp1 = add i32 0, %tmp
282 %tmp2 = zext i32 %tmp1 to i64
283 %tmp3 = add i64 0, %tmp2
284 %tmp4 = shl i64 %tmp3, 32
285 %tmp5 = ashr exact i64 %tmp4, 32
286 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
287 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
288 %tmp8 = sext i32 %tmp7 to i64
289 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
290 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
291 %tmp13 = zext i32 %tmp10 to i64
292 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
293 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
294 %Pivot = icmp slt i32 %tmp16, 2
295 br i1 %Pivot, label %LeafBlock, label %LeafBlock1
296
297 LeafBlock: ; preds = %entry
298 %SwitchLeaf = icmp eq i32 %arg3, 1
299 br i1 %SwitchLeaf, label %exit0, label %exit1
300
301 LeafBlock1: ; preds = %entry
302 %SwitchLeaf2 = icmp eq i32 %tmp16, 2
303 br i1 %SwitchLeaf2, label %exit0, label %exit1
304
305 exit0: ; preds = %LeafBlock, %LeafBlock1
306 store volatile i32 9, i32 addrspace(1)* undef
307 ret void
308
309 exit1: ; preds = %LeafBlock, %LeafBlock1
310 store volatile i32 17, i32 addrspace(3)* undef
311 ret void
312 }
313
314 ; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value(
315 ; IR: Flow2:
316 ; IR: %11 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ]
317 ; IR: %12 = phi i1 [ false, %exit1 ], [ %16, %Flow1 ]
318 ; IR: call void @llvm.amdgcn.end.cf(i64 %20)
319
320 ; IR: UnifiedReturnBlock:
321 ; IR: %UnifiedRetVal = phi float [ %11, %Flow2 ], [ 1.000000e+00, %exit0 ]
322 ; IR: call void @llvm.amdgcn.end.cf(i64 %15)
323 ; IR: ret float %UnifiedRetVal
324 define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 {
325 entry:
326 %Pivot = icmp slt i32 %vgpr, 2
327 br i1 %Pivot, label %LeafBlock, label %LeafBlock1
328
329 LeafBlock: ; preds = %entry
330 %SwitchLeaf = icmp eq i32 %vgpr, 1
331 br i1 %SwitchLeaf, label %exit0, label %exit1
332
333 LeafBlock1: ; preds = %entry
334 %SwitchLeaf2 = icmp eq i32 %vgpr, 2
335 br i1 %SwitchLeaf2, label %exit0, label %exit1
336
337 exit0: ; preds = %LeafBlock, %LeafBlock1
338 store i32 9, i32 addrspace(1)* undef
339 ret float 1.0
340
341 exit1: ; preds = %LeafBlock, %LeafBlock1
342 store i32 17, i32 addrspace(3)* undef
343 ret float 2.0
344 }
345
346 ; IR-LABEL: @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(
347
348 ; GCN-LABEL: {{^}}uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value:
349 ; GCN: s_cmp_gt_i32 s0, 1
350 ; GCN: s_cbranch_scc0 [[FLOW:BB[0-9]+_[0-9]+]]
351
352 ; GCN: v_cmp_ne_u32_e32 vcc, 7, v0
353
354 ; GCN: {{^}}[[FLOW]]:
355 ; GCN: s_cbranch_vccnz [[FLOW1:BB[0-9]+]]
356
357 ; GCN: v_mov_b32_e32 v0, 2.0
358 ; GCN: s_or_b64 exec, exec
359 ; GCN: s_and_b64 exec, exec
360 ; GCN: v_mov_b32_e32 v0, 1.0
361
362 ; GCN: {{^BB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock
363 ; GCN-NEXT: s_or_b64 exec, exec
364 ; GCN-NEXT: ; return
365
366 define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 {
367 entry:
368 %uniform.cond = icmp slt i32 %sgpr, 2
369 br i1 %uniform.cond, label %LeafBlock, label %LeafBlock1
370
371 LeafBlock: ; preds = %entry
372 %divergent.cond0 = icmp eq i32 %vgpr, 3
373 br i1 %divergent.cond0, label %exit0, label %exit1
374
375 LeafBlock1: ; preds = %entry
376 %divergent.cond1 = icmp eq i32 %vgpr, 7
377 br i1 %divergent.cond1, label %exit0, label %exit1
378
379 exit0: ; preds = %LeafBlock, %LeafBlock1
380 store i32 9, i32 addrspace(1)* undef
381 ret float 1.0
382
383 exit1: ; preds = %LeafBlock, %LeafBlock1
384 store i32 17, i32 addrspace(3)* undef
385 ret float 2.0
386 }
387
388 ; IR-LABEL: @multi_divergent_region_exit_ret_unreachable(
389 ; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
390
391 ; IR: Flow:
392 ; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
393 ; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
394 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
395
396 ; IR: Flow2:
397 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
398 ; IR: call void @llvm.amdgcn.end.cf(i64 %19)
399 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
400 ; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
401
402 ; IR: exit0:
403 ; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
404 ; IR-NEXT: br label %UnifiedReturnBlock
405
406 ; IR: Flow1:
407 ; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
408 ; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
409 ; IR: call void @llvm.amdgcn.end.cf(i64 %8)
410 ; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
411 ; IR: %18 = extractvalue { i1, i64 } %17, 0
412 ; IR: %19 = extractvalue { i1, i64 } %17, 1
413 ; IR: br i1 %18, label %exit1, label %Flow2
414
415 ; IR: exit1:
416 ; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef
417 ; IR-NEXT: call void @llvm.amdgcn.unreachable()
418 ; IR-NEXT: br label %Flow2
419
420 ; IR: UnifiedReturnBlock:
421 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
422 ; IR-NEXT: ret void
423 define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
424 entry:
425 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
426 %tmp1 = add i32 0, %tmp
427 %tmp2 = zext i32 %tmp1 to i64
428 %tmp3 = add i64 0, %tmp2
429 %tmp4 = shl i64 %tmp3, 32
430 %tmp5 = ashr exact i64 %tmp4, 32
431 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
432 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
433 %tmp8 = sext i32 %tmp7 to i64
434 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
435 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
436 %tmp13 = zext i32 %tmp10 to i64
437 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
438 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
439 %Pivot = icmp slt i32 %tmp16, 2
440 br i1 %Pivot, label %LeafBlock, label %LeafBlock1
441
442 LeafBlock: ; preds = %entry
443 %SwitchLeaf = icmp eq i32 %tmp16, 1
444 br i1 %SwitchLeaf, label %exit0, label %exit1
445
446 LeafBlock1: ; preds = %entry
447 %SwitchLeaf2 = icmp eq i32 %tmp16, 2
448 br i1 %SwitchLeaf2, label %exit0, label %exit1
449
450 exit0: ; preds = %LeafBlock, %LeafBlock1
451 store volatile i32 17, i32 addrspace(3)* undef
452 ret void
453
454 exit1: ; preds = %LeafBlock, %LeafBlock1
455 store volatile i32 9, i32 addrspace(1)* undef
456 unreachable
457 }
458
459 ; The non-uniformity of the branch to the exiting blocks requires
460 ; looking at transitive predecessors.
461
462 ; IR-LABEL: @indirect_multi_divergent_region_exit_ret_unreachable(
463
464 ; IR: exit0: ; preds = %Flow2
465 ; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
466 ; IR-NEXT: br label %UnifiedReturnBlock
467
468
469 ; IR: indirect.exit1:
470 ; IR: %load = load volatile i32, i32 addrspace(1)* undef
471 ; IR: store volatile i32 %load, i32 addrspace(1)* undef
472 ; IR: store volatile i32 9, i32 addrspace(1)* undef
473 ; IR: call void @llvm.amdgcn.unreachable()
474 ; IR-NEXT: br label %Flow2
475
476 ; IR: UnifiedReturnBlock: ; preds = %exit0, %Flow2
477 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
478 ; IR-NEXT: ret void
479 define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
480 entry:
481 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
482 %tmp1 = add i32 0, %tmp
483 %tmp2 = zext i32 %tmp1 to i64
484 %tmp3 = add i64 0, %tmp2
485 %tmp4 = shl i64 %tmp3, 32
486 %tmp5 = ashr exact i64 %tmp4, 32
487 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
488 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
489 %tmp8 = sext i32 %tmp7 to i64
490 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
491 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
492 %tmp13 = zext i32 %tmp10 to i64
493 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
494 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
495 %Pivot = icmp slt i32 %tmp16, 2
496 br i1 %Pivot, label %LeafBlock, label %LeafBlock1
497
498 LeafBlock: ; preds = %entry
499 %SwitchLeaf = icmp eq i32 %tmp16, 1
500 br i1 %SwitchLeaf, label %exit0, label %indirect.exit1
501
502 LeafBlock1: ; preds = %entry
503 %SwitchLeaf2 = icmp eq i32 %tmp16, 2
504 br i1 %SwitchLeaf2, label %exit0, label %indirect.exit1
505
506 exit0: ; preds = %LeafBlock, %LeafBlock1
507 store volatile i32 17, i32 addrspace(3)* undef
508 ret void
509
510 indirect.exit1:
511 %load = load volatile i32, i32 addrspace(1)* undef
512 store volatile i32 %load, i32 addrspace(1)* undef
513 br label %exit1
514
515 exit1: ; preds = %LeafBlock, %LeafBlock1
516 store volatile i32 9, i32 addrspace(1)* undef
517 unreachable
518 }
519
520 ; IR-LABEL: @multi_divergent_region_exit_ret_switch(
521 define amdgpu_kernel void @multi_divergent_region_exit_ret_switch(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
522 entry:
523 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
524 %tmp1 = add i32 0, %tmp
525 %tmp2 = zext i32 %tmp1 to i64
526 %tmp3 = add i64 0, %tmp2
527 %tmp4 = shl i64 %tmp3, 32
528 %tmp5 = ashr exact i64 %tmp4, 32
529 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
530 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
531 %tmp8 = sext i32 %tmp7 to i64
532 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
533 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
534 %tmp13 = zext i32 %tmp10 to i64
535 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
536 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
537 switch i32 %tmp16, label %exit1
538 [ i32 1, label %LeafBlock
539 i32 2, label %LeafBlock1
540 i32 3, label %exit0 ]
541
542 LeafBlock: ; preds = %entry
543 %SwitchLeaf = icmp eq i32 %tmp16, 1
544 br i1 %SwitchLeaf, label %exit0, label %exit1
545
546 LeafBlock1: ; preds = %entry
547 %SwitchLeaf2 = icmp eq i32 %tmp16, 2
548 br i1 %SwitchLeaf2, label %exit0, label %exit1
549
550 exit0: ; preds = %LeafBlock, %LeafBlock1
551 store volatile i32 17, i32 addrspace(3)* undef
552 ret void
553
554 exit1: ; preds = %LeafBlock, %LeafBlock1
555 store volatile i32 9, i32 addrspace(1)* undef
556 unreachable
557 }
558
559 ; IR-LABEL: @divergent_multi_ret_nest_in_uniform_triangle(
560 define amdgpu_kernel void @divergent_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
561 entry:
562 %uniform.cond0 = icmp eq i32 %arg0, 4
563 br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
564
565 divergent.multi.exit.region:
566 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
567 %divergent.cond0 = icmp eq i32 %id.x, 0
568 br i1 %divergent.cond0, label %divergent.ret0, label %divergent.ret1
569
570 divergent.ret0:
571 store volatile i32 11, i32 addrspace(3)* undef
572 ret void
573
574 divergent.ret1:
575 store volatile i32 42, i32 addrspace(3)* undef
576 ret void
577
578 uniform.ret:
579 store volatile i32 9, i32 addrspace(1)* undef
580 ret void
581 }
582
583 ; IR-LABEL: @divergent_complex_multi_ret_nest_in_uniform_triangle(
584 define amdgpu_kernel void @divergent_complex_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
585 entry:
586 %uniform.cond0 = icmp eq i32 %arg0, 4
587 br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
588
589 divergent.multi.exit.region:
590 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
591 %divergent.cond0 = icmp eq i32 %id.x, 0
592 br i1 %divergent.cond0, label %divergent.if, label %divergent.ret1
593
594 divergent.if:
595 %vgpr0 = load volatile float, float addrspace(1)* undef
596 %divergent.cond1 = fcmp ogt float %vgpr0, 1.0
597 br i1 %divergent.cond1, label %divergent.then, label %divergent.endif
598
599 divergent.then:
600 %vgpr1 = load volatile float, float addrspace(1)* undef
601 %divergent.cond2 = fcmp olt float %vgpr1, 4.0
602 store volatile i32 33, i32 addrspace(1)* undef
603 br i1 %divergent.cond2, label %divergent.ret0, label %divergent.endif
604
605 divergent.endif:
606 store volatile i32 38, i32 addrspace(1)* undef
607 br label %divergent.ret0
608
609 divergent.ret0:
610 store volatile i32 11, i32 addrspace(3)* undef
611 ret void
612
613 divergent.ret1:
614 store volatile i32 42, i32 addrspace(3)* undef
615 ret void
616
617 uniform.ret:
618 store volatile i32 9, i32 addrspace(1)* undef
619 ret void
620 }
621
622 ; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle(
623 ; IR: Flow1: ; preds = %uniform.ret1, %uniform.multi.exit.region
624 ; IR: %8 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ]
625 ; IR: br i1 %8, label %uniform.if, label %Flow2
626
627 ; IR: Flow: ; preds = %uniform.then, %uniform.if
628 ; IR: %11 = phi i1 [ %10, %uniform.then ], [ %9, %uniform.if ]
629 ; IR: br i1 %11, label %uniform.endif, label %uniform.ret0
630
631 ; IR: UnifiedReturnBlock: ; preds = %Flow3, %Flow2
632 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %6)
633 ; IR-NEXT: ret void
634 define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 {
635 entry:
636 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
637 %divergent.cond0 = icmp eq i32 %id.x, 0
638 br i1 %divergent.cond0, label %uniform.multi.exit.region, label %divergent.ret
639
640 uniform.multi.exit.region:
641 %uniform.cond0 = icmp eq i32 %arg0, 4
642 br i1 %uniform.cond0, label %uniform.if, label %uniform.ret1
643
644 uniform.if:
645 %sgpr0 = load volatile i32, i32 addrspace(2)* undef
646 %uniform.cond1 = icmp slt i32 %sgpr0, 1
647 br i1 %uniform.cond1, label %uniform.then, label %uniform.endif
648
649 uniform.then:
650 %sgpr1 = load volatile i32, i32 addrspace(2)* undef
651 %uniform.cond2 = icmp sge i32 %sgpr1, 4
652 store volatile i32 33, i32 addrspace(1)* undef
653 br i1 %uniform.cond2, label %uniform.ret0, label %uniform.endif
654
655 uniform.endif:
656 store volatile i32 38, i32 addrspace(1)* undef
657 br label %uniform.ret0
658
659 uniform.ret0:
660 store volatile i32 11, i32 addrspace(3)* undef
661 ret void
662
663 uniform.ret1:
664 store volatile i32 42, i32 addrspace(3)* undef
665 ret void
666
667 divergent.ret:
668 store volatile i32 9, i32 addrspace(1)* undef
669 ret void
670 }
671
672 ; IR-LABEL: @multi_divergent_unreachable_exit(
673 ; IR: UnifiedUnreachableBlock:
674 ; IR-NEXT: call void @llvm.amdgcn.unreachable()
675 ; IR-NEXT: br label %UnifiedReturnBlock
676
677 ; IR: UnifiedReturnBlock:
678 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64
679 ; IR-NEXT: ret void
680 define amdgpu_kernel void @multi_divergent_unreachable_exit() #0 {
681 bb:
682 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
683 switch i32 %tmp, label %bb3 [
684 i32 2, label %bb1
685 i32 0, label %bb2
686 ]
687
688 bb1: ; preds = %bb
689 unreachable
690
691 bb2: ; preds = %bb
692 unreachable
693
694 bb3: ; preds = %bb
695 switch i32 undef, label %bb5 [
696 i32 2, label %bb4
697 ]
698
699 bb4: ; preds = %bb3
700 ret void
701
702 bb5: ; preds = %bb3
703 unreachable
704 }
705
706 declare i32 @llvm.amdgcn.workitem.id.x() #1
707
708 attributes #0 = { nounwind }
709 attributes #1 = { nounwind readnone }
33 ; This should end with an no-op sequence of exec mask manipulations
44 ; Mask should be in original state after executed unreachable block
55
6 ; GCN-LABEL: {{^}}main:
6
7 ; GCN-LABEL: {{^}}uniform_br_trivial_ret_divergent_br_trivial_unreachable:
78 ; GCN: s_cbranch_scc1 [[RET_BB:BB[0-9]+_[0-9]+]]
9
10 ; GCN-NEXT: ; %else
811
912 ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
1013 ; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
11 ; GCN-NEXT: ; mask branch [[UNREACHABLE_BB:BB[0-9]+_[0-9]+]]
14 ; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
1215
13 ; GCN: [[RET_BB]]:
14 ; GCN-NEXT: s_branch [[FINAL_BB:BB[0-9]+_[0-9]+]]
16 ; GCN: BB{{[0-9]+_[0-9]+}}: ; %unreachable.bb
17 ; GCN-NEXT: ; divergent unreachable
1518
16 ; GCN-NEXT: [[UNREACHABLE_BB]]:
17 ; GCN-NEXT: [[FINAL_BB]]:
19 ; GCN-NEXT: {{^}}[[FLOW]]: ; %Flow
20 ; GCN-NEXT: s_or_b64 exec, exec
21
22 ; GCN-NEXT: [[RET_BB]]:
23 ; GCN-NEXT: ; return
1824 ; GCN-NEXT: .Lfunc_end0
19 define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
25 define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_trivial_ret_divergent_br_trivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, i32 inreg %arg17, i32 %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
26 entry:
27 %i.i = extractelement <2 x i32> %arg7, i32 0
28 %j.i = extractelement <2 x i32> %arg7, i32 1
29 %i.f.i = bitcast i32 %i.i to float
30 %j.f.i = bitcast i32 %j.i to float
31 %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 1, i32 0, i32 %arg5) #2
32 %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 1, i32 0, i32 %arg5) #2
33 %p87 = fmul float undef, %p2.i
34 %p88 = fadd float %p87, undef
35 %p93 = fadd float %p88, undef
36 %p97 = fmul float %p93, undef
37 %p102 = fsub float %p97, undef
38 %p104 = fmul float %p102, undef
39 %p106 = fadd float 0.000000e+00, %p104
40 %p108 = fadd float undef, %p106
41 %uniform.cond = icmp slt i32 %arg17, 0
42 br i1 %uniform.cond, label %ret.bb, label %else
43
44 else: ; preds = %main_body
45 %p124 = fmul float %p108, %p108
46 %p125 = fsub float %p124, undef
47 %divergent.cond = fcmp olt float %p125, 0.000000e+00
48 br i1 %divergent.cond, label %ret.bb, label %unreachable.bb
49
50 unreachable.bb: ; preds = %else
51 unreachable
52
53 ret.bb: ; preds = %else, %main_body
54 ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef
55 }
56
57 ; GCN-LABEL: {{^}}uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable:
58 ; GCN: s_cbranch_vccnz [[RET_BB:BB[0-9]+_[0-9]+]]
59
60 ; GCN: ; BB#{{[0-9]+}}: ; %else
61 ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
62 ; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
63 ; GCN-NEXT: ; mask branch [[FLOW1:BB[0-9]+_[0-9]+]]
64
65 ; GCN-NEXT: ; %unreachable.bb
66 ; GCN: ds_write_b32
67 ; GCN: s_waitcnt
68 ; GCN: ; divergent unreachable
69
70 ; GCN: ; %ret.bb
71 ; GCN: store_dword
72
73 ; GCN: ; %UnifiedReturnBlock
74 ; GCN-NEXT: s_or_b64 exec, exec
75 ; GCN-NEXT: ; return
76 ; GCN-NEXT: .Lfunc_end
77 define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 inreg %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
2078 main_body:
2179 %i.i = extractelement <2 x i32> %arg7, i32 0
2280 %j.i = extractelement <2 x i32> %arg7, i32 1
3290 %p104 = fmul float %p102, undef
3391 %p106 = fadd float 0.000000e+00, %p104
3492 %p108 = fadd float undef, %p106
35 br i1 undef, label %ENDIF69, label %ELSE
93 %uniform.cond = icmp slt i32 %arg18, 0
94 br i1 %uniform.cond, label %ret.bb, label %else
3695
37 ELSE: ; preds = %main_body
96 else: ; preds = %main_body
3897 %p124 = fmul float %p108, %p108
3998 %p125 = fsub float %p124, undef
40 %p126 = fcmp olt float %p125, 0.000000e+00
41 br i1 %p126, label %ENDIF69, label %ELSE41
99 %divergent.cond = fcmp olt float %p125, 0.000000e+00
100 br i1 %divergent.cond, label %ret.bb, label %unreachable.bb
42101
43 ELSE41: ; preds = %ELSE
102 unreachable.bb: ; preds = %else
103 store volatile i32 8, i32 addrspace(3)* undef
44104 unreachable
45105
46 ENDIF69: ; preds = %ELSE, %main_body
106 ret.bb: ; preds = %else, %main_body
107 store volatile i32 11, i32 addrspace(1)* undef
47108 ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef
48109 }
49110
55 ; OPT-NOT: call i1 @llvm.amdgcn.loop
66
77 ; GCN-LABEL: {{^}}annotate_unreachable_noloop:
8 ; GCN: s_cbranch_vccnz
8 ; GCN: s_cbranch_scc1
99 ; GCN-NOT: s_endpgm
1010 ; GCN: .Lfunc_end0
1111 define amdgpu_kernel void @annotate_unreachable_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
3636 ; OPT-NOT: call i1 @llvm.amdgcn.loop
3737
3838 ; GCN-LABEL: {{^}}annotate_ret_noloop:
39 ; GCN: s_cbranch_scc1
40 ; GCN: s_endpgm
41 ; GCN: .Lfunc_end1
39 ; GCN: load_dwordx4
40 ; GCN: v_cmp_nlt_f32
41 ; GCN: s_and_saveexec_b64
42 ; GCN: ; mask branch [[UNIFIED_RET:BB[0-9]+_[0-9]+]]
43 ; GCN-NEXT: [[UNIFIED_RET]]:
44 ; GCN-NEXT: s_or_b64 exec, exec
45 ; GCN-NEXT: s_endpgm
46 ; GCN: .Lfunc_end
4247 define amdgpu_kernel void @annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
4348 bb:
4449 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
50 br label %bb1
51
52 bb1: ; preds = %bb
53 %tmp2 = sext i32 %tmp to i64
54 %tmp3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i64 %tmp2
55 %tmp4 = load <4 x float>, <4 x float> addrspace(1)* %tmp3, align 16
56 %tmp5 = extractelement <4 x float> %tmp4, i32 1
57 store volatile <4 x float> %tmp4, <4 x float> addrspace(1)* undef
58 %cmp = fcmp ogt float %tmp5, 1.0
59 br i1 %cmp, label %bb5, label %bb3
60
61 bb3: ; preds = %bb1
62 %tmp6 = extractelement <4 x float> %tmp4, i32 2
63 %tmp7 = fcmp olt float %tmp6, 0.000000e+00
64 br i1 %tmp7, label %bb4, label %bb5 ; crash goes away if these are swapped
65
66 bb4: ; preds = %bb3
67 ret void
68
69 bb5: ; preds = %bb3, %bb1
70 ret void
71 }
72
73 ; OPT-LABEL: @uniform_annotate_ret_noloop(
74 ; OPT-NOT: call i1 @llvm.amdgcn.loop
75
76 ; GCN-LABEL: {{^}}uniform_annotate_ret_noloop:
77 ; GCN: s_cbranch_scc1
78 ; GCN: s_endpgm
79 ; GCN: .Lfunc_end
80 define amdgpu_kernel void @uniform_annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg, i32 %tmp) #0 {
81 bb:
4582 br label %bb1
4683
4784 bb1: ; preds = %bb
33 ; GCN: v_cmp_eq_u32
44 ; GCN: s_and_saveexec_b64
55 ; GCN: s_xor_b64
6 ; GCN: ; mask branch [[RET:BB[0-9]+]]
7 ; GCN: s_branch [[UNREACHABLE:BB[0-9]+_[0-9]+]]
6 ; GCN: ; mask branch [[RET:BB[0-9]+_[0-9]+]]
87
9 ; GCN: [[RET]]
10 ; GCN: s_or_b64 exec, exec
8 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %unreachable
9 ; GCN: ds_write_b32
10 ; GCN: ; divergent unreachable
11 ; GCN: s_waitcnt
12
13 ; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock
14 ; GCN-NEXT: s_or_b64 exec, exec
1115 ; GCN: s_endpgm
1216
13 ; GCN: [[UNREACHABLE]]:
14 ; GCN: ds_write_b32
15 ; GCN: s_waitcnt
1617 define amdgpu_kernel void @lower_control_flow_unreachable_terminator() #0 {
1718 bb:
1819 %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y()
2829 }
2930
3031 ; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator_swap_block_order:
31 ; GCN: v_cmp_eq_u32
32 ; GCN: v_cmp_ne_u32
3233 ; GCN: s_and_saveexec_b64
3334 ; GCN: s_xor_b64
34 ; GCN: ; mask branch [[UNREACHABLE:BB[0-9]+_[0-9]+]]
35 ; GCN: ; mask branch [[RETURN:BB[0-9]+_[0-9]+]]
3536
36 ; GCN-NEXT: ; %ret
37 ; GCN-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %unreachable
38 ; GCN: ds_write_b32
39 ; GCN: ; divergent unreachable
40 ; GCN: s_waitcnt
41
42 ; GCN: [[RETURN]]:
43 ; GCN-NEXT: s_or_b64 exec, exec
3744 ; GCN-NEXT: s_endpgm
38
39 ; GCN-NEXT: [[UNREACHABLE]]:
40 ; GCN-NEXT: s_or_b64 exec, exec
41 ; GCN: ds_write_b32
42 ; GCN: s_waitcnt
4345 define amdgpu_kernel void @lower_control_flow_unreachable_terminator_swap_block_order() #0 {
4446 bb:
4547 %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y()
5456 unreachable
5557 }
5658
57 ; Function Attrs: nounwind readnone
59 ; GCN-LABEL: {{^}}uniform_lower_control_flow_unreachable_terminator:
60 ; GCN: s_cmp_lg_u32
61 ; GCN: s_cbranch_scc0 [[UNREACHABLE:BB[0-9]+_[0-9]+]]
62
63 ; GCN-NEXT: BB#{{[0-9]+}}: ; %ret
64 ; GCN-NEXT: s_endpgm
65
66 ; GCN: [[UNREACHABLE]]:
67 ; GCN: ds_write_b32
68 ; GCN: s_waitcnt
69 define amdgpu_kernel void @uniform_lower_control_flow_unreachable_terminator(i32 %arg0) #0 {
70 bb:
71 %tmp63 = icmp eq i32 %arg0, 32
72 br i1 %tmp63, label %unreachable, label %ret
73
74 unreachable:
75 store volatile i32 0, i32 addrspace(3)* undef, align 4
76 unreachable
77
78 ret:
79 ret void
80 }
81
5882 declare i32 @llvm.amdgcn.workitem.id.y() #1
5983
6084 attributes #0 = { nounwind }
6363 ret void
6464 }
6565
66 ; SI-LABEL: @simple_test_v_if
66 ; SI-LABEL: {{^}}simple_test_v_if:
6767 ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
6868 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
6969 ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
70
71 ; SI: BB{{[0-9]+_[0-9]+}}:
72 ; SI: buffer_store_dword
73 ; SI: s_endpgm
74
75 ; SI: BB1_2:
70 ; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
71
72 ; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
73 ; SI: buffer_store_dword
74 ; SI-NEXT: s_waitcnt
75
76 ; SI-NEXT: {{^}}[[EXIT]]:
7677 ; SI: s_or_b64 exec, exec, [[BR_SREG]]
7778 ; SI: s_endpgm
7879 define amdgpu_kernel void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
7980 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
8081 %is.0 = icmp ne i32 %tid, 0
81 br i1 %is.0, label %store, label %exit
82
83 store:
82 br i1 %is.0, label %then, label %exit
83
84 then:
8485 %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
8586 store i32 999, i32 addrspace(1)* %gep
86 ret void
87
88 exit:
87 br label %exit
88
89 exit:
90 ret void
91 }
92
93 ; FIXME: It would be better to endpgm in the then block.
94
95 ; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret:
96 ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
97 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
98 ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
99 ; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
100
101 ; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
102 ; SI: buffer_store_dword
103 ; SI-NEXT: s_waitcnt
104
105 ; SI-NEXT: {{^}}[[EXIT]]:
106 ; SI: s_or_b64 exec, exec, [[BR_SREG]]
107 ; SI: s_endpgm
108 define amdgpu_kernel void @simple_test_v_if_ret_else_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
109 %tid = call i32 @llvm.amdgcn.workitem.id.x()
110 %is.0 = icmp ne i32 %tid, 0
111 br i1 %is.0, label %then, label %exit
112
113 then:
114 %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
115 store i32 999, i32 addrspace(1)* %gep
116 ret void
117
118 exit:
119 ret void
120 }
121
122 ; Final block has more than a ret to execute. This was miscompiled
123 ; before function exit blocks were unified since the endpgm would
124 ; terminate the then wavefront before reaching the store.
125
126 ; SI-LABEL: {{^}}simple_test_v_if_ret_else_code_ret:
127 ; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
128 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
129 ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
130 ; SI: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
131
132 ; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %exit
133 ; SI: ds_write_b32
134 ; SI: s_waitcnt
135
136 ; SI-NEXT: {{^}}[[FLOW]]:
137 ; SI-NEXT: s_or_saveexec_b64
138 ; SI-NEXT: s_xor_b64 exec, exec
139 ; SI-NEXT: ; mask branch [[UNIFIED_RETURN:BB[0-9]+_[0-9]+]]
140
141 ; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %then
142 ; SI: buffer_store_dword
143 ; SI-NEXT: s_waitcnt
144
145 ; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock
146 ; SI: s_or_b64 exec, exec
147 ; SI: s_endpgm
148 define amdgpu_kernel void @simple_test_v_if_ret_else_code_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
149 %tid = call i32 @llvm.amdgcn.workitem.id.x()
150 %is.0 = icmp ne i32 %tid, 0
151 br i1 %is.0, label %then, label %exit
152
153 then:
154 %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
155 store i32 999, i32 addrspace(1)* %gep
156 ret void
157
158 exit:
159 store volatile i32 7, i32 addrspace(3)* undef
89160 ret void
90161 }
91162