llvm.org GIT mirror llvm / b62d86c
Merging r360293: ------------------------------------------------------------------------ r360293 | arsenm | 2019-05-08 15:09:57 -0700 (Wed, 08 May 2019) | 21 lines AMDGPU: Select VOP3 form of add The VOP3 form should always be the preferred selection, to be shrunk later. This should only be an optimization issue, but this partially works around a problem from clobbering VCC when SIFixSGPRCopies rewrites an SCC defining operation directly to VCC. 3 of the testcases are regressions from failing to fold the immediate in cases it should. These can be avoided by improving the VCC liveness handling in SIFoldOperands. Simply increasing the threshold to computeRegisterLiveness works, although this is common enough that VCC liveness should probably be tracked throughout the pass. The hack of leaving behind an implicit_def instruction to avoid breaking iterator wastes instruction count, which inhibits finding the VCC def in long chains of adds. Doing this however exposes different, worse looking regressions from poor scheduling behavior. This could probably be avoided around by forcing the shrink of the addc here, but the scheduler should probably be fixed. The r600 add test needs to be split out because it asserts on the arguments in the new test during the calling convention lowering. ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_80@362658 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 2 months ago
8 changed file(s) with 202 addition(s) and 74 deletion(s). Raw diff Collapse all Expand all
514514 }
515515
516516 let SubtargetPredicate = HasAddNoCarryInsts in {
517 def : DivergentBinOp;
518
517 def : DivergentBinOp;
519518 def : DivergentBinOp;
520519 }
521520
522
523 def : DivergentBinOp32>;
521 def : DivergentBinOp64>;
524522 def : DivergentBinOp;
525
526 def : DivergentBinOp;
527523
528524 def : DivergentBinOp;
529525 def : DivergentBinOp;
0 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,FUNC %s
11 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,FUNC %s
22 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s
3 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
43
54 ; FUNC-LABEL: {{^}}s_add_i32:
6 ; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
7
85 ; GCN: s_add_i32 s[[REG:[0-9]+]], {{s[0-9]+, s[0-9]+}}
96 ; GCN: v_mov_b32_e32 v[[V_REG:[0-9]+]], s[[REG]]
107 ; GCN: buffer_store_dword v[[V_REG]],
1815 }
1916
2017 ; FUNC-LABEL: {{^}}s_add_v2i32:
21 ; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22 ; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
23
2418 ; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
2519 ; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
2620 define amdgpu_kernel void @s_add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
3327 }
3428
3529 ; FUNC-LABEL: {{^}}s_add_v4i32:
36 ; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
37 ; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
38 ; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
39 ; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
40
4130 ; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
4231 ; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
4332 ; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
5241 }
5342
5443 ; FUNC-LABEL: {{^}}s_add_v8i32:
55 ; EG: ADD_INT
56 ; EG: ADD_INT
57 ; EG: ADD_INT
58 ; EG: ADD_INT
59 ; EG: ADD_INT
60 ; EG: ADD_INT
61 ; EG: ADD_INT
62 ; EG: ADD_INT
63
6444 ; GCN: s_add_i32
6545 ; GCN: s_add_i32
6646 ; GCN: s_add_i32
7757 }
7858
7959 ; FUNC-LABEL: {{^}}s_add_v16i32:
80 ; EG: ADD_INT
81 ; EG: ADD_INT
82 ; EG: ADD_INT
83 ; EG: ADD_INT
84 ; EG: ADD_INT
85 ; EG: ADD_INT
86 ; EG: ADD_INT
87 ; EG: ADD_INT
88 ; EG: ADD_INT
89 ; EG: ADD_INT
90 ; EG: ADD_INT
91 ; EG: ADD_INT
92 ; EG: ADD_INT
93 ; EG: ADD_INT
94 ; EG: ADD_INT
95 ; EG: ADD_INT
96
9760 ; GCN: s_add_i32
9861 ; GCN: s_add_i32
9962 ; GCN: s_add_i32
12386 ; SIVI: v_add_{{i|u}}32_e32 v{{[0-9]+}}, vcc, [[A]], [[B]]
12487 ; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[A]], [[B]]
12588 define amdgpu_kernel void @v_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
126 %tid = call i32 @llvm.r600.read.tidig.x()
89 %tid = call i32 @llvm.amdgcn.workitem.id.x()
12790 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
12891 %b_ptr = getelementptr i32, i32 addrspace(1)* %gep, i32 1
12992 %a = load volatile i32, i32 addrspace(1)* %gep
138101 ; SIVI: v_add_{{i|u}}32_e32 v{{[0-9]+}}, vcc, 0x7b, [[A]]
139102 ; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0x7b, [[A]]
140103 define amdgpu_kernel void @v_add_imm_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
141 %tid = call i32 @llvm.r600.read.tidig.x()
104 %tid = call i32 @llvm.amdgcn.workitem.id.x()
142105 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
143106 %b_ptr = getelementptr i32, i32 addrspace(1)* %gep, i32 1
144107 %a = load volatile i32, i32 addrspace(1)* %gep
150113 ; FUNC-LABEL: {{^}}add64:
151114 ; GCN: s_add_u32
152115 ; GCN: s_addc_u32
153
154 ; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
155 ; EG-DAG: ADD_INT {{[* ]*}}
156 ; EG-DAG: ADDC_UINT
157 ; EG-DAG: ADD_INT
158 ; EG-DAG: ADD_INT {{[* ]*}}
159 ; EG-NOT: SUB
160116 define amdgpu_kernel void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
161117 entry:
162118 %add = add i64 %a, %b
171127
172128 ; FUNC-LABEL: {{^}}add64_sgpr_vgpr:
173129 ; GCN-NOT: v_addc_u32_e32 s
174
175 ; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
176 ; EG-DAG: ADD_INT {{[* ]*}}
177 ; EG-DAG: ADDC_UINT
178 ; EG-DAG: ADD_INT
179 ; EG-DAG: ADD_INT {{[* ]*}}
180 ; EG-NOT: SUB
181130 define amdgpu_kernel void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
182131 entry:
183132 %0 = load i64, i64 addrspace(1)* %in
190139 ; FUNC-LABEL: {{^}}add64_in_branch:
191140 ; GCN: s_add_u32
192141 ; GCN: s_addc_u32
193
194 ; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
195 ; EG-DAG: ADD_INT {{[* ]*}}
196 ; EG-DAG: ADDC_UINT
197 ; EG-DAG: ADD_INT
198 ; EG-DAG: ADD_INT {{[* ]*}}
199 ; EG-NOT: SUB
200142 define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
201143 entry:
202144 %0 = icmp eq i64 %a, 0
216158 ret void
217159 }
218160
219 declare i32 @llvm.r600.read.tidig.x() #1
161 ; Make sure the VOP3 form of add is initially selected. Otherwise pair
162 ; of opies from/to VCC would be necessary
163
164 ; GCN-LABEL: {{^}}add_select_vop3:
165 ; SI: v_add_i32_e64 v0, s[0:1], s0, v0
166 ; VI: v_add_u32_e64 v0, s[0:1], s0, v0
167 ; GFX9: v_add_u32_e32 v0, s0, v0
168
169 ; GCN: ; def vcc
170 ; GCN: ds_write_b32
171 ; GCN: ; use vcc
172 define amdgpu_ps void @add_select_vop3(i32 inreg %s, i32 %v) {
173 %vcc = call i64 asm sideeffect "; def vcc", "={vcc}"()
174 %sub = add i32 %v, %s
175 store i32 %sub, i32 addrspace(3)* undef
176 call void asm sideeffect "; use vcc", "{vcc}"(i64 %vcc)
177 ret void
178 }
179
180 declare i32 @llvm.amdgcn.workitem.id.x() #1
220181
221182 attributes #0 = { nounwind }
222183 attributes #1 = { nounwind readnone speculatable }
66
77 ; Function Attrs: nounwind
88 ; CHECK-LABEL: {{^}}signed_ds_offset_addressing_loop:
9 ; SI: s_movk_i32 [[K_0X88:s[0-9]+]], 0x
10 ; SI: s_movk_i32 [[K_0X100:s[0-9]+]], 0x100
911 ; CHECK: BB0_1:
1012 ; CHECK: v_add_i32_e32 [[VADDR:v[0-9]+]],
1113 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]]
1315 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR8]]
1416 ; SI-DAG: v_add_i32_e32 [[VADDR0x80:v[0-9]+]], vcc, 0x80, [[VADDR]]
1517 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x80]]
16 ; SI-DAG: v_add_i32_e32 [[VADDR0x88:v[0-9]+]], vcc, 0x88, [[VADDR]]
18 ; SI-DAG: v_add_i32_e32 [[VADDR0x88:v[0-9]+]], vcc, [[K_0X88]], [[VADDR]]
1719 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x88]]
18 ; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], vcc, 0x100, [[VADDR]]
20 ; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], vcc, [[K_0X100]], [[VADDR]]
1921 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x100]]
2022
2123 ; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset1:2
5353 }
5454
5555 ; GCN-LABEL: {{^}}test_global
56 ; GCN: v_add_u32_e32 v{{[0-9]+}}, vcc, 0x888, v{{[0-9]+}}
56 ; GCN: s_movk_i32 [[K:s[0-9]+]], 0x888
57 ; GCN: v_add_u32_e32 v{{[0-9]+}}, vcc, [[K]], v{{[0-9]+}}
5758 ; GCN: flat_store_dword
5859 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
5960 ; GCN-NEXT: s_barrier
1414 }
1515
1616 ; VI-LABEL: {{^}}dpp_test1:
17 ; VI: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}
17 ; VI-OPT: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}
18 ; VI-NOOPT: v_add_u32_e64 [[REG:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}
1819 ; VI-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0
1920 ; VI-NEXT: s_nop 0
2021 ; VI-NEXT: s_nop 0
0 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
1
2 ; FUNC-LABEL: {{^}}s_add_i32:
3 ; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
4 define amdgpu_kernel void @s_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
5 %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
6 %a = load i32, i32 addrspace(1)* %in
7 %b = load i32, i32 addrspace(1)* %b_ptr
8 %result = add i32 %a, %b
9 store i32 %result, i32 addrspace(1)* %out
10 ret void
11 }
12
13 ; FUNC-LABEL: {{^}}s_add_v2i32:
14 ; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
15 ; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
16 define amdgpu_kernel void @s_add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
17 %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
18 %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
19 %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
20 %result = add <2 x i32> %a, %b
21 store <2 x i32> %result, <2 x i32> addrspace(1)* %out
22 ret void
23 }
24
25 ; FUNC-LABEL: {{^}}s_add_v4i32:
26 ; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
27 ; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
28 ; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
29 ; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
30 define amdgpu_kernel void @s_add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
31 %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
32 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
33 %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
34 %result = add <4 x i32> %a, %b
35 store <4 x i32> %result, <4 x i32> addrspace(1)* %out
36 ret void
37 }
38
39 ; FUNC-LABEL: {{^}}s_add_v8i32:
40 ; EG: ADD_INT
41 ; EG: ADD_INT
42 ; EG: ADD_INT
43 ; EG: ADD_INT
44 ; EG: ADD_INT
45 ; EG: ADD_INT
46 ; EG: ADD_INT
47 ; EG: ADD_INT
48 define amdgpu_kernel void @s_add_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
49 entry:
50 %0 = add <8 x i32> %a, %b
51 store <8 x i32> %0, <8 x i32> addrspace(1)* %out
52 ret void
53 }
54
55 ; FUNC-LABEL: {{^}}s_add_v16i32:
56 ; EG: ADD_INT
57 ; EG: ADD_INT
58 ; EG: ADD_INT
59 ; EG: ADD_INT
60 ; EG: ADD_INT
61 ; EG: ADD_INT
62 ; EG: ADD_INT
63 ; EG: ADD_INT
64 ; EG: ADD_INT
65 ; EG: ADD_INT
66 ; EG: ADD_INT
67 ; EG: ADD_INT
68 ; EG: ADD_INT
69 ; EG: ADD_INT
70 ; EG: ADD_INT
71 ; EG: ADD_INT
72 define amdgpu_kernel void @s_add_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
73 entry:
74 %0 = add <16 x i32> %a, %b
75 store <16 x i32> %0, <16 x i32> addrspace(1)* %out
76 ret void
77 }
78
79 ; FUNC-LABEL: {{^}}v_add_i32:
80 define amdgpu_kernel void @v_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
81 %tid = call i32 @llvm.r600.read.tidig.x()
82 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
83 %b_ptr = getelementptr i32, i32 addrspace(1)* %gep, i32 1
84 %a = load volatile i32, i32 addrspace(1)* %gep
85 %b = load volatile i32, i32 addrspace(1)* %b_ptr
86 %result = add i32 %a, %b
87 store i32 %result, i32 addrspace(1)* %out
88 ret void
89 }
90
91 ; FUNC-LABEL: {{^}}v_add_imm_i32:
92 define amdgpu_kernel void @v_add_imm_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
93 %tid = call i32 @llvm.r600.read.tidig.x()
94 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
95 %b_ptr = getelementptr i32, i32 addrspace(1)* %gep, i32 1
96 %a = load volatile i32, i32 addrspace(1)* %gep
97 %result = add i32 %a, 123
98 store i32 %result, i32 addrspace(1)* %out
99 ret void
100 }
101
102 ; FUNC-LABEL: {{^}}add64:
103 ; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
104 ; EG-DAG: ADD_INT {{[* ]*}}
105 ; EG-DAG: ADDC_UINT
106 ; EG-DAG: ADD_INT
107 ; EG-DAG: ADD_INT {{[* ]*}}
108 ; EG-NOT: SUB
109 define amdgpu_kernel void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
110 entry:
111 %add = add i64 %a, %b
112 store i64 %add, i64 addrspace(1)* %out
113 ret void
114 }
115
116 ; The v_addc_u32 and v_add_i32 instruction can't read SGPRs, because they
117 ; use VCC. The test is designed so that %a will be stored in an SGPR and
118 ; %0 will be stored in a VGPR, so the comiler will be forced to copy %a
119 ; to a VGPR before doing the add.
120
121 ; FUNC-LABEL: {{^}}add64_sgpr_vgpr:
122 ; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
123 ; EG-DAG: ADD_INT {{[* ]*}}
124 ; EG-DAG: ADDC_UINT
125 ; EG-DAG: ADD_INT
126 ; EG-DAG: ADD_INT {{[* ]*}}
127 ; EG-NOT: SUB
128 define amdgpu_kernel void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
129 entry:
130 %0 = load i64, i64 addrspace(1)* %in
131 %1 = add i64 %a, %0
132 store i64 %1, i64 addrspace(1)* %out
133 ret void
134 }
135
136 ; Test i64 add inside a branch.
137 ; FUNC-LABEL: {{^}}add64_in_branch:
138 ; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
139 ; EG-DAG: ADD_INT {{[* ]*}}
140 ; EG-DAG: ADDC_UINT
141 ; EG-DAG: ADD_INT
142 ; EG-DAG: ADD_INT {{[* ]*}}
143 ; EG-NOT: SUB
144 define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
145 entry:
146 %0 = icmp eq i64 %a, 0
147 br i1 %0, label %if, label %else
148
149 if:
150 %1 = load i64, i64 addrspace(1)* %in
151 br label %endif
152
153 else:
154 %2 = add i64 %a, %b
155 br label %endif
156
157 endif:
158 %3 = phi i64 [%1, %if], [%2, %else]
159 store i64 %3, i64 addrspace(1)* %out
160 ret void
161 }
162
163 declare i32 @llvm.r600.read.tidig.x() #1
164
165 attributes #0 = { nounwind }
166 attributes #1 = { nounwind readnone speculatable }
457457 }
458458
459459 ; GCN-LABEL: {{^}}phi_visit_order:
460 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 1, v{{[0-9]+}}
460 ; GCN: v_add_i32_e64 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 1, v{{[0-9]+}}
461461 define amdgpu_kernel void @phi_visit_order() {
462462 bb:
463463 br label %bb1
1111 ; LINE: v_mov_b32_e32 v{{[0-9]+}}, 0x888
1212 ; LINE: ; {{.*}}source-lines.cl:3
1313 ; LINE: ; {{.*}}source-lines.cl:4
14 ; LINE: v_add_u32_e32
14 ; LINE: v_add_u32_e64
1515 ; LINE: ; {{.*}}source-lines.cl:5
1616 ; LINE: flat_store_dword
1717 ; Epilogue.
2727 ; SOURCE: v_mov_b32_e32 v{{[0-9]+}}, 0x888
2828 ; SOURCE: ; int var1 = 0x888;
2929 ; SOURCE: ; int var2 = var0 + var1;
30 ; SOURCE: v_add_u32_e32
30 ; SOURCE: v_add_u32_e64
3131 ; SOURCE: ; *Out = var2;
3232 ; SOURCE: flat_store_dword
3333 ; Epilogue.