llvm.org GIT mirror llvm / 101d574
[AMDGPU] Regenerate uitofp i8 to float conversion tests. Prep work for D60462 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@358879 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 5 months ago
1 changed file(s) with 771 addition(s) and 119 deletion(s). Raw diff Collapse all Expand all
None ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=VI %s
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -check-prefixes=GCN,SI
2 ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -check-prefixes=GCN,VI
23
34 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
45 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
56
6 ; GCN-LABEL: {{^}}load_i8_to_f32:
7 ; GCN: {{buffer|flat}}_load_ubyte [[LOADREG:v[0-9]+]],
8 ; GCN-NOT: bfe
9 ; GCN-NOT: lshr
10 ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
11 ; GCN: buffer_store_dword [[CONV]],
127 define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
8 ; SI-LABEL: load_i8_to_f32:
9 ; SI: ; %bb.0:
10 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
11 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
12 ; SI-NEXT: s_mov_b32 s7, 0xf000
13 ; SI-NEXT: v_mov_b32_e32 v1, 0
14 ; SI-NEXT: s_mov_b32 s2, 0
15 ; SI-NEXT: s_mov_b32 s3, s7
16 ; SI-NEXT: s_waitcnt lgkmcnt(0)
17 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
18 ; SI-NEXT: s_mov_b32 s6, -1
19 ; SI-NEXT: s_waitcnt vmcnt(0)
20 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
21 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
22 ; SI-NEXT: s_endpgm
23 ;
24 ; VI-LABEL: load_i8_to_f32:
25 ; VI: ; %bb.0:
26 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
27 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
28 ; VI-NEXT: s_mov_b32 s7, 0xf000
29 ; VI-NEXT: s_mov_b32 s6, -1
30 ; VI-NEXT: s_waitcnt lgkmcnt(0)
31 ; VI-NEXT: v_mov_b32_e32 v1, s1
32 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
33 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
34 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
36 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
37 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
38 ; VI-NEXT: s_endpgm
1339 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1440 %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
1541 %load = load i8, i8 addrspace(1)* %gep, align 1
1844 ret void
1945 }
2046
21 ; GCN-LABEL: {{^}}load_v2i8_to_v2f32:
22 ; GCN: {{buffer|flat}}_load_ushort [[LD:v[0-9]+]]
23 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]]
24 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]]
25 ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
2647 define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
48 ; SI-LABEL: load_v2i8_to_v2f32:
49 ; SI: ; %bb.0:
50 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
51 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
52 ; SI-NEXT: s_mov_b32 s7, 0xf000
53 ; SI-NEXT: s_mov_b32 s2, 0
54 ; SI-NEXT: s_mov_b32 s3, s7
55 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
56 ; SI-NEXT: v_mov_b32_e32 v1, 0
57 ; SI-NEXT: s_waitcnt lgkmcnt(0)
58 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
59 ; SI-NEXT: s_mov_b32 s6, -1
60 ; SI-NEXT: s_waitcnt vmcnt(0)
61 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
62 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
63 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
64 ; SI-NEXT: s_endpgm
65 ;
66 ; VI-LABEL: load_v2i8_to_v2f32:
67 ; VI: ; %bb.0:
68 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
69 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
70 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
71 ; VI-NEXT: s_mov_b32 s7, 0xf000
72 ; VI-NEXT: s_mov_b32 s6, -1
73 ; VI-NEXT: s_waitcnt lgkmcnt(0)
74 ; VI-NEXT: v_mov_b32_e32 v1, s1
75 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
76 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
77 ; VI-NEXT: flat_load_ushort v0, v[0:1]
78 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
79 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
80 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
81 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
82 ; VI-NEXT: s_endpgm
2783 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2884 %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid
2985 %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2
3288 ret void
3389 }
3490
35 ; GCN-LABEL: {{^}}load_v3i8_to_v3f32:
36 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
37 ; GCN-NOT: v_cvt_f32_ubyte3_e32
38 ; GCN-DAG: v_cvt_f32_ubyte2_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
39 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[MDRESULT:[0-9]+]], [[VAL]]
40 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
41 ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[MDRESULT]]{{\]}},
42 ; VI: buffer_store_dwordx3 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
4391 define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
92 ; SI-LABEL: load_v3i8_to_v3f32:
93 ; SI: ; %bb.0:
94 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
95 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
96 ; SI-NEXT: s_mov_b32 s7, 0xf000
97 ; SI-NEXT: s_mov_b32 s2, 0
98 ; SI-NEXT: s_mov_b32 s3, s7
99 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
100 ; SI-NEXT: v_mov_b32_e32 v1, 0
101 ; SI-NEXT: s_waitcnt lgkmcnt(0)
102 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
103 ; SI-NEXT: s_mov_b32 s6, -1
104 ; SI-NEXT: s_waitcnt vmcnt(0)
105 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v2
106 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
107 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2
108 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8
109 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
110 ; SI-NEXT: s_endpgm
111 ;
112 ; VI-LABEL: load_v3i8_to_v3f32:
113 ; VI: ; %bb.0:
114 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
115 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
116 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
117 ; VI-NEXT: s_mov_b32 s7, 0xf000
118 ; VI-NEXT: s_mov_b32 s6, -1
119 ; VI-NEXT: s_waitcnt lgkmcnt(0)
120 ; VI-NEXT: v_mov_b32_e32 v1, s1
121 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
122 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
123 ; VI-NEXT: flat_load_dword v0, v[0:1]
124 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
125 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
126 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
127 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
128 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
129 ; VI-NEXT: s_endpgm
44130 %tid = call i32 @llvm.amdgcn.workitem.id.x()
45131 %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid
46132 %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
49135 ret void
50136 }
51137
52 ; GCN-LABEL: {{^}}load_v4i8_to_v4f32:
53 ; GCN: {{buffer|flat}}_load_dword [[LOADREG:v[0-9]+]]
54 ; GCN-NOT: bfe
55 ; GCN-NOT: lshr
56 ; GCN-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
57 ; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[LOADREG]]
58 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]]
59 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
60 ; GCN: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
61138 define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
139 ; SI-LABEL: load_v4i8_to_v4f32:
140 ; SI: ; %bb.0:
141 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
142 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
143 ; SI-NEXT: s_mov_b32 s7, 0xf000
144 ; SI-NEXT: s_mov_b32 s2, 0
145 ; SI-NEXT: s_mov_b32 s3, s7
146 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
147 ; SI-NEXT: v_mov_b32_e32 v1, 0
148 ; SI-NEXT: s_waitcnt lgkmcnt(0)
149 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
150 ; SI-NEXT: s_mov_b32 s6, -1
151 ; SI-NEXT: s_waitcnt vmcnt(0)
152 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
153 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
154 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
155 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
156 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
157 ; SI-NEXT: s_endpgm
158 ;
159 ; VI-LABEL: load_v4i8_to_v4f32:
160 ; VI: ; %bb.0:
161 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
162 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
163 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
164 ; VI-NEXT: s_mov_b32 s7, 0xf000
165 ; VI-NEXT: s_mov_b32 s6, -1
166 ; VI-NEXT: s_waitcnt lgkmcnt(0)
167 ; VI-NEXT: v_mov_b32_e32 v1, s1
168 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
169 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
170 ; VI-NEXT: flat_load_dword v0, v[0:1]
171 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
172 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
173 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
174 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
175 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
176 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
177 ; VI-NEXT: s_endpgm
62178 %tid = call i32 @llvm.amdgcn.workitem.id.x()
63179 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
64180 %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
71187 ; position in the word for the component.
72188
73189 ; FIXME: Packing bytes
74 ; GCN-LABEL: {{^}}load_v4i8_to_v4f32_unaligned:
75 ; GCN: {{buffer|flat}}_load_ubyte [[LOADREG3:v[0-9]+]]
76 ; GCN: {{buffer|flat}}_load_ubyte [[LOADREG2:v[0-9]+]]
77 ; GCN: {{buffer|flat}}_load_ubyte [[LOADREG1:v[0-9]+]]
78 ; GCN: {{buffer|flat}}_load_ubyte [[LOADREG0:v[0-9]+]]
79 ; GCN-DAG: v_lshlrev_b32
80 ; GCN-DAG: v_or_b32
81 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]],
82 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}},
83 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}},
84 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]]
85
86 ; GCN: buffer_store_dwordx4
87190 define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
191 ; SI-LABEL: load_v4i8_to_v4f32_unaligned:
192 ; SI: ; %bb.0:
193 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
194 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
195 ; SI-NEXT: s_mov_b32 s7, 0xf000
196 ; SI-NEXT: s_mov_b32 s2, 0
197 ; SI-NEXT: s_mov_b32 s3, s7
198 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
199 ; SI-NEXT: v_mov_b32_e32 v1, 0
200 ; SI-NEXT: s_waitcnt lgkmcnt(0)
201 ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
202 ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
203 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2
204 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
205 ; SI-NEXT: s_mov_b32 s6, -1
206 ; SI-NEXT: s_waitcnt vmcnt(2)
207 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3
208 ; SI-NEXT: v_or_b32_e32 v1, v1, v2
209 ; SI-NEXT: s_waitcnt vmcnt(0)
210 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
211 ; SI-NEXT: v_or_b32_e32 v0, v0, v4
212 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
213 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
214 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
215 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
216 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
217 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
218 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
219 ; SI-NEXT: s_endpgm
220 ;
221 ; VI-LABEL: load_v4i8_to_v4f32_unaligned:
222 ; VI: ; %bb.0:
223 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
224 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
225 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
226 ; VI-NEXT: s_mov_b32 s7, 0xf000
227 ; VI-NEXT: s_mov_b32 s6, -1
228 ; VI-NEXT: s_waitcnt lgkmcnt(0)
229 ; VI-NEXT: v_mov_b32_e32 v1, s1
230 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
231 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
232 ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v0
233 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
234 ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0
235 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
236 ; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0
237 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
238 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
239 ; VI-NEXT: flat_load_ubyte v1, v[6:7]
240 ; VI-NEXT: flat_load_ubyte v4, v[4:5]
241 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
242 ; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
243 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
244 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
245 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
246 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
247 ; VI-NEXT: v_or_b32_e32 v2, v2, v4
248 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
249 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
250 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
251 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
252 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3
253 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
254 ; VI-NEXT: s_endpgm
88255 %tid = call i32 @llvm.amdgcn.workitem.id.x()
89256 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
90257 %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
95262
96263 ; FIXME: Need to handle non-uniform case for function below (load without gep).
97264 ; Instructions still emitted to repack bytes for add use.
98
99 ; GCN-LABEL: {{^}}load_v4i8_to_v4f32_2_uses:
100 ; GCN: {{buffer|flat}}_load_dword
101 ; GCN-DAG: v_cvt_f32_ubyte0_e32
102 ; GCN-DAG: v_cvt_f32_ubyte1_e32
103 ; GCN-DAG: v_cvt_f32_ubyte2_e32
104 ; GCN-DAG: v_cvt_f32_ubyte3_e32
105
106 ; GCN-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 24
107
108 ; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16
109 ; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 8
110 ; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff,
111 ; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff00,
112 ; SI-DAG: v_add_i32
113
114 ; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffffff00,
115 ; VI-DAG: v_add_u16_e32
116 ; VI-DAG: v_add_u16_e32
117
118 ; GCN-DAG: {{buffer|flat}}_store_dwordx4
119 ; GCN-DAG: {{buffer|flat}}_store_dword
120
121 ; GCN: s_endpgm
122265 define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
266 ; SI-LABEL: load_v4i8_to_v4f32_2_uses:
267 ; SI: ; %bb.0:
268 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
269 ; SI-NEXT: s_mov_b32 s3, 0xf000
270 ; SI-NEXT: s_mov_b32 s6, 0
271 ; SI-NEXT: s_mov_b32 s7, s3
272 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
273 ; SI-NEXT: v_mov_b32_e32 v1, 0
274 ; SI-NEXT: s_waitcnt lgkmcnt(0)
275 ; SI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
276 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9
277 ; SI-NEXT: s_mov_b32 s2, -1
278 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
279 ; SI-NEXT: s_movk_i32 s12, 0x900
280 ; SI-NEXT: s_mov_b32 s10, s2
281 ; SI-NEXT: s_mov_b32 s11, s3
282 ; SI-NEXT: s_movk_i32 s13, 0xff
283 ; SI-NEXT: s_waitcnt vmcnt(0)
284 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
285 ; SI-NEXT: v_lshrrev_b32_e32 v5, 24, v1
286 ; SI-NEXT: v_and_b32_e32 v6, 0xff00, v1
287 ; SI-NEXT: v_add_i32_e32 v7, vcc, 9, v1
288 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1
289 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v1
290 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
291 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v6
292 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
293 ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4
294 ; SI-NEXT: s_waitcnt lgkmcnt(0)
295 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
296 ; SI-NEXT: v_add_i32_e32 v6, vcc, s12, v6
297 ; SI-NEXT: v_and_b32_e32 v7, s13, v7
298 ; SI-NEXT: s_waitcnt expcnt(0)
299 ; SI-NEXT: v_add_i32_e32 v1, vcc, s12, v5
300 ; SI-NEXT: v_and_b32_e32 v2, s13, v4
301 ; SI-NEXT: v_or_b32_e32 v0, v7, v6
302 ; SI-NEXT: v_or_b32_e32 v1, v2, v1
303 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
304 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
305 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
306 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
307 ; SI-NEXT: s_endpgm
308 ;
309 ; VI-LABEL: load_v4i8_to_v4f32_2_uses:
310 ; VI: ; %bb.0:
311 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
312 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
313 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
314 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
315 ; VI-NEXT: s_movk_i32 s8, 0x900
316 ; VI-NEXT: s_waitcnt lgkmcnt(0)
317 ; VI-NEXT: v_mov_b32_e32 v1, s3
318 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
319 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
320 ; VI-NEXT: flat_load_dword v5, v[0:1]
321 ; VI-NEXT: s_mov_b32 s3, 0xf000
322 ; VI-NEXT: s_mov_b32 s2, -1
323 ; VI-NEXT: s_mov_b32 s6, s2
324 ; VI-NEXT: s_mov_b32 s7, s3
325 ; VI-NEXT: v_mov_b32_e32 v4, 9
326 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
327 ; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v5
328 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v5
329 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v5
330 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v5
331 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v5
332 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
333 ; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v5
334 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6
335 ; VI-NEXT: v_add_u16_e32 v8, 9, v5
336 ; VI-NEXT: v_add_u16_e32 v0, s8, v7
337 ; VI-NEXT: v_add_u16_e32 v1, s8, v1
338 ; VI-NEXT: v_add_u16_sdwa v2, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
339 ; VI-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
340 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
341 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
342 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
343 ; VI-NEXT: s_endpgm
123344 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
124345 %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
125346 %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
131352 }
132353
133354 ; Make sure this doesn't crash.
134 ; GCN-LABEL: {{^}}load_v7i8_to_v7f32:
135 ; GCN: s_endpgm
136355 define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
356 ; SI-LABEL: load_v7i8_to_v7f32:
357 ; SI: ; %bb.0:
358 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
359 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
360 ; SI-NEXT: s_mov_b32 s7, 0xf000
361 ; SI-NEXT: s_mov_b32 s2, 0
362 ; SI-NEXT: s_mov_b32 s3, s7
363 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
364 ; SI-NEXT: v_mov_b32_e32 v1, 0
365 ; SI-NEXT: s_waitcnt lgkmcnt(0)
366 ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
367 ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
368 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2
369 ; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:3
370 ; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:4
371 ; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:5
372 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:6
373 ; SI-NEXT: s_mov_b32 s6, -1
374 ; SI-NEXT: s_waitcnt vmcnt(5)
375 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3
376 ; SI-NEXT: v_or_b32_e32 v1, v1, v2
377 ; SI-NEXT: s_waitcnt vmcnt(3)
378 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5
379 ; SI-NEXT: v_or_b32_e32 v2, v2, v4
380 ; SI-NEXT: s_waitcnt vmcnt(1)
381 ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7
382 ; SI-NEXT: s_waitcnt vmcnt(0)
383 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
384 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:24
385 ; SI-NEXT: s_waitcnt expcnt(0)
386 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2
387 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
388 ; SI-NEXT: v_or_b32_e32 v4, v3, v6
389 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
390 ; SI-NEXT: v_or_b32_e32 v4, v4, v5
391 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v4
392 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
393 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
394 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
395 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
396 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v4
397 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
398 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
399 ; SI-NEXT: s_endpgm
400 ;
401 ; VI-LABEL: load_v7i8_to_v7f32:
402 ; VI: ; %bb.0:
403 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
404 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
405 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
406 ; VI-NEXT: s_mov_b32 s7, 0xf000
407 ; VI-NEXT: s_mov_b32 s6, -1
408 ; VI-NEXT: s_waitcnt lgkmcnt(0)
409 ; VI-NEXT: v_mov_b32_e32 v1, s1
410 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
411 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
412 ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
413 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
414 ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0
415 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
416 ; VI-NEXT: flat_load_ubyte v10, v[4:5]
417 ; VI-NEXT: flat_load_ubyte v11, v[2:3]
418 ; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0
419 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
420 ; VI-NEXT: v_add_u32_e32 v4, vcc, 5, v0
421 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
422 ; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0
423 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
424 ; VI-NEXT: v_add_u32_e32 v8, vcc, 6, v0
425 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
426 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
427 ; VI-NEXT: flat_load_ubyte v1, v[8:9]
428 ; VI-NEXT: flat_load_ubyte v7, v[6:7]
429 ; VI-NEXT: flat_load_ubyte v4, v[4:5]
430 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
431 ; VI-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
432 ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10
433 ; VI-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
434 ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v11
435 ; VI-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
436 ; VI-NEXT: v_or_b32_e32 v0, v3, v0
437 ; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
438 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v1
439 ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
440 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
441 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
442 ; VI-NEXT: v_or_b32_sdwa v1, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
443 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
444 ; VI-NEXT: v_or_b32_e32 v4, v4, v7
445 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
446 ; VI-NEXT: v_or_b32_e32 v4, v4, v5
447 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v4
448 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
449 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
450 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
451 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
452 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v4
453 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
454 ; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16
455 ; VI-NEXT: s_endpgm
137456 %tid = call i32 @llvm.amdgcn.workitem.id.x()
138457 %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid
139458 %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1
142461 ret void
143462 }
144463
145 ; GCN-LABEL: {{^}}load_v8i8_to_v8f32:
146 ; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
147 ; GCN-NOT: bfe
148 ; GCN-NOT: lshr
149 ; GCN-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]]
150 ; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[LOLOAD]]
151 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[LOLOAD]]
152 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[LOLOAD]]
153 ; GCN-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[HILOAD]]
154 ; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[HILOAD]]
155 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[HILOAD]]
156 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]]
157 ; GCN-NOT: bfe
158 ; GCN-NOT: lshr
159 ; GCN: buffer_store_dwordx4
160 ; GCN: buffer_store_dwordx4
161464 define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
465 ; SI-LABEL: load_v8i8_to_v8f32:
466 ; SI: ; %bb.0:
467 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
468 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
469 ; SI-NEXT: s_mov_b32 s7, 0xf000
470 ; SI-NEXT: s_mov_b32 s2, 0
471 ; SI-NEXT: s_mov_b32 s3, s7
472 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
473 ; SI-NEXT: v_mov_b32_e32 v1, 0
474 ; SI-NEXT: s_waitcnt lgkmcnt(0)
475 ; SI-NEXT: buffer_load_dwordx2 v[7:8], v[0:1], s[0:3], 0 addr64
476 ; SI-NEXT: s_mov_b32 s6, -1
477 ; SI-NEXT: s_waitcnt vmcnt(0)
478 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v7
479 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v7
480 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v7
481 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v7
482 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v7, v8
483 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v6, v8
484 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v8
485 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8
486 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
487 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
488 ; SI-NEXT: s_endpgm
489 ;
490 ; VI-LABEL: load_v8i8_to_v8f32:
491 ; VI: ; %bb.0:
492 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
493 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
494 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
495 ; VI-NEXT: s_mov_b32 s7, 0xf000
496 ; VI-NEXT: s_mov_b32 s6, -1
497 ; VI-NEXT: s_waitcnt lgkmcnt(0)
498 ; VI-NEXT: v_mov_b32_e32 v1, s1
499 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
500 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
501 ; VI-NEXT: flat_load_dwordx2 v[7:8], v[0:1]
502 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
503 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v7
504 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v7
505 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v7
506 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v7
507 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v7, v8
508 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v6, v8
509 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v8
510 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8
511 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
512 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
513 ; VI-NEXT: s_endpgm
162514 %tid = call i32 @llvm.amdgcn.workitem.id.x()
163515 %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid
164516 %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8
167519 ret void
168520 }
169521
170 ; GCN-LABEL: {{^}}i8_zext_inreg_i32_to_f32:
171 ; GCN: {{buffer|flat}}_load_dword [[LOADREG:v[0-9]+]],
172 ; GCN: v_add_{{[iu]}}32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]]
173 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
174 ; GCN: buffer_store_dword [[CONV]],
175522 define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
523 ; SI-LABEL: i8_zext_inreg_i32_to_f32:
524 ; SI: ; %bb.0:
525 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
526 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
527 ; SI-NEXT: s_mov_b32 s7, 0xf000
528 ; SI-NEXT: s_mov_b32 s2, 0
529 ; SI-NEXT: s_mov_b32 s3, s7
530 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
531 ; SI-NEXT: v_mov_b32_e32 v1, 0
532 ; SI-NEXT: s_waitcnt lgkmcnt(0)
533 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
534 ; SI-NEXT: s_mov_b32 s6, -1
535 ; SI-NEXT: s_waitcnt vmcnt(0)
536 ; SI-NEXT: v_add_i32_e32 v0, vcc, 2, v0
537 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
538 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
539 ; SI-NEXT: s_endpgm
540 ;
541 ; VI-LABEL: i8_zext_inreg_i32_to_f32:
542 ; VI: ; %bb.0:
543 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
544 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
545 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
546 ; VI-NEXT: s_mov_b32 s7, 0xf000
547 ; VI-NEXT: s_mov_b32 s6, -1
548 ; VI-NEXT: s_waitcnt lgkmcnt(0)
549 ; VI-NEXT: v_mov_b32_e32 v1, s1
550 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
551 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
552 ; VI-NEXT: flat_load_dword v0, v[0:1]
553 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
554 ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
555 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
556 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
557 ; VI-NEXT: s_endpgm
176558 %tid = call i32 @llvm.amdgcn.workitem.id.x()
177559 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
178560 %load = load i32, i32 addrspace(1)* %gep, align 4
183565 ret void
184566 }
185567
186 ; GCN-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
187568 define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
569 ; SI-LABEL: i8_zext_inreg_hi1_to_f32:
570 ; SI: ; %bb.0:
571 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
572 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
573 ; SI-NEXT: s_mov_b32 s7, 0xf000
574 ; SI-NEXT: s_mov_b32 s2, 0
575 ; SI-NEXT: s_mov_b32 s3, s7
576 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
577 ; SI-NEXT: v_mov_b32_e32 v1, 0
578 ; SI-NEXT: s_waitcnt lgkmcnt(0)
579 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
580 ; SI-NEXT: s_mov_b32 s6, -1
581 ; SI-NEXT: s_waitcnt vmcnt(0)
582 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
583 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
584 ; SI-NEXT: s_endpgm
585 ;
586 ; VI-LABEL: i8_zext_inreg_hi1_to_f32:
587 ; VI: ; %bb.0:
588 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
589 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
590 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
591 ; VI-NEXT: s_mov_b32 s7, 0xf000
592 ; VI-NEXT: s_mov_b32 s6, -1
593 ; VI-NEXT: s_waitcnt lgkmcnt(0)
594 ; VI-NEXT: v_mov_b32_e32 v1, s1
595 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
596 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
597 ; VI-NEXT: flat_load_dword v0, v[0:1]
598 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
599 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
600 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
601 ; VI-NEXT: s_endpgm
188602 %tid = call i32 @llvm.amdgcn.workitem.id.x()
189603 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
190604 %load = load i32, i32 addrspace(1)* %gep, align 4
197611
198612 ; We don't get these ones because of the zext, but instcombine removes
199613 ; them so it shouldn't really matter.
200 ; GCN-LABEL: {{^}}i8_zext_i32_to_f32:
201614 define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
615 ; SI-LABEL: i8_zext_i32_to_f32:
616 ; SI: ; %bb.0:
617 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
618 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
619 ; SI-NEXT: s_mov_b32 s7, 0xf000
620 ; SI-NEXT: v_mov_b32_e32 v1, 0
621 ; SI-NEXT: s_mov_b32 s2, 0
622 ; SI-NEXT: s_mov_b32 s3, s7
623 ; SI-NEXT: s_waitcnt lgkmcnt(0)
624 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
625 ; SI-NEXT: s_mov_b32 s6, -1
626 ; SI-NEXT: s_waitcnt vmcnt(0)
627 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
628 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
629 ; SI-NEXT: s_endpgm
630 ;
631 ; VI-LABEL: i8_zext_i32_to_f32:
632 ; VI: ; %bb.0:
633 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
634 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
635 ; VI-NEXT: s_mov_b32 s7, 0xf000
636 ; VI-NEXT: s_mov_b32 s6, -1
637 ; VI-NEXT: s_waitcnt lgkmcnt(0)
638 ; VI-NEXT: v_mov_b32_e32 v1, s1
639 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
640 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
641 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
642 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
643 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
644 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
645 ; VI-NEXT: s_endpgm
202646 %tid = call i32 @llvm.amdgcn.workitem.id.x()
203647 %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
204648 %load = load i8, i8 addrspace(1)* %gep, align 1
208652 ret void
209653 }
210654
211 ; GCN-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32:
212655 define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
656 ; SI-LABEL: v4i8_zext_v4i32_to_v4f32:
657 ; SI: ; %bb.0:
658 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
659 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
660 ; SI-NEXT: s_mov_b32 s7, 0xf000
661 ; SI-NEXT: s_mov_b32 s2, 0
662 ; SI-NEXT: s_mov_b32 s3, s7
663 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
664 ; SI-NEXT: v_mov_b32_e32 v1, 0
665 ; SI-NEXT: s_waitcnt lgkmcnt(0)
666 ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
667 ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
668 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2
669 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
670 ; SI-NEXT: s_mov_b32 s6, -1
671 ; SI-NEXT: s_waitcnt vmcnt(2)
672 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3
673 ; SI-NEXT: v_or_b32_e32 v1, v1, v2
674 ; SI-NEXT: s_waitcnt vmcnt(0)
675 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
676 ; SI-NEXT: v_or_b32_e32 v0, v0, v4
677 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
678 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
679 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
680 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
681 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
682 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
683 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
684 ; SI-NEXT: s_endpgm
685 ;
686 ; VI-LABEL: v4i8_zext_v4i32_to_v4f32:
687 ; VI: ; %bb.0:
688 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
689 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
690 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
691 ; VI-NEXT: s_mov_b32 s7, 0xf000
692 ; VI-NEXT: s_mov_b32 s6, -1
693 ; VI-NEXT: s_waitcnt lgkmcnt(0)
694 ; VI-NEXT: v_mov_b32_e32 v1, s1
695 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
696 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
697 ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
698 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
699 ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0
700 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
701 ; VI-NEXT: v_add_u32_e32 v6, vcc, 2, v0
702 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
703 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
704 ; VI-NEXT: flat_load_ubyte v1, v[6:7]
705 ; VI-NEXT: flat_load_ubyte v4, v[4:5]
706 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
707 ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
708 ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
709 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
710 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
711 ; VI-NEXT: v_or_b32_e32 v4, v2, v0
712 ; VI-NEXT: v_or_b32_sdwa v0, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
713 ; VI-NEXT: v_or_b32_e32 v0, v0, v4
714 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
715 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
716 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
717 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
718 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
719 ; VI-NEXT: s_endpgm
213720 %tid = call i32 @llvm.amdgcn.workitem.id.x()
214721 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
215722 %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
219726 ret void
220727 }
221728
222 ; GCN-LABEL: {{^}}extract_byte0_to_f32:
223 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
224 ; GCN-NOT: [[VAL]]
225 ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]]
226 ; GCN: buffer_store_dword [[CONV]]
227729 define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
730 ; SI-LABEL: extract_byte0_to_f32:
731 ; SI: ; %bb.0:
732 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
733 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
734 ; SI-NEXT: s_mov_b32 s7, 0xf000
735 ; SI-NEXT: s_mov_b32 s2, 0
736 ; SI-NEXT: s_mov_b32 s3, s7
737 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
738 ; SI-NEXT: v_mov_b32_e32 v1, 0
739 ; SI-NEXT: s_waitcnt lgkmcnt(0)
740 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
741 ; SI-NEXT: s_mov_b32 s6, -1
742 ; SI-NEXT: s_waitcnt vmcnt(0)
743 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
744 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
745 ; SI-NEXT: s_endpgm
746 ;
747 ; VI-LABEL: extract_byte0_to_f32:
748 ; VI: ; %bb.0:
749 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
750 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
751 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
752 ; VI-NEXT: s_mov_b32 s7, 0xf000
753 ; VI-NEXT: s_mov_b32 s6, -1
754 ; VI-NEXT: s_waitcnt lgkmcnt(0)
755 ; VI-NEXT: v_mov_b32_e32 v1, s1
756 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
757 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
758 ; VI-NEXT: flat_load_dword v0, v[0:1]
759 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
760 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
761 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
762 ; VI-NEXT: s_endpgm
228763 %tid = call i32 @llvm.amdgcn.workitem.id.x()
229764 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
230765 %val = load i32, i32 addrspace(1)* %gep
234769 ret void
235770 }
236771
237 ; GCN-LABEL: {{^}}extract_byte1_to_f32:
238 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
239 ; GCN-NOT: [[VAL]]
240 ; GCN: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]]
241 ; GCN: buffer_store_dword [[CONV]]
242772 define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
773 ; SI-LABEL: extract_byte1_to_f32:
774 ; SI: ; %bb.0:
775 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
776 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
777 ; SI-NEXT: s_mov_b32 s7, 0xf000
778 ; SI-NEXT: s_mov_b32 s2, 0
779 ; SI-NEXT: s_mov_b32 s3, s7
780 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
781 ; SI-NEXT: v_mov_b32_e32 v1, 0
782 ; SI-NEXT: s_waitcnt lgkmcnt(0)
783 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
784 ; SI-NEXT: s_mov_b32 s6, -1
785 ; SI-NEXT: s_waitcnt vmcnt(0)
786 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
787 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
788 ; SI-NEXT: s_endpgm
789 ;
790 ; VI-LABEL: extract_byte1_to_f32:
791 ; VI: ; %bb.0:
792 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
793 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
794 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
795 ; VI-NEXT: s_mov_b32 s7, 0xf000
796 ; VI-NEXT: s_mov_b32 s6, -1
797 ; VI-NEXT: s_waitcnt lgkmcnt(0)
798 ; VI-NEXT: v_mov_b32_e32 v1, s1
799 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
800 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
801 ; VI-NEXT: flat_load_dword v0, v[0:1]
802 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
803 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
804 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
805 ; VI-NEXT: s_endpgm
243806 %tid = call i32 @llvm.amdgcn.workitem.id.x()
244807 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
245808 %val = load i32, i32 addrspace(1)* %gep
250813 ret void
251814 }
252815
253 ; GCN-LABEL: {{^}}extract_byte2_to_f32:
254 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
255 ; GCN-NOT: [[VAL]]
256 ; GCN: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]]
257 ; GCN: buffer_store_dword [[CONV]]
258816 define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
817 ; SI-LABEL: extract_byte2_to_f32:
818 ; SI: ; %bb.0:
819 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
820 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
821 ; SI-NEXT: s_mov_b32 s7, 0xf000
822 ; SI-NEXT: s_mov_b32 s2, 0
823 ; SI-NEXT: s_mov_b32 s3, s7
824 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
825 ; SI-NEXT: v_mov_b32_e32 v1, 0
826 ; SI-NEXT: s_waitcnt lgkmcnt(0)
827 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
828 ; SI-NEXT: s_mov_b32 s6, -1
829 ; SI-NEXT: s_waitcnt vmcnt(0)
830 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
831 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
832 ; SI-NEXT: s_endpgm
833 ;
834 ; VI-LABEL: extract_byte2_to_f32:
835 ; VI: ; %bb.0:
836 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
837 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
838 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
839 ; VI-NEXT: s_mov_b32 s7, 0xf000
840 ; VI-NEXT: s_mov_b32 s6, -1
841 ; VI-NEXT: s_waitcnt lgkmcnt(0)
842 ; VI-NEXT: v_mov_b32_e32 v1, s1
843 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
844 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
845 ; VI-NEXT: flat_load_dword v0, v[0:1]
846 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
847 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
848 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
849 ; VI-NEXT: s_endpgm
259850 %tid = call i32 @llvm.amdgcn.workitem.id.x()
260851 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
261852 %val = load i32, i32 addrspace(1)* %gep
266857 ret void
267858 }
268859
269 ; GCN-LABEL: {{^}}extract_byte3_to_f32:
270 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
271 ; GCN-NOT: [[VAL]]
272 ; GCN: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]]
273 ; GCN: buffer_store_dword [[CONV]]
274860 define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
861 ; SI-LABEL: extract_byte3_to_f32:
862 ; SI: ; %bb.0:
863 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
864 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
865 ; SI-NEXT: s_mov_b32 s7, 0xf000
866 ; SI-NEXT: s_mov_b32 s2, 0
867 ; SI-NEXT: s_mov_b32 s3, s7
868 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
869 ; SI-NEXT: v_mov_b32_e32 v1, 0
870 ; SI-NEXT: s_waitcnt lgkmcnt(0)
871 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
872 ; SI-NEXT: s_mov_b32 s6, -1
873 ; SI-NEXT: s_waitcnt vmcnt(0)
874 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
875 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
876 ; SI-NEXT: s_endpgm
877 ;
878 ; VI-LABEL: extract_byte3_to_f32:
879 ; VI: ; %bb.0:
880 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
881 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
882 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
883 ; VI-NEXT: s_mov_b32 s7, 0xf000
884 ; VI-NEXT: s_mov_b32 s6, -1
885 ; VI-NEXT: s_waitcnt lgkmcnt(0)
886 ; VI-NEXT: v_mov_b32_e32 v1, s1
887 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
888 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
889 ; VI-NEXT: flat_load_dword v0, v[0:1]
890 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
891 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
892 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
893 ; VI-NEXT: s_endpgm
275894 %tid = call i32 @llvm.amdgcn.workitem.id.x()
276895 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
277896 %val = load i32, i32 addrspace(1)* %gep
282901 ret void
283902 }
284903
285 ; GCN-LABEL: {{^}}cvt_ubyte0_or_multiuse:
286 ; GCN: {{buffer|flat}}_load_dword [[LOADREG:v[0-9]+]],
287 ; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], 0x80000001, [[LOADREG]]
288 ; GCN-DAG: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[OR]]
289 ; GCN: v_add_f32_e32 [[RES:v[0-9]+]], [[OR]], [[CONV]]
290 ; GCN: buffer_store_dword [[RES]],
291904 define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float addrspace(1)* %out) {
905 ; SI-LABEL: cvt_ubyte0_or_multiuse:
906 ; SI: ; %bb.0: ; %bb
907 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
908 ; SI-NEXT: s_mov_b32 s7, 0xf000
909 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
910 ; SI-NEXT: v_mov_b32_e32 v1, 0
911 ; SI-NEXT: s_mov_b32 s6, -1
912 ; SI-NEXT: s_waitcnt lgkmcnt(0)
913 ; SI-NEXT: s_mov_b32 s4, s2
914 ; SI-NEXT: s_mov_b32 s5, s3
915 ; SI-NEXT: s_mov_b32 s2, 0
916 ; SI-NEXT: s_mov_b32 s3, s7
917 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
918 ; SI-NEXT: s_waitcnt vmcnt(0)
919 ; SI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
920 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
921 ; SI-NEXT: v_add_f32_e32 v0, v0, v1
922 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
923 ; SI-NEXT: s_endpgm
924 ;
925 ; VI-LABEL: cvt_ubyte0_or_multiuse:
926 ; VI: ; %bb.0: ; %bb
927 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
928 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
929 ; VI-NEXT: s_mov_b32 s7, 0xf000
930 ; VI-NEXT: s_mov_b32 s6, -1
931 ; VI-NEXT: s_waitcnt lgkmcnt(0)
932 ; VI-NEXT: v_mov_b32_e32 v1, s1
933 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
934 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
935 ; VI-NEXT: flat_load_dword v0, v[0:1]
936 ; VI-NEXT: s_mov_b32 s4, s2
937 ; VI-NEXT: s_mov_b32 s5, s3
938 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
939 ; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
940 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
941 ; VI-NEXT: v_add_f32_e32 v0, v0, v1
942 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
943 ; VI-NEXT: s_endpgm
292944 bb:
293945 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
294946 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %lid