llvm.org GIT mirror llvm / 4402e81
[AMDGPU] Convert insert_vector_elt into set of selects This allows to avoid scratch use or indirect VGPR addressing for small vectors. Differential Revision: https://reviews.llvm.org/D54606 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@347231 91177308-0d34-0410-b5e6-96231b3b80d8 Stanislav Mekhanoshin 1 year, 4 months ago
11 changed file(s) with 560 addition(s) and 144 deletion(s). Raw diff Collapse all Expand all
678678 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
679679 setTargetDAGCombine(ISD::ZERO_EXTEND);
680680 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
681 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
681682
682683 // All memory operations. Some folding on the pointer operand is done to help
683684 // matching the constant offsets in the addressing modes.
81138114 return SDValue();
81148115 }
81158116
8117 SDValue
8118 SITargetLowering::performInsertVectorEltCombine(SDNode *N,
8119 DAGCombinerInfo &DCI) const {
8120 SDValue Vec = N->getOperand(0);
8121 SDValue Idx = N->getOperand(2);
8122 EVT VecVT = Vec.getValueType();
8123 EVT EltVT = VecVT.getVectorElementType();
8124 unsigned VecSize = VecVT.getSizeInBits();
8125 unsigned EltSize = EltVT.getSizeInBits();
8126
8127 // INSERT_VECTOR_ELT (, var-idx)
8128 // => BUILD_VECTOR n x select (e, const-idx)
8129 // This elminates non-constant index and subsequent movrel or scratch access.
8130 // Sub-dword vectors of size 2 dword or less have better implementation.
8131 // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
8132 // instructions.
8133 if (isa(Idx) ||
8134 VecSize > 256 || (VecSize <= 64 && EltSize < 32))
8135 return SDValue();
8136
8137 SelectionDAG &DAG = DCI.DAG;
8138 SDLoc SL(N);
8139 SDValue Ins = N->getOperand(1);
8140 EVT IdxVT = Idx.getValueType();
8141
8142 SDValue V;
8143 SmallVector Ops;
8144 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
8145 SDValue IC = DAG.getConstant(I, SL, IdxVT);
8146 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
8147 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
8148 Ops.push_back(V);
8149 }
8150
8151 return DAG.getBuildVector(VecVT, SL, Ops);
8152 }
8153
81168154 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
81178155 const SDNode *N0,
81188156 const SDNode *N1) const {
87218759 }
87228760 case ISD::EXTRACT_VECTOR_ELT:
87238761 return performExtractVectorEltCombine(N, DCI);
8762 case ISD::INSERT_VECTOR_ELT:
8763 return performInsertVectorEltCombine(N, DCI);
87248764 }
87258765 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
87268766 }
151151 SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
152152 SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
153153 SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
154 SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
154155
155156 unsigned getFusedOpcode(const SelectionDAG &DAG,
156157 const SDNode *N0, const SDNode *N1) const;
66
77
88 ; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
9 ; GCN-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}}
9 ; GCN-DAG: s_load_dwordx16 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]{{\]}}
1010 ; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
1111 ; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
1212
13 ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]]
14 ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}}
15 ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT1:[0-9]+]], s{{[0-9]+}}
16 ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:3]], s[[S_ELT0]]
13 ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]]
14 ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
1715
1816 ; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
1917 ; GCN-NEXT: s_waitcnt vmcnt(0)
4240 ; GCN: s_and_saveexec_b64 vcc, vcc
4341
4442 ; MOVREL: s_mov_b32 m0, [[READLANE]]
45 ; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63
43 ; MOVREL-NEXT: v_movreld_b32_e32 v{{[0-9]+}}, 63
4644
4745 ; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
48 ; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT1]], 63
46 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 63
4947 ; IDXMODE: s_set_gpr_idx_off
5048
5149 ; GCN-NEXT: s_xor_b64 exec, exec, vcc
5452 ; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]:
5553
5654 ; GCN: buffer_store_dword [[INS0]]
57 define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {
55 define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<16 x i32> addrspace(1)* %out0, <16 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <16 x i32> %vec0) #0 {
5856 entry:
5957 %id = call i32 @llvm.amdgcn.workitem.id.x() #1
6058 %id.ext = zext i32 %id to i64
6260 %idx0 = load volatile i32, i32 addrspace(1)* %gep
6361 %idx1 = add i32 %idx0, 1
6462 %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
65 %vec1 = insertelement <4 x i32> %vec0, i32 %live.out.val, i32 %idx0
66 %vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1
67 store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0
63 %vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0
64 %vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1
65 store volatile <16 x i32> %vec2, <16 x i32> addrspace(1)* %out0
6866 %cmp = icmp eq i32 %id, 0
6967 br i1 %cmp, label %bb1, label %bb2
7068
88 ; CHECK: s_load_dword [[IN:s[0-9]+]]
99 ; CHECK: s_mov_b32 m0, [[IN]]
1010 ; CHECK: v_movreld_b32_e32 v[[ELT0:[0-9]+]]
11 ; CHECK-NEXT: buffer_store_dwordx4 v{{\[}}[[ELT0]]:
12 define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
11 ; CHECK: buffer_store_dwordx4
12 ; CHECK: buffer_store_dwordx4
13 ; CHECK: buffer_store_dwordx4
14 ; CHECK: buffer_store_dwordx4
15 define amdgpu_kernel void @insert_wo_offset(<16 x float> addrspace(1)* %out, i32 %in) {
1316 entry:
14 %ins = insertelement <4 x float> , float 5.0, i32 %in
15 store <4 x float> %ins, <4 x float> addrspace(1)* %out
17 %ins = insertelement <16 x float> , float 17.0, i32 %in
18 store <16 x float> %ins, <16 x float> addrspace(1)* %out
1619 ret void
1720 }
1821
99
1010
1111 ; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
12 ; GCN-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}}
12 ; GCN-DAG: s_load_dwordx16 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]{{\]}}
1313 ; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
1414 ; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
1515
16 ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]]
17 ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}}
18 ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT1:3]], s{{[0-9]+}}
16 ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]]
1917 ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
2018
2119 ; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
4543 ; GCN: s_and_saveexec_b64 vcc, vcc
4644
4745 ; MOVREL: s_mov_b32 m0, [[READLANE]]
48 ; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63
46 ; MOVREL-NEXT: v_movreld_b32_e32 v{{[0-9]+}}, 63
4947
5048 ; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
51 ; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT1]], 63
49 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 63
5250 ; IDXMODE: s_set_gpr_idx_off
5351
5452 ; GCN-NEXT: s_xor_b64 exec, exec, vcc
5755 ; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]:
5856
5957 ; GCN: buffer_store_dword [[INS0]]
60 define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {
58 define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<16 x i32> addrspace(1)* %out0, <16 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <16 x i32> %vec0) #0 {
6159 entry:
6260 %id = call i32 @llvm.amdgcn.workitem.id.x() #1
6361 %id.ext = zext i32 %id to i64
6563 %idx0 = load volatile i32, i32 addrspace(1)* %gep
6664 %idx1 = add i32 %idx0, 1
6765 %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
68 %vec1 = insertelement <4 x i32> %vec0, i32 %live.out.val, i32 %idx0
69 %vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1
70 store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0
66 %vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0
67 %vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1
68 store volatile <16 x i32> %vec2, <16 x i32> addrspace(1)* %out0
7169 %cmp = icmp eq i32 %id, 0
7270 br i1 %cmp, label %bb1, label %bb2
7371
181181 ; GCN-DAG: v_mov_b32_e32 v[[ELT1:[0-9]+]], 2.0
182182 ; GCN-DAG: v_mov_b32_e32 v[[ELT2:[0-9]+]], 0x40400000
183183 ; GCN-DAG: v_mov_b32_e32 v[[ELT3:[0-9]+]], 4.0
184 ; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x40a00000
184 ; GCN-DAG: v_mov_b32_e32 v[[ELT15:[0-9]+]], 0x41800000
185 ; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x41880000
185186
186187 ; MOVREL: v_movreld_b32_e32 v[[ELT1]], v[[INS]]
187188 ; MOVREL: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}}
188 define amdgpu_kernel void @insert_w_offset(<4 x float> addrspace(1)* %out, i32 %in) {
189 entry:
190 %0 = add i32 %in, 1
191 %1 = insertelement <4 x float> , float 5.0, i32 %0
192 store <4 x float> %1, <4 x float> addrspace(1)* %out
189 define amdgpu_kernel void @insert_w_offset(<16 x float> addrspace(1)* %out, i32 %in) {
190 entry:
191 %add = add i32 %in, 1
192 %ins = insertelement <16 x float> , float 17.0, i32 %add
193 store <16 x float> %ins, <16 x float> addrspace(1)* %out
193194 ret void
194195 }
195196
204205 ; IDXMODE-NEXT: s_set_gpr_idx_off
205206
206207 ; GCN: buffer_store_dwordx4 v{{\[}}[[ELT0]]:
207 define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
208 entry:
209 %0 = insertelement <4 x float> , float 5.0, i32 %in
210 store <4 x float> %0, <4 x float> addrspace(1)* %out
208 define amdgpu_kernel void @insert_wo_offset(<16 x float> addrspace(1)* %out, i32 %in) {
209 entry:
210 %ins = insertelement <16 x float> , float 17.0, i32 %in
211 store <16 x float> %ins, <16 x float> addrspace(1)* %out
211212 ret void
212213 }
213214
214215 ; GCN-LABEL: {{^}}insert_neg_offset_sgpr:
215216 ; The offset depends on the register that holds the first element of the vector.
216217 ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
217 ; MOVREL: v_movreld_b32_e32 v0, 5
218 ; MOVREL: v_movreld_b32_e32 v0, 16
218219
219220 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
220221 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
221 ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
222 ; IDXMODE-NEXT: s_set_gpr_idx_off
223 define amdgpu_kernel void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) {
222 ; IDXMODE-NEXT: v_mov_b32_e32 v0, 16
223 ; IDXMODE-NEXT: s_set_gpr_idx_off
224 define amdgpu_kernel void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out, i32 %offset) {
224225 entry:
225226 %index = add i32 %offset, -512
226 %value = insertelement <4 x i32> , i32 5, i32 %index
227 store <4 x i32> %value, <4 x i32> addrspace(1)* %out
227 %value = insertelement <16 x i32> , i32 16, i32 %index
228 store <16 x i32> %value, <16 x i32> addrspace(1)* %out
228229 ret void
229230 }
230231
240241 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
241242 ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
242243 ; IDXMODE-NEXT: s_set_gpr_idx_off
243 define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %offset) {
244 define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out, <16 x i32> %vec, i32 %offset) {
244245 entry:
245246 %index = add i32 %offset, -512
246 %value = insertelement <4 x i32> %vec, i32 5, i32 %index
247 store <4 x i32> %value, <4 x i32> addrspace(1)* %out
247 %value = insertelement <16 x i32> %vec, i32 5, i32 %index
248 store <16 x i32> %value, <16 x i32> addrspace(1)* %out
248249 ret void
249250 }
250251
255256 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], 2{{$}}
256257 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT2:v[0-9]+]], 3{{$}}
257258 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}}
259 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 5{{$}}
260 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 6{{$}}
261 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 7{{$}}
262 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 8{{$}}
263 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 9{{$}}
264 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 10{{$}}
265 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 11{{$}}
266 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 12{{$}}
267 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 13{{$}}
268 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 14{{$}}
269 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 15{{$}}
270 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 16{{$}}
258271
259272 ; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
260273 ; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
262275 ; GCN: s_and_saveexec_b64 vcc, vcc
263276
264277 ; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe00
265 ; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], 5
278 ; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], 33
266279
267280 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
268281 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
269 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 5
282 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 33
270283 ; IDXMODE: s_set_gpr_idx_off
271284
272285 ; GCN: s_cbranch_execnz [[LOOPBB]]
273286 ; GCN: s_mov_b64 exec, [[SAVEEXEC]]
274287
275288 ; GCN: buffer_store_dword
276 define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
289 define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out) {
277290 entry:
278291 %id = call i32 @llvm.amdgcn.workitem.id.x() #1
279292 %index = add i32 %id, -512
280 %value = insertelement <4 x i32> , i32 5, i32 %index
281 store <4 x i32> %value, <4 x i32> addrspace(1)* %out
293 %value = insertelement <16 x i32> , i32 33, i32 %index
294 store <16 x i32> %value, <16 x i32> addrspace(1)* %out
282295 ret void
283296 }
284297
288301 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], 2{{$}}
289302 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT2:v[0-9]+]], 3{{$}}
290303 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}}
304 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 5{{$}}
305 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 6{{$}}
306 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 7{{$}}
307 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 8{{$}}
308 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 9{{$}}
309 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 10{{$}}
310 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 11{{$}}
311 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 12{{$}}
312 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 13{{$}}
313 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 14{{$}}
314 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 15{{$}}
315 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 16{{$}}
291316 ; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x1f4{{$}}
292317
293318 ; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
304329 ; IDXMODE: s_set_gpr_idx_off
305330
306331 ; GCN: s_cbranch_execnz
307 define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
332 define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out) {
308333 entry:
309334 %id = call i32 @llvm.amdgcn.workitem.id.x() #1
310335 %index = add i32 %id, -16
311 %value = insertelement <4 x i32> , i32 500, i32 %index
312 store <4 x i32> %value, <4 x i32> addrspace(1)* %out
336 %value = insertelement <16 x i32> , i32 500, i32 %index
337 store <16 x i32> %value, <16 x i32> addrspace(1)* %out
313338 ret void
314339 }
315340
427452 ; GCN: s_load_dword [[ARG:s[0-9]+]]
428453
429454 ; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
430 ; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
431455 ; MOVREL: s_waitcnt
432456 ; MOVREL: s_add_i32 m0, [[ARG]], -16
433457 ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, 4.0
458 ; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
434459 ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, -4.0
435460 ; MOVREL: s_mov_b32 m0, -1
436461
452477 define amdgpu_kernel void @multi_same_block(i32 %arg) #0 {
453478 bb:
454479 %tmp1 = add i32 %arg, -16
455 %tmp2 = insertelement <6 x float> 00000e+01>, float 4.000000e+00, i32 %tmp1
480 %tmp2 = insertelement <9 x float> 00000e+01>, float 4.000000e+00, i32 %tmp1
456481 %tmp3 = add i32 %arg, -16
457 %tmp4 = insertelement <6 x float> , float -4.0, i32 %tmp3
458 %tmp5 = bitcast <6 x float> %tmp2 to <6 x i32>
459 %tmp6 = extractelement <6 x i32> %tmp5, i32 1
460 %tmp7 = bitcast <6 x float> %tmp4 to <6 x i32>
461 %tmp8 = extractelement <6 x i32> %tmp7, i32 5
482 %tmp4 = insertelement <9 x float> , float -4.0, i32 %tmp3
483 %tmp5 = bitcast <9 x float> %tmp2 to <9 x i32>
484 %tmp6 = extractelement <9 x i32> %tmp5, i32 1
485 %tmp7 = bitcast <9 x float> %tmp4 to <9 x i32>
486 %tmp8 = extractelement <9 x i32> %tmp7, i32 5
462487 store volatile i32 %tmp6, i32 addrspace(3)* undef, align 4
463488 store volatile i32 %tmp8, i32 addrspace(3)* undef, align 4
464489 ret void
530555 ret void
531556 }
532557
533 ; GCN-LABEL: {{^}}insertelement_v4f32_or_index:
558 ; GCN-LABEL: {{^}}insertelement_v16f32_or_index:
534559 ; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
535560 ; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
536561 ; GCN-NOT: [[IDX_SHL]]
541566 ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], dst
542567 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
543568 ; IDXMODE: s_set_gpr_idx_off
544 define amdgpu_kernel void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind {
569 define amdgpu_kernel void @insertelement_v16f32_or_index(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %idx.in) nounwind {
545570 %idx.shl = shl i32 %idx.in, 2
546571 %idx = or i32 %idx.shl, 1
547 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %idx
548 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
572 %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %idx
573 store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
549574 ret void
550575 }
551576
580605
581606 bb4: ; preds = %bb2
582607 %vgpr = load volatile i32, i32 addrspace(1)* undef
583 %tmp5 = insertelement <8 x i32> undef, i32 undef, i32 %vgpr
584 %tmp6 = insertelement <8 x i32> %tmp5, i32 %arg1, i32 %vgpr
585 %tmp7 = extractelement <8 x i32> %tmp6, i32 0
608 %tmp5 = insertelement <16 x i32> undef, i32 undef, i32 %vgpr
609 %tmp6 = insertelement <16 x i32> %tmp5, i32 %arg1, i32 %vgpr
610 %tmp7 = extractelement <16 x i32> %tmp6, i32 0
586611 br label %bb2
587612
588613 bb8: ; preds = %bb2
0 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
1
2 ; GCN-LABEL: {{^}}float4_inselt:
3 ; GCN-NOT: v_movrel
4 ; GCN-NOT: buffer_
5 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3
6 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
7 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2
8 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]]
9 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1
10 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]]
11 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0
12 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]]
13 ; GCN: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
14 define amdgpu_kernel void @float4_inselt(<4 x float> addrspace(1)* %out, <4 x float> %vec, i32 %sel) {
15 entry:
16 %v = insertelement <4 x float> %vec, float 1.000000e+00, i32 %sel
17 store <4 x float> %v, <4 x float> addrspace(1)* %out
18 ret void
19 }
20
21 ; GCN-LABEL: {{^}}float4_inselt_undef:
22 ; GCN-NOT: v_movrel
23 ; GCN-NOT: buffer_
24 ; GCN-NOT: v_cmp_
25 ; GCN-NOT: v_cndmask_
26 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
27 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
28 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
29 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
30 define amdgpu_kernel void @float4_inselt_undef(<4 x float> addrspace(1)* %out, i32 %sel) {
31 entry:
32 %v = insertelement <4 x float> undef, float 1.000000e+00, i32 %sel
33 store <4 x float> %v, <4 x float> addrspace(1)* %out
34 ret void
35 }
36
37 ; GCN-LABEL: {{^}}int4_inselt:
38 ; GCN-NOT: v_movrel
39 ; GCN-NOT: buffer_
40 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3
41 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1, v{{[0-9]+}}, [[CC1]]
42 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2
43 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC2]]
44 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1
45 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC3]]
46 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0
47 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1, v{{[0-9]+}}, [[CC4]]
48 ; GCN: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
49 define amdgpu_kernel void @int4_inselt(<4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %sel) {
50 entry:
51 %v = insertelement <4 x i32> %vec, i32 1, i32 %sel
52 store <4 x i32> %v, <4 x i32> addrspace(1)* %out
53 ret void
54 }
55
56 ; GCN-LABEL: {{^}}float2_inselt:
57 ; GCN-NOT: v_movrel
58 ; GCN-NOT: buffer_
59 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1
60 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
61 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 0
62 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC2]]
63 ; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
64 define amdgpu_kernel void @float2_inselt(<2 x float> addrspace(1)* %out, <2 x float> %vec, i32 %sel) {
65 entry:
66 %v = insertelement <2 x float> %vec, float 1.000000e+00, i32 %sel
67 store <2 x float> %v, <2 x float> addrspace(1)* %out
68 ret void
69 }
70
71 ; GCN-LABEL: {{^}}float8_inselt:
72 ; GCN-NOT: v_movrel
73 ; GCN-NOT: buffer_
74 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3
75 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
76 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2
77 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]]
78 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1
79 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]]
80 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0
81 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]]
82 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC5:[^,]+]], [[IDX:s[0-9]+]], 7
83 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC5]]
84 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC6:[^,]+]], [[IDX]], 6
85 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC6]]
86 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC7:[^,]+]], [[IDX]], 5
87 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC7]]
88 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC8:[^,]+]], [[IDX]], 4
89 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC8]]
90 ; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST0]]:[[ELT_LAST0]]]
91 ; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST1]]:[[ELT_LAST1]]]
92 define amdgpu_kernel void @float8_inselt(<8 x float> addrspace(1)* %out, <8 x float> %vec, i32 %sel) {
93 entry:
94 %v = insertelement <8 x float> %vec, float 1.000000e+00, i32 %sel
95 store <8 x float> %v, <8 x float> addrspace(1)* %out
96 ret void
97 }
98
99 ; GCN-LABEL: {{^}}float16_inselt:
100 ; GCN: v_movreld_b32
101 define amdgpu_kernel void @float16_inselt(<16 x float> addrspace(1)* %out, <16 x float> %vec, i32 %sel) {
102 entry:
103 %v = insertelement <16 x float> %vec, float 1.000000e+00, i32 %sel
104 store <16 x float> %v, <16 x float> addrspace(1)* %out
105 ret void
106 }
107
108 ; GCN-LABEL: {{^}}half4_inselt:
109 ; GCN-NOT: v_cndmask_b32
110 ; GCN-NOT: v_movrel
111 ; GCN-NOT: buffer_
112 ; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
113 ; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
114 ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3c00
115 define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, <4 x half> %vec, i32 %sel) {
116 entry:
117 %v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel
118 store <4 x half> %v, <4 x half> addrspace(1)* %out
119 ret void
120 }
121
122 ; GCN-LABEL: {{^}}half2_inselt:
123 ; GCN-NOT: v_cndmask_b32
124 ; GCN-NOT: v_movrel
125 ; GCN-NOT: buffer_
126 ; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
127 ; GCN: s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]]
128 ; GCN: v_bfi_b32 v{{[0-9]+}}, [[V]], v{{[0-9]+}}, v{{[0-9]+}}
129 define amdgpu_kernel void @half2_inselt(<2 x half> addrspace(1)* %out, <2 x half> %vec, i32 %sel) {
130 entry:
131 %v = insertelement <2 x half> %vec, half 1.000000e+00, i32 %sel
132 store <2 x half> %v, <2 x half> addrspace(1)* %out
133 ret void
134 }
135
136 ; GCN-LABEL: {{^}}half8_inselt:
137 ; GCN-NOT: v_movrel
138 ; GCN-NOT: buffer_
139 ; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0
140 ; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 1
141 ; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 2
142 ; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 3
143 ; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 4
144 ; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 5
145 ; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 6
146 ; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 7
147 ; GCN-DAG: v_cndmask_b32_e32
148 ; GCN-DAG: v_cndmask_b32_e32
149 ; GCN-DAG: v_cndmask_b32_e32
150 ; GCN-DAG: v_cndmask_b32_e32
151 ; GCN-DAG: v_cndmask_b32_e32
152 ; GCN-DAG: v_cndmask_b32_e32
153 ; GCN-DAG: v_cndmask_b32_e32
154 ; GCN-DAG: v_cndmask_b32_e32
155 ; GCN-DAG: v_or_b32_sdwa
156 ; GCN-DAG: v_or_b32_sdwa
157 ; GCN-DAG: v_or_b32_sdwa
158 ; GCN-DAG: v_or_b32_sdwa
159 define amdgpu_kernel void @half8_inselt(<8 x half> addrspace(1)* %out, <8 x half> %vec, i32 %sel) {
160 entry:
161 %v = insertelement <8 x half> %vec, half 1.000000e+00, i32 %sel
162 store <8 x half> %v, <8 x half> addrspace(1)* %out
163 ret void
164 }
165
166 ; GCN-LABEL: {{^}}short2_inselt:
167 ; GCN-NOT: v_cndmask_b32
168 ; GCN-NOT: v_movrel
169 ; GCN-NOT: buffer_
170 ; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
171 ; GCN: s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]]
172 ; GCN: v_bfi_b32 v{{[0-9]+}}, [[V]], 1, v{{[0-9]+}}
173 define amdgpu_kernel void @short2_inselt(<2 x i16> addrspace(1)* %out, <2 x i16> %vec, i32 %sel) {
174 entry:
175 %v = insertelement <2 x i16> %vec, i16 1, i32 %sel
176 store <2 x i16> %v, <2 x i16> addrspace(1)* %out
177 ret void
178 }
179
180 ; GCN-LABEL: {{^}}short4_inselt:
181 ; GCN-NOT: v_cndmask_b32
182 ; GCN-NOT: v_movrel
183 ; GCN-NOT: buffer_
184 ; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
185 ; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
186 ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
187 define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, <4 x i16> %vec, i32 %sel) {
188 entry:
189 %v = insertelement <4 x i16> %vec, i16 1, i32 %sel
190 store <4 x i16> %v, <4 x i16> addrspace(1)* %out
191 ret void
192 }
193
194 ; GCN-LABEL: {{^}}byte8_inselt:
195 ; GCN-NOT: v_movrel
196 ; GCN-NOT: buffer_
197 ; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 3
198 ; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
199 ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
200 define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %vec, i32 %sel) {
201 entry:
202 %v = insertelement <8 x i8> %vec, i8 1, i32 %sel
203 store <8 x i8> %v, <8 x i8> addrspace(1)* %out
204 ret void
205 }
206
207 ; GCN-LABEL: {{^}}byte16_inselt:
208 ; GCN-NOT: v_movrel
209 ; GCN-NOT: buffer_
210 ; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0
211 ; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 15
212 ; GCN-DAG: v_cndmask_b32_e32
213 ; GCN-DAG: v_cndmask_b32_e32
214 ; GCN-DAG: v_cndmask_b32_e32
215 ; GCN-DAG: v_cndmask_b32_e32
216 ; GCN-DAG: v_cndmask_b32_e32
217 ; GCN-DAG: v_cndmask_b32_e32
218 ; GCN-DAG: v_cndmask_b32_e32
219 ; GCN-DAG: v_cndmask_b32_e32
220 ; GCN-DAG: v_cndmask_b32_e32
221 ; GCN-DAG: v_cndmask_b32_e32
222 ; GCN-DAG: v_cndmask_b32_e32
223 ; GCN-DAG: v_cndmask_b32_e32
224 ; GCN-DAG: v_cndmask_b32_e32
225 ; GCN-DAG: v_cndmask_b32_e32
226 ; GCN-DAG: v_cndmask_b32_e32
227 ; GCN-DAG: v_cndmask_b32_e32
228 ; GCN-DAG: v_or_b32_sdwa
229 ; GCN-DAG: v_or_b32_sdwa
230 ; GCN-DAG: v_or_b32_sdwa
231 ; GCN-DAG: v_or_b32_sdwa
232 ; GCN-DAG: v_or_b32_sdwa
233 ; GCN-DAG: v_or_b32_sdwa
234 ; GCN-DAG: v_or_b32_sdwa
235 ; GCN-DAG: v_or_b32_sdwa
236 define amdgpu_kernel void @byte16_inselt(<16 x i8> addrspace(1)* %out, <16 x i8> %vec, i32 %sel) {
237 entry:
238 %v = insertelement <16 x i8> %vec, i8 1, i32 %sel
239 store <16 x i8> %v, <16 x i8> addrspace(1)* %out
240 ret void
241 }
242
243 ; GCN-LABEL: {{^}}double2_inselt:
244 ; GCN-NOT: v_movrel
245 ; GCN-NOT: buffer_
246 ; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1
247 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC1]]
248 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
249 ; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 0
250 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC2]]
251 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
252 define amdgpu_kernel void @double2_inselt(<2 x double> addrspace(1)* %out, <2 x double> %vec, i32 %sel) {
253 entry:
254 %v = insertelement <2 x double> %vec, double 1.000000e+00, i32 %sel
255 store <2 x double> %v, <2 x double> addrspace(1)* %out
256 ret void
257 }
258
259 ; GCN-LABEL: {{^}}double8_inselt:
260 ; GCN-NOT: v_cndmask
261 ; GCN: buffer_store_dword
262 ; GCN: buffer_store_dword
263 ; GCN: buffer_load_dword
264 ; GCN: buffer_load_dword
265 ; GCN: buffer_load_dword
266 ; GCN: buffer_load_dword
267 ; GCN: buffer_load_dword
268 ; GCN: buffer_load_dword
269 ; GCN: buffer_load_dword
270 ; GCN: buffer_load_dword
271 ; GCN: buffer_load_dword
272 ; GCN: buffer_load_dword
273 ; GCN: buffer_load_dword
274 ; GCN: buffer_load_dword
275 ; GCN: buffer_load_dword
276 ; GCN: buffer_load_dword
277 ; GCN: buffer_load_dword
278 ; GCN: buffer_load_dword
279 define amdgpu_kernel void @double8_inselt(<8 x double> addrspace(1)* %out, <8 x double> %vec, i32 %sel) {
280 entry:
281 %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel
282 store <8 x double> %v, <8 x double> addrspace(1)* %out
283 ret void
284 }
285
286 ; GCN-LABEL: {{^}}bit4_inselt:
287 ; GCN: buffer_store_byte
288 ; GCN: buffer_load_ubyte
289 ; GCN: buffer_load_ubyte
290 ; GCN: buffer_load_ubyte
291 ; GCN: buffer_load_ubyte
292 define amdgpu_kernel void @bit4_inselt(<4 x i1> addrspace(1)* %out, <4 x i1> %vec, i32 %sel) {
293 entry:
294 %v = insertelement <4 x i1> %vec, i1 1, i32 %sel
295 store <4 x i1> %v, <4 x i1> addrspace(1)* %out
296 ret void
297 }
298
299 ; GCN-LABEL: {{^}}bit128_inselt:
300 ; GCN-NOT: buffer_
301 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], s{{[0-9]+}}, 0
302 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC1]]
303 ; GCN-DAG: v_mov_b32_e32 [[LASTIDX:v[0-9]+]], 0x7f
304 ; GCN-DAG: v_cmp_ne_u32_e32 [[CCL:[^,]+]], s{{[0-9]+}}, [[LASTIDX]]
305 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CCL]]
306 define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i1> %vec, i32 %sel) {
307 entry:
308 %v = insertelement <128 x i1> %vec, i1 1, i32 %sel
309 store <128 x i1> %v, <128 x i1> addrspace(1)* %out
310 ret void
311 }
8282 }
8383
8484 ; GCN-LABEL: {{^}}dynamic_insertelement_v2f32:
85 ; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
86 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
85 ; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
86 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1
87 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]]
88 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
89 ; GCN-DAG: v_cndmask_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]], v{{[0-9]+}}, [[CC1]]
8790 ; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
8891 define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
8992 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
9295 }
9396
9497 ; GCN-LABEL: {{^}}dynamic_insertelement_v3f32:
95 ; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
96 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
97 ; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
98 ; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
99 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX:s[0-9]+]], 2
100 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC3]]
101 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
102 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]]
103 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
104 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC1]]
105 ; GCN-DAG: buffer_store_dwordx2 v
98106 ; GCN-DAG: buffer_store_dword v
99107 define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
100108 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
103111 }
104112
105113 ; GCN-LABEL: {{^}}dynamic_insertelement_v4f32:
106 ; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
107 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
114 ; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
115 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX:s[0-9]+]], 3
116 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC4]]
117 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 2
118 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC3]]
119 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
120 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]]
121 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
122 ; GCN-DAG: v_cndmask_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]], v{{[0-9]+}}, [[CC1]]
108123 ; GCN: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]:
109124 define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
110125 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
113128 }
114129
115130 ; GCN-LABEL: {{^}}dynamic_insertelement_v8f32:
116 ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
131 ; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
132 ; GCN-DAG: v_cmp_ne_u32_e64 [[CCL:[^,]+]], [[IDX:s[0-9]+]], 7
133 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CCL]]
134 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
135 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC1]]
117136 ; GCN: buffer_store_dwordx4
118137 ; GCN: buffer_store_dwordx4
119138 define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
135154 }
136155
137156 ; GCN-LABEL: {{^}}dynamic_insertelement_v2i32:
138 ; GCN: v_movreld_b32
139 ; GCN: buffer_store_dwordx2
157 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1
158 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC2]]
159 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
160 ; GCN-DAG: v_cndmask_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], 5, v{{[0-9]+}}, [[CC1]]
161 ; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
140162 define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
141163 %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
142164 store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
144166 }
145167
146168 ; GCN-LABEL: {{^}}dynamic_insertelement_v3i32:
147 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], 5
148 ; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
169 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX:s[0-9]+]], 2
170 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC3]]
171 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
172 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC2]]
173 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
174 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]]
175 ; GCN-DAG: buffer_store_dwordx2 v
149176 ; GCN-DAG: buffer_store_dword v
150177 define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
151178 %vecins = insertelement <3 x i32> %a, i32 5, i32 %b
155182
156183 ; GCN-LABEL: {{^}}dynamic_insertelement_v4i32:
157184 ; GCN: s_load_dword [[SVAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}
158 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
159 ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[VVAL]]
185 ; GCN-DAG: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
186 ; GCN-DAG: v_cmp_eq_u32_e64 [[CC4:[^,]+]], [[IDX:s[0-9]+]], 3
187 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC4]]
188 ; GCN-DAG: v_cmp_eq_u32_e64 [[CC3:[^,]+]], [[IDX]], 2
189 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC3]]
190 ; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
191 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC2]]
192 ; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
193 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC1]]
160194 ; GCN: buffer_store_dwordx4
161195 define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
162196 %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
165199 }
166200
167201 ; GCN-LABEL: {{^}}dynamic_insertelement_v8i32:
168 ; GCN: v_movreld_b32
202 ; GCN-DAG: v_cmp_ne_u32_e64 [[CCL:[^,]+]], [[IDX:s[0-9]+]], 7
203 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CCL]]
204 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
205 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]]
169206 ; GCN: buffer_store_dwordx4
170207 ; GCN: buffer_store_dwordx4
171208 define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
287324 ; GCN: s_load_dwordx4
288325 ; GCN: s_load_dword s
289326
290 ; GCN: buffer_store_byte
291 ; GCN: buffer_store_byte
292 ; GCN: buffer_store_byte
293 ; GCN: buffer_store_byte
294 ; GCN: buffer_store_byte
295 ; GCN: buffer_store_byte
296 ; GCN: buffer_store_byte
297 ; GCN: buffer_store_byte
298 ; GCN: buffer_store_byte
299 ; GCN: buffer_store_byte
300 ; GCN: buffer_store_byte
301 ; GCN: buffer_store_byte
302 ; GCN: buffer_store_byte
303 ; GCN: buffer_store_byte
304 ; GCN: buffer_store_byte
305 ; GCN: buffer_store_byte
306
307 ; GCN: buffer_store_byte
327 ; GCN-NOT: buffer_store_byte
328
329 ; GCN-DAG: v_cmp_ne_u32_e64 [[CCL:[^,]+]], [[IDX:s[0-9]+]], 15
330 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CCL]]
331 ; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
332 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]]
333
308334 ; GCN: buffer_store_dwordx4
309335 define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
310336 %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
342368 ; GCN-DAG: s_load_dwordx4 s{{\[}}[[A_ELT0:[0-9]+]]:[[A_ELT3:[0-9]+]]{{\]}}
343369 ; GCN-DAG: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x18|0x60}}{{$}}
344370
345 ; GCN-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}}
346
347371 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
348372 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
349373 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
350374 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
351375 ; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000
352376
353 ; GCN-DAG: s_mov_b32 m0, [[SCALEDIDX]]
354 ; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, 0
355
356 ; Increment to next element folded into base register, but FileCheck
357 ; can't do math expressions
358
359 ; FIXME: Should be able to manipulate m0 directly instead of s_lshl_b32 + copy to m0
360
361 ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]]
377 ; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1
378 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[ELT1]], [[CC2]]
379 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
380 ; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
381 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[ELT1]], [[CC1]]
382 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
362383
363384 ; GCN: buffer_store_dwordx4
364385 ; GCN: s_endpgm
370391
371392 ; GCN-LABEL: {{^}}dynamic_insertelement_v2i64:
372393
373 ; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, 5
374 ; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, 0
394 ; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1
395 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC2]]
396 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
397 ; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
398 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC1]]
399 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
375400
376401 ; GCN: buffer_store_dwordx4
377402 ; GCN: s_endpgm
382407 }
383408
384409 ; GCN-LABEL: {{^}}dynamic_insertelement_v3i64:
410 ; GCN-DAG: v_cmp_eq_u32_e64 [[CC3:[^,]+]], [[IDX:s[0-9]+]], 2
411 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC3]]
412 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC3]]
413 ; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
414 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC2]]
415 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
416 ; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
417 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC1]]
418 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
385419 define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
386420 %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
387421 store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32
388422 ret void
389423 }
390424
391 ; FIXME: Should be able to do without stack access. The used stack
392 ; space is also 2x what should be required.
393
394425 ; GCN-LABEL: {{^}}dynamic_insertelement_v4f64:
395426
396 ; Stack store
397
398 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:32{{$}}
399 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:48{{$}}
400
401 ; Write element
402 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], {{s[0-9]+}} offen{{$}}
403
404 ; Stack reload
405 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:32{{$}}
406 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:48{{$}}
407
408 ; Store result
427 ; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40200000
428 ; GCN-DAG: v_cmp_eq_u32_e64 [[CC4:[^,]+]], [[IDX:s[0-9]+]], 3
429 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC4]]
430 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC4]]
431 ; GCN-DAG: v_cmp_eq_u32_e64 [[CC3:[^,]+]], [[IDX]], 2
432 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC3]]
433 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC3]]
434 ; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
435 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC2]]
436 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
437 ; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
438 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC1]]
439 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
440
409441 ; GCN: buffer_store_dwordx4
410442 ; GCN: buffer_store_dwordx4
411443 ; GCN: s_endpgm
412 ; GCN: ScratchSize: 64
444 ; GCN: ScratchSize: 0
413445
414446 define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
415447 %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
66 ; GCN: ; return
77 define amdgpu_ps float @main(i32 inreg %arg) #0 {
88 main_body:
9 %tmp24 = insertelement <2 x float> undef, float 0.000000e+00, i32 %arg
10 %tmp25 = extractelement <2 x float> %tmp24, i32 1
9 %tmp24 = insertelement <16 x float> undef, float 0.000000e+00, i32 %arg
10 %tmp25 = extractelement <16 x float> %tmp24, i32 1
1111 ret float %tmp25
1212 }
1313
None ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8 %s
1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
0 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
22 ; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
33
44 ; GCN-LABEL: {{^}}float4_alloca_store4:
55 ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4
66
7 ; GFX-NOT: buffer_
7 ; GCN-NOT: buffer_
88 ; GCN: v_cndmask_b32
99 ; GCN: v_cndmask_b32
1010 ; GCN: v_cndmask_b32_e32 [[RES:v[0-9]+]], 4.0,
3535 ; GCN-LABEL: {{^}}float4_alloca_load4:
3636 ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_load4
3737
38 ; GFX-NOT: buffer_
39 ; GCN: v_readfirstlane_b32
40 ; GFX8: v_movreld_b32
41 ; GFX9: s_set_gpr_idx_on
42 ; GFX9: s_set_gpr_idx_off
38 ; GCN-NOT: v_movrel
39 ; GCN-NOT: buffer_
40 ; GCN-NOT: v_cmp_
41 ; GCN-NOT: v_cndmask_
42 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
43 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
44 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
45 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
46 ; GCN: store_dwordx4 v[{{[0-9:]+}}],
4347
4448 ; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
4549 ; OPT: %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca
6771 ; GCN-LABEL: {{^}}half4_alloca_store4:
6872 ; OPT-LABEL: define amdgpu_kernel void @half4_alloca_store4
6973
70 ; GFX-NOT: buffer_
74 ; GCN-NOT: buffer_
7175 ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x44004200
7276 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
7377 ; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s{{\[}}[[SL]]:[[SH]]]
97101 ; GCN-LABEL: {{^}}half4_alloca_load4:
98102 ; OPT-LABEL: define amdgpu_kernel void @half4_alloca_load4
99103
100 ; GFX-NOT: buffer_
104 ; GCN-NOT: buffer_
101105 ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0
102106 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff
103107
127131 ; GCN-LABEL: {{^}}short4_alloca_store4:
128132 ; OPT-LABEL: define amdgpu_kernel void @short4_alloca_store4
129133
130 ; GFX-NOT: buffer_
134 ; GCN-NOT: buffer_
131135 ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x40003
132136 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x20001
133137 ; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s{{\[}}[[SL]]:[[SH]]]
157161 ; GCN-LABEL: {{^}}short4_alloca_load4:
158162 ; OPT-LABEL: define amdgpu_kernel void @short4_alloca_load4
159163
160 ; GFX-NOT: buffer_
164 ; GCN-NOT: buffer_
161165 ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0
162166 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff
163167
2626
2727 ; GCN-LABEL: {{^}}extract_insert_different_dynelt_v4i32:
2828 ; GCN: buffer_load_dwordx4
29 ; GCN: v_movreld_b32
29 ; GCN: v_cndmask_b32
30 ; GCN: v_cndmask_b32
31 ; GCN: v_cndmask_b32
32 ; GCN: v_cndmask_b32
3033 ; GCN: v_cndmask_b32
3134 ; GCN: v_cndmask_b32
3235 ; GCN: v_cndmask_b32