llvm.org GIT mirror llvm / 3e0069d
[AMDGPU] Fix for vector element insertion Summary: Incorrect code was generated when lowering insertelement operations for vectors with 8 or 16 bit elements. The value being inserted was not adjusted for the position of the element within the 32 bit word and so only the low element within each 32 bit word could receive the intended value. Fixed by simply replicating the value to each element of a congruent vector before the mask and or operation used to update the intended element. A number of affected LIT tests have been updated appropriately. before the mask & or into the intended Reviewers: arsenm, nhaehnle Reviewed By: arsenm Subscribers: llvm-commits, arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Tags: #llvm Differential Revision: https://reviews.llvm.org/D57588 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@352885 91177308-0d34-0410-b5e6-96231b3b80d8 Tim Corringham 1 year, 20 days ago
7 changed file(s) with 49 addition(s) and 41 deletion(s). Raw diff Collapse all Expand all
43684368 MVT IntVT = MVT::getIntegerVT(VecSize);
43694369
43704370 // Avoid stack access for dynamic indexing.
4371 SDValue Val = InsVal;
4372 if (InsVal.getValueType() == MVT::f16)
4373 Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal);
4374
43754371 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
4376 SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val);
4372
4373 // Create a congruent vector with the target value in each element so that
4374 // the required element can be masked and ORed into the target vector.
4375 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
4376 DAG.getSplatBuildVector(VecVT, SL, InsVal));
43774377
43784378 assert(isPowerOf2_32(EltSize));
43794379 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
813813 }
814814
815815 ; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_v2f16:
816 ; GFX9: v_mul_f16_e32
816817 ; GFX9: v_pk_mul_f16
817 ; GFX9: v_mul_f16_e32
818818 ; GFX9-NOT: v_max
819819 ; GFX9-NOT: v_pk_max
820820 define <2 x half> @v_test_canonicalize_insertelement_v2f16(<2 x half> %vec, half %val, i32 %idx) {
111111 ; GCN-NOT: buffer_
112112 ; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
113113 ; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
114 ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3c00
114 ; GCN: s_mov_b32 [[K:s[0-9]+]], 0x3c003c00
115 ; GCN: v_mov_b32_e32 [[V:v[0-9]+]], [[K]]
116 ; GCN: v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}}
117 ; GCN: v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}}
115118 define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, <4 x half> %vec, i32 %sel) {
116119 entry:
117120 %v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel
167170 ; GCN-NOT: v_cndmask_b32
168171 ; GCN-NOT: v_movrel
169172 ; GCN-NOT: buffer_
173 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x10001
170174 ; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
171175 ; GCN: s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]]
172 ; GCN: v_bfi_b32 v{{[0-9]+}}, [[V]], 1, v{{[0-9]+}}
176 ; GCN: v_bfi_b32 v{{[0-9]+}}, [[V]], [[K]], v{{[0-9]+}}
173177 define amdgpu_kernel void @short2_inselt(<2 x i16> addrspace(1)* %out, <2 x i16> %vec, i32 %sel) {
174178 entry:
175179 %v = insertelement <2 x i16> %vec, i16 1, i32 %sel
183187 ; GCN-NOT: buffer_
184188 ; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
185189 ; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
186 ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
190 ; GCN: s_mov_b32 [[K:s[0-9]+]], 0x10001
191 ; GCN: v_mov_b32_e32 [[V:v[0-9]+]], [[K]]
192 ; GCN: v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}}
193 ; GCN: v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}}
187194 define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, <4 x i16> %vec, i32 %sel) {
188195 entry:
189196 %v = insertelement <4 x i16> %vec, i16 1, i32 %sel
196203 ; GCN-NOT: buffer_
197204 ; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 3
198205 ; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
199 ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
206 ; GCN: s_mov_b32 [[K:s[0-9]+]], 0x1010101
207 ; GCN: s_and_b32 s3, s1, [[K]]
208 ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]]
209 ; GCN: s_andn2_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
210 ; GCN: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
200211 define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %vec, i32 %sel) {
201212 entry:
202213 %v = insertelement <8 x i8> %vec, i8 1, i32 %sel
241241 ; VI-NOT: _load
242242 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
243243 ; VI: v_lshlrev_b16_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], -1
244 ; VI: v_and_b32_e32 [[INSERT:v[0-9]+]], 5, [[MASK]]
244 ; VI: v_and_b32_e32 [[INSERT:v[0-9]+]], 0x505, [[MASK]]
245245 ; VI: v_xor_b32_e32 [[NOT_MASK:v[0-9]+]], -1, [[MASK]]
246246 ; VI: v_and_b32_e32 [[AND_NOT_MASK:v[0-9]+]], [[LOAD]], [[NOT_MASK]]
247247 ; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[INSERT]], [[AND_NOT_MASK]]
260260 ; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
261261 ; VI-NOT: _load
262262
263 ; VI: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x5050505
263264 ; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
264265 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
265266 ; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
266 ; VI: s_andn2_b32 [[AND_NOT_MASK:s[0-9]+]], [[LOAD]], [[SHIFTED_MASK]]
267 ; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]]
268 ; VI: s_lshr_b32 [[HI2:s[0-9]+]], [[AND_NOT_MASK]], 16
269
270 ; VI-DAG: buffer_store_short [[BFI]]
271 ; VI-DAG: v_mov_b32_e32 [[V_HI2:v[0-9]+]], [[HI2]]
267 ; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], [[VAL]], [[V_LOAD]]
268 ; VI: v_lshrrev_b32_e32 [[V_HI2:v[0-9]+]], 16, [[BFI]]
269
270 ; VI: buffer_store_short [[BFI]]
272271 ; VI: buffer_store_byte [[V_HI2]]
273272 define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
274273 %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
281280 ; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
282281 ; VI-NOT: _load
283282
283 ; VI: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x5050505
284284 ; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
285285 ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
286286 ; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
287 ; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]]
287 ; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], [[VAL]], [[V_LOAD]]
288288 ; VI: buffer_store_dword [[BFI]]
289289 define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
290290 %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
302302 ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
303303 ; VI-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff
304304 ; VI: s_lshl_b64 s{{\[}}[[MASK_SHIFT_LO:[0-9]+]]:[[MASK_SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]]
305 ; VI: s_mov_b32 [[VAL:s[0-9]+]], 0x5050505
306 ; VI: s_and_b32 s[[INS_HI:[0-9]+]], s[[MASK_SHIFT_HI]], [[VAL]]
307 ; VI: s_and_b32 s[[INS_LO:[0-9]+]], s[[MASK_SHIFT_LO]], [[VAL]]
305308 ; VI: s_andn2_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[VEC]], s{{\[}}[[MASK_SHIFT_LO]]:[[MASK_SHIFT_HI]]{{\]}}
306 ; VI: s_and_b32 s[[INS:[0-9]+]], s[[MASK_SHIFT_LO]], 5
307 ; VI: s_or_b64 s{{\[}}[[RESULT0:[0-9]+]]:[[RESULT1:[0-9]+]]{{\]}}, s{{\[}}[[INS]]:[[MASK_HI]]{{\]}}, [[AND]]
309 ; VI: s_or_b64 s{{\[}}[[RESULT0:[0-9]+]]:[[RESULT1:[0-9]+]]{{\]}}, s{{\[}}[[INS_LO]]:[[INS_HI]]{{\]}}, [[AND]]
308310 ; VI: v_mov_b32_e32 v[[V_RESULT0:[0-9]+]], s[[RESULT0]]
309311 ; VI: v_mov_b32_e32 v[[V_RESULT1:[0-9]+]], s[[RESULT1]]
310312 ; VI: buffer_store_dwordx2 v{{\[}}[[V_RESULT0]]:[[V_RESULT1]]{{\]}}
445445
446446 ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
447447 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
448 ; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x1234
448 ; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0x12341234
449449
450450 ; GCN-DAG: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
451451 ; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
610610 ; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]
611611 ; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
612612
613 ; GCN-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff
614 ; GCN-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0
615 ; CIVI-DAG: s_and_b32 [[MASKED_VAL:s[0-9]+]], [[VAL]], s[[MASK_LO]]
616 ; VI-DAG: s_lshl_b32 [[SHIFTED_VAL:s[0-9]+]], [[MASKED_VAL]], 16
617 ; CI-DAG: s_lshl_b32 [[SHIFTED_VAL:s[0-9]+]], [[VAL]], 16
618 ; CIVI: s_or_b32 [[DUP_VAL:s[0-9]+]], [[MASKED_VAL]], [[SHIFTED_VAL]]
613619 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
614 ; GCN-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0
615 ; GCN-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff{{$}}
616
617 ; GFX89: v_lshlrev_b64 v{{\[}}[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]{{\]}}, [[SCALED_IDX]], s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}
618 ; GFX89-DAG: v_not_b32_e32 v[[NOT_SHIFT_LO:[0-9+]]], v[[SHIFT_LO]]
619 ; GFX89-DAG: v_not_b32_e32 v[[NOT_SHIFT_HI:[0-9+]]], v[[SHIFT_HI]]
620 ; GFX89-DAG: v_and_b32_e32 v[[MASK:[0-9]+]], [[VAL]], v[[SHIFT_LO]]
621
622 ; GFX89-DAG: v_and_b32_e32 v[[AND0:[0-9]+]], v[[NOT_SHIFT_LO]], v[[LO]]
623 ; GFX89-DAG: v_and_b32_e32 v[[AND1:[0-9]+]], v[[NOT_SHIFT_HI]], v[[HI]]
624 ; GFX89: v_or_b32_sdwa v[[OR_SDWA:[0-9]+]], v[[MASK]], v[[AND0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
625
626
627 ; CI: v_lshl_b64 v{{\[}}[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]]
628 ; CI-DAG: v_bfi_b32 v[[OR_SDWA:[0-9]+]], v[[SHIFT_LO]],
629 ; CI-DAG: v_bfi_b32 v[[AND1:[0-9]+]], v[[SHIFT_HI]], 0,
630
631 ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[OR_SDWA]]:[[AND1]]{{\]}}
620 ; GFX9-DAG: s_pack_ll_b32_b16 [[DUP_VAL:s[0-9]+]], [[VAL]], [[VAL]]
621 ; GFX89: v_lshlrev_b64 v[{{[0-9:]+}}], [[SCALED_IDX]], s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}
622 ; CI: v_lshl_b64 v[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SCALED_IDX]]
623 ; GCN: v_bfi_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[DUP_VAL]], v{{[0-9]+}}
624 ; GCN: v_bfi_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[DUP_VAL]], v{{[0-9]+}}
625
626 ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
632627 define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
633628 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
634629 %tid.ext = sext i32 %tid to i64
22
33 ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
44 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
5 ; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
5 ; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0x3e703e7
66
77 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
88 ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
55 ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
66
77 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
8 ; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
8 ; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0x3e7
99
1010 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
1111 ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]