llvm.org GIT mirror llvm / 781f7ee
R600/SI: Fix bad code with unaligned byte vector loads Don't do the v4i8 -> v4f32 combine if the load will need to be expanded due to alignment. This stops adding instructions to repack into a single register that the v_cvt_ubyteN_f32 instructions read. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225926 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 5 years ago
3 changed file(s) with 25 addition(s) and 22 deletion(s). Raw diff Collapse all Expand all
301301 return true;
302302 }
303303
304 bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
304 bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
305305 unsigned AddrSpace,
306306 unsigned Align,
307307 bool *IsFast) const {
11661166 //===----------------------------------------------------------------------===//
11671167
11681168 SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
1169 DAGCombinerInfo &DCI) {
1169 DAGCombinerInfo &DCI) const {
11701170 EVT VT = N->getValueType(0);
11711171 EVT ScalarVT = VT.getScalarType();
11721172 if (ScalarVT != MVT::f32)
12141214 EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT);
12151215 EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT);
12161216 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts);
1217
12181217 LoadSDNode *Load = cast(Src);
1218
1219 unsigned AS = Load->getAddressSpace();
1220 unsigned Align = Load->getAlignment();
1221 Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext());
1222 unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
1223
1224 // Don't try to replace the load if we have to expand it due to alignment
1225 // problems. Otherwise we will end up scalarizing the load, and trying to
1226 // repack into the vector for no real reason.
1227 if (Align < ABIAlignment &&
1228 !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) {
1229 return SDValue();
1230 }
1231
12191232 SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT,
12201233 Load->getChain(),
12211234 Load->getBasePtr(),
4949 void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
5050 MachineSDNode *AdjustRegClass(MachineSDNode *N, SelectionDAG &DAG) const;
5151
52 static SDValue performUCharToFloatCombine(SDNode *N,
53 DAGCombinerInfo &DCI);
52 SDValue performUCharToFloatCombine(SDNode *N,
53 DAGCombinerInfo &DCI) const;
5454 SDValue performSHLPtrCombine(SDNode *N,
5555 unsigned AS,
5656 DAGCombinerInfo &DCI) const;
3535 ; SI-DAG: v_cvt_f32_ubyte0_e32
3636 ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
3737 define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
38 %load = load <3 x i8> addrspace(1)* %in, align 1
38 %load = load <3 x i8> addrspace(1)* %in, align 4
3939 %cvt = uitofp <3 x i8> %load to <3 x float>
4040 store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
4141 ret void
6565 ; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
6666 ; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
6767 ; SI: buffer_load_ubyte [[LOADREG3:v[0-9]+]]
68 ; SI-NOT: v_lshlrev_b32
69 ; SI-NOT: v_or_b32
6870
69 ; SI: v_lshlrev_b32
70 ; SI: v_or_b32
71 ; SI: v_lshlrev_b32
72 ; SI: v_or_b32
73 ; SI: v_lshlrev_b32
74 ; SI: v_or_b32
75
76 ; XSI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]], [[LOADREG0]]
77 ; XSI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG1]]
78 ; XSI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG2]]
79 ; XSI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG3]]
80
81 ; SI-DAG: v_cvt_f32_ubyte0_e32
82 ; SI-DAG: v_cvt_f32_ubyte1_e32
83 ; SI-DAG: v_cvt_f32_ubyte2_e32
84 ; SI-DAG: v_cvt_f32_ubyte3_e32
71 ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG0]]
72 ; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG1]]
73 ; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG2]]
74 ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]], [[LOADREG3]]
8575
8676 ; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
8777 define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {