llvm.org GIT mirror llvm / ba988eb
[PowerPC] - Legalize vector types by widening instead of integer promotion This patch corresponds to review: http://reviews.llvm.org/D20443 It changes the legalization strategy for illegal vector types from integer promotion to widening. This only applies for vectors with elements of width that is a multiple of a byte since we have hardware support for vectors with 1, 2, 3, 8 and 16 byte elements. Integer promotion for vectors is quite expensive on PPC due to the sequence of breaking apart the vector, extending the elements and reconstituting the vector. Two of these operations are expensive. This patch causes between minor and major improvements in performance on most benchmarks. There are very few benchmarks whose performance regresses. These regressions can be handled in a subsequent patch with a DAG combine (similar to how this patch handles int -> fp conversions of illegal vector types). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@274535 91177308-0d34-0410-b5e6-96231b3b80d8 Nemanja Ivanovic 3 years ago
8 changed file(s) with 188 addition(s) and 53 deletion(s). Raw diff Collapse all Expand all
850850
851851 // We have target-specific dag combine patterns for the following nodes:
852852 setTargetDAGCombine(ISD::SINT_TO_FP);
853 setTargetDAGCombine(ISD::BUILD_VECTOR);
853854 if (Subtarget.hasFPCVT())
854855 setTargetDAGCombine(ISD::UINT_TO_FP);
855856 setTargetDAGCombine(ISD::LOAD);
10401041 case PPCISD::MFVSR: return "PPCISD::MFVSR";
10411042 case PPCISD::MTVSRA: return "PPCISD::MTVSRA";
10421043 case PPCISD::MTVSRZ: return "PPCISD::MTVSRZ";
1044 case PPCISD::SINT_VEC_TO_FP: return "PPCISD::SINT_VEC_TO_FP";
1045 case PPCISD::UINT_VEC_TO_FP: return "PPCISD::UINT_VEC_TO_FP";
10431046 case PPCISD::ANDIo_1_EQ_BIT: return "PPCISD::ANDIo_1_EQ_BIT";
10441047 case PPCISD::ANDIo_1_GT_BIT: return "PPCISD::ANDIo_1_GT_BIT";
10451048 case PPCISD::VCMP: return "PPCISD::VCMP";
1018710190 ShiftCst);
1018810191 }
1018910192
10193 SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
10194 DAGCombinerInfo &DCI) const {
10195 assert(N->getOpcode() == ISD::BUILD_VECTOR &&
10196 "Should be called with a BUILD_VECTOR node");
10197
10198 SelectionDAG &DAG = DCI.DAG;
10199 SDLoc dl(N);
10200 if (N->getValueType(0) != MVT::v2f64 || !Subtarget.hasVSX())
10201 return SDValue();
10202
10203 // Looking for:
10204 // (build_vector ([su]int_to_fp (extractelt 0)), [su]int_to_fp (extractelt 1))
10205 if (N->getOperand(0).getOpcode() != ISD::SINT_TO_FP &&
10206 N->getOperand(0).getOpcode() != ISD::UINT_TO_FP)
10207 return SDValue();
10208 if (N->getOperand(1).getOpcode() != ISD::SINT_TO_FP &&
10209 N->getOperand(1).getOpcode() != ISD::UINT_TO_FP)
10210 return SDValue();
10211 if (N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
10212 return SDValue();
10213
10214 SDValue Ext1 = N->getOperand(0).getOperand(0);
10215 SDValue Ext2 = N->getOperand(1).getOperand(0);
10216 if(Ext1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10217 Ext2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10218 return SDValue();
10219
10220 ConstantSDNode *Ext1Op = dyn_cast(Ext1.getOperand(1));
10221 ConstantSDNode *Ext2Op = dyn_cast(Ext2.getOperand(1));
10222 if (!Ext1Op || !Ext2Op)
10223 return SDValue();
10224 if (Ext1.getValueType() != MVT::i32 ||
10225 Ext2.getValueType() != MVT::i32)
10226 if (Ext1.getOperand(0) != Ext2.getOperand(0))
10227 return SDValue();
10228
10229 int FirstElem = Ext1Op->getZExtValue();
10230 int SecondElem = Ext2Op->getZExtValue();
10231 int SubvecIdx;
10232 if (FirstElem == 0 && SecondElem == 1)
10233 SubvecIdx = Subtarget.isLittleEndian() ? 1 : 0;
10234 else if (FirstElem == 2 && SecondElem == 3)
10235 SubvecIdx = Subtarget.isLittleEndian() ? 0 : 1;
10236 else
10237 return SDValue();
10238
10239 SDValue SrcVec = Ext1.getOperand(0);
10240 auto NodeType = (N->getOperand(1).getOpcode() == ISD::SINT_TO_FP) ?
10241 PPCISD::SINT_VEC_TO_FP : PPCISD::UINT_VEC_TO_FP;
10242 return DAG.getNode(NodeType, dl, MVT::v2f64,
10243 SrcVec, DAG.getIntPtrConstant(SubvecIdx, dl));
10244 }
10245
1019010246 SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
1019110247 DAGCombinerInfo &DCI) const {
1019210248 assert((N->getOpcode() == ISD::SINT_TO_FP ||
1102311079 }
1102411080 break;
1102511081 }
11082 case ISD::BUILD_VECTOR:
11083 return DAGCombineBuildVector(N, DCI);
1102611084 }
1102711085
1102811086 return SDValue();
136136 /// Direct move from a GPR to a VSX register (zero)
137137 MTVSRZ,
138138
139 /// Extract a subvector from signed integer vector and convert to FP.
140 /// It is primarily used to convert a (widened) illegal integer vector
141 /// type to a legal floating point vector type.
142 /// For example v2i32 -> widened to v4i32 -> v2f64
143 SINT_VEC_TO_FP,
144
145 /// Extract a subvector from unsigned integer vector and convert to FP.
146 /// As with SINT_VEC_TO_FP, used for converting illegal types.
147 UINT_VEC_TO_FP,
148
139149 // FIXME: Remove these once the ANDI glue bug is fixed:
140150 /// i1 = ANDIo_1_[EQ|GT]_BIT(i32 or i64 x) - Represents the result of the
141151 /// eq or gt bit of CR0 after executing andi. x, 1. This is used to
431441 /// DAG node.
432442 const char *getTargetNodeName(unsigned Opcode) const override;
433443
444 /// getPreferredVectorAction - The code we generate when vector types are
445 /// legalized by promoting the integer element type is often much worse
446 /// than code we generate if we widen the type for applicable vector types.
447 /// The issue with promoting is that the vector is scalaraized, individual
448 /// elements promoted and then the vector is rebuilt. So say we load a pair
449 /// of v4i8's and shuffle them. This will turn into a mess of 8 extending
450 /// loads, moves back into VSR's (or memory ops if we don't have moves) and
451 /// then the VPERM for the shuffle. All in all a very slow sequence.
452 TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT)
453 const override {
454 if (VT.getVectorElementType().getSizeInBits() % 8 == 0)
455 return TypeWidenVector;
456 return TargetLoweringBase::getPreferredVectorAction(VT);
457 }
434458 bool useSoftFloat() const override;
435459
436460 MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
882906 SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
883907
884908 SDValue DAGCombineExtBoolTrunc(SDNode *N, DAGCombinerInfo &DCI) const;
909 SDValue DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const;
885910 SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const;
886911 SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const;
887912
5656 def SDT_PPCxxswapd : SDTypeProfile<1, 1, [
5757 SDTCisSameAs<0, 1>
5858 ]>;
59 def SDTVecConv : SDTypeProfile<1, 2, [
60 SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>
61 ]>;
5962
6063 def PPClxvd2x : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x,
6164 [SDNPHasChain, SDNPMayLoad]>;
6568 def PPCmfvsr : SDNode<"PPCISD::MFVSR", SDTUnaryOp, []>;
6669 def PPCmtvsra : SDNode<"PPCISD::MTVSRA", SDTUnaryOp, []>;
6770 def PPCmtvsrz : SDNode<"PPCISD::MTVSRZ", SDTUnaryOp, []>;
71 def PPCsvec2fp : SDNode<"PPCISD::SINT_VEC_TO_FP", SDTVecConv, []>;
72 def PPCuvec2fp: SDNode<"PPCISD::UINT_VEC_TO_FP", SDTVecConv, []>;
6873
6974 multiclass XX3Form_Rcr opcode, bits<7> xo, string asmbase,
7075 string asmstr, InstrItinClass itin, Intrinsic Int,
607612 "xvcvsxwdp $XT, $XB", IIC_VecFP, []>;
608613 def XVCVSXWSP : XX2Form<60, 184,
609614 (outs vsrc:$XT), (ins vsrc:$XB),
610 "xvcvsxwsp $XT, $XB", IIC_VecFP, []>;
615 "xvcvsxwsp $XT, $XB", IIC_VecFP,
616 [(set v4f32:$XT, (sint_to_fp v4i32:$XB))]>;
611617 def XVCVUXDDP : XX2Form<60, 488,
612618 (outs vsrc:$XT), (ins vsrc:$XB),
613619 "xvcvuxddp $XT, $XB", IIC_VecFP,
926932 (XVCVDPSXDS (XVCVSXWDP (XXSLDWI $C, $C, 1)))>;
927933 def : Pat<(v2f64 (sint_to_fp (sext_inreg v2i64:$C, v2i32))),
928934 (XVCVSXWDP (XXSLDWI $C, $C, 1))>;
935
936 def : Pat<(v2f64 (PPCsvec2fp v4i32:$C, 0)),
937 (v2f64 (XVCVSXWDP (v2i64 (XXMRGHW $C, $C))))>;
938 def : Pat<(v2f64 (PPCsvec2fp v4i32:$C, 1)),
939 (v2f64 (XVCVSXWDP (v2i64 (XXMRGLW $C, $C))))>;
940
941 def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 0)),
942 (v2f64 (XVCVUXWDP (v2i64 (XXMRGHW $C, $C))))>;
943 def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 1)),
944 (v2f64 (XVCVUXWDP (v2i64 (XXMRGLW $C, $C))))>;
929945
930946 // Loads.
931947 def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
3030
3131 ; FIXME: There actually are sub-vector Altivec loads, and so we could handle
3232 ; this with a small expense, but we don't currently.
33 ; CHECK: cost of 48 {{.*}} load
33 ; CHECK: cost of 42 {{.*}} load
3434 load <4 x i16>, <4 x i16>* undef, align 2
3535
3636 ; CHECK: cost of 2 {{.*}} load
0 ; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck \
1 ; RUN: -implicit-check-not vmrg -implicit-check-not=vperm %s
2 ; RUN: llc -mcpu=pwr8 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck \
3 ; RUN: -implicit-check-not vmrg -implicit-check-not=vperm %s \
4 ; RUN: --check-prefix=CHECK-BE
5
6 define <16 x i8> @test(i32* %s, i32* %t) {
7 entry:
8 %0 = bitcast i32* %s to <4 x i8>*
9 %1 = load <4 x i8>, <4 x i8>* %0, align 4
10 %2 = shufflevector <4 x i8> %1, <4 x i8> undef, <16 x i32>
11 ret <16 x i8> %2
12 ; CHECK-LABEL: test
13 ; CHECK: lwz [[GPR:[0-9]+]], 0(3)
14 ; CHECK: mtvsrd [[VSR:[0-9]+]], [[GPR]]
15 ; CHECK: xxswapd [[SWP:[0-9]+]], [[VSR]]
16 ; CHECK: xxspltw 34, [[SWP]], 3
17 ; CHECK-BE-LABEL: test
18 ; CHECK-BE: lwz [[GPR:[0-9]+]], 0(3)
19 ; CHECK-BE: sldi [[SHL:[0-9]+]], [[GPR]], 32
20 ; CHECK-BE: mtvsrd [[VSR:[0-9]+]], [[SHL]]
21 ; CHECK-BE: xxspltw 34, [[VSR]], 0
22 }
88 ret <2 x i32> %strided.vec
99
1010 ; CHECK-LABEL: @test1
11 ; CHECK: vsldoi 2, 2, 2, 12
11 ; CHECK: vsldoi [[TGT:[0-9]+]], 2, 2, 8
12 ; CHECK: vmrghw 2, 2, [[TGT]]
1213 ; CHECK: blr
1314 }
1415
2323 ret <4 x i8> %sext
2424 }
2525 ; CHECK-LABEL: v4si8_cmp:
26 ; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
26 ; CHECK: vcmpequb {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
2727
2828
2929 define <8 x i8> @v8si8_cmp(<8 x i8> %x, <8 x i8> %y) nounwind readnone {
3232 ret <8 x i8> %sext
3333 }
3434 ; CHECK-LABEL: v8si8_cmp:
35 ; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
35 ; CHECK: vcmpequb {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
3636
3737
3838 ; Additional tests for v16i8 since it is a altivec native type
157157 ret <4 x i16> %sext
158158 }
159159 ; CHECK-LABEL: v4si16_cmp:
160 ; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
160 ; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
161161
162162
163163 ; Additional tests for v8i16 since it is an altivec native type
11431143 ret <2 x double> %w
11441144
11451145 ; CHECK-LABEL: @test68
1146 ; CHECK: xxsldwi [[V1:[0-9]+]], 34, 34, 1
1146 ; CHECK: xxmrghw [[V1:[0-9]+]]
11471147 ; CHECK: xvcvsxwdp 34, [[V1]]
11481148 ; CHECK: blr
11491149
11501150 ; CHECK-LE-LABEL: @test68
1151 ; CHECK-LE: xxsldwi [[V1:[0-9]+]], 34, 34, 1
1151 ; CHECK-LE: xxmrglw [[V1:[0-9]+]], 34, 34
11521152 ; CHECK-LE: xvcvsxwdp 34, [[V1]]
11531153 ; CHECK-LE: blr
11541154 }
11551155
1156 ; This gets scalarized so the code isn't great
11561157 define <2 x double> @test69(<2 x i16> %a) {
11571158 %w = sitofp <2 x i16> %a to <2 x double>
11581159 ret <2 x double> %w
11591160
11601161 ; CHECK-LABEL: @test69
1161 ; CHECK: vspltisw [[V1:[0-9]+]], 8
1162 ; CHECK: vadduwm [[V2:[0-9]+]], [[V1]], [[V1]]
1163 ; CHECK: vslw [[V3:[0-9]+]], {{[0-9]+}}, [[V2]]
1164 ; CHECK: vsraw {{[0-9]+}}, [[V3]], [[V2]]
1165 ; CHECK: xxsldwi [[V4:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}, 1
1166 ; CHECK: xvcvsxwdp 34, [[V4]]
1162 ; CHECK-DAG: lfiwax
1163 ; CHECK-DAG: lfiwax
1164 ; CHECK-DAG: xscvsxddp
1165 ; CHECK-DAG: xscvsxddp
1166 ; CHECK: xxmrghd
11671167 ; CHECK: blr
11681168
11691169 ; CHECK-LE-LABEL: @test69
1170 ; CHECK-LE: vspltisw [[V1:[0-9]+]], 8
1171 ; CHECK-LE: vadduwm [[V2:[0-9]+]], [[V1]], [[V1]]
1172 ; CHECK-LE: vslw [[V3:[0-9]+]], {{[0-9]+}}, [[V2]]
1173 ; CHECK-LE: vsraw {{[0-9]+}}, [[V3]], [[V2]]
1174 ; CHECK-LE: xxsldwi [[V4:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}, 1
1175 ; CHECK-LE: xvcvsxwdp 34, [[V4]]
1176 ; CHECK-LE: blr
1177 }
1178
1170 ; CHECK-LE: mfvsrd
1171 ; CHECK-LE: mtvsrwa
1172 ; CHECK-LE: mtvsrwa
1173 ; CHECK-LE: xscvsxddp
1174 ; CHECK-LE: xscvsxddp
1175 ; CHECK-LE: xxspltd
1176 ; CHECK-LE: xxspltd
1177 ; CHECK-LE: xxmrgld
1178 ; CHECK-LE: blr
1179 }
1180
1181 ; This gets scalarized so the code isn't great
11791182 define <2 x double> @test70(<2 x i8> %a) {
11801183 %w = sitofp <2 x i8> %a to <2 x double>
11811184 ret <2 x double> %w
11821185
11831186 ; CHECK-LABEL: @test70
1184 ; CHECK: vspltisw [[V1:[0-9]+]], 12
1185 ; CHECK: vadduwm [[V2:[0-9]+]], [[V1]], [[V1]]
1186 ; CHECK: vslw [[V3:[0-9]+]], {{[0-9]+}}, [[V2]]
1187 ; CHECK: vsraw {{[0-9]+}}, [[V3]], [[V2]]
1188 ; CHECK: xxsldwi [[V4:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}, 1
1189 ; CHECK: xvcvsxwdp 34, [[V4]]
1187 ; CHECK-DAG: lfiwax
1188 ; CHECK-DAG: lfiwax
1189 ; CHECK-DAG: xscvsxddp
1190 ; CHECK-DAG: xscvsxddp
1191 ; CHECK: xxmrghd
11901192 ; CHECK: blr
11911193
11921194 ; CHECK-LE-LABEL: @test70
1193 ; CHECK-LE: vspltisw [[V1:[0-9]+]], 12
1194 ; CHECK-LE: vadduwm [[V2:[0-9]+]], [[V1]], [[V1]]
1195 ; CHECK-LE: vslw [[V3:[0-9]+]], {{[0-9]+}}, [[V2]]
1196 ; CHECK-LE: vsraw {{[0-9]+}}, [[V3]], [[V2]]
1197 ; CHECK-LE: xxsldwi [[V4:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}, 1
1198 ; CHECK-LE: xvcvsxwdp 34, [[V4]]
1199 ; CHECK-LE: blr
1200 }
1201
1195 ; CHECK-LE: mfvsrd
1196 ; CHECK-LE: mtvsrwa
1197 ; CHECK-LE: mtvsrwa
1198 ; CHECK-LE: xscvsxddp
1199 ; CHECK-LE: xscvsxddp
1200 ; CHECK-LE: xxspltd
1201 ; CHECK-LE: xxspltd
1202 ; CHECK-LE: xxmrgld
1203 ; CHECK-LE: blr
1204 }
1205
1206 ; This gets scalarized so the code isn't great
12021207 define <2 x i32> @test80(i32 %v) {
12031208 %b1 = insertelement <2 x i32> undef, i32 %v, i32 0
12041209 %b2 = shufflevector <2 x i32> %b1, <2 x i32> undef, <2 x i32> zeroinitializer
12061211 ret <2 x i32> %i
12071212
12081213 ; CHECK-REG-LABEL: @test80
1209 ; CHECK-REG-DAG: addi [[R1:[0-9]+]], 3, 3
1210 ; CHECK-REG-DAG: addi [[R2:[0-9]+]], 1, -16
1211 ; CHECK-REG-DAG: addi [[R3:[0-9]+]], 3, 2
1212 ; CHECK-REG: std [[R1]], -8(1)
1213 ; CHECK-REG: std [[R3]], -16(1)
1214 ; CHECK-REG: lxvd2x 34, 0, [[R2]]
1215 ; CHECK-REG-NOT: stxvd2x
1214 ; CHECK-REG: stw 3, -16(1)
1215 ; CHECK-REG: addi [[R1:[0-9]+]], 1, -16
1216 ; CHECK-REG: addis [[R2:[0-9]+]]
1217 ; CHECK-REG: addi [[R2]], [[R2]]
1218 ; CHECK-REG-DAG: lxvw4x [[VS1:[0-9]+]], 0, [[R1]]
1219 ; CHECK-REG-DAG: lxvw4x 35, 0, [[R2]]
1220 ; CHECK-REG: xxspltw 34, [[VS1]], 0
1221 ; CHECK-REG: vadduwm 2, 2, 3
1222 ; CHECK-REG-NOT: stxvw4x
12161223 ; CHECK-REG: blr
12171224
12181225 ; CHECK-FISL-LABEL: @test80
1219 ; CHECK-FISL-DAG: addi [[R1:[0-9]+]], 3, 3
1220 ; CHECK-FISL-DAG: addi [[R2:[0-9]+]], 1, -16
1221 ; CHECK-FISL-DAG: addi [[R3:[0-9]+]], 3, 2
1222 ; CHECK-FISL-DAG: std [[R1]], -8(1)
1223 ; CHECK-FISL-DAG: std [[R3]], -16(1)
1224 ; CHECK-FISL-DAG: lxvd2x 0, 0, [[R2]]
1226 ; CHECK-FISL: mr 4, 3
1227 ; CHECK-FISL: stw 4, -16(1)
1228 ; CHECK-FISL: addi [[R1:[0-9]+]], 1, -16
1229 ; CHECK-FISL-DAG: lxvw4x [[VS1:[0-9]+]], 0, [[R1]]
1230 ; CHECK-FISL-DAG: xxspltw {{[0-9]+}}, [[VS1]], 0
1231 ; CHECK-FISL: addis [[R2:[0-9]+]]
1232 ; CHECK-FISL: addi [[R2]], [[R2]]
1233 ; CHECK-FISL-DAG: lxvw4x {{[0-9]+}}, 0, [[R2]]
1234 ; CHECK-FISL: vadduwm
1235 ; CHECK-FISL-NOT: stxvw4x
12251236 ; CHECK-FISL: blr
12261237
12271238 ; CHECK-LE-LABEL: @test80
12281239 ; CHECK-LE-DAG: mtvsrd [[R1:[0-9]+]], 3
1240 ; CHECK-LE-DAG: xxswapd [[V1:[0-9]+]], [[R1]]
12291241 ; CHECK-LE-DAG: addi [[R2:[0-9]+]], {{[0-9]+}}, .LCPI
12301242 ; CHECK-LE-DAG: lxvd2x [[V2:[0-9]+]], 0, [[R2]]
1231 ; CHECK-LE-DAG: xxspltd 34, [[R1]]
1243 ; CHECK-LE-DAG: xxspltw 34, [[V1]]
12321244 ; CHECK-LE-DAG: xxswapd 35, [[V2]]
1233 ; CHECK-LE: vaddudm 2, 2, 3
1245 ; CHECK-LE: vadduwm 2, 2, 3
12341246 ; CHECK-LE: blr
12351247 }
12361248