llvm.org GIT mirror llvm / 3ae9815
Optimization for "truncate" operation on AVX. Truncating v4i64 -> v4i32 and v8i32 -> v8i16 may be done with set of shuffles. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@149485 91177308-0d34-0410-b5e6-96231b3b80d8 Elena Demikhovsky 7 years ago
3 changed file(s) with 116 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
12171217 setTargetDAGCombine(ISD::LOAD);
12181218 setTargetDAGCombine(ISD::STORE);
12191219 setTargetDAGCombine(ISD::ZERO_EXTEND);
1220 setTargetDAGCombine(ISD::TRUNCATE);
12201221 setTargetDAGCombine(ISD::SINT_TO_FP);
12211222 if (Subtarget->is64Bit())
12221223 setTargetDAGCombine(ISD::MUL);
1291012911 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
1291112912 }
1291212913
12914
12915 /// PerformTruncateCombine - Converts truncate operation to
12916 /// a sequence of vector shuffle operations.
12917 /// It is possible when we truncate 256-bit vector to 128-bit vector
12918
12919 SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
12920 DAGCombinerInfo &DCI) const {
12921 if (!DCI.isBeforeLegalizeOps())
12922 return SDValue();
12923
12924 if (!Subtarget->hasAVX()) return SDValue();
12925
12926 EVT VT = N->getValueType(0);
12927 SDValue Op = N->getOperand(0);
12928 EVT OpVT = Op.getValueType();
12929 DebugLoc dl = N->getDebugLoc();
12930
12931 if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) {
12932
12933 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
12934 DAG.getIntPtrConstant(0));
12935
12936 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
12937 DAG.getIntPtrConstant(2));
12938
12939 OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
12940 OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
12941
12942 // PSHUFD
12943 SmallVector ShufMask1;
12944 ShufMask1.push_back(0);
12945 ShufMask1.push_back(2);
12946 ShufMask1.push_back(0);
12947 ShufMask1.push_back(0);
12948
12949 OpLo = DAG.getVectorShuffle(VT, dl, OpLo, DAG.getUNDEF(VT),
12950 ShufMask1.data());
12951 OpHi = DAG.getVectorShuffle(VT, dl, OpHi, DAG.getUNDEF(VT),
12952 ShufMask1.data());
12953
12954 // MOVLHPS
12955 SmallVector ShufMask2;
12956 ShufMask2.push_back(0);
12957 ShufMask2.push_back(1);
12958 ShufMask2.push_back(4);
12959 ShufMask2.push_back(5);
12960
12961 return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2.data());
12962 }
12963 if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) {
12964
12965 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
12966 DAG.getIntPtrConstant(0));
12967
12968 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
12969 DAG.getIntPtrConstant(4));
12970
12971 OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo);
12972 OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi);
12973
12974 // PSHUFB
12975 SmallVector ShufMask1;
12976 ShufMask1.push_back(0x0);
12977 ShufMask1.push_back(0x1);
12978 ShufMask1.push_back(0x4);
12979 ShufMask1.push_back(0x5);
12980 ShufMask1.push_back(0x8);
12981 ShufMask1.push_back(0x9);
12982 ShufMask1.push_back(0xc);
12983 ShufMask1.push_back(0xd);
12984 for (unsigned i=0; i<8; ++i)
12985 ShufMask1.push_back(-1);
12986
12987 OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo,
12988 DAG.getUNDEF(MVT::v16i8),
12989 ShufMask1.data());
12990 OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi,
12991 DAG.getUNDEF(MVT::v16i8),
12992 ShufMask1.data());
12993
12994 OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
12995 OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
12996
12997 // MOVLHPS
12998 SmallVector ShufMask2;
12999 ShufMask2.push_back(0);
13000 ShufMask2.push_back(1);
13001 ShufMask2.push_back(4);
13002 ShufMask2.push_back(5);
13003
13004 SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2.data());
13005 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res);
13006
13007 }
13008
13009 return SDValue();
13010 }
13011
1291313012 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
1291413013 /// generation and convert it from being a bunch of shuffles and extracts
1291513014 /// to a simple store and scalar loads to extract the elements.
1477014869 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
1477114870 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG);
1477214871 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, Subtarget);
14872 case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG, DCI);
1477314873 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG);
1477414874 case X86ISD::SHUFP: // Handle all target specific shuffles
1477514875 case X86ISD::PALIGN:
838838 SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const;
839839 SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
840840 SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
841 SDValue PerformTruncateCombine(SDNode* N, SelectionDAG &DAG, DAGCombinerInfo &DCI) const;
841842
842843 // Utility functions to help LowerVECTOR_SHUFFLE
843844 SDValue LowerVECTOR_SHUFFLEv8i16(SDValue Op, SelectionDAG &DAG) const;
0 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
1
2 define <4 x i32> @trunc_64_32(<4 x i64> %A) nounwind uwtable readnone ssp{
3 ; CHECK: trunc_64_32
4 ; CHECK: pshufd
5 %B = trunc <4 x i64> %A to <4 x i32>
6 ret <4 x i32>%B
7 }
8 define <8 x i16> @trunc_32_16(<8 x i32> %A) nounwind uwtable readnone ssp{
9 ; CHECK: trunc_32_16
10 ; CHECK: pshufb
11 %B = trunc <8 x i32> %A to <8 x i16>
12 ret <8 x i16>%B
13 }
14