llvm.org GIT mirror llvm / 7e2ff77
Handle vector move / load which zero the destination register top bits (i.e. movd, movq, movss (addr), movsd (addr)) with X86 specific dag combine. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@50838 91177308-0d34-0410-b5e6-96231b3b80d8 Evan Cheng 12 years ago
9 changed file(s) with 180 addition(s) and 152 deletion(s). Raw diff Collapse all Expand all
974974
975975 // Also handle the case where we explicitly require zeros in the top
976976 // elements. This is a vector shuffle from the zero vector.
977 if (N.getOpcode() == ISD::VECTOR_SHUFFLE && N.Val->hasOneUse() &&
977 if (N.getOpcode() == X86ISD::ZEXT_VMOVL && N.Val->hasOneUse() &&
978978 // Check to see if the top elements are all zeros (or bitcast of zeros).
979 ISD::isBuildVectorAllZeros(N.getOperand(0).Val) &&
980 N.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR &&
981 N.getOperand(1).Val->hasOneUse() &&
982 ISD::isNON_EXTLoad(N.getOperand(1).getOperand(0).Val) &&
983 N.getOperand(1).getOperand(0).hasOneUse()) {
984 // Check to see if the shuffle mask is 4/L/L/L or 2/L, where L is something
985 // from the LHS.
986 unsigned VecWidth=MVT::getVectorNumElements(N.getOperand(0).getValueType());
987 SDOperand ShufMask = N.getOperand(2);
988 assert(ShufMask.getOpcode() == ISD::BUILD_VECTOR && "Invalid shuf mask!");
989 if (ConstantSDNode *C = dyn_cast(ShufMask.getOperand(0))) {
990 if (C->getValue() == VecWidth) {
991 for (unsigned i = 1; i != VecWidth; ++i) {
992 if (ShufMask.getOperand(i).getOpcode() == ISD::UNDEF) {
993 // ok.
994 } else {
995 ConstantSDNode *C = cast(ShufMask.getOperand(i));
996 if (C->getValue() >= VecWidth) return false;
997 }
998 }
999 }
1000
1001 // Okay, this is a zero extending load. Fold it.
1002 LoadSDNode *LD = cast(N.getOperand(1).getOperand(0));
1003 if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp))
1004 return false;
1005 OutChain = LD->getChain();
1006 InChain = SDOperand(LD, 1);
1007 return true;
1008 }
979 N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
980 N.getOperand(0).Val->hasOneUse() &&
981 ISD::isNON_EXTLoad(N.getOperand(0).getOperand(0).Val) &&
982 N.getOperand(0).getOperand(0).hasOneUse()) {
983 // Okay, this is a zero extending load. Fold it.
984 LoadSDNode *LD = cast(N.getOperand(0).getOperand(0));
985 if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp))
986 return false;
987 OutChain = LD->getChain();
988 InChain = SDOperand(LD, 1);
989 return true;
1009990 }
1010991 return false;
1011992 }
26042604 }
26052605
26062606 /// isScalarLoadToVector - Returns true if the node is a scalar load that
2607 /// is promoted to a vector.
2608 static inline bool isScalarLoadToVector(SDNode *N) {
2607 /// is promoted to a vector. It also returns the LoadSDNode by reference if
2608 /// required.
2609 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
26092610 if (N->getOpcode() == ISD::SCALAR_TO_VECTOR) {
26102611 N = N->getOperand(0).Val;
2611 return ISD::isNON_EXTLoad(N);
2612 if (ISD::isNON_EXTLoad(N)) {
2613 if (LD)
2614 *LD = cast(N);
2615 return true;
2616 }
26122617 }
26132618 return false;
26142619 }
30813086 return SDOperand();
30823087
30833088 // Let legalizer expand 2-wide build_vectors.
3084 if (EVTBits == 64)
3089 if (EVTBits == 64) {
3090 if (NumNonZero == 1) {
3091 // One half is zero or undef.
3092 unsigned Idx = CountTrailingZeros_32(NonZeros);
3093 SDOperand V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT,
3094 Op.getOperand(Idx));
3095 return getShuffleVectorZeroOrUndef(V2, Idx, true, DAG);
3096 }
30853097 return SDOperand();
3098 }
30863099
30873100 // If element VT is < 32 bits, convert it to inserts into a zero vector.
30883101 if (EVTBits == 8 && NumElems == 16) {
31303143 }
31313144 }
31323145
3133 // Take advantage of the fact GR32 to VR128 scalar_to_vector (i.e. movd)
3134 // clears the upper bits.
3135 // FIXME: we can do the same for v4f32 case when we know both parts of
3136 // the lower half come from scalar_to_vector (loadf32). We should do
3137 // that in post legalizer dag combiner with target specific hooks.
3138 if (MVT::isInteger(EVT) && (NonZeros & (0x3 << 2)) == 0)
3139 return V[0];
31403146 MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
31413147 MVT::ValueType EVT = MVT::getVectorElementType(MaskVT);
31423148 SmallVector MaskVec;
34743480 &MaskVec[0], MaskVec.size()));
34753481 }
34763482
3483 /// getZextVMoveL - Return a zero-extending vector move low node.
3484 ///
3485 static SDOperand getZextVMoveL(MVT::ValueType VT, MVT::ValueType OpVT,
3486 SDOperand SrcOp, SelectionDAG &DAG,
3487 const X86Subtarget *Subtarget) {
3488 if (VT == MVT::v2f64 || VT == MVT::v4f32) {
3489 LoadSDNode *LD = NULL;
3490 if (!isScalarLoadToVector(SrcOp.Val, &LD))
3491 LD = dyn_cast(SrcOp);
3492 if (!LD) {
3493 // movssrr and movsdrr do not clear top bits. Try to use movd, movq
3494 // instead.
3495 MVT::ValueType EVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
3496 if ((EVT != MVT::i64 || Subtarget->is64Bit()) &&
3497 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
3498 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT &&
3499 SrcOp.getOperand(0).getOperand(0).getValueType() == EVT) {
3500 // PR2108
3501 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
3502 return DAG.getNode(ISD::BIT_CONVERT, VT,
3503 DAG.getNode(X86ISD::ZEXT_VMOVL, OpVT,
3504 DAG.getNode(ISD::SCALAR_TO_VECTOR, OpVT,
3505 SrcOp.getOperand(0).getOperand(0))));
3506 }
3507 }
3508 }
3509
3510 return DAG.getNode(ISD::BIT_CONVERT, VT,
3511 DAG.getNode(X86ISD::ZEXT_VMOVL, OpVT,
3512 DAG.getNode(ISD::BIT_CONVERT, OpVT, SrcOp)));
3513 }
3514
34773515 SDOperand
34783516 X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
34793517 SDOperand V1 = Op.getOperand(0);
35143552 // FIXME: Figure out a cleaner way to do this.
35153553 // Try to make use of movq to zero out the top part.
35163554 if (ISD::isBuildVectorAllZeros(V2.Val)) {
3517 SDOperand NewOp = RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, DAG, *this);
3555 SDOperand NewOp = RewriteAsNarrowerShuffle(V1, V2, VT, PermMask,
3556 DAG, *this);
35183557 if (NewOp.Val) {
35193558 SDOperand NewV1 = NewOp.getOperand(0);
35203559 SDOperand NewV2 = NewOp.getOperand(1);
35213560 SDOperand NewMask = NewOp.getOperand(2);
35223561 if (isCommutedMOVL(NewMask.Val, true, false)) {
35233562 NewOp = CommuteVectorShuffle(NewOp, NewV1, NewV2, NewMask, DAG);
3524 NewOp = DAG.getNode(ISD::VECTOR_SHUFFLE, NewOp.getValueType(),
3525 NewV1, NewV2, getMOVLMask(2, DAG));
3526 return DAG.getNode(ISD::BIT_CONVERT, VT, LowerVECTOR_SHUFFLE(NewOp, DAG));
3563 return getZextVMoveL(VT, NewOp.getValueType(), NewV2, DAG, Subtarget);
35273564 }
35283565 }
35293566 } else if (ISD::isBuildVectorAllZeros(V1.Val)) {
3530 SDOperand NewOp= RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, DAG, *this);
3567 SDOperand NewOp= RewriteAsNarrowerShuffle(V1, V2, VT, PermMask,
3568 DAG, *this);
35313569 if (NewOp.Val && X86::isMOVLMask(NewOp.getOperand(2).Val))
3532 return DAG.getNode(ISD::BIT_CONVERT, VT, LowerVECTOR_SHUFFLE(NewOp, DAG));
3533 }
3534 }
3535
3536 if (X86::isMOVLMask(PermMask.Val))
3537 return (V1IsUndef) ? V2 : Op;
3570 return getZextVMoveL(VT, NewOp.getValueType(), NewOp.getOperand(1),
3571 DAG, Subtarget);
3572 }
3573 }
3574
3575 if (X86::isMOVLMask(PermMask.Val)) {
3576 if (V1IsUndef)
3577 return V2;
3578 if (ISD::isBuildVectorAllZeros(V1.Val))
3579 return getZextVMoveL(VT, VT, V2, DAG, Subtarget);
3580 return Op;
3581 }
35383582
35393583 if (X86::isMOVSHDUPMask(PermMask.Val) ||
35403584 X86::isMOVSLDUPMask(PermMask.Val) ||
56285672 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
56295673 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
56305674 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
5631 case X86ISD::LCMPXCHG_DAG: return "x86ISD::LCMPXCHG_DAG";
5632 case X86ISD::LCMPXCHG8_DAG: return "x86ISD::LCMPXCHG8_DAG";
5675 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
5676 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
5677 case X86ISD::ZEXT_VMOVL: return "X86ISD::ZEXT_VMOVL";
56335678 }
56345679 }
56355680
61916236 return false;
61926237 }
61936238
6194 static bool isBaseAlignment16(SDNode *Base, MachineFrameInfo *MFI,
6195 const X86Subtarget *Subtarget) {
6239 static bool isBaseAlignmentOfN(unsigned N, SDNode *Base, MachineFrameInfo *MFI,
6240 const X86Subtarget *Subtarget) {
61966241 GlobalValue *GV;
61976242 int64_t Offset = 0;
61986243 if (isGAPlusOffset(Base, GV, Offset))
6199 return (GV->getAlignment() >= 16 && (Offset % 16) == 0);
6244 return (GV->getAlignment() >= N && (Offset % N) == 0);
62006245 // DAG combine handles the stack object case.
62016246 return false;
62026247 }
62036248
6249 static bool EltsFromConsecutiveLoads(SDNode *N, SDOperand PermMask,
6250 unsigned NumElems, MVT::ValueType EVT,
6251 MachineFrameInfo *MFI,
6252 SelectionDAG &DAG, SDNode *&Base) {
6253 Base = NULL;
6254 for (unsigned i = 0; i < NumElems; ++i) {
6255 SDOperand Idx = PermMask.getOperand(i);
6256 if (Idx.getOpcode() == ISD::UNDEF) {
6257 if (!Base)
6258 return false;
6259 continue;
6260 }
6261
6262 unsigned Index = cast(Idx)->getValue();
6263 SDOperand Elt = getShuffleScalarElt(N, Index, DAG);
6264 if (!Elt.Val ||
6265 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.Val)))
6266 return false;
6267 if (!Base) {
6268 Base = Elt.Val;
6269 continue;
6270 }
6271 if (Elt.getOpcode() == ISD::UNDEF)
6272 continue;
6273
6274 if (!isConsecutiveLoad(Elt.Val, Base, i, MVT::getSizeInBits(EVT)/8,MFI))
6275 return false;
6276 }
6277 return true;
6278 }
62046279
62056280 /// PerformShuffleCombine - Combine a vector_shuffle that is equal to
62066281 /// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
62086283 /// order.
62096284 static SDOperand PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
62106285 const X86Subtarget *Subtarget) {
6211 MachineFunction &MF = DAG.getMachineFunction();
6212 MachineFrameInfo *MFI = MF.getFrameInfo();
6286 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
62136287 MVT::ValueType VT = N->getValueType(0);
62146288 MVT::ValueType EVT = MVT::getVectorElementType(VT);
62156289 SDOperand PermMask = N->getOperand(2);
62166290 unsigned NumElems = PermMask.getNumOperands();
62176291 SDNode *Base = NULL;
6218 for (unsigned i = 0; i < NumElems; ++i) {
6219 SDOperand Elt = PermMask.getOperand(i);
6220 if (Elt.getOpcode() == ISD::UNDEF) {
6221 if (!Base)
6222 return SDOperand();
6223 continue;
6224 }
6225
6226 unsigned Idx = cast(Elt)->getValue();
6227 SDOperand Arg = getShuffleScalarElt(N, Idx, DAG);
6228 if (!Arg.Val || !ISD::isNON_EXTLoad(Arg.Val))
6229 return SDOperand();
6230 if (!Base) {
6231 Base = Arg.Val;
6232 continue;
6233 }
6234
6235 if (!isConsecutiveLoad(Arg.Val, Base, i, MVT::getSizeInBits(EVT)/8,MFI))
6236 return SDOperand();
6237 }
6292 if (!EltsFromConsecutiveLoads(N, PermMask, NumElems, EVT, MFI, DAG, Base))
6293 return SDOperand();
62386294
62396295 LoadSDNode *LD = cast(Base);
6240 if (isBaseAlignment16(Base->getOperand(1).Val, MFI, Subtarget))
6296 if (isBaseAlignmentOfN(16, Base->getOperand(1).Val, MFI, Subtarget))
62416297 return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(),
62426298 LD->getSrcValueOffset(), LD->isVolatile());
62436299 return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(),
63186374 }
63196375
63206376 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
6321 static SDOperand PerformSTORECombine(StoreSDNode *St, SelectionDAG &DAG,
6377 static SDOperand PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
63226378 const X86Subtarget *Subtarget) {
63236379 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
63246380 // the FP state in cases where an emms may be missing.
63256381 // A preferable solution to the general problem is to figure out the right
63266382 // places to insert EMMS. This qualifies as a quick hack.
6383 StoreSDNode *St = cast(N);
63276384 if (MVT::isVector(St->getValue().getValueType()) &&
63286385 MVT::getSizeInBits(St->getValue().getValueType()) == 64 &&
63296386 isa(St->getValue()) &&
64416498 default: break;
64426499 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, Subtarget);
64436500 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget);
6444 case ISD::STORE:
6445 return PerformSTORECombine(cast(N), DAG, Subtarget);
6501 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
64466502 case X86ISD::FXOR:
64476503 case X86ISD::FOR: return PerformFORCombine(N, DAG);
64486504 case X86ISD::FAND: return PerformFANDCombine(N, DAG);
180180 /// in order to obtain suitable precision.
181181 FRSQRT, FRCP,
182182
183 // Thread Local Storage
183 // TLSADDR, THREAThread - Thread Local Storage.
184184 TLSADDR, THREAD_POINTER,
185185
186 // Exception Handling helpers
186 // EH_RETURN - Exception Handling helpers.
187187 EH_RETURN,
188188
189189 /// TC_RETURN - Tail call return.
193193 /// operand #3 optional in flag
194194 TC_RETURN,
195195
196 // compare and swap
196 // LCMPXCHG_DAG, LCMPXCHG8_DAG - Compare and swap.
197197 LCMPXCHG_DAG,
198198 LCMPXCHG8_DAG,
199199
200 // Store FP control world into i16 memory
201 FNSTCW16m
200 // FNSTCW16m - Store FP control world into i16 memory.
201 FNSTCW16m,
202
203 // ZEXT_VMOVL - Vector move low and zero extend.
204 ZEXT_VMOVL
202205 };
203206 }
204207
199199 // movd to MMX register zero-extends
200200 def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
201201 "movd\t{$src, $dst|$dst, $src}",
202 [(set VR64:$dst,
203 (v2i32 (vector_shuffle immAllZerosV,
204 (v2i32 (scalar_to_vector GR32:$src)),
205 MMX_MOVL_shuffle_mask)))]>;
202 [(set VR64:$dst,
203 (v2i32 (X86zvmovl (v2i32 (scalar_to_vector GR32:$src)))))]>;
206204 let AddedComplexity = 20 in
207205 def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
208206 "movd\t{$src, $dst|$dst, $src}",
209 [(set VR64:$dst,
210 (v2i32 (vector_shuffle immAllZerosV,
211 (v2i32 (scalar_to_vector
212 (loadi32 addr:$src))),
213 MMX_MOVL_shuffle_mask)))]>;
207 [(set VR64:$dst,
208 (v2i32 (X86zvmovl (v2i32
209 (scalar_to_vector (loadi32 addr:$src))))))]>;
214210
215211 // Arithmetic Instructions
216212
563559 // Move scalar to XMM zero-extended
564560 // movd to XMM register zero-extends
565561 let AddedComplexity = 15 in {
566 def : Pat<(v8i8 (vector_shuffle immAllZerosV_bc,
567 (bc_v8i8 (v2i32 (scalar_to_vector GR32:$src))),
568 MMX_MOVL_shuffle_mask)),
569 (MMX_MOVZDI2PDIrr GR32:$src)>;
570 def : Pat<(v4i16 (vector_shuffle immAllZerosV_bc,
571 (bc_v4i16 (v2i32 (scalar_to_vector GR32:$src))),
572 MMX_MOVL_shuffle_mask)),
573 (MMX_MOVZDI2PDIrr GR32:$src)>;
562 def : Pat<(v8i8 (X86zvmovl (bc_v8i8 (v2i32 (scalar_to_vector GR32:$src))))),
563 (MMX_MOVZDI2PDIrr GR32:$src)>;
564 def : Pat<(v4i16 (X86zvmovl (bc_v8i8 (v2i32 (scalar_to_vector GR32:$src))))),
565 (MMX_MOVZDI2PDIrr GR32:$src)>;
574566 }
575567
576568 // Scalar to v4i16 / v8i8. The source may be a GR32, but only the lower
4646 def X86insrtps : SDNode<"X86ISD::INSERTPS",
4747 SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
4848 SDTCisVT<2, f32>, SDTCisPtrTy<3>]>>;
49 def X86zvmovl : SDNode<"X86ISD::ZEXT_VMOVL", SDTUnaryOp>;
4950
5051 //===----------------------------------------------------------------------===//
5152 // SSE Complex Patterns
10061007 let AddedComplexity = 20 in
10071008 def MOVZSS2PSrm : SSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f32mem:$src),
10081009 "movss\t{$src, $dst|$dst, $src}",
1009 [(set VR128:$dst, (v4f32 (vector_shuffle immAllZerosV_bc,
1010 (v4f32 (scalar_to_vector (loadf32 addr:$src))),
1011 MOVL_shuffle_mask)))]>;
1012
1010 [(set VR128:$dst, (v4f32 (X86zvmovl (v4f32 (scalar_to_vector
1011 (loadf32 addr:$src))))))]>;
1012
1013 def : Pat<(v4f32 (X86zvmovl (memopv4f32 addr:$src))),
1014 (MOVZSS2PSrm addr:$src)>;
10131015
10141016 //===----------------------------------------------------------------------===//
10151017 // SSE2 Instructions
22632265 def MOVZSD2PDrm : SDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
22642266 "movsd\t{$src, $dst|$dst, $src}",
22652267 [(set VR128:$dst,
2266 (v2f64 (vector_shuffle immAllZerosV_bc,
2267 (v2f64 (scalar_to_vector
2268 (loadf64 addr:$src))),
2269 MOVL_shuffle_mask)))]>;
2268 (v2f64 (X86zvmovl (v2f64 (scalar_to_vector
2269 (loadf64 addr:$src))))))]>;
2270
2271 def : Pat<(v2f64 (X86zvmovl (memopv2f64 addr:$src))),
2272 (MOVZSD2PDrm addr:$src)>;
22702273
22712274 // movd / movq to XMM register zero-extends
22722275 let AddedComplexity = 15 in {
22732276 def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
22742277 "movd\t{$src, $dst|$dst, $src}",
2275 [(set VR128:$dst,
2276 (v4i32 (vector_shuffle immAllZerosV,
2277 (v4i32 (scalar_to_vector GR32:$src)),
2278 MOVL_shuffle_mask)))]>;
2278 [(set VR128:$dst, (v4i32 (X86zvmovl
2279 (v4i32 (scalar_to_vector GR32:$src)))))]>;
22792280 // This is X86-64 only.
22802281 def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
22812282 "mov{d|q}\t{$src, $dst|$dst, $src}",
2282 [(set VR128:$dst,
2283 (v2i64 (vector_shuffle immAllZerosV_bc,
2284 (v2i64 (scalar_to_vector GR64:$src)),
2285 MOVL_shuffle_mask)))]>;
2286 }
2287
2288 // Handle the v2f64 form of 'MOVZQI2PQIrr' for PR2108. FIXME: this would be
2289 // better written as a dag combine xform.
2290 let AddedComplexity = 15 in
2291 def : Pat<(v2f64 (vector_shuffle immAllZerosV_bc,
2292 (v2f64 (scalar_to_vector
2293 (f64 (bitconvert GR64:$src)))),
2294 MOVL_shuffle_mask)),
2295 (MOVZQI2PQIrr GR64:$src)>, Requires<[HasSSE2]>;
2296
2283 [(set VR128:$dst, (v2i64 (X86zvmovl
2284 (v2i64 (scalar_to_vector GR64:$src)))))]>;
2285 }
22972286
22982287 let AddedComplexity = 20 in {
22992288 def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
23002289 "movd\t{$src, $dst|$dst, $src}",
23012290 [(set VR128:$dst,
2302 (v4i32 (vector_shuffle immAllZerosV,
2303 (v4i32 (scalar_to_vector (loadi32 addr:$src))),
2304 MOVL_shuffle_mask)))]>;
2291 (v4i32 (X86zvmovl (v4i32 (scalar_to_vector
2292 (loadi32 addr:$src))))))]>;
23052293 def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
23062294 "movq\t{$src, $dst|$dst, $src}",
23072295 [(set VR128:$dst,
2308 (v2i64 (vector_shuffle immAllZerosV_bc,
2309 (v2i64 (scalar_to_vector (loadi64 addr:$src))),
2310 MOVL_shuffle_mask)))]>, XS,
2296 (v2i64 (X86zvmovl (v2i64 (scalar_to_vector
2297 (loadi64 addr:$src))))))]>, XS,
23112298 Requires<[HasSSE2]>;
23122299 }
23132300
23162303 let AddedComplexity = 15 in
23172304 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
23182305 "movq\t{$src, $dst|$dst, $src}",
2319 [(set VR128:$dst, (v2i64 (vector_shuffle immAllZerosV_bc,
2320 VR128:$src,
2321 MOVL_shuffle_mask)))]>,
2306 [(set VR128:$dst, (v2i64 (X86zvmovl (v2i64 VR128:$src))))]>,
23222307 XS, Requires<[HasSSE2]>;
23232308
23242309 let AddedComplexity = 20 in
23252310 def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
23262311 "movq\t{$src, $dst|$dst, $src}",
2327 [(set VR128:$dst, (v2i64 (vector_shuffle immAllZerosV_bc,
2328 (memopv2i64 addr:$src),
2329 MOVL_shuffle_mask)))]>,
2312 [(set VR128:$dst, (v2i64 (X86zvmovl
2313 (memopv2i64 addr:$src))))]>,
23302314 XS, Requires<[HasSSE2]>;
23312315
23322316 //===----------------------------------------------------------------------===//
27732757 // movd to XMM register zero-extends
27742758 let AddedComplexity = 15 in {
27752759 // Zeroing a VR128 then do a MOVS{S|D} to the lower bits.
2776 def : Pat<(v2f64 (vector_shuffle immAllZerosV_bc,
2777 (v2f64 (scalar_to_vector FR64:$src)), MOVL_shuffle_mask)),
2760 def : Pat<(v2f64 (X86zvmovl (v2f64 (scalar_to_vector FR64:$src)))),
27782761 (MOVLSD2PDrr (V_SET0), FR64:$src)>, Requires<[HasSSE2]>;
2779 def : Pat<(v4f32 (vector_shuffle immAllZerosV_bc,
2780 (v4f32 (scalar_to_vector FR32:$src)), MOVL_shuffle_mask)),
2762 def : Pat<(v4f32 (X86zvmovl (v4f32 (scalar_to_vector FR32:$src)))),
27812763 (MOVLSS2PSrr (V_SET0), FR32:$src)>, Requires<[HasSSE2]>;
27822764 }
27832765
0 ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f
11 ; RUN: grep movlhps %t | count 1
2 ; RUN: grep unpcklps %t | count 1
3 ; RUN: grep punpckldq %t | count 1
42 ; RUN: grep movq %t | count 1
3 ; RUN: grep movsd %t | count 1
54
65 define <4 x float> @test1(float %a, float %b) nounwind {
76 %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0 ; <<4 x float>> [#uses=1]
0 ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f
1 ; RUN: grep unpcklps %t | count 1
1 ; RUN: grep movss %t | count 1
2 ; RUN: grep movups %t | count 1
23 ; RUN: grep shufps %t | count 1
34
45 define <4 x float> @test(float %a, float %b, float %c) nounwind {
0 ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep movq
1 ; RUN: llvm-as < %s | llc -march=x86-64 -mattr=+sse2 | grep movd
2
3 define <2 x i64> @t1(i64 %x) nounwind {
4 %tmp8 = insertelement <2 x i64> zeroinitializer, i64 %x, i32 0
5 ret <2 x i64> %tmp8
6 }
0 ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep movq
1
2 define <4 x i32> @t(i32 %x, i32 %y) nounwind {
3 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %x, i32 0
4 %tmp2 = insertelement <4 x i32> %tmp1, i32 %y, i32 1
5 ret <4 x i32> %tmp2
6 }