llvm.org GIT mirror llvm / 50e64cf
Modify the code that lowers shuffles to blends from using blendvXX to vblendXX. blendv uses a register for the selection while vblend uses an immediate. On sandybridge they still have the same latency and execute on the same execution ports. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@154396 91177308-0d34-0410-b5e6-96231b3b80d8 Nadav Rotem 8 years ago
6 changed file(s) with 93 addition(s) and 41 deletion(s). Raw diff Collapse all Expand all
53905390 SDValue V1 = SVOp->getOperand(0);
53915391 SDValue V2 = SVOp->getOperand(1);
53925392 DebugLoc dl = SVOp->getDebugLoc();
5393 LLVMContext *Context = DAG.getContext();
53945393 EVT VT = Op.getValueType();
53955394 EVT InVT = V1.getValueType();
53965395 EVT EltVT = VT.getVectorElementType();
5397 unsigned EltSize = EltVT.getSizeInBits();
53985396 int MaskSize = VT.getVectorNumElements();
53995397 int InSize = InVT.getVectorNumElements();
54005398
5401 // TODO: At the moment we only use AVX blends. We could also use SSE4 blends.
5402 if (!Subtarget->hasAVX())
5399 if (!Subtarget->hasSSE41())
54035400 return SDValue();
54045401
54055402 if (MaskSize != InSize)
54065403 return SDValue();
54075404
5408 SmallVector MaskVals;
5409 ConstantInt *Zero = ConstantInt::get(*Context, APInt(EltSize, 0));
5410 ConstantInt *NegOne = ConstantInt::get(*Context, APInt(EltSize, -1));
5405 int ISDNo = 0;
5406 MVT OpTy;
5407
5408 switch (VT.getSimpleVT().SimpleTy) {
5409 default: return SDValue();
5410 case MVT::v8i16:
5411 ISDNo = X86ISD::BLENDPW;
5412 OpTy = MVT::v8i16;
5413 break;
5414 case MVT::v4i32:
5415 case MVT::v4f32:
5416 ISDNo = X86ISD::BLENDPS;
5417 OpTy = MVT::v4f32;
5418 break;
5419 case MVT::v2i64:
5420 case MVT::v2f64:
5421 ISDNo = X86ISD::BLENDPD;
5422 OpTy = MVT::v2f64;
5423 break;
5424 case MVT::v8i32:
5425 case MVT::v8f32:
5426 if (!Subtarget->hasAVX())
5427 return SDValue();
5428 ISDNo = X86ISD::BLENDPS;
5429 OpTy = MVT::v8f32;
5430 break;
5431 case MVT::v4i64:
5432 case MVT::v4f64:
5433 if (!Subtarget->hasAVX())
5434 return SDValue();
5435 ISDNo = X86ISD::BLENDPD;
5436 OpTy = MVT::v4f64;
5437 break;
5438 case MVT::v16i16:
5439 if (!Subtarget->hasAVX2())
5440 return SDValue();
5441 ISDNo = X86ISD::BLENDPW;
5442 OpTy = MVT::v16i16;
5443 break;
5444 }
5445 assert(ISDNo && "Invalid Op Number");
5446
5447 unsigned MaskVals = 0;
54115448
54125449 for (int i = 0; i < MaskSize; ++i) {
54135450 int EltIdx = SVOp->getMaskElt(i);
54145451 if (EltIdx == i || EltIdx == -1)
5415 MaskVals.push_back(NegOne);
5452 MaskVals |= (1<);
54165453 else if (EltIdx == (i + MaskSize))
5417 MaskVals.push_back(Zero);
5454 continue; // Bit is set to zero;
54185455 else return SDValue();
54195456 }
54205457
5421 Constant *MaskC = ConstantVector::get(MaskVals);
5422 EVT MaskTy = EVT::getEVT(MaskC->getType());
5423 assert(MaskTy.getSizeInBits() == VT.getSizeInBits() && "Invalid mask size");
5424 SDValue MaskIdx = DAG.getConstantPool(MaskC, PtrTy);
5425 unsigned Alignment = cast(MaskIdx)->getAlignment();
5426 SDValue Mask = DAG.getLoad(MaskTy, dl, DAG.getEntryNode(), MaskIdx,
5427 MachinePointerInfo::getConstantPool(),
5428 false, false, false, Alignment);
5429
5430 if (Subtarget->hasAVX2() && MaskTy == MVT::v32i8)
5431 return DAG.getNode(ISD::VSELECT, dl, VT, Mask, V1, V2);
5432
5433 if (Subtarget->hasAVX()) {
5434 switch (MaskTy.getSimpleVT().SimpleTy) {
5435 default: return SDValue();
5436 case MVT::v16i8:
5437 case MVT::v4i32:
5438 case MVT::v2i64:
5439 case MVT::v8i32:
5440 case MVT::v4i64:
5441 return DAG.getNode(ISD::VSELECT, dl, VT, Mask, V1, V2);
5442 }
5443 }
5444
5445 return SDValue();
5458 V1 = DAG.getNode(ISD::BITCAST, dl, OpTy, V1);
5459 V2 = DAG.getNode(ISD::BITCAST, dl, OpTy, V2);
5460 SDValue Ret = DAG.getNode(ISDNo, dl, OpTy, V1, V2,
5461 DAG.getConstant(MaskVals, MVT::i32));
5462 return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
54465463 }
54475464
54485465 // v8i16 shuffles - Prefer shuffles in the following order:
1104911066 case X86ISD::ANDNP: return "X86ISD::ANDNP";
1105011067 case X86ISD::PSIGN: return "X86ISD::PSIGN";
1105111068 case X86ISD::BLENDV: return "X86ISD::BLENDV";
11069 case X86ISD::BLENDPW: return "X86ISD::BLENDPW";
11070 case X86ISD::BLENDPS: return "X86ISD::BLENDPS";
11071 case X86ISD::BLENDPD: return "X86ISD::BLENDPD";
1105211072 case X86ISD::HADD: return "X86ISD::HADD";
1105311073 case X86ISD::HSUB: return "X86ISD::HSUB";
1105411074 case X86ISD::FHADD: return "X86ISD::FHADD";
174174 /// PSIGN - Copy integer sign.
175175 PSIGN,
176176
177 /// BLEND family of opcodes
177 /// BLENDV - Blend where the selector is an XMM.
178178 BLENDV,
179
180 /// BLENDxx - Blend where the selector is an immediate.
181 BLENDPW,
182 BLENDPS,
183 BLENDPD,
179184
180185 /// HADD - Integer horizontal add.
181186 HADD,
125125 SDTCisSameAs<0,2>, SDTCisInt<3>]>;
126126
127127 def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
128 def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
129 SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>;
128130
129131 def X86PAlign : SDNode<"X86ISD::PALIGN", SDTShuff3OpI>;
130132
156158 def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
157159
158160 def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
161
162 def X86Blendpw : SDNode<"X86ISD::BLENDPW", SDTBlend>;
163 def X86Blendps : SDNode<"X86ISD::BLENDPS", SDTBlend>;
164 def X86Blendpd : SDNode<"X86ISD::BLENDPD", SDTBlend>;
159165
160166 //===----------------------------------------------------------------------===//
161167 // SSE Complex Patterns
67346734 def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
67356735 (v4f64 VR256:$src2))),
67366736 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6737
6738 def : Pat<(v8f32 (X86Blendps (v8f32 VR256:$src1), (v8f32 VR256:$src2),
6739 (imm:$mask))),
6740 (VBLENDPSYrri VR256:$src2, VR256:$src1, imm:$mask)>;
6741 def : Pat<(v4f64 (X86Blendpd (v4f64 VR256:$src1), (v4f64 VR256:$src2),
6742 (imm:$mask))),
6743 (VBLENDPDYrri VR256:$src2, VR256:$src1, imm:$mask)>;
67376744 }
67386745
67396746 let Predicates = [HasAVX2] in {
67406747 def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
67416748 (v32i8 VR256:$src2))),
67426749 (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6750 def : Pat<(v16i16 (X86Blendpw (v16i16 VR256:$src1), (v16i16 VR256:$src2),
6751 (imm:$mask))),
6752 (VPBLENDWYrri VR256:$src2, VR256:$src1, imm:$mask)>;
67436753 }
67446754
67456755 /// SS41I_ternary_int - SSE 4.1 ternary operator
67886798 def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
67896799 (v2f64 VR128:$src2))),
67906800 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
6801
6802 def : Pat<(v8i16 (X86Blendpw (v8i16 VR128:$src1), (v8i16 VR128:$src2),
6803 (imm:$mask))),
6804 (VPBLENDWrri VR128:$src2, VR128:$src1, imm:$mask)>;
6805 def : Pat<(v4f32 (X86Blendps (v4f32 VR128:$src1), (v4f32 VR128:$src2),
6806 (imm:$mask))),
6807 (VBLENDPSrri VR128:$src2, VR128:$src1, imm:$mask)>;
6808 def : Pat<(v2f64 (X86Blendpd (v2f64 VR128:$src1), (v2f64 VR128:$src2),
6809 (imm:$mask))),
6810 (VBLENDPDrri VR128:$src2, VR128:$src1, imm:$mask)>;
6811
67916812 }
67926813
67936814 let Predicates = [HasAVX] in
163163 }
164164
165165 ; CHECK: blend1
166 ; CHECK: vblendvps
166 ; CHECK: vblendps
167167 ; CHECK: ret
168168 define <4 x i32> @blend1(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline {
169169 %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32>
171171 }
172172
173173 ; CHECK: blend2
174 ; CHECK: vblendvps
174 ; CHECK: vblendps
175175 ; CHECK: ret
176176 define <4 x i32> @blend2(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline {
177177 %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32>
179179 }
180180
181181 ; CHECK: blend2a
182 ; CHECK: vblendvps
182 ; CHECK: vblendps
183183 ; CHECK: ret
184184 define <4 x float> @blend2a(<4 x float> %a, <4 x float> %b) nounwind alwaysinline {
185185 %t = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32>
187187 }
188188
189189 ; CHECK: blend3
190 ; CHECK-NOT: vblendvps
190 ; CHECK-NOT: vblendps
191191 ; CHECK: ret
192192 define <4 x i32> @blend3(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline {
193193 %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32>
195195 }
196196
197197 ; CHECK: blend4
198 ; CHECK: vblendvpd
198 ; CHECK: vblendpd
199199 ; CHECK: ret
200200 define <4 x i64> @blend4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline {
201201 %t = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32>
None ; RUN: llc < %s -o /dev/null -march=x86 -mattr=+sse2 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 3
0 ; RUN: llc < %s -o /dev/null -march=x86 -mattr=+sse2 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 2
11
22 define <4 x float> @func(<4 x float> %fp0, <4 x float> %fp1) nounwind {
33 entry: