llvm.org GIT mirror llvm / cc6487e
[X86][SSE] Disable shouldFoldConstantShiftPairToMask for btver1/btver2 targets (PR40758) As detailed on PR40758, Bobcat/Jaguar can perform vector immediate shifts on the same pipes as vector ANDs with the same latency - so it doesn't make sense to replace a shl+lshr with a shift+and pair as it requires an additional mask (with the extra constant pool, loading and register pressure costs). Differential Revision: https://reviews.llvm.org/D61068 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@359293 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 1 year, 5 months ago
5 changed file(s) with 40 addition(s) and 8 deletion(s). Raw diff Collapse all Expand all
68816881 // (and (srl x, (sub c1, c2), MASK)
68826882 // Only fold this if the inner shift has no other uses -- if it does, folding
68836883 // this will increase the total number of instructions.
6884 // TODO - drop hasOneUse requirement if c1 == c2?
6885 // TODO - support non-uniform vector shift amounts.
68846886 if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse() &&
68856887 TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
68866888 if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
71877189 }
71887190
71897191 // fold (srl (shl x, c), c) -> (and x, cst2)
7192 // TODO - (srl (shl x, c1), c2).
71907193 if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 &&
71917194 isConstantOrConstantVector(N1, /* NoOpaques */ true)) {
71927195 SDLoc DL(N);
423423 "Prefer horizontal vector math instructions (haddp, phsub, etc.) over "
424424 "normal vector instructions with shuffles", [FeatureSSE3]>;
425425
426 def FeatureFastVectorShiftMasks
427 : SubtargetFeature<
428 "fast-vector-shift-masks", "HasFastVectorShiftMasks", "true",
429 "Prefer a left/right vector logical shift pair over a shift+and pair">;
430
426431 // Merge branches using three-way conditional code.
427432 def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch",
428433 "ThreewayBranchProfitable", "true",
774779 FeaturePOPCNT,
775780 FeatureSlowSHLD,
776781 FeatureLAHFSAHF,
777 FeatureFast15ByteNOP];
782 FeatureFast15ByteNOP,
783 FeatureFastVectorShiftMasks];
778784 list BtVer1Features = BtVer1InheritableFeatures;
779785
780786 // Jaguar
50125012
50135013 bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
50145014 const SDNode *N, CombineLevel Level) const {
5015 // TODO - some targets prefer immediate vector shifts to shift+mask.
5015 assert((N->getOpcode() == ISD::SHL &&
5016 N->getOperand(0).getOpcode() == ISD::SRL) ||
5017 (N->getOpcode() == ISD::SRL &&
5018 N->getOperand(0).getOpcode() == ISD::SHL) &&
5019 "Expected shift-shift mask");
5020
5021 if (Subtarget.hasFastVectorShiftMasks() && N->getValueType(0).isVector()) {
5022 // Only fold if the shift values are equal - so it folds to AND.
5023 // TODO - we should fold if either is non-uniform but we don't do the
5024 // fold for non-splats yet.
5025 return N->getOperand(1) == N->getOperand(0).getOperand(1);
5026 }
50165027 return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
50175028 }
50185029
391391
392392 /// Try harder to combine to horizontal vector ops if they are fast.
393393 bool HasFastHorizontalOps = false;
394
395 /// Prefer a left/right vector logical shifts pair over a shift+and pair.
396 bool HasFastVectorShiftMasks = false;
394397
395398 /// Use a retpoline thunk rather than indirect calls to block speculative
396399 /// execution.
643646 bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
644647 bool hasFastBEXTR() const { return HasFastBEXTR; }
645648 bool hasFastHorizontalOps() const { return HasFastHorizontalOps; }
649 bool hasFastVectorShiftMasks() const { return HasFastVectorShiftMasks; }
646650 bool hasMacroFusion() const { return HasMacroFusion; }
647651 bool hasBranchFusion() const { return HasBranchFusion; }
648652 bool hasERMSB() const { return HasERMSB; }
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s
1 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,MASK
2 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2,+fast-vector-shift-masks | FileCheck %s --check-prefixes=CHECK,SHIFT
3 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=btver1 | FileCheck %s --check-prefixes=CHECK,SHIFT
24
35 ; SSE2 Logical Shift Left
46
299301 }
300302
301303 define <4 x i32> @shl_srl_v4i32(<4 x i32> %x) nounwind {
302 ; CHECK-LABEL: shl_srl_v4i32:
303 ; CHECK: # %bb.0:
304 ; CHECK-NEXT: pslld $3, %xmm0
305 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
306 ; CHECK-NEXT: retq
304 ; MASK-LABEL: shl_srl_v4i32:
305 ; MASK: # %bb.0:
306 ; MASK-NEXT: pslld $3, %xmm0
307 ; MASK-NEXT: pand {{.*}}(%rip), %xmm0
308 ; MASK-NEXT: retq
309 ;
310 ; SHIFT-LABEL: shl_srl_v4i32:
311 ; SHIFT: # %bb.0:
312 ; SHIFT-NEXT: psrld $2, %xmm0
313 ; SHIFT-NEXT: pslld $5, %xmm0
314 ; SHIFT-NEXT: retq
307315 %shl0 = lshr <4 x i32> %x,
308316 %shl1 = shl <4 x i32> %shl0,
309317 ret <4 x i32> %shl1