llvm.org GIT mirror llvm / d6482ac
Merging r243361: ------------------------------------------------------------------------ r243361 | spatel | 2015-07-27 17:48:32 -0700 (Mon, 27 Jul 2015) | 17 lines fix invalid load folding with SSE/AVX FP logical instructions (PR22371) This is a follow-up to the FIXME that was added with D7474 ( http://reviews.llvm.org/rL229531 ). I thought this load folding bug had been made hard-to-hit, but it turns out to be very easy when targeting 32-bit x86 and causes a miscompile/crash in Wine: https://bugs.winehq.org/show_bug.cgi?id=38826 https://llvm.org/bugs/show_bug.cgi?id=22371#c25 The quick fix is to simply remove the scalar FP logical instructions from the load folding table in X86InstrInfo, but that causes us to miss load folds that should be possible when lowering fabs, fneg, fcopysign. So the majority of this patch is altering those lowerings to use *vector* FP logical instructions (because that's all x86 gives us anyway). That lets us do the load folding legally. Differential Revision: http://reviews.llvm.org/D11477 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_37@243435 91177308-0d34-0410-b5e6-96231b3b80d8 Hans Wennborg 5 years ago
6 changed file(s) with 106 addition(s) and 67 deletion(s). Raw diff Collapse all Expand all
1263912639 if (User->getOpcode() == ISD::FNEG)
1264012640 return Op;
1264112641
12642 SDValue Op0 = Op.getOperand(0);
12643 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
12644
1264512642 SDLoc dl(Op);
1264612643 MVT VT = Op.getSimpleValueType();
12647 // Assume scalar op for initialization; update for vector if needed.
12648 // Note that there are no scalar bitwise logical SSE/AVX instructions, so we
12649 // generate a 16-byte vector constant and logic op even for the scalar case.
12650 // Using a 16-byte mask allows folding the load of the mask with
12651 // the logic op, so it can save (~4 bytes) on code size.
12652 MVT EltVT = VT;
12653 unsigned NumElts = VT == MVT::f64 ? 2 : 4;
12644
1265412645 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
1265512646 // decide if we should generate a 16-byte constant mask when we only need 4 or
1265612647 // 8 bytes for the scalar case.
12648
12649 MVT LogicVT;
12650 MVT EltVT;
12651 unsigned NumElts;
12652
1265712653 if (VT.isVector()) {
12654 LogicVT = VT;
1265812655 EltVT = VT.getVectorElementType();
1265912656 NumElts = VT.getVectorNumElements();
12657 } else {
12658 // There are no scalar bitwise logical SSE/AVX instructions, so we
12659 // generate a 16-byte vector constant and logic op even for the scalar case.
12660 // Using a 16-byte mask allows folding the load of the mask with
12661 // the logic op, so it can save (~4 bytes) on code size.
12662 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
12663 EltVT = VT;
12664 NumElts = (VT == MVT::f64) ? 2 : 4;
1266012665 }
1266112666
1266212667 unsigned EltBits = EltVT.getSizeInBits();
1266912674 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
1267012675 SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
1267112676 unsigned Alignment = cast(CPIdx)->getAlignment();
12672 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
12677 SDValue Mask = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
1267312678 MachinePointerInfo::getConstantPool(),
1267412679 false, false, false, Alignment);
1267512680
12676 if (VT.isVector()) {
12677 // For a vector, cast operands to a vector type, perform the logic op,
12678 // and cast the result back to the original value type.
12679 MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
12680 SDValue MaskCasted = DAG.getBitcast(VecVT, Mask);
12681 SDValue Operand = IsFNABS ? DAG.getBitcast(VecVT, Op0.getOperand(0))
12682 : DAG.getBitcast(VecVT, Op0);
12683 unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
12684 return DAG.getBitcast(VT,
12685 DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
12686 }
12687
12688 // If not vector, then scalar.
12689 unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
12681 SDValue Op0 = Op.getOperand(0);
12682 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
12683 unsigned LogicOp =
12684 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
1269012685 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
12691 return DAG.getNode(BitOp, dl, VT, Operand, Mask);
12686
12687 if (VT.isVector())
12688 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
12689
12690 // For the scalar case extend to a 128-bit vector, perform the logic op,
12691 // and extract the scalar result back out.
12692 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
12693 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
12694 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
12695 DAG.getIntPtrConstant(0, dl));
1269212696 }
1269312697
1269412698 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
1272812732 Constant *C = ConstantVector::get(CV);
1272912733 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
1273012734 SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16);
12731 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
12735
12736 // Perform all logic operations as 16-byte vectors because there are no
12737 // scalar FP logic instructions in SSE. This allows load folding of the
12738 // constants into the logic instructions.
12739 MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
12740 SDValue Mask1 = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
1273212741 MachinePointerInfo::getConstantPool(),
1273312742 false, false, false, 16);
12734 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
12743 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
12744 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1);
1273512745
1273612746 // Next, clear the sign bit from the first operand (magnitude).
1273712747 // If it's a constant, we can clear it here.
1273912749 APFloat APF = Op0CN->getValueAPF();
1274012750 // If the magnitude is a positive zero, the sign bit alone is enough.
1274112751 if (APF.isPosZero())
12742 return SignBit;
12752 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
12753 DAG.getIntPtrConstant(0, dl));
1274312754 APF.clearSign();
1274412755 CV[0] = ConstantFP::get(*Context, APF);
1274512756 } else {
1274912760 }
1275012761 C = ConstantVector::get(CV);
1275112762 CPIdx = DAG.getConstantPool(C, PtrVT, 16);
12752 SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
12763 SDValue Val = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
1275312764 MachinePointerInfo::getConstantPool(),
1275412765 false, false, false, 16);
1275512766 // If the magnitude operand wasn't a constant, we need to AND out the sign.
12756 if (!isa(Op0))
12757 Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
12758
12767 if (!isa(Op0)) {
12768 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
12769 Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val);
12770 }
1275912771 // OR the magnitude value with the sign bit.
12760 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
12772 Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit);
12773 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
12774 DAG.getIntPtrConstant(0, dl));
1276112775 }
1276212776
1276312777 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
955955 { X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 },
956956 { X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 },
957957
958 // FIXME: We should not be folding Fs* scalar loads into vector
959 // instructions because the vector instructions require vector-sized
960 // loads. Lowering should create vector-sized instructions (the Fv*
961 // variants below) to allow load folding.
962 { X86::FsANDNPDrr, X86::FsANDNPDrm, TB_ALIGN_16 },
963 { X86::FsANDNPSrr, X86::FsANDNPSrm, TB_ALIGN_16 },
964 { X86::FsANDPDrr, X86::FsANDPDrm, TB_ALIGN_16 },
965 { X86::FsANDPSrr, X86::FsANDPSrm, TB_ALIGN_16 },
966 { X86::FsORPDrr, X86::FsORPDrm, TB_ALIGN_16 },
967 { X86::FsORPSrr, X86::FsORPSrm, TB_ALIGN_16 },
968 { X86::FsXORPDrr, X86::FsXORPDrm, TB_ALIGN_16 },
969 { X86::FsXORPSrr, X86::FsXORPSrm, TB_ALIGN_16 },
958 // Do not fold Fs* scalar logical op loads because there are no scalar
959 // load variants for these instructions. When folded, the load is required
960 // to be 128-bits, so the load size would not match.
970961
971962 { X86::FvANDNPDrr, X86::FvANDNPDrm, TB_ALIGN_16 },
972963 { X86::FvANDNPSrr, X86::FvANDNPSrm, TB_ALIGN_16 },
29182918 defm V#NAME#PD : sse12_fp_packed
29192919 VR128, v2f64, f128mem, loadv2f64, SSEPackedDouble, itins, 0>,
29202920 PD, VEX_4V;
2921
2922 defm V#NAME#PSY : sse12_fp_packed
2923 VR256, v8f32, f256mem, loadv8f32, SSEPackedSingle, itins, 0>,
2924 PS, VEX_4V, VEX_L;
2925
2926 defm V#NAME#PDY : sse12_fp_packed
2927 VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins, 0>,
2928 PD, VEX_4V, VEX_L;
29212929 }
29222930
29232931 let Constraints = "$src1 = $dst" in {
0 ; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
11 ; PR2656
2
3 ; CHECK: {{xorps.*sp}}
4 ; CHECK-NOT: {{xorps.*sp}}
52
63 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
74 target triple = "i686-apple-darwin9.4.0"
85 %struct.anon = type <{ float, float }>
96 @.str = internal constant [17 x i8] c"pt: %.0f, %.0f\0A\00\00" ; <[17 x i8]*> [#uses=1]
107
8 ; We can not fold either stack load into an 'xor' instruction because that
9 ; would change what should be a 4-byte load into a 16-byte load.
10 ; We can fold the 16-byte constant load into either 'xor' instruction,
11 ; but we do not. It has more than one use, so it gets loaded into a register.
12
1113 define void @foo(%struct.anon* byval %p) nounwind {
14 ; CHECK-LABEL: foo:
15 ; CHECK: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
16 ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
17 ; CHECK-NEXT: movaps {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
18 ; CHECK-NEXT: xorps %xmm2, %xmm0
19 ; CHECK-NEXT: cvtss2sd %xmm0, %xmm0
20 ; CHECK-NEXT: xorps %xmm2, %xmm1
1221 entry:
1322 %tmp = getelementptr %struct.anon, %struct.anon* %p, i32 0, i32 0 ; [#uses=1]
1423 %tmp1 = load float, float* %tmp ; [#uses=1]
2332 }
2433
2534 declare i32 @printf(...)
35
36 ; We can not fold the load from the stack into the 'and' instruction because
37 ; that changes an 8-byte load into a 16-byte load (illegal memory access).
38 ; We can fold the load of the constant because it is a 16-byte vector constant.
39
40 define double @PR22371(double %x) {
41 ; CHECK-LABEL: PR22371:
42 ; CHECK: movsd 16(%esp), %xmm0
43 ; CHECK-NEXT: andpd LCPI1_0, %xmm0
44 ; CHECK-NEXT: movlpd %xmm0, (%esp)
45 %call = tail call double @fabs(double %x) #0
46 ret double %call
47 }
48
49 declare double @fabs(double) #0
50 attributes #0 = { readnone }
51
5454
5555 define float @int1(float %a, float %b) {
5656 ; X32-LABEL: @int1
57 ; X32: movss 12(%esp), %xmm0 {{.*#+}} xmm0 = mem[0],zero,zero,zero
58 ; X32-NEXT: movss 8(%esp), %xmm1 {{.*#+}} xmm1 = mem[0],zero,zero,zero
59 ; X32-NEXT: andps .LCPI2_0, %xmm1
60 ; X32-NEXT: andps .LCPI2_1, %xmm0
61 ; X32-NEXT: orps %xmm1, %xmm0
62 ; X32-NEXT: movss %xmm0, (%esp)
57 ; X32: movss 8(%esp), %xmm0 {{.*#+}} xmm0 = mem[0],zero,zero,zero
58 ; X32-NEXT: andps .LCPI2_0, %xmm0
59 ; X32-NEXT: movss 12(%esp), %xmm1 {{.*#+}} xmm1 = mem[0],zero,zero,zero
60 ; X32-NEXT: andps .LCPI2_1, %xmm1
61 ; X32-NEXT: orps %xmm0, %xmm1
62 ; X32-NEXT: movss %xmm1, (%esp)
6363 ; X32-NEXT: flds (%esp)
6464 ; X32-NEXT: popl %eax
6565 ; X32-NEXT: retl
7575
7676 define double @int2(double %a, float %b, float %c) {
7777 ; X32-LABEL: @int2
78 ; X32: movsd 8(%ebp), %xmm0 {{.*#+}} xmm0 = mem[0],zero
79 ; X32-NEXT: movss 16(%ebp), %xmm1 {{.*#+}} xmm1 = mem[0],zero,zero,zero
80 ; X32-NEXT: addss 20(%ebp), %xmm1
81 ; X32-NEXT: andpd .LCPI3_0, %xmm0
82 ; X32-NEXT: cvtss2sd %xmm1, %xmm1
83 ; X32-NEXT: andpd .LCPI3_1, %xmm1
84 ; X32-NEXT: orpd %xmm0, %xmm1
85 ; X32-NEXT: movsd %xmm1, (%esp)
78 ; X32: movss 16(%ebp), %xmm0 {{.*#+}} xmm0 = mem[0],zero,zero,zero
79 ; X32-NEXT: addss 20(%ebp), %xmm0
80 ; X32-NEXT: movsd 8(%ebp), %xmm1 {{.*#+}} xmm1 = mem[0],zero
81 ; X32-NEXT: andpd .LCPI3_0, %xmm1
82 ; X32-NEXT: cvtss2sd %xmm0, %xmm0
83 ; X32-NEXT: andpd .LCPI3_1, %xmm0
84 ; X32-NEXT: orpd %xmm1, %xmm0
85 ; X32-NEXT: movlpd %xmm0, (%esp)
8686 ; X32-NEXT: fldl (%esp)
8787 ; X32-NEXT: movl %ebp, %esp
8888 ; X32-NEXT: popl %ebp
9090 ;
9191 ; X64-LABEL: @int2
9292 ; X64: addss %xmm2, %xmm1
93 ; X64-NEXT: andpd .LCPI3_0(%rip), %xmm0
9493 ; X64-NEXT: cvtss2sd %xmm1, %xmm1
95 ; X64-NEXT: andpd .LCPI3_1(%rip), %xmm1
94 ; X64-NEXT: andpd .LCPI3_0(%rip), %xmm1
95 ; X64-NEXT: andpd .LCPI3_1(%rip), %xmm0
9696 ; X64-NEXT: orpd %xmm1, %xmm0
9797 ; X64-NEXT: retq
9898 %tmp1 = fadd float %b, %c
33 define <2 x double> @fabs_v2f64(<2 x double> %p)
44 {
55 ; CHECK-LABEL: fabs_v2f64
6 ; CHECK: vandps
6 ; CHECK: vandpd
77 %t = call <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
88 ret <2 x double> %t
99 }
2121 define <4 x double> @fabs_v4f64(<4 x double> %p)
2222 {
2323 ; CHECK-LABEL: fabs_v4f64
24 ; CHECK: vandps
24 ; CHECK: vandpd
2525 %t = call <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
2626 ret <4 x double> %t
2727 }