llvm.org GIT mirror llvm / 544843c
prevent folding a scalar FP load into a packed logical FP instruction (PR22371) Change the memory operands in sse12_fp_packed_scalar_logical_alias from scalars to vectors. That's what the hardware packed logical FP instructions define: 128-bit memory operands. There are no scalar versions of these instructions...because this is x86. Generating the wrong code (folding a scalar load into a 128-bit load) is still possible using the peephole optimization pass and the load folding tables. We won't completely solve this bug until we either fix the lowering in fabs/fneg/fcopysign and any other places where scalar FP logic is created or fix the load folding in foldMemoryOperandImpl() to make sure it isn't changing the size of the load. Differential Revision: http://reviews.llvm.org/D7474 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@229531 91177308-0d34-0410-b5e6-96231b3b80d8 Sanjay Patel 5 years ago
5 changed file(s) with 120 addition(s) and 21 deletion(s). Raw diff Collapse all Expand all
365365 def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
366366 def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>;
367367
368 // These are needed to match a scalar load that is used in a vector-only
369 // math instruction such as the FP logical ops: andps, andnps, orps, xorps.
370 // The memory operand is required to be a 128-bit load, so it must be converted
371 // from a vector to a scalar.
372 def loadf32_128 : PatFrag<(ops node:$ptr),
373 (f32 (vector_extract (loadv4f32 node:$ptr), (iPTR 0)))>;
374 def loadf64_128 : PatFrag<(ops node:$ptr),
375 (f64 (vector_extract (loadv2f64 node:$ptr), (iPTR 0)))>;
376
368377 // Like 'store', but always requires 128-bit vector alignment.
369378 def alignedstore : PatFrag<(ops node:$val, node:$ptr),
370379 (store node:$val, node:$ptr), [{
456465 def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
457466 def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
458467
468 // These are needed to match a scalar memop that is used in a vector-only
469 // math instruction such as the FP logical ops: andps, andnps, orps, xorps.
470 // The memory operand is required to be a 128-bit load, so it must be converted
471 // from a vector to a scalar.
472 def memopfsf32_128 : PatFrag<(ops node:$ptr),
473 (f32 (vector_extract (memopv4f32 node:$ptr), (iPTR 0)))>;
474 def memopfsf64_128 : PatFrag<(ops node:$ptr),
475 (f64 (vector_extract (memopv2f64 node:$ptr), (iPTR 0)))>;
476
477
459478 // SSSE3 uses MMX registers for some instructions. They aren't aligned on a
460479 // 16-byte boundary.
461480 // FIXME: 8 byte alignment for mmx reads is not required
932932 { X86::DIVSSrr_Int, X86::DIVSSrm_Int, 0 },
933933 { X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 },
934934 { X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 },
935
936 // FIXME: We should not be folding Fs* scalar loads into vector
937 // instructions because the vector instructions require vector-sized
938 // loads. Lowering should create vector-sized instructions (the Fv*
939 // variants below) to allow load folding.
935940 { X86::FsANDNPDrr, X86::FsANDNPDrm, TB_ALIGN_16 },
936941 { X86::FsANDNPSrr, X86::FsANDNPSrm, TB_ALIGN_16 },
937942 { X86::FsANDPDrr, X86::FsANDPDrm, TB_ALIGN_16 },
940945 { X86::FsORPSrr, X86::FsORPSrm, TB_ALIGN_16 },
941946 { X86::FsXORPDrr, X86::FsXORPDrm, TB_ALIGN_16 },
942947 { X86::FsXORPSrr, X86::FsXORPSrm, TB_ALIGN_16 },
948
949 { X86::FvANDNPDrr, X86::FvANDNPDrm, TB_ALIGN_16 },
950 { X86::FvANDNPSrr, X86::FvANDNPSrm, TB_ALIGN_16 },
951 { X86::FvANDPDrr, X86::FvANDPDrm, TB_ALIGN_16 },
952 { X86::FvANDPSrr, X86::FvANDPSrm, TB_ALIGN_16 },
953 { X86::FvORPDrr, X86::FvORPDrm, TB_ALIGN_16 },
954 { X86::FvORPSrr, X86::FvORPSrm, TB_ALIGN_16 },
955 { X86::FvXORPDrr, X86::FvXORPDrm, TB_ALIGN_16 },
956 { X86::FvXORPSrr, X86::FvXORPSrm, TB_ALIGN_16 },
943957 { X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 },
944958 { X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 },
945959 { X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 },
11411155 { X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, 0 },
11421156 { X86::VDPPDrri, X86::VDPPDrmi, 0 },
11431157 { X86::VDPPSrri, X86::VDPPSrmi, 0 },
1144 { X86::VFsANDNPDrr, X86::VFsANDNPDrm, 0 },
1145 { X86::VFsANDNPSrr, X86::VFsANDNPSrm, 0 },
1146 { X86::VFsANDPDrr, X86::VFsANDPDrm, 0 },
1147 { X86::VFsANDPSrr, X86::VFsANDPSrm, 0 },
1148 { X86::VFsORPDrr, X86::VFsORPDrm, 0 },
1149 { X86::VFsORPSrr, X86::VFsORPSrm, 0 },
1150 { X86::VFsXORPDrr, X86::VFsXORPDrm, 0 },
1151 { X86::VFsXORPSrr, X86::VFsXORPSrm, 0 },
1158 // Do not fold VFs* loads because there are no scalar load variants for
1159 // these instructions. When folded, the load is required to be 128-bits, so
1160 // the load size would not match.
1161 { X86::VFvANDNPDrr, X86::VFvANDNPDrm, 0 },
1162 { X86::VFvANDNPSrr, X86::VFvANDNPSrm, 0 },
1163 { X86::VFvANDPDrr, X86::VFvANDPDrm, 0 },
1164 { X86::VFvANDPSrr, X86::VFvANDPSrm, 0 },
1165 { X86::VFvORPDrr, X86::VFvORPDrm, 0 },
1166 { X86::VFvORPSrr, X86::VFvORPSrm, 0 },
1167 { X86::VFvXORPDrr, X86::VFvXORPDrm, 0 },
1168 { X86::VFvXORPSrr, X86::VFvXORPSrm, 0 },
11521169 { X86::VHADDPDrr, X86::VHADDPDrm, 0 },
11531170 { X86::VHADDPSrr, X86::VHADDPSrm, 0 },
11541171 { X86::VHSUBPDrr, X86::VHSUBPDrm, 0 },
28732873 multiclass sse12_fp_packed_scalar_logical_alias<
28742874 bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
28752875 defm V#NAME#PS : sse12_fp_packed
2876 FR32, f32, f128mem, loadf32, SSEPackedSingle, itins, 0>,
2877 PS, VEX_4V;
2876 FR32, f32, f128mem, loadf32_128, SSEPackedSingle, itins, 0>,
2877 PS, VEX_4V;
28782878
28792879 defm V#NAME#PD : sse12_fp_packed
2880 FR64, f64, f128mem, loadf64, SSEPackedDouble, itins, 0>,
2881 PD, VEX_4V;
2880 FR64, f64, f128mem, loadf64_128, SSEPackedDouble, itins, 0>,
2881 PD, VEX_4V;
28822882
28832883 let Constraints = "$src1 = $dst" in {
28842884 defm PS : sse12_fp_packed
2885 f32, f128mem, memopfsf32, SSEPackedSingle, itins>,
2886 PS;
2885 f32, f128mem, memopfsf32_128, SSEPackedSingle, itins>, PS;
28872886
28882887 defm PD : sse12_fp_packed
2889 f64, f128mem, memopfsf64, SSEPackedDouble, itins>,
2890 PD;
2888 f64, f128mem, memopfsf64_128, SSEPackedDouble, itins>, PD;
28912889 }
28922890 }
28932891
0 ; RUN: llc < %s -mcpu=x86-64 -mattr=sse2,sse-unaligned-mem | FileCheck %s --check-prefix=SSE2
1 ; RUN: llc < %s -mcpu=x86-64 -mattr=avx | FileCheck %s --check-prefix=AVX
2
3 ; Although we have the ability to fold an unaligned load with AVX
4 ; and under special conditions with some SSE implementations, we
5 ; can not fold the load under any circumstances in these test
6 ; cases because they are not 16-byte loads. The load must be
7 ; executed as a scalar ('movs*') with a zero extension to
8 ; 128-bits and then used in the packed logical ('andp*') op.
9 ; PR22371 - http://llvm.org/bugs/show_bug.cgi?id=22371
10
11 define double @load_double_no_fold(double %x, double %y) {
12 ; SSE2-LABEL: load_double_no_fold:
13 ; SSE2: ## BB#0:
14 ; SSE2-NEXT: cmplesd %xmm0, %xmm1
15 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
16 ; SSE2-NEXT: andpd %xmm1, %xmm0
17 ; SSE2-NEXT: retq
18 ;
19 ; AVX-LABEL: load_double_no_fold:
20 ; AVX: ## BB#0:
21 ; AVX-NEXT: vcmplesd %xmm0, %xmm1, %xmm0
22 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
23 ; AVX-NEXT: vandpd %xmm1, %xmm0, %xmm0
24 ; AVX-NEXT: retq
25
26 %cmp = fcmp oge double %x, %y
27 %zext = zext i1 %cmp to i32
28 %conv = sitofp i32 %zext to double
29 ret double %conv
30 }
31
32 define float @load_float_no_fold(float %x, float %y) {
33 ; SSE2-LABEL: load_float_no_fold:
34 ; SSE2: ## BB#0:
35 ; SSE2-NEXT: cmpless %xmm0, %xmm1
36 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
37 ; SSE2-NEXT: andps %xmm1, %xmm0
38 ; SSE2-NEXT: retq
39 ;
40 ; AVX-LABEL: load_float_no_fold:
41 ; AVX: ## BB#0:
42 ; AVX-NEXT: vcmpless %xmm0, %xmm1, %xmm0
43 ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
44 ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
45 ; AVX-NEXT: retq
46
47 %cmp = fcmp oge float %x, %y
48 %zext = zext i1 %cmp to i32
49 %conv = sitofp i32 %zext to float
50 ret float %conv
51 }
52
0 ; RUN: llc < %s -relocation-model=static -mcpu=yonah | FileCheck %s
11
2 ; The double argument is at 4(esp) which is 16-byte aligned, allowing us to
3 ; fold the load into the andpd.
2 ; The double argument is at 4(esp) which is 16-byte aligned, but we
3 ; are required to read in extra bytes of memory in order to fold the
4 ; load. Bad Things may happen when reading/processing undefined bytes,
5 ; so don't fold the load.
6 ; PR22371 / http://reviews.llvm.org/D7474
47
58 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
69 target triple = "i686-apple-darwin8"
1417 %tmp = getelementptr { double, double }* %z, i32 0, i32 0 ; [#uses=1]
1518 %tmp1 = load volatile double* %tmp, align 8 ; [#uses=1]
1619 %tmp2 = tail call double @fabs( double %tmp1 ) readnone ; [#uses=1]
17 ; CHECK: andpd{{.*}}4(%esp), %xmm
1820 %tmp6 = fadd double %tmp4, %tmp2 ; [#uses=1]
1921 store volatile double %tmp6, double* %P, align 8
2022 ret void
23
24 ; CHECK-LABEL: test:
25 ; CHECK: movsd {{.*}}G, %xmm{{.*}}
26 ; CHECK: andpd %xmm{{.*}}, %xmm{{.*}}
27 ; CHECK: movsd 4(%esp), %xmm{{.*}}
28 ; CHECK: andpd %xmm{{.*}}, %xmm{{.*}}
29
30
2131 }
2232
2333 define void @test2() alignstack(16) nounwind {
2434 entry:
25 ; CHECK: andl{{.*}}$-16, %esp
35 ; CHECK-LABEL: test2:
36 ; CHECK: andl{{.*}}$-16, %esp
2637 ret void
2738 }
2839
2940 ; Use a call to force a spill.
3041 define <2 x double> @test3(<2 x double> %x, <2 x double> %y) alignstack(32) nounwind {
3142 entry:
32 ; CHECK: andl{{.*}}$-32, %esp
43 ; CHECK-LABEL: test3:
44 ; CHECK: andl{{.*}}$-32, %esp
3345 call void @test2()
3446 %A = fmul <2 x double> %x, %y
3547 ret <2 x double> %A