llvm.org GIT mirror llvm / bfa4a37
[X86] Add missing memory variants to AVX false dependency breaking Adds missing memory instruction variants to AVX false dependency breaking handling. (SSE was handled in r224246) Differential Revision: http://reviews.llvm.org/D6780 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@224900 91177308-0d34-0410-b5e6-96231b3b80d8 Michael Kuperstein 5 years ago
4 changed file(s) with 227 addition(s) and 192 deletion(s). Raw diff Collapse all Expand all
45534553 static bool hasUndefRegUpdate(unsigned Opcode) {
45544554 switch (Opcode) {
45554555 case X86::VCVTSI2SSrr:
4556 case X86::VCVTSI2SSrm:
45564557 case X86::Int_VCVTSI2SSrr:
4558 case X86::Int_VCVTSI2SSrm:
45574559 case X86::VCVTSI2SS64rr:
4560 case X86::VCVTSI2SS64rm:
45584561 case X86::Int_VCVTSI2SS64rr:
4562 case X86::Int_VCVTSI2SS64rm:
45594563 case X86::VCVTSI2SDrr:
4564 case X86::VCVTSI2SDrm:
45604565 case X86::Int_VCVTSI2SDrr:
4566 case X86::Int_VCVTSI2SDrm:
45614567 case X86::VCVTSI2SD64rr:
4568 case X86::VCVTSI2SD64rm:
45624569 case X86::Int_VCVTSI2SD64rr:
4570 case X86::Int_VCVTSI2SD64rm:
45634571 case X86::VCVTSD2SSrr:
4572 case X86::VCVTSD2SSrm:
45644573 case X86::Int_VCVTSD2SSrr:
4574 case X86::Int_VCVTSD2SSrm:
45654575 case X86::VCVTSS2SDrr:
4576 case X86::VCVTSS2SDrm:
45664577 case X86::Int_VCVTSS2SDrr:
4578 case X86::Int_VCVTSS2SDrm:
45674579 case X86::VRCPSSr:
4580 case X86::VRCPSSm:
4581 case X86::VRCPSSm_Int:
45684582 case X86::VROUNDSDr:
4583 case X86::VROUNDSDm:
45694584 case X86::VROUNDSDr_Int:
45704585 case X86::VROUNDSSr:
4586 case X86::VROUNDSSm:
45714587 case X86::VROUNDSSr_Int:
45724588 case X86::VRSQRTSSr:
4589 case X86::VRSQRTSSm:
4590 case X86::VRSQRTSSm_Int:
45734591 case X86::VSQRTSSr:
4574
4575 // AVX-512
4592 case X86::VSQRTSSm:
4593 case X86::VSQRTSSm_Int:
4594 case X86::VSQRTSDr:
4595 case X86::VSQRTSDm:
4596 case X86::VSQRTSDm_Int:
4597 // AVX-512
45764598 case X86::VCVTSD2SSZrr:
4599 case X86::VCVTSD2SSZrm:
45774600 case X86::VCVTSS2SDZrr:
4601 case X86::VCVTSS2SDZrm:
45784602 return true;
45794603 }
45804604
+0
-29
test/CodeGen/X86/break-avx-dep.ll less more
None ; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s
1 ;
2 ; rdar:15221834 False AVX register dependencies cause 5x slowdown on
3 ; flops-6. Make sure the unused register read by vcvtsi2sdq is zeroed
4 ; to avoid cyclic dependence on a write to the same register in a
5 ; previous iteration.
6
7 ; CHECK-LABEL: t1:
8 ; CHECK-LABEL: %loop
9 ; CHECK: vxorps %[[REG:xmm.]], %{{xmm.}}, %{{xmm.}}
10 ; CHECK: vcvtsi2sdq %{{r[0-9a-x]+}}, %[[REG]], %{{xmm.}}
11 define i64 @t1(i64* nocapture %x, double* nocapture %y) nounwind {
12 entry:
13 %vx = load i64* %x
14 br label %loop
15 loop:
16 %i = phi i64 [ 1, %entry ], [ %inc, %loop ]
17 %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ]
18 %fi = sitofp i64 %i to double
19 %vy = load double* %y
20 %fipy = fadd double %fi, %vy
21 %iipy = fptosi double %fipy to i64
22 %s2 = add i64 %s1, %iipy
23 %inc = add nsw i64 %i, 1
24 %exitcond = icmp eq i64 %inc, 156250000
25 br i1 %exitcond, label %ret, label %loop
26 ret:
27 ret i64 %s2
28 }
0 ; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2 -mcpu=nehalem | FileCheck %s --check-prefix=SSE
1 ; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2 -mcpu=nehalem | FileCheck %s --check-prefix=SSE
2 ; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx -mcpu=corei7-avx | FileCheck %s --check-prefix=AVX
3
4 define double @t1(float* nocapture %x) nounwind readonly ssp {
5 entry:
6 ; SSE-LABEL: t1:
7 ; SSE: movss ([[A0:%rdi|%rcx]]), %xmm0
8 ; SSE: cvtss2sd %xmm0, %xmm0
9
10 %0 = load float* %x, align 4
11 %1 = fpext float %0 to double
12 ret double %1
13 }
14
15 define float @t2(double* nocapture %x) nounwind readonly ssp optsize {
16 entry:
17 ; SSE-LABEL: t2:
18 ; SSE: cvtsd2ss ([[A0]]), %xmm0
19 %0 = load double* %x, align 8
20 %1 = fptrunc double %0 to float
21 ret float %1
22 }
23
24 define float @squirtf(float* %x) nounwind {
25 entry:
26 ; SSE-LABEL: squirtf:
27 ; SSE: movss ([[A0]]), %xmm0
28 ; SSE: sqrtss %xmm0, %xmm0
29 %z = load float* %x
30 %t = call float @llvm.sqrt.f32(float %z)
31 ret float %t
32 }
33
34 define double @squirt(double* %x) nounwind {
35 entry:
36 ; SSE-LABEL: squirt:
37 ; SSE: movsd ([[A0]]), %xmm0
38 ; SSE: sqrtsd %xmm0, %xmm0
39 %z = load double* %x
40 %t = call double @llvm.sqrt.f64(double %z)
41 ret double %t
42 }
43
44 define float @squirtf_size(float* %x) nounwind optsize {
45 entry:
46 ; SSE-LABEL: squirtf_size:
47 ; SSE: sqrtss ([[A0]]), %xmm0
48 %z = load float* %x
49 %t = call float @llvm.sqrt.f32(float %z)
50 ret float %t
51 }
52
53 define double @squirt_size(double* %x) nounwind optsize {
54 entry:
55 ; SSE-LABEL: squirt_size:
56 ; SSE: sqrtsd ([[A0]]), %xmm0
57 %z = load double* %x
58 %t = call double @llvm.sqrt.f64(double %z)
59 ret double %t
60 }
61
62 declare float @llvm.sqrt.f32(float)
63 declare double @llvm.sqrt.f64(double)
64
65 ; SSE-LABEL: loopdep1
66 ; SSE: for.body
67 ;
68 ; This loop contains two cvtsi2ss instructions that update the same xmm
69 ; register. Verify that the execution dependency fix pass breaks those
70 ; dependencies by inserting xorps instructions.
71 ;
72 ; If the register allocator chooses different registers for the two cvtsi2ss
73 ; instructions, they are still dependent on themselves.
74 ; SSE: xorps [[XMM1:%xmm[0-9]+]]
75 ; SSE: , [[XMM1]]
76 ; SSE: cvtsi2ssl %{{.*}}, [[XMM1]]
77 ; SSE: xorps [[XMM2:%xmm[0-9]+]]
78 ; SSE: , [[XMM2]]
79 ; SSE: cvtsi2ssl %{{.*}}, [[XMM2]]
80 ;
81 define float @loopdep1(i32 %m) nounwind uwtable readnone ssp {
82 entry:
83 %tobool3 = icmp eq i32 %m, 0
84 br i1 %tobool3, label %for.end, label %for.body
85
86 for.body: ; preds = %entry, %for.body
87 %m.addr.07 = phi i32 [ %dec, %for.body ], [ %m, %entry ]
88 %s1.06 = phi float [ %add, %for.body ], [ 0.000000e+00, %entry ]
89 %s2.05 = phi float [ %add2, %for.body ], [ 0.000000e+00, %entry ]
90 %n.04 = phi i32 [ %inc, %for.body ], [ 1, %entry ]
91 %conv = sitofp i32 %n.04 to float
92 %add = fadd float %s1.06, %conv
93 %conv1 = sitofp i32 %m.addr.07 to float
94 %add2 = fadd float %s2.05, %conv1
95 %inc = add nsw i32 %n.04, 1
96 %dec = add nsw i32 %m.addr.07, -1
97 %tobool = icmp eq i32 %dec, 0
98 br i1 %tobool, label %for.end, label %for.body
99
100 for.end: ; preds = %for.body, %entry
101 %s1.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
102 %s2.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add2, %for.body ]
103 %sub = fsub float %s1.0.lcssa, %s2.0.lcssa
104 ret float %sub
105 }
106
107 ; rdar:15221834 False AVX register dependencies cause 5x slowdown on
108 ; flops-6. Make sure the unused register read by vcvtsi2sdq is zeroed
109 ; to avoid cyclic dependence on a write to the same register in a
110 ; previous iteration.
111
112 ; AVX-LABEL: loopdep2:
113 ; AVX-LABEL: %loop
114 ; AVX: vxorps %[[REG:xmm.]], %{{xmm.}}, %{{xmm.}}
115 ; AVX: vcvtsi2sdq %{{r[0-9a-x]+}}, %[[REG]], %{{xmm.}}
116 ; SSE-LABEL: loopdep2:
117 ; SSE-LABEL: %loop
118 ; SSE: xorps %[[REG:xmm.]], %[[REG]]
119 ; SSE: cvtsi2sdq %{{r[0-9a-x]+}}, %[[REG]]
120 define i64 @loopdep2(i64* nocapture %x, double* nocapture %y) nounwind {
121 entry:
122 %vx = load i64* %x
123 br label %loop
124 loop:
125 %i = phi i64 [ 1, %entry ], [ %inc, %loop ]
126 %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ]
127 %fi = sitofp i64 %i to double
128 %vy = load double* %y
129 %fipy = fadd double %fi, %vy
130 %iipy = fptosi double %fipy to i64
131 %s2 = add i64 %s1, %iipy
132 %inc = add nsw i64 %i, 1
133 %exitcond = icmp eq i64 %inc, 156250000
134 br i1 %exitcond, label %ret, label %loop
135 ret:
136 ret i64 %s2
137 }
138
139 ; This loop contains a cvtsi2sd instruction that has a loop-carried
140 ; false dependency on an xmm that is modified by other scalar instructions
141 ; that follow it in the loop. Additionally, the source of convert is a
142 ; memory operand. Verify the execution dependency fix pass breaks this
143 ; dependency by inserting a xor before the convert.
144 @x = common global [1024 x double] zeroinitializer, align 16
145 @y = common global [1024 x double] zeroinitializer, align 16
146 @z = common global [1024 x double] zeroinitializer, align 16
147 @w = common global [1024 x double] zeroinitializer, align 16
148 @v = common global [1024 x i32] zeroinitializer, align 16
149
150 define void @loopdep3() {
151 entry:
152 br label %for.cond1.preheader
153
154 for.cond1.preheader: ; preds = %for.inc14, %entry
155 %i.025 = phi i32 [ 0, %entry ], [ %inc15, %for.inc14 ]
156 br label %for.body3
157
158 for.body3:
159 %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
160 %arrayidx = getelementptr inbounds [1024 x i32]* @v, i64 0, i64 %indvars.iv
161 %0 = load i32* %arrayidx, align 4
162 %conv = sitofp i32 %0 to double
163 %arrayidx5 = getelementptr inbounds [1024 x double]* @x, i64 0, i64 %indvars.iv
164 %1 = load double* %arrayidx5, align 8
165 %mul = fmul double %conv, %1
166 %arrayidx7 = getelementptr inbounds [1024 x double]* @y, i64 0, i64 %indvars.iv
167 %2 = load double* %arrayidx7, align 8
168 %mul8 = fmul double %mul, %2
169 %arrayidx10 = getelementptr inbounds [1024 x double]* @z, i64 0, i64 %indvars.iv
170 %3 = load double* %arrayidx10, align 8
171 %mul11 = fmul double %mul8, %3
172 %arrayidx13 = getelementptr inbounds [1024 x double]* @w, i64 0, i64 %indvars.iv
173 store double %mul11, double* %arrayidx13, align 8
174 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
175 %exitcond = icmp eq i64 %indvars.iv.next, 1024
176 br i1 %exitcond, label %for.inc14, label %for.body3
177
178 for.inc14: ; preds = %for.body3
179 %inc15 = add nsw i32 %i.025, 1
180 %exitcond26 = icmp eq i32 %inc15, 100000
181 br i1 %exitcond26, label %for.end16, label %for.cond1.preheader
182
183 for.end16: ; preds = %for.inc14
184 ret void
185
186 ;SSE-LABEL:@loopdep3
187 ;SSE: xorps [[XMM0:%xmm[0-9]+]], [[XMM0]]
188 ;SSE-NEXT: cvtsi2sdl {{.*}}, [[XMM0]]
189 ;SSE-NEXT: mulsd {{.*}}, [[XMM0]]
190 ;SSE-NEXT: mulsd {{.*}}, [[XMM0]]
191 ;SSE-NEXT: mulsd {{.*}}, [[XMM0]]
192 ;SSE-NEXT: movsd [[XMM0]],
193 ;AVX-LABEL:@loopdep3
194 ;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]]
195 ;AVX-NEXT: vcvtsi2sdl {{.*}}, [[XMM0]], [[XMM0]]
196 ;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]]
197 ;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]]
198 ;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]]
199 ;AVX-NEXT: vmovsd [[XMM0]],
200 }
+0
-161
test/CodeGen/X86/break-sse-dep.ll less more
None ; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2 -mcpu=nehalem | FileCheck %s
1 ; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2 -mcpu=nehalem | FileCheck %s
2
3 define double @t1(float* nocapture %x) nounwind readonly ssp {
4 entry:
5 ; CHECK-LABEL: t1:
6 ; CHECK: movss ([[A0:%rdi|%rcx]]), %xmm0
7 ; CHECK: cvtss2sd %xmm0, %xmm0
8
9 %0 = load float* %x, align 4
10 %1 = fpext float %0 to double
11 ret double %1
12 }
13
14 define float @t2(double* nocapture %x) nounwind readonly ssp optsize {
15 entry:
16 ; CHECK-LABEL: t2:
17 ; CHECK: cvtsd2ss ([[A0]]), %xmm0
18 %0 = load double* %x, align 8
19 %1 = fptrunc double %0 to float
20 ret float %1
21 }
22
23 define float @squirtf(float* %x) nounwind {
24 entry:
25 ; CHECK-LABEL: squirtf:
26 ; CHECK: movss ([[A0]]), %xmm0
27 ; CHECK: sqrtss %xmm0, %xmm0
28 %z = load float* %x
29 %t = call float @llvm.sqrt.f32(float %z)
30 ret float %t
31 }
32
33 define double @squirt(double* %x) nounwind {
34 entry:
35 ; CHECK-LABEL: squirt:
36 ; CHECK: movsd ([[A0]]), %xmm0
37 ; CHECK: sqrtsd %xmm0, %xmm0
38 %z = load double* %x
39 %t = call double @llvm.sqrt.f64(double %z)
40 ret double %t
41 }
42
43 define float @squirtf_size(float* %x) nounwind optsize {
44 entry:
45 ; CHECK-LABEL: squirtf_size:
46 ; CHECK: sqrtss ([[A0]]), %xmm0
47 %z = load float* %x
48 %t = call float @llvm.sqrt.f32(float %z)
49 ret float %t
50 }
51
52 define double @squirt_size(double* %x) nounwind optsize {
53 entry:
54 ; CHECK-LABEL: squirt_size:
55 ; CHECK: sqrtsd ([[A0]]), %xmm0
56 %z = load double* %x
57 %t = call double @llvm.sqrt.f64(double %z)
58 ret double %t
59 }
60
61 declare float @llvm.sqrt.f32(float)
62 declare double @llvm.sqrt.f64(double)
63
64 ; CHECK-LABEL: loopdep1
65 ; CHECK: for.body
66 ;
67 ; This loop contains two cvtsi2ss instructions that update the same xmm
68 ; register. Verify that the execution dependency fix pass breaks those
69 ; dependencies by inserting xorps instructions.
70 ;
71 ; If the register allocator chooses different registers for the two cvtsi2ss
72 ; instructions, they are still dependent on themselves.
73 ; CHECK: xorps [[XMM1:%xmm[0-9]+]]
74 ; CHECK: , [[XMM1]]
75 ; CHECK: cvtsi2ssl %{{.*}}, [[XMM1]]
76 ; CHECK: xorps [[XMM2:%xmm[0-9]+]]
77 ; CHECK: , [[XMM2]]
78 ; CHECK: cvtsi2ssl %{{.*}}, [[XMM2]]
79 ;
80 define float @loopdep1(i32 %m) nounwind uwtable readnone ssp {
81 entry:
82 %tobool3 = icmp eq i32 %m, 0
83 br i1 %tobool3, label %for.end, label %for.body
84
85 for.body: ; preds = %entry, %for.body
86 %m.addr.07 = phi i32 [ %dec, %for.body ], [ %m, %entry ]
87 %s1.06 = phi float [ %add, %for.body ], [ 0.000000e+00, %entry ]
88 %s2.05 = phi float [ %add2, %for.body ], [ 0.000000e+00, %entry ]
89 %n.04 = phi i32 [ %inc, %for.body ], [ 1, %entry ]
90 %conv = sitofp i32 %n.04 to float
91 %add = fadd float %s1.06, %conv
92 %conv1 = sitofp i32 %m.addr.07 to float
93 %add2 = fadd float %s2.05, %conv1
94 %inc = add nsw i32 %n.04, 1
95 %dec = add nsw i32 %m.addr.07, -1
96 %tobool = icmp eq i32 %dec, 0
97 br i1 %tobool, label %for.end, label %for.body
98
99 for.end: ; preds = %for.body, %entry
100 %s1.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
101 %s2.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add2, %for.body ]
102 %sub = fsub float %s1.0.lcssa, %s2.0.lcssa
103 ret float %sub
104 }
105
106 ; This loop contains a cvtsi2sd instruction that has a loop-carried
107 ; false dependency on an xmm that is modified by other scalar instructions
108 ; that follow it in the loop. Additionally, the source of convert is a
109 ; memory operand. Verify the execution dependency fix pass breaks this
110 ; dependency by inserting a xor before the convert.
111 @x = common global [1024 x double] zeroinitializer, align 16
112 @y = common global [1024 x double] zeroinitializer, align 16
113 @z = common global [1024 x double] zeroinitializer, align 16
114 @w = common global [1024 x double] zeroinitializer, align 16
115 @v = common global [1024 x i32] zeroinitializer, align 16
116
117 define void @loopdep2() {
118 entry:
119 br label %for.cond1.preheader
120
121 for.cond1.preheader: ; preds = %for.inc14, %entry
122 %i.025 = phi i32 [ 0, %entry ], [ %inc15, %for.inc14 ]
123 br label %for.body3
124
125 for.body3:
126 %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
127 %arrayidx = getelementptr inbounds [1024 x i32]* @v, i64 0, i64 %indvars.iv
128 %0 = load i32* %arrayidx, align 4
129 %conv = sitofp i32 %0 to double
130 %arrayidx5 = getelementptr inbounds [1024 x double]* @x, i64 0, i64 %indvars.iv
131 %1 = load double* %arrayidx5, align 8
132 %mul = fmul double %conv, %1
133 %arrayidx7 = getelementptr inbounds [1024 x double]* @y, i64 0, i64 %indvars.iv
134 %2 = load double* %arrayidx7, align 8
135 %mul8 = fmul double %mul, %2
136 %arrayidx10 = getelementptr inbounds [1024 x double]* @z, i64 0, i64 %indvars.iv
137 %3 = load double* %arrayidx10, align 8
138 %mul11 = fmul double %mul8, %3
139 %arrayidx13 = getelementptr inbounds [1024 x double]* @w, i64 0, i64 %indvars.iv
140 store double %mul11, double* %arrayidx13, align 8
141 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
142 %exitcond = icmp eq i64 %indvars.iv.next, 1024
143 br i1 %exitcond, label %for.inc14, label %for.body3
144
145 for.inc14: ; preds = %for.body3
146 %inc15 = add nsw i32 %i.025, 1
147 %exitcond26 = icmp eq i32 %inc15, 100000
148 br i1 %exitcond26, label %for.end16, label %for.cond1.preheader
149
150 for.end16: ; preds = %for.inc14
151 ret void
152
153 ;CHECK-LABEL:@loopdep2
154 ;CHECK: xorps [[XMM0:%xmm[0-9]+]], [[XMM0]]
155 ;CHECK-NEXT: cvtsi2sdl {{.*}}, [[XMM0]]
156 ;CHECK-NEXT: mulsd {{.*}}, [[XMM0]]
157 ;CHECK-NEXT: mulsd {{.*}}, [[XMM0]]
158 ;CHECK-NEXT: mulsd {{.*}}, [[XMM0]]
159 ;CHECK-NEXT: movsd [[XMM0]],
160 }