llvm.org GIT mirror llvm / 46f7257
SHLD/SHRD are VectorPath (microcode) instructions known to have poor latency on certain architectures. While generating SHLD/SHRD instructions is acceptable when optimizing for size, optimizing for speed on these platforms should be implemented using alternative sequences of instructions composed of add, adc, shr, shl, or and lea which are directPath instructions. These alternative instructions not only have a lower latency but they also increase the decode bandwidth by allowing simultaneous decoding of a third directPath instruction. AMD's processors family K7, K8, K10, K12, K15 and K16 are known to have SHLD/SHRD instructions with very poor latency. Optimization guides for these processors recommend using an alternative sequence of instructions. For these AMD's processors, I disabled folding (or (x << c) | (y >> (64 - c))) when we are not optimizing for size. It might be beneficial to disable this folding for some of the Intel's processors. However, since I couldn't find specific recommendations regarding using SHLD/SHRD instructions on Intel's processors, I haven't disabled this peephole for Intel. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@195383 91177308-0d34-0410-b5e6-96231b3b80d8 Ekaterina Romanova 6 years ago
8 changed file(s) with 328 addition(s) and 18 deletion(s). Raw diff Collapse all Expand all
7272 [Feature64Bit]>;
7373 def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
7474 "Bit testing of memory is slow">;
75 def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
76 "SHLD instruction is slow">;
7577 def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem",
7678 "IsUAMemFast", "true",
7779 "Fast unaligned memory access">;
267269 def : Proc<"k6", [FeatureMMX]>;
268270 def : Proc<"k6-2", [Feature3DNow]>;
269271 def : Proc<"k6-3", [Feature3DNow]>;
270 def : Proc<"athlon", [Feature3DNowA, FeatureSlowBTMem]>;
271 def : Proc<"athlon-tbird", [Feature3DNowA, FeatureSlowBTMem]>;
272 def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>;
273 def : Proc<"athlon-xp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>;
274 def : Proc<"athlon-mp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>;
272 def : Proc<"athlon", [Feature3DNowA, FeatureSlowBTMem,
273 FeatureSlowSHLD]>;
274 def : Proc<"athlon-tbird", [Feature3DNowA, FeatureSlowBTMem,
275 FeatureSlowSHLD]>;
276 def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem,
277 FeatureSlowSHLD]>;
278 def : Proc<"athlon-xp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem,
279 FeatureSlowSHLD]>;
280 def : Proc<"athlon-mp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem,
281 FeatureSlowSHLD]>;
275282 def : Proc<"k8", [FeatureSSE2, Feature3DNowA, Feature64Bit,
276 FeatureSlowBTMem]>;
283 FeatureSlowBTMem, FeatureSlowSHLD]>;
277284 def : Proc<"opteron", [FeatureSSE2, Feature3DNowA, Feature64Bit,
278 FeatureSlowBTMem]>;
285 FeatureSlowBTMem, FeatureSlowSHLD]>;
279286 def : Proc<"athlon64", [FeatureSSE2, Feature3DNowA, Feature64Bit,
280 FeatureSlowBTMem]>;
287 FeatureSlowBTMem, FeatureSlowSHLD]>;
281288 def : Proc<"athlon-fx", [FeatureSSE2, Feature3DNowA, Feature64Bit,
282 FeatureSlowBTMem]>;
289 FeatureSlowBTMem, FeatureSlowSHLD]>;
283290 def : Proc<"k8-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B,
284 FeatureSlowBTMem]>;
291 FeatureSlowBTMem, FeatureSlowSHLD]>;
285292 def : Proc<"opteron-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B,
286 FeatureSlowBTMem]>;
293 FeatureSlowBTMem, FeatureSlowSHLD]>;
287294 def : Proc<"athlon64-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B,
288 FeatureSlowBTMem]>;
295 FeatureSlowBTMem, FeatureSlowSHLD]>;
289296 def : Proc<"amdfam10", [FeatureSSE4A,
290297 Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT,
291 FeaturePOPCNT, FeatureSlowBTMem]>;
298 FeaturePOPCNT, FeatureSlowBTMem,
299 FeatureSlowSHLD]>;
292300 // Bobcat
293301 def : Proc<"btver1", [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B,
294 FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT]>;
302 FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT,
303 FeatureSlowSHLD]>;
295304 // Jaguar
296305 def : Proc<"btver2", [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B,
297306 FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
298307 FeatureBMI, FeatureF16C, FeatureMOVBE,
299 FeatureLZCNT, FeaturePOPCNT]>;
308 FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>;
300309 // Bulldozer
301310 def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
302311 FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
303 FeatureLZCNT, FeaturePOPCNT]>;
312 FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>;
304313 // Piledriver
305314 def : Proc<"bdver2", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
306315 FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
307316 FeatureF16C, FeatureLZCNT,
308 FeaturePOPCNT, FeatureBMI, FeatureTBM,
309 FeatureFMA]>;
317 FeaturePOPCNT, FeatureBMI, FeatureTBM,
318 FeatureFMA, FeatureSlowSHLD]>;
310319
311320 // Steamroller
312321 def : Proc<"bdver3", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
1789117891 return SDValue();
1789217892
1789317893 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
17894 MachineFunction &MF = DAG.getMachineFunction();
17895 bool OptForSize = MF.getFunction()->getAttributes().
17896 hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
17897
17898 // SHLD/SHRD instructions have lower register pressure, but on some
17899 // platforms they have higher latency than the equivalent
17900 // series of shifts/or that would otherwise be generated.
17901 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
17902 // have higer latencies and we are not optimizing for size.
17903 if (!OptForSize && Subtarget->isSHLDSlow())
17904 return SDValue();
17905
1789417906 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
1789517907 std::swap(N0, N1);
1789617908 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
260260 if (IsAMD || (Family == 6 && Model >= 13)) {
261261 IsBTMemSlow = true;
262262 ToggleFeature(X86::FeatureSlowBTMem);
263 }
264
265 // Determine if SHLD/SHRD instructions have higher latency then the
266 // equivalent series of shifts/or instructions.
267 // FIXME: Add Intel's processors that have SHLD instructions with very
268 // poor latency.
269 if (IsAMD) {
270 IsSHLDSlow = true;
271 ToggleFeature(X86::FeatureSlowSHLD);
263272 }
264273
265274 // If it's an Intel chip since Nehalem and not an Atom chip, unaligned
513522 HasPRFCHW = false;
514523 HasRDSEED = false;
515524 IsBTMemSlow = false;
525 IsSHLDSlow = false;
516526 IsUAMemFast = false;
517527 HasVectorUAMem = false;
518528 HasCmpxchg16b = false;
140140
141141 /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
142142 bool IsBTMemSlow;
143
144 /// IsSHLDSlow - True if SHLD instructions are slow.
145 bool IsSHLDSlow;
143146
144147 /// IsUAMemFast - True if unaligned memory access is fast.
145148 bool IsUAMemFast;
291294 bool hasPRFCHW() const { return HasPRFCHW; }
292295 bool hasRDSEED() const { return HasRDSEED; }
293296 bool isBTMemSlow() const { return IsBTMemSlow; }
297 bool isSHLDSlow() const { return IsSHLDSlow; }
294298 bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
295299 bool hasVectorUAMem() const { return HasVectorUAMem; }
296300 bool hasCmpxchg16b() const { return HasCmpxchg16b; }
0 ; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s
1 ; Verify that for the architectures that are known to have poor latency
2 ; double precision shift instructions we generate alternative sequence
3 ; of instructions with lower latencies instead of shld instruction.
4
5 ;uint64_t lshift1(uint64_t a, uint64_t b)
6 ;{
7 ; return (a << 1) | (b >> 63);
8 ;}
9
10 ; CHECK: lshift1:
11 ; CHECK: addq {{.*}},{{.*}}
12 ; CHECK-NEXT: shrq $63, {{.*}}
13 ; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}}
14
15
16 define i64 @lshift1(i64 %a, i64 %b) nounwind readnone uwtable {
17 entry:
18 %shl = shl i64 %a, 1
19 %shr = lshr i64 %b, 63
20 %or = or i64 %shr, %shl
21 ret i64 %or
22 }
23
24 ;uint64_t lshift2(uint64_t a, uint64_t b)
25 ;{
26 ; return (a << 2) | (b >> 62);
27 ;}
28
29 ; CHECK: lshift2:
30 ; CHECK: shlq $2, {{.*}}
31 ; CHECK-NEXT: shrq $62, {{.*}}
32 ; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}}
33
34 define i64 @lshift2(i64 %a, i64 %b) nounwind readnone uwtable {
35 entry:
36 %shl = shl i64 %a, 2
37 %shr = lshr i64 %b, 62
38 %or = or i64 %shr, %shl
39 ret i64 %or
40 }
41
42 ;uint64_t lshift7(uint64_t a, uint64_t b)
43 ;{
44 ; return (a << 7) | (b >> 57);
45 ;}
46
47 ; CHECK: lshift7:
48 ; CHECK: shlq $7, {{.*}}
49 ; CHECK-NEXT: shrq $57, {{.*}}
50 ; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}}
51
52 define i64 @lshift7(i64 %a, i64 %b) nounwind readnone uwtable {
53 entry:
54 %shl = shl i64 %a, 7
55 %shr = lshr i64 %b, 57
56 %or = or i64 %shr, %shl
57 ret i64 %or
58 }
59
60 ;uint64_t lshift63(uint64_t a, uint64_t b)
61 ;{
62 ; return (a << 63) | (b >> 1);
63 ;}
64
65 ; CHECK: lshift63:
66 ; CHECK: shlq $63, {{.*}}
67 ; CHECK-NEXT: shrq {{.*}}
68 ; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}}
69
70 define i64 @lshift63(i64 %a, i64 %b) nounwind readnone uwtable {
71 entry:
72 %shl = shl i64 %a, 63
73 %shr = lshr i64 %b, 1
74 %or = or i64 %shr, %shl
75 ret i64 %or
76 }
0 ; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s
1 ; Verify that for the architectures that are known to have poor latency
2 ; double precision shift instructions we generate alternative sequence
3 ; of instructions with lower latencies instead of shrd instruction.
4
5 ;uint64_t rshift1(uint64_t a, uint64_t b)
6 ;{
7 ; return (a >> 1) | (b << 63);
8 ;}
9
10 ; CHECK: rshift1:
11 ; CHECK: shrq {{.*}}
12 ; CHECK-NEXT: shlq $63, {{.*}}
13 ; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}}
14
15 define i64 @rshift1(i64 %a, i64 %b) nounwind readnone uwtable {
16 %1 = lshr i64 %a, 1
17 %2 = shl i64 %b, 63
18 %3 = or i64 %2, %1
19 ret i64 %3
20 }
21
22 ;uint64_t rshift2(uint64_t a, uint64_t b)
23 ;{
24 ; return (a >> 2) | (b << 62);
25 ;}
26
27 ; CHECK: rshift2:
28 ; CHECK: shrq $2, {{.*}}
29 ; CHECK-NEXT: shlq $62, {{.*}}
30 ; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}}
31
32
33 define i64 @rshift2(i64 %a, i64 %b) nounwind readnone uwtable {
34 %1 = lshr i64 %a, 2
35 %2 = shl i64 %b, 62
36 %3 = or i64 %2, %1
37 ret i64 %3
38 }
39
40 ;uint64_t rshift7(uint64_t a, uint64_t b)
41 ;{
42 ; return (a >> 7) | (b << 57);
43 ;}
44
45 ; CHECK: rshift7:
46 ; CHECK: shrq $7, {{.*}}
47 ; CHECK-NEXT: shlq $57, {{.*}}
48 ; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}}
49
50
51 define i64 @rshift7(i64 %a, i64 %b) nounwind readnone uwtable {
52 %1 = lshr i64 %a, 7
53 %2 = shl i64 %b, 57
54 %3 = or i64 %2, %1
55 ret i64 %3
56 }
57
58 ;uint64_t rshift63(uint64_t a, uint64_t b)
59 ;{
60 ; return (a >> 63) | (b << 1);
61 ;}
62
63 ; CHECK: rshift63:
64 ; CHECK: shrq $63, {{.*}}
65 ; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}}
66 ; CHECK-NEXT: orq {{.*}}, {{.*}}
67
68 define i64 @rshift63(i64 %a, i64 %b) nounwind readnone uwtable {
69 %1 = lshr i64 %a, 63
70 %2 = shl i64 %b, 1
71 %3 = or i64 %2, %1
72 ret i64 %3
73 }
0 ; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s
1
2 ; clang -Oz -c test1.cpp -emit-llvm -S -o
3 ; Verify that we generate shld insruction when we are optimizing for size,
4 ; even for X86_64 processors that are known to have poor latency double
5 ; precision shift instuctions.
6 ; uint64_t lshift10(uint64_t a, uint64_t b)
7 ; {
8 ; return (a << 10) | (b >> 54);
9 ; }
10
11 ; Function Attrs: minsize nounwind optsize readnone uwtable
12 define i64 @_Z8lshift10mm(i64 %a, i64 %b) #0 {
13 entry:
14 ; CHECK: shldq $10
15 %shl = shl i64 %a, 10
16 %shr = lshr i64 %b, 54
17 %or = or i64 %shr, %shl
18 ret i64 %or
19 }
20
21 attributes #0 = { minsize nounwind optsize readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
22
23
24 ; clang -Os -c test2.cpp -emit-llvm -S
25 ; Verify that we generate shld insruction when we are optimizing for size,
26 ; even for X86_64 processors that are known to have poor latency double
27 ; precision shift instuctions.
28 ; uint64_t lshift11(uint64_t a, uint64_t b)
29 ; {
30 ; return (a << 11) | (b >> 53);
31 ; }
32
33 ; Function Attrs: nounwind optsize readnone uwtable
34 define i64 @_Z8lshift11mm(i64 %a, i64 %b) #1 {
35 entry:
36 ; CHECK: shldq $11
37 %shl = shl i64 %a, 11
38 %shr = lshr i64 %b, 53
39 %or = or i64 %shr, %shl
40 ret i64 %or
41 }
42
43 attributes #1 = { nounwind optsize readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
44
45 ; clang -O2 -c test2.cpp -emit-llvm -S
46 ; Verify that we do not generate shld insruction when we are not optimizing
47 ; for size for X86_64 processors that are known to have poor latency double
48 ; precision shift instuctions.
49 ; uint64_t lshift12(uint64_t a, uint64_t b)
50 ; {
51 ; return (a << 12) | (b >> 52);
52 ; }
53
54 ; Function Attrs: nounwind optsize readnone uwtable
55 define i64 @_Z8lshift12mm(i64 %a, i64 %b) #2 {
56 entry:
57 ; CHECK: shlq $12
58 ; CHECK-NEXT: shrq $52
59 %shl = shl i64 %a, 12
60 %shr = lshr i64 %b, 52
61 %or = or i64 %shr, %shl
62 ret i64 %or
63 }
64
65 attributes #2= { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
66
0 ; RUN: llc < %s -march=x86-64 -mcpu=athlon | FileCheck %s
1 ; RUN: llc < %s -march=x86-64 -mcpu=athlon-tbird | FileCheck %s
2 ; RUN: llc < %s -march=x86-64 -mcpu=athlon-4 | FileCheck %s
3 ; RUN: llc < %s -march=x86-64 -mcpu=athlon-xp | FileCheck %s
4 ; RUN: llc < %s -march=x86-64 -mcpu=athlon-mp | FileCheck %s
5 ; RUN: llc < %s -march=x86-64 -mcpu=k8 | FileCheck %s
6 ; RUN: llc < %s -march=x86-64 -mcpu=opteron | FileCheck %s
7 ; RUN: llc < %s -march=x86-64 -mcpu=athlon64 | FileCheck %s
8 ; RUN: llc < %s -march=x86-64 -mcpu=athlon-fx | FileCheck %s
9 ; RUN: llc < %s -march=x86-64 -mcpu=k8-sse3 | FileCheck %s
10 ; RUN: llc < %s -march=x86-64 -mcpu=opteron-sse3 | FileCheck %s
11 ; RUN: llc < %s -march=x86-64 -mcpu=athlon64-sse3 | FileCheck %s
12 ; RUN: llc < %s -march=x86-64 -mcpu=amdfam10 | FileCheck %s
13 ; RUN: llc < %s -march=x86-64 -mcpu=btver1 | FileCheck %s
14 ; RUN: llc < %s -march=x86-64 -mcpu=btver2 | FileCheck %s
15 ; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s
16 ; RUN: llc < %s -march=x86-64 -mcpu=bdver2 | FileCheck %s
17
18 ; Verify that for the X86_64 processors that are known to have poor latency
19 ; double precision shift instructions we do not generate 'shld' or 'shrd'
20 ; instructions.
21
22 ;uint64_t lshift(uint64_t a, uint64_t b, int c)
23 ;{
24 ; return (a << c) | (b >> (64-c));
25 ;}
26
27 define i64 @lshift(i64 %a, i64 %b, i32 %c) nounwind readnone {
28 entry:
29 ; CHECK-NOT: shld
30 %sh_prom = zext i32 %c to i64
31 %shl = shl i64 %a, %sh_prom
32 %sub = sub nsw i32 64, %c
33 %sh_prom1 = zext i32 %sub to i64
34 %shr = lshr i64 %b, %sh_prom1
35 %or = or i64 %shr, %shl
36 ret i64 %or
37 }
38
39 ;uint64_t rshift(uint64_t a, uint64_t b, int c)
40 ;{
41 ; return (a >> c) | (b << (64-c));
42 ;}
43
44 define i64 @rshift(i64 %a, i64 %b, i32 %c) nounwind readnone {
45 entry:
46 ; CHECK-NOT: shrd
47 %sh_prom = zext i32 %c to i64
48 %shr = lshr i64 %a, %sh_prom
49 %sub = sub nsw i32 64, %c
50 %sh_prom1 = zext i32 %sub to i64
51 %shl = shl i64 %b, %sh_prom1
52 %or = or i64 %shl, %shr
53 ret i64 %or
54 }
55
56