llvm.org GIT mirror llvm / 6c4ec69
[ARM64] Ports the Cortex-A53 Machine Model description from AArch64. Summary: This port includes the rudimentary latencies that were provided for the Cortex-A53 Machine Model in the AArch64 backend. It also changes the SchedAlias for COPY in the Cyclone model to an explicit WriteRes mapping to avoid conflicts in other subtargets. Differential Revision: http://reviews.llvm.org/D3427 Patch by Dave Estes <cestes@codeaurora.org>! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206652 91177308-0d34-0410-b5e6-96231b3b80d8 Chad Rosier 6 years ago
4 changed file(s) with 247 addition(s) and 4 deletion(s). Raw diff Collapse all Expand all
2020 //
2121
2222 def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true",
23 "Enable ARMv8 FP">;
23 "Enable ARMv8 FP">;
2424
2525 def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
2626 "Enable Advanced SIMD instructions", [FeatureFPARMv8]>;
5555 //===----------------------------------------------------------------------===//
5656 // ARM64 Processors supported.
5757 //
58 include "ARM64SchedA53.td"
5859 include "ARM64SchedCyclone.td"
5960
6061 def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
7879
7980 def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8, FeatureNEON]>;
8081
81 def : ProcessorModel<"cortex-a53", NoSchedModel, [ProcA53]>;
82 def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
8283 def : ProcessorModel<"cortex-a57", NoSchedModel, [ProcA57]>;
83
8484 def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
8585
8686 //===----------------------------------------------------------------------===//
0 //=- ARM64SchedA53.td - ARM Cortex-A53 Scheduling Definitions -*- tablegen -*-=//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the itinerary class data for the ARM Cortex A53 processors.
10 //
11 //===----------------------------------------------------------------------===//
12
13 // ===---------------------------------------------------------------------===//
14 // The following definitions describe the simpler per-operand machine model.
15 // This works with MachineScheduler. See MCSchedModel.h for details.
16
17 // Cortex-A53 machine model for scheduling and other instruction cost heuristics.
18 def CortexA53Model : SchedMachineModel {
19 let MicroOpBufferSize = 0; // Explicitly set to zero since A53 is in-order.
20 let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
21 let MinLatency = 1 ; // OperandCycles are interpreted as MinLatency.
22 let LoadLatency = 2; // Optimistic load latency assuming bypass.
23 // This is overriden by OperandCycles if the
24 // Itineraries are queried instead.
25 let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation
26 // Specification - Instruction Timings"
27 // v 1.0 Spreadsheet
28 }
29
30
31 //===----------------------------------------------------------------------===//
32 // Define each kind of processor resource and number available.
33
34 // Modeling each pipeline as a ProcResource using the BufferSize = 0 since
35 // Cortex-A53 is in-order.
36
37 def A53UnitALU : ProcResource<2> { let BufferSize = 0; } // Int ALU
38 def A53UnitMAC : ProcResource<1> { let BufferSize = 0; } // Int MAC
39 def A53UnitDiv : ProcResource<1> { let BufferSize = 0; } // Int Division
40 def A53UnitLdSt : ProcResource<1> { let BufferSize = 0; } // Load/Store
41 def A53UnitB : ProcResource<1> { let BufferSize = 0; } // Branch
42 def A53UnitFPALU : ProcResource<1> { let BufferSize = 0; } // FP ALU
43 def A53UnitFPMDS : ProcResource<1> { let BufferSize = 0; } // FP Mult/Div/Sqrt
44
45
46 //===----------------------------------------------------------------------===//
47 // Subtarget-specific SchedWrite types which both map the ProcResources and
48 // set the latency.
49
50 let SchedModel = CortexA53Model in {
51
52 // ALU - These are reduced to 1 despite a true latency of 4 in order to easily
53 // model forwarding logic. Once forwarding is properly modelled, then
54 // they'll be corrected.
55 def : WriteRes { let Latency = 1; }
56 def : WriteRes { let Latency = 1; }
57 def : WriteRes { let Latency = 1; }
58 def : WriteRes { let Latency = 1; }
59 def : WriteRes { let Latency = 1; }
60 def : WriteRes { let Latency = 1; }
61 def : WriteRes { let Latency = 1; }
62
63 // MAC
64 def : WriteRes { let Latency = 4; }
65 def : WriteRes { let Latency = 4; }
66
67 // Div
68 def : WriteRes { let Latency = 4; }
69 def : WriteRes { let Latency = 4; }
70
71 // Load
72 def : WriteRes { let Latency = 4; }
73 def : WriteRes { let Latency = 4; }
74 def : WriteRes { let Latency = 4; }
75 def : WriteRes { let Latency = 4; }
76
77 // Store
78 def : WriteRes { let Latency = 4; }
79 def : WriteRes { let Latency = 4; }
80 def : WriteRes { let Latency = 4; }
81 def : WriteRes { let Latency = 4; }
82 def : WriteRes { let Latency = 4; }
83
84 // Branch
85 def : WriteRes;
86 def : WriteRes;
87 def : WriteRes;
88 def : WriteRes;
89 def : WriteRes;
90
91 // FP ALU
92 def : WriteRes { let Latency = 6; }
93 def : WriteRes { let Latency = 6; }
94 def : WriteRes { let Latency = 6; }
95 def : WriteRes { let Latency = 6; }
96 def : WriteRes { let Latency = 6; }
97 def : WriteRes { let Latency = 6; }
98
99 // FP Mul, Div, Sqrt
100 def : WriteRes { let Latency = 6; }
101 def : WriteRes { let Latency = 33;
102 let ResourceCycles = [29]; }
103 def A53WriteFDiv : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 33;
104 let ResourceCycles = [29]; }
105 def A53WriteFSqrt : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 32;
106 let ResourceCycles = [28]; }
107
108 //===----------------------------------------------------------------------===//
109 // Subtarget-specific SchedRead types.
110
111 // While there is no forwarding information defined for these SchedRead types,
112 // they are still used by some instruction via a SchedRW list and so these zero
113 // SchedReadAdvances are required.
114
115 def : ReadAdvance;
116 def : ReadAdvance;
117 def : ReadAdvance;
118
119 //===----------------------------------------------------------------------===//
120 // Subtarget-specific InstRWs.
121
122 def : InstRW<[WriteI], (instrs COPY)>;
123 def : InstRW<[WriteLD], (instregex "LD[1-4]")>;
124 def : InstRW<[WriteST], (instregex "ST[1-4]")>;
125 def : InstRW<[A53WriteFDiv], (instregex "^FDIV")>;
126 def : InstRW<[A53WriteFSqrt], (instregex ".*SQRT.*")>;
127
128 }
341341 // INS V[x],V[y] is a WriteV.
342342
343343 // FMOVWSr,FMOVXDr,FMOVXDHighr
344 def : SchedAlias;
344 def : WriteRes {
345 let Latency = 5;
346 }
345347
346348 // FMOVSWr,FMOVDXr
347349 def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>;
0 ; REQUIRES: asserts
1 ; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
2 ;
3 ; The Cortex-A53 machine model will cause the MADD instruction to be scheduled
4 ; much higher than the ADD instructions in order to hide latency. When not
5 ; specifying a subtarget, the MADD will remain near the end of the block.
6 ;
7 ; CHECK: ********** MI Scheduling **********
8 ; CHECK: main
9 ; CHECK: *** Final schedule for BB#2 ***
10 ; CHECK: SU(13)
11 ; CHECK: MADDWrrr
12 ; CHECK: SU(4)
13 ; CHECK: ADDWri
14 ; CHECK: ********** INTERVALS **********
15 @main.x = private unnamed_addr constant [8 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 4
16 @main.y = private unnamed_addr constant [8 x i32] [i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2], align 4
17
18 ; Function Attrs: nounwind
19 define i32 @main() #0 {
20 entry:
21 %retval = alloca i32, align 4
22 %x = alloca [8 x i32], align 4
23 %y = alloca [8 x i32], align 4
24 %i = alloca i32, align 4
25 %xx = alloca i32, align 4
26 %yy = alloca i32, align 4
27 store i32 0, i32* %retval
28 %0 = bitcast [8 x i32]* %x to i8*
29 call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([8 x i32]* @main.x to i8*), i64 32, i32 4, i1 false)
30 %1 = bitcast [8 x i32]* %y to i8*
31 call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ([8 x i32]* @main.y to i8*), i64 32, i32 4, i1 false)
32 store i32 0, i32* %xx, align 4
33 store i32 0, i32* %yy, align 4
34 store i32 0, i32* %i, align 4
35 br label %for.cond
36
37 for.cond: ; preds = %for.inc, %entry
38 %2 = load i32* %i, align 4
39 %cmp = icmp slt i32 %2, 8
40 br i1 %cmp, label %for.body, label %for.end
41
42 for.body: ; preds = %for.cond
43 %3 = load i32* %i, align 4
44 %idxprom = sext i32 %3 to i64
45 %arrayidx = getelementptr inbounds [8 x i32]* %x, i32 0, i64 %idxprom
46 %4 = load i32* %arrayidx, align 4
47 %add = add nsw i32 %4, 1
48 store i32 %add, i32* %xx, align 4
49 %5 = load i32* %xx, align 4
50 %add1 = add nsw i32 %5, 12
51 store i32 %add1, i32* %xx, align 4
52 %6 = load i32* %xx, align 4
53 %add2 = add nsw i32 %6, 23
54 store i32 %add2, i32* %xx, align 4
55 %7 = load i32* %xx, align 4
56 %add3 = add nsw i32 %7, 34
57 store i32 %add3, i32* %xx, align 4
58 %8 = load i32* %i, align 4
59 %idxprom4 = sext i32 %8 to i64
60 %arrayidx5 = getelementptr inbounds [8 x i32]* %y, i32 0, i64 %idxprom4
61 %9 = load i32* %arrayidx5, align 4
62 %10 = load i32* %yy, align 4
63 %mul = mul nsw i32 %10, %9
64 store i32 %mul, i32* %yy, align 4
65 br label %for.inc
66
67 for.inc: ; preds = %for.body
68 %11 = load i32* %i, align 4
69 %inc = add nsw i32 %11, 1
70 store i32 %inc, i32* %i, align 4
71 br label %for.cond
72
73 for.end: ; preds = %for.cond
74 %12 = load i32* %xx, align 4
75 %13 = load i32* %yy, align 4
76 %add6 = add nsw i32 %12, %13
77 ret i32 %add6
78 }
79
80
81 ; The Cortex-A53 machine model will cause the FDIVvvv_42 to be raised to
82 ; hide latency. Whereas normally there would only be a single FADDvvv_4s
83 ; after it, this test checks to make sure there are more than one.
84 ;
85 ; CHECK: ********** MI Scheduling **********
86 ; CHECK: neon4xfloat:BB#0
87 ; CHECK: *** Final schedule for BB#0 ***
88 ; CHECK: FDIVv4f32
89 ; CHECK: FADDv4f32
90 ; CHECK: FADDv4f32
91 ; CHECK: ********** INTERVALS **********
92 define <4 x float> @neon4xfloat(<4 x float> %A, <4 x float> %B) {
93 %tmp1 = fadd <4 x float> %A, %B;
94 %tmp2 = fadd <4 x float> %A, %tmp1;
95 %tmp3 = fadd <4 x float> %A, %tmp2;
96 %tmp4 = fadd <4 x float> %A, %tmp3;
97 %tmp5 = fadd <4 x float> %A, %tmp4;
98 %tmp6 = fadd <4 x float> %A, %tmp5;
99 %tmp7 = fadd <4 x float> %A, %tmp6;
100 %tmp8 = fadd <4 x float> %A, %tmp7;
101 %tmp9 = fdiv <4 x float> %A, %B;
102 %tmp10 = fadd <4 x float> %tmp8, %tmp9;
103
104 ret <4 x float> %tmp10
105 }
106
107 ; Function Attrs: nounwind
108 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
109
110 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
111 attributes #1 = { nounwind }