llvm.org GIT mirror llvm / 3f27fbb
[MCA] Add an experimental MicroOpQueue stage. This patch adds an experimental stage named MicroOpQueueStage. MicroOpQueueStage can be used to simulate a hardware micro-op queue (basically, a decoupling queue between 'decode' and 'dispatch'). Users can specify a queue size, as well as a optional MaxIPC (which - in the absence of a "Decoders" stage - can be used to simulate a different throughput from the decoders). This stage is added to the default pipeline between the EntryStage and the DispatchStage only if PipelineOption::MicroOpQueue is different than zero. By default, llvm-mca sets PipelineOption::MicroOpQueue to the value of hidden flag -micro-op-queue-size. Throughput from the decoder can be simulated via another hidden flag named -decoder-throughput. That flag allows us to quickly experiment with different frontend throughputs. For targets that declare a loop buffer, flag -decoder-throughput allows users to do multiple runs, each time simulating a different throughput from the decoders. This stage can/will be extended in future. For example, we could add a "buffer full" event to notify bottlenecks caused by backpressure. flag -decoder-throughput would probably go away if in future we delegate to another stage (DecoderStage?) the simulation of a (potentially variable) throughput from the decoders. For now, flag -decoder-throughput is "good enough" to run some simple experiments. Differential Revision: https://reviews.llvm.org/D59928 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@357248 91177308-0d34-0410-b5e6-96231b3b80d8 Andrea Di Biagio 6 months ago
7 changed file(s) with 289 addition(s) and 6 deletion(s). Raw diff Collapse all Expand all
3030 /// This is a convenience struct to hold the parameters necessary for creating
3131 /// the pre-built "default" out-of-order pipeline.
3232 struct PipelineOptions {
33 PipelineOptions(unsigned DW, unsigned RFS, unsigned LQS, unsigned SQS,
34 bool NoAlias, bool ShouldEnableBottleneckAnalysis = false)
35 : DispatchWidth(DW), RegisterFileSize(RFS), LoadQueueSize(LQS),
33 PipelineOptions(unsigned UOPQSize, unsigned DecThr, unsigned DW, unsigned RFS,
34 unsigned LQS, unsigned SQS, bool NoAlias,
35 bool ShouldEnableBottleneckAnalysis = false)
36 : MicroOpQueueSize(UOPQSize), DecodersThroughput(DecThr),
37 DispatchWidth(DW), RegisterFileSize(RFS), LoadQueueSize(LQS),
3638 StoreQueueSize(SQS), AssumeNoAlias(NoAlias),
3739 EnableBottleneckAnalysis(ShouldEnableBottleneckAnalysis) {}
40 unsigned MicroOpQueueSize;
41 unsigned DecodersThroughput; // Instructions per cycle.
3842 unsigned DispatchWidth;
3943 unsigned RegisterFileSize;
4044 unsigned LoadQueueSize;
0 //===---------------------- MicroOpQueueStage.h -----------------*- C++ -*-===//
1 //
2 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
3 // See https://llvm.org/LICENSE.txt for license information.
4 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5 //
6 //===----------------------------------------------------------------------===//
7 /// \file
8 ///
9 /// This file defines a stage that implements a queue of micro opcodes.
10 /// It can be used to simulate a hardware micro-op queue that serves opcodes to
11 /// the out of order backend.
12 ///
13 //===----------------------------------------------------------------------===//
14
15 #ifndef LLVM_MCA_MICRO_OP_QUEUE_STAGE_H
16 #define LLVM_MCA_MICRO_OP_QUEUE_STAGE_H
17
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/MCA/Stages/Stage.h"
20
21 namespace llvm {
22 namespace mca {
23
24 /// A stage that simulates a queue of instruction opcodes.
25 class MicroOpQueueStage : public Stage {
26 SmallVector Buffer;
27 unsigned NextAvailableSlotIdx;
28 unsigned CurrentInstructionSlotIdx;
29
30 // Limits the number of instructions that can be written to this buffer every
31 // cycle. A value of zero means that there is no limit to the instruction
32 // throughput in input.
33 const unsigned MaxIPC;
34 unsigned CurrentIPC;
35
36 // Number of entries that are available during this cycle.
37 unsigned AvailableEntries;
38
39 // True if instructions dispatched to this stage don't need to wait for the
40 // next cycle before moving to the next stage.
41 // False if this buffer acts as a one cycle delay in the execution pipeline.
42 bool IsZeroLatencyStage;
43
44 MicroOpQueueStage(const MicroOpQueueStage &Other) = delete;
45 MicroOpQueueStage &operator=(const MicroOpQueueStage &Other) = delete;
46
47 // By default, an instruction consumes a number of buffer entries equal to its
48 // number of micro opcodes (see field `InstrDesc::NumMicroOpcodes`). The
49 // number of entries consumed by an instruction is normalized to the
50 // minimum value between NumMicroOpcodes and the buffer size. This is to avoid
51 // problems with (microcoded) instructions that generate a number of micro
52 // opcodes than doesn't fit in the buffer.
53 unsigned getNormalizedOpcodes(const InstRef &IR) const {
54 unsigned NormalizedOpcodes =
55 std::min(static_cast(Buffer.size()),
56 IR.getInstruction()->getDesc().NumMicroOps);
57 return NormalizedOpcodes ? NormalizedOpcodes : 1U;
58 }
59
60 Error moveInstructions();
61
62 public:
63 MicroOpQueueStage(unsigned Size, unsigned IPC = 0,
64 bool ZeroLatencyStage = true);
65
66 bool isAvailable(const InstRef &IR) const override {
67 if (MaxIPC && CurrentIPC == MaxIPC)
68 return false;
69 unsigned NormalizedOpcodes = getNormalizedOpcodes(IR);
70 if (NormalizedOpcodes > AvailableEntries)
71 return false;
72 return true;
73 }
74
75 bool hasWorkToComplete() const override {
76 return AvailableEntries != Buffer.size();
77 }
78
79 Error execute(InstRef &IR) override;
80 Error cycleStart() override;
81 Error cycleEnd() override;
82 };
83
84 } // namespace mca
85 } // namespace llvm
86
87 #endif // LLVM_MCA_MICRO_OP_QUEUE_STAGE_H
1313 Stages/EntryStage.cpp
1414 Stages/ExecuteStage.cpp
1515 Stages/InstructionTables.cpp
16 Stages/MicroOpQueueStage.cpp
1617 Stages/RetireStage.cpp
1718 Stages/Stage.cpp
1819 Support.cpp
2020 #include "llvm/MCA/Stages/DispatchStage.h"
2121 #include "llvm/MCA/Stages/EntryStage.h"
2222 #include "llvm/MCA/Stages/ExecuteStage.h"
23 #include "llvm/MCA/Stages/MicroOpQueueStage.h"
2324 #include "llvm/MCA/Stages/RetireStage.h"
2425
2526 namespace llvm {
5455 // Build the pipeline.
5556 auto StagePipeline = llvm::make_unique();
5657 StagePipeline->appendStage(std::move(Fetch));
58 if (Opts.MicroOpQueueSize)
59 StagePipeline->appendStage(llvm::make_unique(
60 Opts.MicroOpQueueSize, Opts.DecodersThroughput));
5761 StagePipeline->appendStage(std::move(Dispatch));
5862 StagePipeline->appendStage(std::move(Execute));
5963 StagePipeline->appendStage(std::move(Retire));
0 //===---------------------- MicroOpQueueStage.cpp ---------------*- C++ -*-===//
1 //
2 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
3 // See https://llvm.org/LICENSE.txt for license information.
4 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5 //
6 //===----------------------------------------------------------------------===//
7 /// \file
8 ///
9 /// This file defines the MicroOpQueueStage.
10 ///
11 //===----------------------------------------------------------------------===//
12
13 #include "llvm/MCA/Stages/MicroOpQueueStage.h"
14
15 namespace llvm {
16 namespace mca {
17
18 #define DEBUG_TYPE "llvm-mca"
19
20 Error MicroOpQueueStage::moveInstructions() {
21 InstRef IR = Buffer[CurrentInstructionSlotIdx];
22 while (IR && checkNextStage(IR)) {
23 if (llvm::Error Val = moveToTheNextStage(IR))
24 return Val;
25
26 Buffer[CurrentInstructionSlotIdx].invalidate();
27 unsigned NormalizedOpcodes = getNormalizedOpcodes(IR);
28 CurrentInstructionSlotIdx += NormalizedOpcodes;
29 CurrentInstructionSlotIdx %= Buffer.size();
30 AvailableEntries += NormalizedOpcodes;
31 IR = Buffer[CurrentInstructionSlotIdx];
32 }
33
34 return llvm::ErrorSuccess();
35 }
36
37 MicroOpQueueStage::MicroOpQueueStage(unsigned Size, unsigned IPC,
38 bool ZeroLatencyStage)
39 : NextAvailableSlotIdx(0), CurrentInstructionSlotIdx(0), MaxIPC(IPC),
40 CurrentIPC(0), IsZeroLatencyStage(ZeroLatencyStage) {
41 Buffer.resize(Size ? Size : 1);
42 AvailableEntries = Buffer.size();
43 }
44
45 Error MicroOpQueueStage::execute(InstRef &IR) {
46 Buffer[NextAvailableSlotIdx] = IR;
47 unsigned NormalizedOpcodes = getNormalizedOpcodes(IR);
48 NextAvailableSlotIdx += NormalizedOpcodes;
49 NextAvailableSlotIdx %= Buffer.size();
50 AvailableEntries -= NormalizedOpcodes;
51 ++CurrentIPC;
52 return llvm::ErrorSuccess();
53 }
54
55 Error MicroOpQueueStage::cycleStart() {
56 CurrentIPC = 0;
57 if (!IsZeroLatencyStage)
58 return moveInstructions();
59 return llvm::ErrorSuccess();
60 }
61
62 Error MicroOpQueueStage::cycleEnd() {
63 if (IsZeroLatencyStage)
64 return moveInstructions();
65 return llvm::ErrorSuccess();
66 }
67
68 } // namespace mca
69 } // namespace llvm
0 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
1 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=1 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-1
2 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=2 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-2
3 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=3 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-3
4 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=4 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-4
5 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=4 -decoder-throughput=2 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-DEC-2
6
7 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -micro-op-queue-size=1 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=BTVER2-UOPQ-1
8 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -micro-op-queue-size=2 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=BTVER2-UOPQ-2
9 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -micro-op-queue-size=4 -decoder-throughput=1 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=BTVER2-DEC-1
10
11 add %eax, %eax
12 add %ebx, %ebx
13 add %ecx, %ecx
14 add %edx, %edx
15
16 # BTVER2-DEC-2: Iterations: 1500
17 # BTVER2-DEC-2-NEXT: Instructions: 6000
18 # BTVER2-DEC-2-NEXT: Total Cycles: 3003
19 # BTVER2-DEC-2-NEXT: Total uOps: 6000
20
21 # BTVER2-DEC-2: Dispatch Width: 2
22 # BTVER2-DEC-2-NEXT: uOps Per Cycle: 2.00
23 # BTVER2-DEC-2-NEXT: IPC: 2.00
24 # BTVER2-DEC-2-NEXT: Block RThroughput: 2.0
25
26 # BTVER2-DEC-1: Iterations: 1500
27 # BTVER2-DEC-1-NEXT: Instructions: 6000
28 # BTVER2-DEC-1-NEXT: Total Cycles: 6003
29 # BTVER2-DEC-1-NEXT: Total uOps: 6000
30
31 # BTVER2-UOPQ-1: Iterations: 1500
32 # BTVER2-UOPQ-1-NEXT: Instructions: 6000
33 # BTVER2-UOPQ-1-NEXT: Total Cycles: 6003
34 # BTVER2-UOPQ-1-NEXT: Total uOps: 6000
35
36 # BTVER2-UOPQ-2: Iterations: 1500
37 # BTVER2-UOPQ-2-NEXT: Instructions: 6000
38 # BTVER2-UOPQ-2-NEXT: Total Cycles: 3003
39 # BTVER2-UOPQ-2-NEXT: Total uOps: 6000
40
41 # HASWELL-DEC-2: Iterations: 1500
42 # HASWELL-DEC-2-NEXT: Instructions: 6000
43 # HASWELL-DEC-2-NEXT: Total Cycles: 3003
44 # HASWELL-DEC-2-NEXT: Total uOps: 6000
45
46 # HASWELL-UOPQ-1: Iterations: 1500
47 # HASWELL-UOPQ-1-NEXT: Instructions: 6000
48 # HASWELL-UOPQ-1-NEXT: Total Cycles: 6003
49 # HASWELL-UOPQ-1-NEXT: Total uOps: 6000
50
51 # HASWELL-UOPQ-2: Iterations: 1500
52 # HASWELL-UOPQ-2-NEXT: Instructions: 6000
53 # HASWELL-UOPQ-2-NEXT: Total Cycles: 3003
54 # HASWELL-UOPQ-2-NEXT: Total uOps: 6000
55
56 # HASWELL-UOPQ-3: Iterations: 1500
57 # HASWELL-UOPQ-3-NEXT: Instructions: 6000
58 # HASWELL-UOPQ-3-NEXT: Total Cycles: 2003
59 # HASWELL-UOPQ-3-NEXT: Total uOps: 6000
60
61 # HASWELL-UOPQ-4: Iterations: 1500
62 # HASWELL-UOPQ-4-NEXT: Instructions: 6000
63 # HASWELL-UOPQ-4-NEXT: Total Cycles: 1503
64 # HASWELL-UOPQ-4-NEXT: Total uOps: 6000
65
66 # BTVER2-DEC-1: Dispatch Width: 2
67 # BTVER2-DEC-1-NEXT: uOps Per Cycle: 1.00
68 # BTVER2-DEC-1-NEXT: IPC: 1.00
69 # BTVER2-DEC-1-NEXT: Block RThroughput: 2.0
70
71 # BTVER2-UOPQ-1: Dispatch Width: 2
72 # BTVER2-UOPQ-1-NEXT: uOps Per Cycle: 1.00
73 # BTVER2-UOPQ-1-NEXT: IPC: 1.00
74 # BTVER2-UOPQ-1-NEXT: Block RThroughput: 2.0
75
76 # BTVER2-UOPQ-2: Dispatch Width: 2
77 # BTVER2-UOPQ-2-NEXT: uOps Per Cycle: 2.00
78 # BTVER2-UOPQ-2-NEXT: IPC: 2.00
79 # BTVER2-UOPQ-2-NEXT: Block RThroughput: 2.0
80
81 # HASWELL-DEC-2: Dispatch Width: 4
82 # HASWELL-DEC-2-NEXT: uOps Per Cycle: 2.00
83 # HASWELL-DEC-2-NEXT: IPC: 2.00
84 # HASWELL-DEC-2-NEXT: Block RThroughput: 1.0
85
86 # HASWELL-UOPQ-1: Dispatch Width: 4
87 # HASWELL-UOPQ-1-NEXT: uOps Per Cycle: 1.00
88 # HASWELL-UOPQ-1-NEXT: IPC: 1.00
89 # HASWELL-UOPQ-1-NEXT: Block RThroughput: 1.0
90
91 # HASWELL-UOPQ-2: Dispatch Width: 4
92 # HASWELL-UOPQ-2-NEXT: uOps Per Cycle: 2.00
93 # HASWELL-UOPQ-2-NEXT: IPC: 2.00
94 # HASWELL-UOPQ-2-NEXT: Block RThroughput: 1.0
95
96 # HASWELL-UOPQ-3: Dispatch Width: 4
97 # HASWELL-UOPQ-3-NEXT: uOps Per Cycle: 3.00
98 # HASWELL-UOPQ-3-NEXT: IPC: 3.00
99 # HASWELL-UOPQ-3-NEXT: Block RThroughput: 1.0
100
101 # HASWELL-UOPQ-4: Dispatch Width: 4
102 # HASWELL-UOPQ-4-NEXT: uOps Per Cycle: 3.99
103 # HASWELL-UOPQ-4-NEXT: IPC: 3.99
104 # HASWELL-UOPQ-4-NEXT: Block RThroughput: 1.0
9999 "be used for register mappings"),
100100 cl::cat(ToolOptions), cl::init(0));
101101
102 static cl::opt
103 MicroOpQueue("micro-op-queue-size", cl::Hidden,
104 cl::desc("Number of entries in the micro-op queue"),
105 cl::cat(ToolOptions), cl::init(0));
106
107 static cl::opt
108 DecoderThroughput("decoder-throughput", cl::Hidden,
109 cl::desc("Maximum throughput from the decoders "
110 "(instructions per cycle)"),
111 cl::cat(ToolOptions), cl::init(0));
112
102113 static cl::opt
103114 PrintRegisterFileStats("register-file-stats",
104115 cl::desc("Print register file statistics"),
386397 // Create a context to control ownership of the pipeline hardware.
387398 mca::Context MCA(*MRI, *STI);
388399
389 mca::PipelineOptions PO(DispatchWidth, RegisterFileSize, LoadQueueSize,
390 StoreQueueSize, AssumeNoAlias,
391 EnableBottleneckAnalysis);
400 mca::PipelineOptions PO(MicroOpQueue, DecoderThroughput, DispatchWidth,
401 RegisterFileSize, LoadQueueSize, StoreQueueSize,
402 AssumeNoAlias, EnableBottleneckAnalysis);
392403
393404 // Number each region in the sequence.
394405 unsigned RegionIdx = 0;