llvm.org GIT mirror llvm / 51af6fd
[llvm-mca][MC] Add the ability to declare which processor resources model load/store queues (PR36666). This patch adds the ability to specify via tablegen which processor resources are load/store queue resources. A new tablegen class named MemoryQueue can be optionally used to mark resources that model load/store queues. Information about the load/store queue is collected at 'CodeGenSchedule' stage, and analyzed by the 'SubtargetEmitter' to initialize two new fields in struct MCExtraProcessorInfo named `LoadQueueID` and `StoreQueueID`. Those two fields are identifiers for buffered resources used to describe the load queue and the store queue. Field `BufferSize` is interpreted as the number of entries in the queue, while the number of units is a throughput indicator (i.e. number of available pickers for loads/stores). At construction time, LSUnit in llvm-mca checks for the presence of extra processor information (i.e. MCExtraProcessorInfo) in the scheduling model. If that information is available, and fields LoadQueueID and StoreQueueID are set to a value different than zero (i.e. the invalid processor resource index), then LSUnit initializes its LoadQueue/StoreQueue based on the BufferSize value declared by the two processor resources. With this patch, we more accurately track dynamic dispatch stalls caused by the lack of LS tokens (i.e. load/store queue full). This is also shown by the differences in two BdVer2 tests. Stalls that were previously classified as generic SCHEDULER FULL stalls, are not correctly classified either as "load queue full" or "store queue full". About the differences in the -scheduler-stats view: those differences are expected, because entries in the load/store queue are not released at instruction issue stage. Instead, those are released at instruction executed stage. This is the main reason why for the modified tests, the load/store queues gets full before PdEx is full. Differential Revision: https://reviews.llvm.org/D54957 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@347857 91177308-0d34-0410-b5e6-96231b3b80d8 Andrea Di Biagio 9 months ago
14 changed file(s) with 312 addition(s) and 107 deletion(s). Raw diff Collapse all Expand all
182182 unsigned NumRegisterFiles;
183183 const MCRegisterCostEntry *RegisterCostTable;
184184 unsigned NumRegisterCostEntries;
185 unsigned LoadQueueID;
186 unsigned StoreQueueID;
185187 };
186188
187189 /// Machine model for scheduling, bundling, and heuristics.
560560 int MaxRetirePerCycle = retirePerCycle;
561561 SchedMachineModel SchedModel = ?;
562562 }
563
564 // Base class for Load/StoreQueue. It is used to identify processor resources
565 // which describe load/store queues in the LS unit.
566 class MemoryQueue {
567 ProcResource QueueDescriptor = PR;
568 SchedMachineModel SchedModel = ?;
569 }
570
571 class LoadQueue : MemoryQueue;
572 class StoreQueue : MemoryQueue;
135135 let BufferSize = 40;
136136 }
137137
138 def PdLoadQueue : LoadQueue;
139
138140 let Super = PdAGLU01 in
139141 def PdStore : ProcResource<1> {
140142 // For Piledriver, the store queue is 24 entries deep.
141143 let BufferSize = 24;
142144 }
145
146 def PdStoreQueue : StoreQueue;
143147
144148 //===----------------------------------------------------------------------===//
145149 // Integer Execution Units
7878 # CHECK: Dynamic Dispatch Stall Cycles:
7979 # CHECK-NEXT: RAT - Register unavailable: 0
8080 # CHECK-NEXT: RCU - Retire tokens unavailable: 0
81 # CHECK-NEXT: SCHEDQ - Scheduler full: 161 (77.8%)
82 # CHECK-NEXT: LQ - Load queue full: 0
81 # CHECK-NEXT: SCHEDQ - Scheduler full: 0
82 # CHECK-NEXT: LQ - Load queue full: 171 (82.6%)
8383 # CHECK-NEXT: SQ - Store queue full: 0
8484 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
8585
8686 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
8787 # CHECK-NEXT: [# dispatched], [# cycles]
88 # CHECK-NEXT: 0, 26 (12.6%)
89 # CHECK-NEXT: 2, 162 (78.3%)
90 # CHECK-NEXT: 4, 19 (9.2%)
88 # CHECK-NEXT: 0, 21 (10.1%)
89 # CHECK-NEXT: 2, 172 (83.1%)
90 # CHECK-NEXT: 4, 14 (6.8%)
9191
9292 # CHECK: Schedulers - number of cycles where we saw N instructions issued:
9393 # CHECK-NEXT: [# issued], [# cycles]
101101 # CHECK-NEXT: [4] Total number of buffer entries.
102102
103103 # CHECK: [1] [2] [3] [4]
104 # CHECK-NEXT: PdEX 35 40 40
104 # CHECK-NEXT: PdEX 27 30 40
105105 # CHECK-NEXT: PdFPU 0 0 64
106 # CHECK-NEXT: PdLoad 35 40 40
106 # CHECK-NEXT: PdLoad 36 40 40
107107 # CHECK-NEXT: PdStore 0 0 24
108108
109109 # CHECK: Resources:
191191 # CHECK: Dynamic Dispatch Stall Cycles:
192192 # CHECK-NEXT: RAT - Register unavailable: 0
193193 # CHECK-NEXT: RCU - Retire tokens unavailable: 0
194 # CHECK-NEXT: SCHEDQ - Scheduler full: 161 (77.8%)
195 # CHECK-NEXT: LQ - Load queue full: 0
194 # CHECK-NEXT: SCHEDQ - Scheduler full: 0
195 # CHECK-NEXT: LQ - Load queue full: 171 (82.6%)
196196 # CHECK-NEXT: SQ - Store queue full: 0
197197 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
198198
199199 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
200200 # CHECK-NEXT: [# dispatched], [# cycles]
201 # CHECK-NEXT: 0, 26 (12.6%)
202 # CHECK-NEXT: 2, 162 (78.3%)
203 # CHECK-NEXT: 4, 19 (9.2%)
201 # CHECK-NEXT: 0, 21 (10.1%)
202 # CHECK-NEXT: 2, 172 (83.1%)
203 # CHECK-NEXT: 4, 14 (6.8%)
204204
205205 # CHECK: Schedulers - number of cycles where we saw N instructions issued:
206206 # CHECK-NEXT: [# issued], [# cycles]
214214 # CHECK-NEXT: [4] Total number of buffer entries.
215215
216216 # CHECK: [1] [2] [3] [4]
217 # CHECK-NEXT: PdEX 35 40 40
217 # CHECK-NEXT: PdEX 27 30 40
218218 # CHECK-NEXT: PdFPU 0 0 64
219 # CHECK-NEXT: PdLoad 35 40 40
219 # CHECK-NEXT: PdLoad 36 40 40
220220 # CHECK-NEXT: PdStore 0 0 24
221221
222222 # CHECK: Resources:
304304 # CHECK: Dynamic Dispatch Stall Cycles:
305305 # CHECK-NEXT: RAT - Register unavailable: 0
306306 # CHECK-NEXT: RCU - Retire tokens unavailable: 0
307 # CHECK-NEXT: SCHEDQ - Scheduler full: 161 (77.8%)
308 # CHECK-NEXT: LQ - Load queue full: 0
307 # CHECK-NEXT: SCHEDQ - Scheduler full: 0
308 # CHECK-NEXT: LQ - Load queue full: 171 (82.6%)
309309 # CHECK-NEXT: SQ - Store queue full: 0
310310 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
311311
312312 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
313313 # CHECK-NEXT: [# dispatched], [# cycles]
314 # CHECK-NEXT: 0, 26 (12.6%)
315 # CHECK-NEXT: 2, 162 (78.3%)
316 # CHECK-NEXT: 4, 19 (9.2%)
314 # CHECK-NEXT: 0, 21 (10.1%)
315 # CHECK-NEXT: 2, 172 (83.1%)
316 # CHECK-NEXT: 4, 14 (6.8%)
317317
318318 # CHECK: Schedulers - number of cycles where we saw N instructions issued:
319319 # CHECK-NEXT: [# issued], [# cycles]
327327 # CHECK-NEXT: [4] Total number of buffer entries.
328328
329329 # CHECK: [1] [2] [3] [4]
330 # CHECK-NEXT: PdEX 35 40 40
330 # CHECK-NEXT: PdEX 27 30 40
331331 # CHECK-NEXT: PdFPU 0 0 64
332 # CHECK-NEXT: PdLoad 35 40 40
332 # CHECK-NEXT: PdLoad 36 40 40
333333 # CHECK-NEXT: PdStore 0 0 24
334334
335335 # CHECK: Resources:
417417 # CHECK: Dynamic Dispatch Stall Cycles:
418418 # CHECK-NEXT: RAT - Register unavailable: 0
419419 # CHECK-NEXT: RCU - Retire tokens unavailable: 0
420 # CHECK-NEXT: SCHEDQ - Scheduler full: 161 (77.8%)
421 # CHECK-NEXT: LQ - Load queue full: 0
420 # CHECK-NEXT: SCHEDQ - Scheduler full: 0
421 # CHECK-NEXT: LQ - Load queue full: 171 (82.6%)
422422 # CHECK-NEXT: SQ - Store queue full: 0
423423 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
424424
425425 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
426426 # CHECK-NEXT: [# dispatched], [# cycles]
427 # CHECK-NEXT: 0, 26 (12.6%)
428 # CHECK-NEXT: 2, 162 (78.3%)
429 # CHECK-NEXT: 4, 19 (9.2%)
427 # CHECK-NEXT: 0, 21 (10.1%)
428 # CHECK-NEXT: 2, 172 (83.1%)
429 # CHECK-NEXT: 4, 14 (6.8%)
430430
431431 # CHECK: Schedulers - number of cycles where we saw N instructions issued:
432432 # CHECK-NEXT: [# issued], [# cycles]
440440 # CHECK-NEXT: [4] Total number of buffer entries.
441441
442442 # CHECK: [1] [2] [3] [4]
443 # CHECK-NEXT: PdEX 35 40 40
443 # CHECK-NEXT: PdEX 27 30 40
444444 # CHECK-NEXT: PdFPU 0 0 64
445 # CHECK-NEXT: PdLoad 35 40 40
445 # CHECK-NEXT: PdLoad 36 40 40
446446 # CHECK-NEXT: PdStore 0 0 24
447447
448448 # CHECK: Resources:
530530 # CHECK: Dynamic Dispatch Stall Cycles:
531531 # CHECK-NEXT: RAT - Register unavailable: 0
532532 # CHECK-NEXT: RCU - Retire tokens unavailable: 0
533 # CHECK-NEXT: SCHEDQ - Scheduler full: 161 (77.8%)
534 # CHECK-NEXT: LQ - Load queue full: 0
533 # CHECK-NEXT: SCHEDQ - Scheduler full: 0
534 # CHECK-NEXT: LQ - Load queue full: 171 (82.6%)
535535 # CHECK-NEXT: SQ - Store queue full: 0
536536 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
537537
538538 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
539539 # CHECK-NEXT: [# dispatched], [# cycles]
540 # CHECK-NEXT: 0, 26 (12.6%)
541 # CHECK-NEXT: 2, 162 (78.3%)
542 # CHECK-NEXT: 4, 19 (9.2%)
540 # CHECK-NEXT: 0, 21 (10.1%)
541 # CHECK-NEXT: 2, 172 (83.1%)
542 # CHECK-NEXT: 4, 14 (6.8%)
543543
544544 # CHECK: Schedulers - number of cycles where we saw N instructions issued:
545545 # CHECK-NEXT: [# issued], [# cycles]
553553 # CHECK-NEXT: [4] Total number of buffer entries.
554554
555555 # CHECK: [1] [2] [3] [4]
556 # CHECK-NEXT: PdEX 35 40 40
557 # CHECK-NEXT: PdFPU 35 40 64
558 # CHECK-NEXT: PdLoad 35 40 40
556 # CHECK-NEXT: PdEX 27 30 40
557 # CHECK-NEXT: PdFPU 27 30 64
558 # CHECK-NEXT: PdLoad 36 40 40
559559 # CHECK-NEXT: PdStore 0 0 24
560560
561561 # CHECK: Resources:
643643 # CHECK: Dynamic Dispatch Stall Cycles:
644644 # CHECK-NEXT: RAT - Register unavailable: 0
645645 # CHECK-NEXT: RCU - Retire tokens unavailable: 0
646 # CHECK-NEXT: SCHEDQ - Scheduler full: 161 (77.8%)
647 # CHECK-NEXT: LQ - Load queue full: 0
646 # CHECK-NEXT: SCHEDQ - Scheduler full: 0
647 # CHECK-NEXT: LQ - Load queue full: 171 (82.6%)
648648 # CHECK-NEXT: SQ - Store queue full: 0
649649 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
650650
651651 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
652652 # CHECK-NEXT: [# dispatched], [# cycles]
653 # CHECK-NEXT: 0, 26 (12.6%)
654 # CHECK-NEXT: 2, 162 (78.3%)
655 # CHECK-NEXT: 4, 19 (9.2%)
653 # CHECK-NEXT: 0, 21 (10.1%)
654 # CHECK-NEXT: 2, 172 (83.1%)
655 # CHECK-NEXT: 4, 14 (6.8%)
656656
657657 # CHECK: Schedulers - number of cycles where we saw N instructions issued:
658658 # CHECK-NEXT: [# issued], [# cycles]
666666 # CHECK-NEXT: [4] Total number of buffer entries.
667667
668668 # CHECK: [1] [2] [3] [4]
669 # CHECK-NEXT: PdEX 35 40 40
670 # CHECK-NEXT: PdFPU 35 40 64
671 # CHECK-NEXT: PdLoad 35 40 40
669 # CHECK-NEXT: PdEX 27 30 40
670 # CHECK-NEXT: PdFPU 27 30 64
671 # CHECK-NEXT: PdLoad 36 40 40
672672 # CHECK-NEXT: PdStore 0 0 24
673673
674674 # CHECK: Resources:
780780 # CHECK: [1] [2] [3] [4]
781781 # CHECK-NEXT: PdEX 1 2 40
782782 # CHECK-NEXT: PdFPU 1 2 64
783 # CHECK-NEXT: PdLoad 1 2 40
783 # CHECK-NEXT: PdLoad 11 12 40
784784 # CHECK-NEXT: PdStore 0 0 24
785785
786786 # CHECK: Resources:
7878 # CHECK: Dynamic Dispatch Stall Cycles:
7979 # CHECK-NEXT: RAT - Register unavailable: 0
8080 # CHECK-NEXT: RCU - Retire tokens unavailable: 0
81 # CHECK-NEXT: SCHEDQ - Scheduler full: 369 (91.6%)
81 # CHECK-NEXT: SCHEDQ - Scheduler full: 0
8282 # CHECK-NEXT: LQ - Load queue full: 0
83 # CHECK-NEXT: SQ - Store queue full: 0
83 # CHECK-NEXT: SQ - Store queue full: 370 (91.8%)
8484 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
8585
8686 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
8787 # CHECK-NEXT: [# dispatched], [# cycles]
88 # CHECK-NEXT: 0, 26 (6.5%)
89 # CHECK-NEXT: 1, 369 (91.6%)
90 # CHECK-NEXT: 3, 1 (0.2%)
88 # CHECK-NEXT: 0, 25 (6.2%)
89 # CHECK-NEXT: 1, 370 (91.8%)
90 # CHECK-NEXT: 2, 1 (0.2%)
9191 # CHECK-NEXT: 4, 7 (1.7%)
9292
9393 # CHECK: Schedulers - number of cycles where we saw N instructions issued:
102102 # CHECK-NEXT: [4] Total number of buffer entries.
103103
104104 # CHECK: [1] [2] [3] [4]
105 # CHECK-NEXT: PdEX 22 24 40
105 # CHECK-NEXT: PdEX 22 23 40
106106 # CHECK-NEXT: PdFPU 0 0 64
107107 # CHECK-NEXT: PdLoad 0 0 40
108 # CHECK-NEXT: PdStore 22 24 24
108 # CHECK-NEXT: PdStore 23 24 24
109109
110110 # CHECK: Resources:
111111 # CHECK-NEXT: [0.0] - PdAGLU01
192192 # CHECK: Dynamic Dispatch Stall Cycles:
193193 # CHECK-NEXT: RAT - Register unavailable: 0
194194 # CHECK-NEXT: RCU - Retire tokens unavailable: 0
195 # CHECK-NEXT: SCHEDQ - Scheduler full: 369 (91.6%)
195 # CHECK-NEXT: SCHEDQ - Scheduler full: 0
196196 # CHECK-NEXT: LQ - Load queue full: 0
197 # CHECK-NEXT: SQ - Store queue full: 0
197 # CHECK-NEXT: SQ - Store queue full: 370 (91.8%)
198198 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
199199
200200 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
201201 # CHECK-NEXT: [# dispatched], [# cycles]
202 # CHECK-NEXT: 0, 26 (6.5%)
203 # CHECK-NEXT: 1, 369 (91.6%)
204 # CHECK-NEXT: 3, 1 (0.2%)
202 # CHECK-NEXT: 0, 25 (6.2%)
203 # CHECK-NEXT: 1, 370 (91.8%)
204 # CHECK-NEXT: 2, 1 (0.2%)
205205 # CHECK-NEXT: 4, 7 (1.7%)
206206
207207 # CHECK: Schedulers - number of cycles where we saw N instructions issued:
216216 # CHECK-NEXT: [4] Total number of buffer entries.
217217
218218 # CHECK: [1] [2] [3] [4]
219 # CHECK-NEXT: PdEX 22 24 40
219 # CHECK-NEXT: PdEX 22 23 40
220220 # CHECK-NEXT: PdFPU 0 0 64
221221 # CHECK-NEXT: PdLoad 0 0 40
222 # CHECK-NEXT: PdStore 22 24 24
222 # CHECK-NEXT: PdStore 23 24 24
223223
224224 # CHECK: Resources:
225225 # CHECK-NEXT: [0.0] - PdAGLU01
306306 # CHECK: Dynamic Dispatch Stall Cycles:
307307 # CHECK-NEXT: RAT - Register unavailable: 0
308308 # CHECK-NEXT: RCU - Retire tokens unavailable: 0
309 # CHECK-NEXT: SCHEDQ - Scheduler full: 369 (91.6%)
309 # CHECK-NEXT: SCHEDQ - Scheduler full: 0
310310 # CHECK-NEXT: LQ - Load queue full: 0
311 # CHECK-NEXT: SQ - Store queue full: 0
311 # CHECK-NEXT: SQ - Store queue full: 370 (91.8%)
312312 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
313313
314314 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
315315 # CHECK-NEXT: [# dispatched], [# cycles]
316 # CHECK-NEXT: 0, 26 (6.5%)
317 # CHECK-NEXT: 1, 369 (91.6%)
318 # CHECK-NEXT: 3, 1 (0.2%)
316 # CHECK-NEXT: 0, 25 (6.2%)
317 # CHECK-NEXT: 1, 370 (91.8%)
318 # CHECK-NEXT: 2, 1 (0.2%)
319319 # CHECK-NEXT: 4, 7 (1.7%)
320320
321321 # CHECK: Schedulers - number of cycles where we saw N instructions issued:
330330 # CHECK-NEXT: [4] Total number of buffer entries.
331331
332332 # CHECK: [1] [2] [3] [4]
333 # CHECK-NEXT: PdEX 22 24 40
333 # CHECK-NEXT: PdEX 22 23 40
334334 # CHECK-NEXT: PdFPU 0 0 64
335335 # CHECK-NEXT: PdLoad 0 0 40
336 # CHECK-NEXT: PdStore 22 24 24
336 # CHECK-NEXT: PdStore 23 24 24
337337
338338 # CHECK: Resources:
339339 # CHECK-NEXT: [0.0] - PdAGLU01
420420 # CHECK: Dynamic Dispatch Stall Cycles:
421421 # CHECK-NEXT: RAT - Register unavailable: 0
422422 # CHECK-NEXT: RCU - Retire tokens unavailable: 0
423 # CHECK-NEXT: SCHEDQ - Scheduler full: 369 (91.6%)
423 # CHECK-NEXT: SCHEDQ - Scheduler full: 0
424424 # CHECK-NEXT: LQ - Load queue full: 0
425 # CHECK-NEXT: SQ - Store queue full: 0
425 # CHECK-NEXT: SQ - Store queue full: 370 (91.8%)
426426 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
427427
428428 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
429429 # CHECK-NEXT: [# dispatched], [# cycles]
430 # CHECK-NEXT: 0, 26 (6.5%)
431 # CHECK-NEXT: 1, 369 (91.6%)
432 # CHECK-NEXT: 3, 1 (0.2%)
430 # CHECK-NEXT: 0, 25 (6.2%)
431 # CHECK-NEXT: 1, 370 (91.8%)
432 # CHECK-NEXT: 2, 1 (0.2%)
433433 # CHECK-NEXT: 4, 7 (1.7%)
434434
435435 # CHECK: Schedulers - number of cycles where we saw N instructions issued:
444444 # CHECK-NEXT: [4] Total number of buffer entries.
445445
446446 # CHECK: [1] [2] [3] [4]
447 # CHECK-NEXT: PdEX 22 24 40
447 # CHECK-NEXT: PdEX 22 23 40
448448 # CHECK-NEXT: PdFPU 0 0 64
449449 # CHECK-NEXT: PdLoad 0 0 40
450 # CHECK-NEXT: PdStore 22 24 24
450 # CHECK-NEXT: PdStore 23 24 24
451451
452452 # CHECK: Resources:
453453 # CHECK-NEXT: [0.0] - PdAGLU01
534534 # CHECK: Dynamic Dispatch Stall Cycles:
535535 # CHECK-NEXT: RAT - Register unavailable: 0
536536 # CHECK-NEXT: RCU - Retire tokens unavailable: 0
537 # CHECK-NEXT: SCHEDQ - Scheduler full: 745 (92.8%)
537 # CHECK-NEXT: SCHEDQ - Scheduler full: 0
538538 # CHECK-NEXT: LQ - Load queue full: 0
539 # CHECK-NEXT: SQ - Store queue full: 0
539 # CHECK-NEXT: SQ - Store queue full: 747 (93.0%)
540540 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
541541
542542 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
543543 # CHECK-NEXT: [# dispatched], [# cycles]
544 # CHECK-NEXT: 0, 423 (52.7%)
545 # CHECK-NEXT: 1, 373 (46.5%)
546 # CHECK-NEXT: 3, 1 (0.1%)
544 # CHECK-NEXT: 0, 422 (52.6%)
545 # CHECK-NEXT: 1, 374 (46.6%)
546 # CHECK-NEXT: 2, 1 (0.1%)
547547 # CHECK-NEXT: 4, 6 (0.7%)
548548
549549 # CHECK: Schedulers - number of cycles where we saw N instructions issued:
558558 # CHECK-NEXT: [4] Total number of buffer entries.
559559
560560 # CHECK: [1] [2] [3] [4]
561 # CHECK-NEXT: PdEX 23 24 40
562 # CHECK-NEXT: PdFPU 23 24 64
561 # CHECK-NEXT: PdEX 22 23 40
562 # CHECK-NEXT: PdFPU 22 23 64
563563 # CHECK-NEXT: PdLoad 0 0 40
564564 # CHECK-NEXT: PdStore 23 24 24
565565
649649 # CHECK: Dynamic Dispatch Stall Cycles:
650650 # CHECK-NEXT: RAT - Register unavailable: 0
651651 # CHECK-NEXT: RCU - Retire tokens unavailable: 0
652 # CHECK-NEXT: SCHEDQ - Scheduler full: 369 (91.6%)
652 # CHECK-NEXT: SCHEDQ - Scheduler full: 0
653653 # CHECK-NEXT: LQ - Load queue full: 0
654 # CHECK-NEXT: SQ - Store queue full: 0
654 # CHECK-NEXT: SQ - Store queue full: 370 (91.8%)
655655 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
656656
657657 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
658658 # CHECK-NEXT: [# dispatched], [# cycles]
659 # CHECK-NEXT: 0, 26 (6.5%)
660 # CHECK-NEXT: 1, 369 (91.6%)
661 # CHECK-NEXT: 3, 1 (0.2%)
659 # CHECK-NEXT: 0, 25 (6.2%)
660 # CHECK-NEXT: 1, 370 (91.8%)
661 # CHECK-NEXT: 2, 1 (0.2%)
662662 # CHECK-NEXT: 4, 7 (1.7%)
663663
664664 # CHECK: Schedulers - number of cycles where we saw N instructions issued:
673673 # CHECK-NEXT: [4] Total number of buffer entries.
674674
675675 # CHECK: [1] [2] [3] [4]
676 # CHECK-NEXT: PdEX 22 24 40
677 # CHECK-NEXT: PdFPU 22 24 64
676 # CHECK-NEXT: PdEX 22 23 40
677 # CHECK-NEXT: PdFPU 22 23 64
678678 # CHECK-NEXT: PdLoad 0 0 40
679 # CHECK-NEXT: PdStore 22 24 24
679 # CHECK-NEXT: PdStore 23 24 24
680680
681681 # CHECK: Resources:
682682 # CHECK-NEXT: [0.0] - PdAGLU01
788788 # CHECK-NEXT: PdEX 1 1 40
789789 # CHECK-NEXT: PdFPU 1 1 64
790790 # CHECK-NEXT: PdLoad 0 0 40
791 # CHECK-NEXT: PdStore 1 1 24
791 # CHECK-NEXT: PdStore 2 2 24
792792
793793 # CHECK: Resources:
794794 # CHECK-NEXT: [0.0] - PdAGLU01
1818 namespace llvm {
1919 namespace mca {
2020
21 SchedulerStatistics::SchedulerStatistics(const llvm::MCSubtargetInfo &STI)
22 : SM(STI.getSchedModel()), LQResourceID(0), SQResourceID(0), NumIssued(0),
23 NumCycles(0), MostRecentLoadDispatched(~0U),
24 MostRecentStoreDispatched(~0U),
25 IssuedPerCycle(STI.getSchedModel().NumProcResourceKinds, 0),
26 Usage(STI.getSchedModel().NumProcResourceKinds, {0, 0, 0}) {
27 if (SM.hasExtraProcessorInfo()) {
28 const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo();
29 LQResourceID = EPI.LoadQueueID;
30 SQResourceID = EPI.StoreQueueID;
31 }
32 }
33
34 // FIXME: This implementation works under the assumption that load/store queue
35 // entries are reserved at 'instruction dispatched' stage, and released at
36 // 'instruction executed' stage. This currently matches the behavior of LSUnit.
37 //
38 // The current design minimizes the number of events generated by the
39 // Dispatch/Execute stages, at the cost of doing extra bookkeeping in method
40 // `onEvent`. However, it introduces a subtle dependency between this view and
41 // how the LSUnit works.
42 //
43 // In future we should add a new "memory queue" event type, so that we stop
44 // making assumptions on how LSUnit internally works (See PR39828).
2145 void SchedulerStatistics::onEvent(const HWInstructionEvent &Event) {
2246 if (Event.Type == HWInstructionEvent::Issued)
2347 ++NumIssued;
48 else if (Event.Type == HWInstructionEvent::Dispatched) {
49 const Instruction &Inst = *Event.IR.getInstruction();
50 const unsigned Index = Event.IR.getSourceIndex();
51 if (LQResourceID && Inst.getDesc().MayLoad &&
52 MostRecentLoadDispatched != Index) {
53 Usage[LQResourceID].SlotsInUse++;
54 MostRecentLoadDispatched = Index;
55 }
56 if (SQResourceID && Inst.getDesc().MayStore &&
57 MostRecentStoreDispatched != Index) {
58 Usage[SQResourceID].SlotsInUse++;
59 MostRecentStoreDispatched = Index;
60 }
61 } else if (Event.Type == HWInstructionEvent::Executed) {
62 const Instruction &Inst = *Event.IR.getInstruction();
63 if (LQResourceID && Inst.getDesc().MayLoad) {
64 assert(Usage[LQResourceID].SlotsInUse);
65 Usage[LQResourceID].SlotsInUse--;
66 }
67 if (SQResourceID && Inst.getDesc().MayStore) {
68 assert(Usage[SQResourceID].SlotsInUse);
69 Usage[SQResourceID].SlotsInUse--;
70 }
71 }
2472 }
2573
2674 void SchedulerStatistics::onReservedBuffers(const InstRef & /* unused */,
2775 ArrayRef Buffers) {
2876 for (const unsigned Buffer : Buffers) {
29 BufferUsage &BU = Usage[Buffer];
30 BU.SlotsInUse++;
31 BU.MaxUsedSlots = std::max(BU.MaxUsedSlots, BU.SlotsInUse);
77 if (Buffer == LQResourceID || Buffer == SQResourceID)
78 continue;
79 Usage[Buffer].SlotsInUse++;
3280 }
3381 }
3482
3583 void SchedulerStatistics::onReleasedBuffers(const InstRef & /* unused */,
3684 ArrayRef Buffers) {
37 for (const unsigned Buffer : Buffers)
85 for (const unsigned Buffer : Buffers) {
86 if (Buffer == LQResourceID || Buffer == SQResourceID)
87 continue;
3888 Usage[Buffer].SlotsInUse--;
89 }
3990 }
4091
4192 void SchedulerStatistics::updateHistograms() {
42 for (BufferUsage &BU : Usage)
93 for (BufferUsage &BU : Usage) {
4394 BU.CumulativeNumUsedSlots += BU.SlotsInUse;
95 BU.MaxUsedSlots = std::max(BU.MaxUsedSlots, BU.SlotsInUse);
96 }
97
4498 IssuedPerCycle[NumIssued]++;
4599 NumIssued = 0;
46100 }
4646
4747 class SchedulerStatistics final : public View {
4848 const llvm::MCSchedModel &SM;
49 unsigned LQResourceID;
50 unsigned SQResourceID;
51
4952 unsigned NumIssued;
5053 unsigned NumCycles;
54
55 unsigned MostRecentLoadDispatched;
56 unsigned MostRecentStoreDispatched;
5157
5258 // Tracks the usage of a scheduler's queue.
5359 struct BufferUsage {
6470 void printSchedulerUsage(llvm::raw_ostream &OS) const;
6571
6672 public:
67 SchedulerStatistics(const llvm::MCSubtargetInfo &STI)
68 : SM(STI.getSchedModel()), NumIssued(0), NumCycles(0),
69 IssuedPerCycle(STI.getSchedModel().NumProcResourceKinds, 0),
70 Usage(STI.getSchedModel().NumProcResourceKinds, {0, 0, 0}) {}
71
73 SchedulerStatistics(const llvm::MCSubtargetInfo &STI);
7274 void onEvent(const HWInstructionEvent &Event) override;
7375 void onCycleBegin() override { NumCycles++; }
7476 void onCycleEnd() override { updateHistograms(); }
1717
1818 #include "HardwareUnits/HardwareUnit.h"
1919 #include "llvm/ADT/SmallSet.h"
20 #include "llvm/MC/MCSchedule.h"
2021
2122 namespace llvm {
2223 namespace mca {
9899 // If true, loads will never alias with stores. This is the default.
99100 bool NoAlias;
100101
102 // When a `MayLoad` instruction is dispatched to the schedulers for execution,
103 // the LSUnit reserves an entry in the `LoadQueue` for it.
104 //
105 // LoadQueue keeps track of all the loads that are in-flight. A load
106 // instruction is eventually removed from the LoadQueue when it reaches
107 // completion stage. That means, a load leaves the queue whe it is 'executed',
108 // and its value can be forwarded on the data path to outside units.
109 //
110 // This class doesn't know about the latency of a load instruction. So, it
111 // conservatively/pessimistically assumes that the latency of a load opcode
112 // matches the instruction latency.
113 //
114 // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
115 // and load/store conflicts, the latency of a load is determined by the depth
116 // of the load pipeline. So, we could use field `LoadLatency` in the
117 // MCSchedModel to model that latency.
118 // Field `LoadLatency` often matches the so-called 'load-to-use' latency from
119 // L1D, and it usually already accounts for any extra latency due to data
120 // forwarding.
121 // When doing throughput analysis, `LoadLatency` is likely to
122 // be a better predictor of load latency than instruction latency. This is
123 // particularly true when simulating code with temporal/spatial locality of
124 // memory accesses.
125 // Using `LoadLatency` (instead of the instruction latency) is also expected
126 // to improve the load queue allocation for long latency instructions with
127 // folded memory operands (See PR39829).
128 //
129 // FIXME: On some processors, load/store operations are split into multiple
130 // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
131 // not 256-bit data types. So, a 256-bit load is effectively split into two
132 // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
133 // simplicity, this class optimistically assumes that a load instruction only
134 // consumes one entry in the LoadQueue. Similarly, store instructions only
135 // consume a single entry in the StoreQueue.
136 // In future, we should reassess the quality of this design, and consider
137 // alternative approaches that let instructions specify the number of
138 // load/store queue entries which they consume at dispatch stage (See
139 // PR39830).
101140 SmallSet LoadQueue;
102141 SmallSet StoreQueue;
103142
121160 bool isLQFull() const { return LQ_Size != 0 && LoadQueue.size() == LQ_Size; }
122161
123162 public:
124 LSUnit(unsigned LQ = 0, unsigned SQ = 0, bool AssumeNoAlias = false)
125 : LQ_Size(LQ), SQ_Size(SQ), NoAlias(AssumeNoAlias) {}
163 LSUnit(const MCSchedModel &SM, unsigned LQ = 0, unsigned SQ = 0,
164 bool AssumeNoAlias = false);
126165
127166 #ifndef NDEBUG
128167 void dump() const;
148187 // 5. A load has to wait until an older load barrier is fully executed.
149188 // 6. A store has to wait until an older store barrier is fully executed.
150189 virtual bool isReady(const InstRef &IR) const;
190
191 // Load and store instructions are tracked by their corresponding queues from
192 // dispatch until the "instruction executed" event.
193 // Only when a load instruction reaches the 'Executed' stage, its value
194 // becomes available to the users. At that point, the load no longer needs to
195 // be tracked by the load queue.
196 // FIXME: For simplicity, we optimistically assume a similar behavior for
197 // store instructions. In practice, store operation don't tend to leave the
198 // store queue until they reach the 'Retired' stage (See PR39830).
151199 void onInstructionExecuted(const InstRef &IR);
152200 };
153201
3434 // Create the hardware units defining the backend.
3535 auto RCU = llvm::make_unique(SM);
3636 auto PRF = llvm::make_unique(SM, MRI, Opts.RegisterFileSize);
37 auto LSU = llvm::make_unique(Opts.LoadQueueSize, Opts.StoreQueueSize,
38 Opts.AssumeNoAlias);
37 auto LSU = llvm::make_unique(SM, Opts.LoadQueueSize,
38 Opts.StoreQueueSize, Opts.AssumeNoAlias);
3939 auto HWS = llvm::make_unique(SM, LSU.get());
4040
4141 // Create the pipeline stages.
2020
2121 namespace llvm {
2222 namespace mca {
23
24 LSUnit::LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ,
25 bool AssumeNoAlias)
26 : LQ_Size(LQ), SQ_Size(SQ), NoAlias(AssumeNoAlias) {
27 if (SM.hasExtraProcessorInfo()) {
28 const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo();
29 if (!LQ_Size && EPI.LoadQueueID) {
30 const MCProcResourceDesc &LdQDesc = *SM.getProcResource(EPI.LoadQueueID);
31 LQ_Size = LdQDesc.BufferSize;
32 }
33
34 if (!SQ_Size && EPI.StoreQueueID) {
35 const MCProcResourceDesc &StQDesc = *SM.getProcResource(EPI.StoreQueueID);
36 SQ_Size = StQDesc.BufferSize;
37 }
38 }
39 }
2340
2441 #ifndef NDEBUG
2542 void LSUnit::dump() const {
150150
151151 static cl::opt
152152 LoadQueueSize("lqueue",
153 cl::desc("Size of the load queue (unbound by default)"),
153 cl::desc("Size of the load queue"),
154154 cl::cat(ToolOptions), cl::init(0));
155155
156156 static cl::opt
157157 StoreQueueSize("squeue",
158 cl::desc("Size of the store queue (unbound by default)"),
158 cl::desc("Size of the store queue"),
159159 cl::cat(ToolOptions), cl::init(0));
160160
161161 static cl::opt
478478 }
479479 }
480480
481 void CodeGenSchedModels::collectLoadStoreQueueInfo() {
482 RecVec Queues = Records.getAllDerivedDefinitions("MemoryQueue");
483
484 for (Record *Queue : Queues) {
485 CodeGenProcModel &PM = getProcModel(Queue->getValueAsDef("SchedModel"));
486 if (Queue->isSubClassOf("LoadQueue")) {
487 if (PM.LoadQueue) {
488 PrintError(Queue->getLoc(),
489 "Expected a single LoadQueue definition");
490 PrintNote(PM.LoadQueue->getLoc(),
491 "Previous definition of LoadQueue was here");
492 }
493
494 PM.LoadQueue = Queue;
495 }
496
497 if (Queue->isSubClassOf("StoreQueue")) {
498 if (PM.StoreQueue) {
499 PrintError(Queue->getLoc(),
500 "Expected a single StoreQueue definition");
501 PrintNote(PM.LoadQueue->getLoc(),
502 "Previous definition of StoreQueue was here");
503 }
504
505 PM.StoreQueue = Queue;
506 }
507 }
508 }
509
481510 /// Collect optional processor information.
482511 void CodeGenSchedModels::collectOptionalProcessorInfo() {
483512 // Find register file definitions for each processor.
485514
486515 // Collect processor RetireControlUnit descriptors if available.
487516 collectRetireControlUnits();
517
518 // Collect information about load/store queues.
519 collectLoadStoreQueueInfo();
488520
489521 checkCompleteness();
490522 }
245245 // Optional Retire Control Unit definition.
246246 Record *RetireControlUnit;
247247
248 // Load/Store queue descriptors.
249 Record *LoadQueue;
250 Record *StoreQueue;
251
248252 CodeGenProcModel(unsigned Idx, std::string Name, Record *MDef,
249253 Record *IDef) :
250254 Index(Idx), ModelName(std::move(Name)), ModelDef(MDef), ItinsDef(IDef),
251 RetireControlUnit(nullptr) {}
255 RetireControlUnit(nullptr), LoadQueue(nullptr), StoreQueue(nullptr) {}
252256
253257 bool hasItineraries() const {
254258 return !ItinsDef->getValueAsListOfDefs("IID").empty();
259263 }
260264
261265 bool hasExtraProcessorInfo() const {
262 return RetireControlUnit || !RegisterFiles.empty();
266 return RetireControlUnit || LoadQueue || StoreQueue ||
267 !RegisterFiles.empty();
263268 }
264269
265270 unsigned getProcResourceIdx(Record *PRDef) const;
606611
607612 void collectSTIPredicates();
608613
614 void collectLoadStoreQueueInfo();
615
609616 void checkCompleteness();
610617
611618 void inferFromRW(ArrayRef OperWrites, ArrayRef OperReads,
9292 &ProcItinLists);
9393 unsigned EmitRegisterFileTables(const CodeGenProcModel &ProcModel,
9494 raw_ostream &OS);
95 void EmitLoadStoreQueueInfo(const CodeGenProcModel &ProcModel,
96 raw_ostream &OS);
9597 void EmitExtraProcessorInfo(const CodeGenProcModel &ProcModel,
9698 raw_ostream &OS);
9799 void EmitProcessorProp(raw_ostream &OS, const Record *R, StringRef Name,
696698 return CostTblIndex;
697699 }
698700
701 void SubtargetEmitter::EmitLoadStoreQueueInfo(const CodeGenProcModel &ProcModel,
702 raw_ostream &OS) {
703 unsigned QueueID = 0;
704 if (ProcModel.LoadQueue) {
705 const Record *Queue = ProcModel.LoadQueue->getValueAsDef("QueueDescriptor");
706 QueueID =
707 1 + std::distance(ProcModel.ProcResourceDefs.begin(),
708 std::find(ProcModel.ProcResourceDefs.begin(),
709 ProcModel.ProcResourceDefs.end(), Queue));
710 }
711 OS << " " << QueueID << ", // Resource Descriptor for the Load Queue\n";
712
713 QueueID = 0;
714 if (ProcModel.StoreQueue) {
715 const Record *Queue =
716 ProcModel.StoreQueue->getValueAsDef("QueueDescriptor");
717 QueueID =
718 1 + std::distance(ProcModel.ProcResourceDefs.begin(),
719 std::find(ProcModel.ProcResourceDefs.begin(),
720 ProcModel.ProcResourceDefs.end(), Queue));
721 }
722 OS << " " << QueueID << ", // Resource Descriptor for the Store Queue\n";
723 }
724
699725 void SubtargetEmitter::EmitExtraProcessorInfo(const CodeGenProcModel &ProcModel,
700726 raw_ostream &OS) {
701727 // Generate a table of register file descriptors (one entry per each user
713739 // file descriptors and register costs).
714740 EmitRegisterFileInfo(ProcModel, ProcModel.RegisterFiles.size(),
715741 NumCostEntries, OS);
742
743 // Add information about load/store queues.
744 EmitLoadStoreQueueInfo(ProcModel, OS);
716745
717746 OS << "};\n";
718747 }