llvm.org GIT mirror llvm / 60873e2
[AMDGPU] Prevent post-RA scheduler from breaking memory clauses The pre-RA scheduler does load/store clustering, but post-RA scheduler undoes it. Add mutation to prevent it. Differential Revision: https://reviews.llvm.org/D38014 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@313670 91177308-0d34-0410-b5e6-96231b3b80d8 Stanislav Mekhanoshin 2 years ago
25 changed file(s) with 180 addition(s) and 90 deletion(s). Raw diff Collapse all Expand all
523523
524524 return MaxNumVGPRs - getReservedNumVGPRs(MF);
525525 }
526
527 struct MemOpClusterMutation : ScheduleDAGMutation {
528 const SIInstrInfo *TII;
529
530 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
531
532 void apply(ScheduleDAGInstrs *DAGInstrs) override {
533 ScheduleDAGMI *DAG = static_cast(DAGInstrs);
534
535 SUnit *SUa = nullptr;
536 // Search for two consequent memory operations and link them
537 // to prevent scheduler from moving them apart.
538 // In DAG pre-process SUnits are in the original order of
539 // the instructions before scheduling.
540 for (SUnit &SU : DAG->SUnits) {
541 MachineInstr &MI2 = *SU.getInstr();
542 if (!MI2.mayLoad() && !MI2.mayStore()) {
543 SUa = nullptr;
544 continue;
545 }
546 if (!SUa) {
547 SUa = &SU;
548 continue;
549 }
550
551 MachineInstr &MI1 = *SUa->getInstr();
552 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
553 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
554 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
555 (TII->isDS(MI1) && TII->isDS(MI2))) {
556 SU.addPredBarrier(SUa);
557
558 for (const SDep &SI : SU.Preds) {
559 if (SI.getSUnit() != SUa)
560 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
561 }
562
563 if (&SU != &DAG->ExitSU) {
564 for (const SDep &SI : SUa->Succs) {
565 if (SI.getSUnit() != &SU)
566 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
567 }
568 }
569 }
570
571 SUa = &SU;
572 }
573 }
574 };
575
576 void SISubtarget::getPostRAMutations(
577 std::vector> &Mutations) const {
578 Mutations.push_back(llvm::make_unique(&InstrInfo));
579 }
882882 /// subtarget's specifications, or does not meet number of waves per execution
883883 /// unit requirement.
884884 unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
885
886 void getPostRAMutations(
887 std::vector> &Mutations)
888 const override;
885889 };
886890
887891 } // end namespace llvm
218218 }
219219
220220 ; FUNC-LABEL: {{^}}s_and_multi_use_inline_imm_i64:
221 ; SI: s_load_dwordx2
221222 ; SI: s_load_dword [[A:s[0-9]+]]
222223 ; SI: s_load_dword [[B:s[0-9]+]]
223 ; SI: s_load_dwordx2
224224 ; SI: s_load_dwordx2
225225 ; SI-NOT: and
226226 ; SI: s_lshl_b32 [[A]], [[A]], 1
1010 ; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1111 ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1212
13 ; CI: v_ashrrev_i32_e32
14 ; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
15 ; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
16 ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
13 ; CI-DAG: v_ashrrev_i32_e32
14 ; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
15 ; CI-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
16 ; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
1717 ; CI: v_or_b32_e32
1818 define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
1919 %result = ashr <2 x i16> %lhs, %rhs
44 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
55 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
66
7 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
8 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
9 ; SI: v_cmp_nlt_f32_e32 vcc, v[[B_F32]], v[[A_F32]]
7 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
8 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
9 ; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
1010 ; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
1111 ; GCN: s_cbranch_vccnz
1212
1313 ; GCN: one{{$}}
14 ; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[B_F32]]
14 ; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]]
1515 ; GCN: buffer_store_short
1616 ; GCN: s_endpgm
1717
1818 ; GCN: two{{$}}
19 ; SI: v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[A_F32]]
19 ; SI: v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]]
2020 ; GCN: buffer_store_short v[[B_F16]]
2121 ; GCN: s_endpgm
2222 define amdgpu_kernel void @br_cc_f16(
399399 ; GCN-DAG: buffer_load_dwordx4 v[24:27], off
400400 ; GCN-DAG: buffer_load_dwordx4 v[28:31], off
401401
402 ; GCN: s_waitcnt
402403 ; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32 offset:4{{$}}
403 ; GCN: s_waitcnt
404 ; GCN-NEXT: s_swappc_b64
404 ; GCN: s_swappc_b64
405405 ; GCN-NEXT: s_endpgm
406406 define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
407407 %ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef
451451 ; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], s33 offset:8
452452 ; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], s33 offset:12
453453
454 ; HSA: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]] offset:4
455 ; HSA: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:8
454 ; HSA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]] offset:4
455 ; HSA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:8
456456
457457
458458 ; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], s33 offset:8
459459 ; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], s33 offset:12
460460
461 ; MESA: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]] offset:4
462 ; MESA: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:8
461 ; MESA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]] offset:4
462 ; MESA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:8
463463
464464 ; GCN-NEXT: s_swappc_b64
465465 ; GCN-NOT: [[SP]]
486486 ; GCN-DAG: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:12
487487
488488 ; GCN-NOT: s_add_u32 [[SP]]
489 ; GCN: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4
490 ; GCN: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:8
489 ; GCN-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4
490 ; GCN-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:8
491491 ; GCN-NEXT: s_swappc_b64
492492 ; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:16
493493 ; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:20
326326 ; Requires loading and storing to stack slot.
327327 ; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x:
328328 ; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
329 ; GCN: s_add_u32 s32, s32, 0x400{{$}}
329330 ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
330 ; GCN: s_add_u32 s32, s32, 0x400{{$}}
331331
332332 ; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8{{$}}
333333
0 # RUN: llc -march=amdgcn -mcpu=tonga -run-pass post-RA-sched -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
1
2 # GCN: FLAT_LOAD_DWORD
3 # GCN-NEXT: FLAT_LOAD_DWORD
4 # GCN: FLAT_STORE_DWORD
5 # GCN-NEXT: FLAT_STORE_DWORD
6
7 ---
8 name: cluster_loads_post_ra
9 tracksRegLiveness: true
10 registers:
11 liveins:
12 - { reg: '%vgpr0' }
13 body: |
14 bb.0:
15 liveins: %vgpr0
16
17 %vgpr0_vgpr1 = IMPLICIT_DEF
18 %vgpr4_vgpr5 = IMPLICIT_DEF
19 %vgpr0 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4)
20 %vgpr4 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4)
21 %vgpr2 = IMPLICIT_DEF
22 %vgpr3 = IMPLICIT_DEF
23 %vgpr6 = IMPLICIT_DEF
24 %vgpr0 = V_ADD_I32_e32 16, %vgpr2, implicit-def %vcc, implicit %exec
25 %vgpr1 = V_ADDC_U32_e32 %vgpr3, killed %vgpr6, implicit-def dead %vcc, implicit %vcc, implicit %exec
26 FLAT_STORE_DWORD %vgpr2_vgpr3, killed %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4)
27 FLAT_STORE_DWORD %vgpr0_vgpr1, killed %vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4)
28 S_ENDPGM
29
30 ...
1111 declare i32 @llvm.amdgcn.workitem.id.x()
1212
1313 ; GCN-LABEL: {{^}}test_copysign_f16:
14 ; SI: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
1415 ; SI: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
15 ; SI: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
1616 ; SI: s_brev_b32 s[[CONST:[0-9]+]], -2
1717 ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
1818 ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
1919 ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_F32]]
2020 ; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]]
21 ; GFX89: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
2122 ; GFX89: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
22 ; GFX89: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
2323 ; GFX89: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff
2424 ; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN]]
2525 ; GCN: buffer_store_short v[[OUT]]
2323 }
2424
2525 ; FUNC-LABEL: {{^}}test_copysign_f64_f32:
26 ; GCN-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}
26 ; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
27 ; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
2728 ; GCN-DAG: s_load_dword s[[SSIGN:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}
2829 ; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2{{$}}
2930 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
1111
1212 define amdgpu_kernel void @f(i32 addrspace(1)* nocapture %a, i32 %i, i32 %j) local_unnamed_addr #0 {
1313 entry:
14 ; CHECK: s_load_dwordx2 s[4:5], s[0:1], 0x9
1415 ; CHECK: s_load_dword s2, s[0:1], 0xb
15 ; CHECK: s_load_dwordx2 s[4:5], s[0:1], 0x9
1616 ; CHECK: s_load_dword s0, s[0:1], 0xc
1717 ; CHECK: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1818 ; CHECK: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2626 ; CHECK: s_lshl_b32 s0, s0, 2
2727 ; CHECK: buffer_store_dword v2, v1, s[8:11], s3 offen
2828 ; CHECK: v_add_i32_e32 v0, vcc, s0, v0
29 ; CHECK: buffer_load_dword v0, v0, s[8:11], s3 offen
3029 ; CHECK: s_mov_b32 s7, 0xf000
3130 ; CHECK: s_mov_b32 s6, -1
31 ; CHECK: buffer_load_dword v0, v0, s[8:11], s3 offen
3232 ; CHECK: s_waitcnt vmcnt(0)
3333 ; CHECK: buffer_store_dword v0, off, s[4:7], 0
3434 ; CHECK: s_endpgm
420420 }
421421
422422 ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
423 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
423424 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
424425 ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
425426 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
426427
427 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
428428 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
429429
430430 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
449449 }
450450
451451 ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
452 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
452453 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
453454 ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
454455 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
455456
456 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
457457 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
458458
459459 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
9494 }
9595
9696 ; GCN-LABEL: {{^}}fmuladd_v2f16
97 ; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
9798 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
98 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
99 ; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
99100 ; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
100101
101102 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
123124 ; VI-FLUSH-NOT: v_and_b32
124125 ; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[R_F16_HI]]
125126
126 ; VI-DENORM: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
127 ; VI-DENORM: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
128 ; VI-DENORM: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
129 ; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
130 ; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
127 ; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
128 ; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
129 ; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
130 ; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]], v[[C_V2_F16]]
131 ; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]], v[[C_F16_1]]
131132 ; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]]
132133 ; VI-DENORM-NOT: v_and_b32
133134 ; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]]
423423 ; GCN-NOHSA: buffer_store_dwordx4
424424 ; GCN-NOHSA: buffer_store_dwordx4
425425
426 ; GCN-HSA: flat_store_dwordx4
427 ; GCN-HSA: flat_store_dwordx4
428 ; GCN-HSA: flat_store_dwordx4
429 ; GCN-HSA: flat_store_dwordx4
430
431 ; GCN-HSA: flat_store_dwordx4
432 ; GCN-HSA: flat_store_dwordx4
433 ; GCN-HSA: flat_store_dwordx4
434 ; GCN-HSA: flat_store_dwordx4
435
436 ; GCN-HSA: flat_store_dwordx4
437 ; GCN-HSA: flat_store_dwordx4
438 ; GCN-HSA: flat_store_dwordx4
439 ; GCN-HSA: flat_store_dwordx4
440
441 ; GCN-HSA: flat_store_dwordx4
442 ; GCN-HSA: flat_store_dwordx4
443 ; GCN-HSA: flat_store_dwordx4
444 ; GCN-HSA: flat_store_dwordx4
426 ; GCN-HSA-DAG: flat_store_dwordx4
427 ; GCN-HSA-DAG: flat_store_dwordx4
428 ; GCN-HSA-DAG: flat_store_dwordx4
429 ; GCN-HSA-DAG: flat_store_dwordx4
430
431 ; GCN-HSA-DAG: flat_store_dwordx4
432 ; GCN-HSA-DAG: flat_store_dwordx4
433 ; GCN-HSA-DAG: flat_store_dwordx4
434 ; GCN-HSA-DAG: flat_store_dwordx4
435
436 ; GCN-HSA-DAG: flat_store_dwordx4
437 ; GCN-HSA-DAG: flat_store_dwordx4
438 ; GCN-HSA-DAG: flat_store_dwordx4
439 ; GCN-HSA-DAG: flat_store_dwordx4
440
441 ; GCN-HSA-DAG: flat_store_dwordx4
442 ; GCN-HSA-DAG: flat_store_dwordx4
443 ; GCN-HSA-DAG: flat_store_dwordx4
444 ; GCN-HSA-DAG: flat_store_dwordx4
445445
446446 define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
447447 %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
44 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s
55
66 ; FUNC-LABEL: {{^}}load_i24:
7 ; SI: {{flat|buffer}}_load_ubyte
8 ; SI: {{flat|buffer}}_load_ushort
7 ; SI-DAG: {{flat|buffer}}_load_ubyte
8 ; SI-DAG: {{flat|buffer}}_load_ushort
99 ; SI: {{flat|buffer}}_store_dword
1010 define amdgpu_kernel void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 {
1111 %1 = load i24, i24 addrspace(1)* %in
99
1010 ; VI: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1111 ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
12 ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
13 ; CIVI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
14 ; CIVI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
12 ; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
13 ; CIVI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
14 ; CIVI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
1515 define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
1616 %result = lshr <2 x i16> %lhs, %rhs
1717 store <2 x i16> %result, <2 x i16> addrspace(1)* %out
55 ; FIXME: We should be able to use the SGPR directly as src0 to v_add_i32
66
77 ; GCN-LABEL: {{^}}clobber_vgpr_pair_pointer_add:
8 ; GCN-DAG: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
9 ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}}
8 ; GCN: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
109
1110 ; GCN-NOT: v_mov_b32
1211 ; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]]
12 ; GCN: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}}
1313 ; GCN-NOT: v_mov_b32
1414 ; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]
1515 ; GCN-NOT: v_mov_b32
169169 ; CI.
170170
171171 ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x8:
172 ; GCN-NOHSA: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}}
173 ; GCN-NOHSA-NOT: v_add
172174 ; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
173 ; GCN-NOHSA-NOT: v_add
174 ; GCN-NOHSA: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}}
175175 ; GCN-NOHSA-NOT: v_add
176176 ; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
177177 ; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
11 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI -check-prefix=GCN %s
22
33 ; FUNC-LABEL: {{^}}cluster_arg_loads:
4 ; FIXME: Due to changes in the load clustering heuristics. We no longer
5 ; cluster all argument loads together on SI.
6 ; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
74 ; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9
85 ; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
6 ; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
97 ; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe
10 ; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
118 ; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
12 ; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
13 ; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
9 ; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
10 ; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
11 ; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
1412 define amdgpu_kernel void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
1513 store i32 %x, i32 addrspace(1)* %out0, align 4
1614 store i32 %y, i32 addrspace(1)* %out1, align 4
214214 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]]
215215 ; GCN-DAG: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
216216
217 ; GCN: v_cndmask_b32_e32
218 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]]
219 ; GCN: v_cndmask_b32_e32
217 ; GCN-DAG: v_cndmask_b32_e32
218 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]]
219 ; GCN-DAG: v_cndmask_b32_e32
220220 ; GCN: buffer_store_dwordx2
221221 define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) #0 {
222222 %cmp = icmp eq i32 %c, 0
77 ; GCN: buffer_load_ushort v[[D_F16:[0-9]+]]
88 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
99 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
10 ; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
11 ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
12 ; SI: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
10 ; SI-DAG: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
11 ; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
12 ; SI-DAG: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
1313 ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
1414 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
1515 ; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
3838 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
3939 ; GCN: buffer_load_ushort v[[D_F16:[0-9]+]]
4040 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
41 ; SI: v_cmp_lt_f32_e32 vcc, 0.5, v[[B_F32]]
42 ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
43 ; SI: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
41 ; SI-DAG: v_cmp_lt_f32_e32 vcc, 0.5, v[[B_F32]]
42 ; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
43 ; SI-DAG: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
4444 ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
4545 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
4646 ; VI: v_cmp_lt_f16_e32 vcc, 0.5, v[[B_F16]]
6767 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
6868 ; GCN: buffer_load_ushort v[[D_F16:[0-9]+]]
6969 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
70 ; SI: v_cmp_gt_f32_e32 vcc, 0.5, v[[A_F32]]
71 ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
72 ; SI: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
70 ; SI-DAG: v_cmp_gt_f32_e32 vcc, 0.5, v[[A_F32]]
71 ; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
72 ; SI-DAG: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
7373 ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
7474 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
7575
1111 ; VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1212 ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1313
14 ; CI: v_lshlrev_b32_e32
15 ; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
16 ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
17 ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
14 ; CI-DAG: v_lshlrev_b32_e32
15 ; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
16 ; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
17 ; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
1818 ; CI: v_or_b32_e32
1919 define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
2020 %result = shl <2 x i16> %lhs, %rhs
6262 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
6363 ; CI: buffer_store_dword
6464
65 ; GFX9: global_store_dword
66 ; GFX9: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
65 ; GFX9-DAG: global_store_dword
66 ; GFX9-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
6767 ; GFX9: s_barrier
68 ; GFX9: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
69 ; GFX9: global_store_dword
68 ; GFX9-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
69 ; GFX9-DAG: global_store_dword
7070 define amdgpu_kernel void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
7171 %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
7272
255255 ; CI: v_mov_b32
256256
257257 ; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
258 ; CI-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}}
259258
260259 ; CI: v_add_i32
261260 ; CI: v_add_i32
261
262 ; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}}
262263
263264 ; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}}
264265 ; CI-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}}
121121 ; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_vgprZ_f32:
122122 ; GCN-DAG: {{buffer|flat}}_load_dword [[Z:v[0-9]+]]
123123 ; GCN-DAG: s_load_dword [[X:s[0-9]+]]
124 ; GCN: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0
124 ; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0
125125 ; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 1.0, [[Z]], [[COND]]
126126 define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
127127 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
472472 ;CHECK: image_sample
473473 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
474474 ;CHECK: image_sample
475 ;CHECK: v_cmp
476 ;CHECK: store
475 ;CHECK-DAG: v_cmp
476 ;CHECK-DAG: store
477477 define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) {
478478 main_body:
479479 %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0