llvm.org GIT mirror llvm / 7725fd8
AMDGPU/SI: Improve MachineSchedModel definition This patch contains a few improvements to the model, including: - Using a single resource with a defined buffers size for each memory unit. - Setting the IssueWidth correctly. - Fixing latency values for memory instructions. shader-db stats: 16429 shaders in 3231 tests Totals: SGPRS: 318232 -> 312328 (-1.86 %) VGPRS: 208996 -> 209346 (0.17 %) Code Size: 7147044 -> 7166440 (0.27 %) bytes LDS: 83 -> 83 (0.00 %) blocks Scratch: 1862656 -> 1459200 (-21.66 %) bytes per wave Max Waves: 49182 -> 49243 (0.12 %) Wait states: 0 -> 0 (0.00 %)A Differential Revision: http://reviews.llvm.org/D18453 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@264877 91177308-0d34-0410-b5e6-96231b3b80d8 Tom Stellard 4 years ago
10 changed file(s) with 209 addition(s) and 195 deletion(s). Raw diff Collapse all Expand all
3838 // instructions and have VALU rates, but write to the SALU (i.e. VOPC
3939 // instructions)
4040
41 def SIFullSpeedModel : SchedMachineModel {
41 class SISchedMachineModel : SchedMachineModel {
4242 let CompleteModel = 0;
43 }
44 def SIQuarterSpeedModel : SchedMachineModel {
45 let CompleteModel = 0;
43 let IssueWidth = 1;
4644 }
4745
48 // BufferSize = 0 means the processors are in-order.
49 let BufferSize = 0 in {
46 def SIFullSpeedModel : SISchedMachineModel;
47 def SIQuarterSpeedModel : SISchedMachineModel;
5048
5149 // XXX: Are the resource counts correct?
52 def HWBranch : ProcResource<1>;
53 def HWExport : ProcResource<7>; // Taken from S_WAITCNT
54 def HWLGKM : ProcResource<31>; // Taken from S_WAITCNT
55 def HWSALU : ProcResource<1>;
56 def HWVMEM : ProcResource<15>; // Taken from S_WAITCNT
57 def HWVALU : ProcResource<1>;
58
50 def HWBranch : ProcResource<1> {
51 let BufferSize = 1;
52 }
53 def HWExport : ProcResource<1> {
54 let BufferSize = 7; // Taken from S_WAITCNT
55 }
56 def HWLGKM : ProcResource<1> {
57 let BufferSize = 31; // Taken from S_WAITCNT
58 }
59 def HWSALU : ProcResource<1> {
60 let BufferSize = 1;
61 }
62 def HWVMEM : ProcResource<1> {
63 let BufferSize = 15; // Taken from S_WAITCNT
64 }
65 def HWVALU : ProcResource<1> {
66 let BufferSize = 1;
5967 }
6068
6169 class HWWriteRes resources,
7381 // The latency values are 1 / (operations / cycle) / 4.
7482 multiclass SICommonWriteRes {
7583
76 def : HWWriteRes; // XXX: Guessed ???
77 def : HWWriteRes; // XXX: Guessed ???
78 def : HWWriteRes; // 2 - 64
79 def : HWWriteRes;
80 def : HWWriteRes; // XXX: Guessed ???
81 def : HWWriteResVMEM, [HWVMEM], 450>; // 300 - 600
84 def : HWWriteResBranch, [HWBranch], 8>;
85 def : HWWriteRes;
86 def : HWWriteRes; // Can be between 2 and 64
87 def : HWWriteRes;
88 def : HWWriteRes;
89 def : HWWriteRes;
8290 def : HWWriteRes; // XXX: Guessed ???
8391
8492 def : HWVALUWriteRes;
77
88 ; SI-LABEL: {{^}}offset_order:
99
10 ; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset1:4{{$}}
11 ; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:3 offset1:2
12 ; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14
10 ; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:4{{$}}
11 ; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
12 ; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:14 offset1:12
1313 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44
1414
1515 define void @offset_order(float addrspace(1)* %out) {
1111 ; FUNC-LABEL: {{^}}fceil_f64:
1212 ; CI: v_ceil_f64_e32
1313 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
14 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
15 ; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
16 ; SI: s_lshr_b64
17 ; SI: s_not_b64
18 ; SI: s_and_b64
14 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
15 ; SI-DAG: s_add_i32 [[A:s[0-9]+]], [[SEXP]], 0xfffffc01
16 ; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[A]]
17 ; SI-DAG: s_not_b64
18 ; SI-DAG: s_and_b64
1919 ; SI-DAG: cmp_gt_i32
2020 ; SI-DAG: cndmask_b32
2121 ; SI-DAG: cndmask_b32
2323 ; CI: v_trunc_f64_e32
2424
2525 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
26 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
27 ; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
28 ; SI: s_lshr_b64
29 ; SI: s_not_b64
30 ; SI: s_and_b64
26 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
27 ; SI-DAG: s_add_i32 [[A:s[0-9]+]], [[SEXP]], 0xfffffc01
28 ; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[A]]
29 ; SI-DAG: s_not_b64
30 ; SI-DAG: s_and_b64
3131 ; SI-DAG: cmp_gt_i32
3232 ; SI-DAG: cndmask_b32
3333 ; SI-DAG: cndmask_b32
55 ; FUNC-LABEL: {{^}}rsq_clamped_f64:
66 ; SI: v_rsq_clamp_f64_e32
77
8 ; VI: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[2:3]
8 ; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}]
99 ; TODO: this constant should be folded:
10 ; VI: s_mov_b32 s[[ALLBITS:[0-9+]]], -1
11 ; VI: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
12 ; VI: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]]
10 ; VI-DAG: s_mov_b32 s[[ALLBITS:[0-9+]]], -1
11 ; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
12 ; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
13 ; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]]
1314 ; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
14 ; VI: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
1515 ; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]]
1616 ; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]
1717
2323 ; FUNC-LABEL: {{^}}rsq_clamp_f64:
2424 ; SI: v_rsq_clamp_f64_e32
2525
26 ; VI: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[2:3]
2726 ; TODO: this constant should be folded:
28 ; VI: s_mov_b32 s[[ALLBITS:[0-9+]]], -1
29 ; VI: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
30 ; VI: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]]
27 ; VI-DAG: s_mov_b32 s[[ALLBITS:[0-9+]]], -1
28 ; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
29 ; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
30 ; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]]
31 ; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}
3132 ; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
32 ; VI: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
3333 ; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]]
3434 ; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]
3535 define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 {
55
66
77 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1:
8 ; SI: ds_read_u8
9 ; SI: ds_read_u8
10 ; SI: ds_read_u8
11 ; SI: ds_read_u8
12 ; SI: ds_read_u8
13 ; SI: ds_read_u8
14 ; SI: ds_read_u8
15 ; SI: ds_read_u8
16
17 ; SI: ds_read_u8
18 ; SI: ds_read_u8
19 ; SI: ds_read_u8
20 ; SI: ds_read_u8
21 ; SI: ds_read_u8
22 ; SI: ds_read_u8
23 ; SI: ds_read_u8
24 ; SI: ds_read_u8
25
26 ; SI: ds_read_u8
27 ; SI: ds_read_u8
28 ; SI: ds_read_u8
29 ; SI: ds_read_u8
30 ; SI: ds_read_u8
31 ; SI: ds_read_u8
32 ; SI: ds_read_u8
33 ; SI: ds_read_u8
34
35 ; SI: ds_read_u8
36 ; SI: ds_read_u8
37 ; SI: ds_read_u8
38 ; SI: ds_read_u8
39 ; SI: ds_read_u8
40 ; SI: ds_read_u8
41 ; SI: ds_read_u8
42 ; SI: ds_read_u8
43
44 ; SI: ds_write_b8
45 ; SI: ds_write_b8
46 ; SI: ds_write_b8
47 ; SI: ds_write_b8
48 ; SI: ds_write_b8
49 ; SI: ds_write_b8
50 ; SI: ds_write_b8
51 ; SI: ds_write_b8
52
53 ; SI: ds_write_b8
54 ; SI: ds_write_b8
55 ; SI: ds_write_b8
56 ; SI: ds_write_b8
57 ; SI: ds_write_b8
58 ; SI: ds_write_b8
59 ; SI: ds_write_b8
60 ; SI: ds_write_b8
61
62 ; SI: ds_write_b8
63 ; SI: ds_write_b8
64 ; SI: ds_write_b8
65 ; SI: ds_write_b8
66 ; SI: ds_write_b8
67 ; SI: ds_write_b8
68 ; SI: ds_write_b8
69 ; SI: ds_write_b8
70
71 ; SI: ds_write_b8
72 ; SI: ds_write_b8
73 ; SI: ds_write_b8
74 ; SI: ds_write_b8
75 ; SI: ds_write_b8
76 ; SI: ds_write_b8
77 ; SI: ds_write_b8
78 ; SI: ds_write_b8
8 ; SI-DAG: ds_read_u8
9 ; SI-DAG: ds_read_u8
10 ; SI-DAG: ds_read_u8
11 ; SI-DAG: ds_read_u8
12 ; SI-DAG: ds_read_u8
13 ; SI-DAG: ds_read_u8
14 ; SI-DAG: ds_read_u8
15 ; SI-DAG: ds_read_u8
16
17 ; SI-DAG: ds_read_u8
18 ; SI-DAG: ds_read_u8
19 ; SI-DAG: ds_read_u8
20 ; SI-DAG: ds_read_u8
21 ; SI-DAG: ds_read_u8
22 ; SI-DAG: ds_read_u8
23 ; SI-DAG: ds_read_u8
24 ; SI-DAG: ds_read_u8
25
26 ; SI-DAG: ds_read_u8
27 ; SI-DAG: ds_read_u8
28 ; SI-DAG: ds_read_u8
29 ; SI-DAG: ds_read_u8
30 ; SI-DAG: ds_read_u8
31 ; SI-DAG: ds_read_u8
32 ; SI-DAG: ds_read_u8
33 ; SI-DAG: ds_read_u8
34
35 ; SI-DAG: ds_read_u8
36 ; SI-DAG: ds_read_u8
37 ; SI-DAG: ds_read_u8
38 ; SI-DAG: ds_read_u8
39 ; SI-DAG: ds_read_u8
40 ; SI-DAG: ds_read_u8
41 ; SI-DAG: ds_read_u8
42 ; SI-DAG: ds_read_u8
43
44 ; SI-DAG: ds_write_b8
45 ; SI-DAG: ds_write_b8
46 ; SI-DAG: ds_write_b8
47 ; SI-DAG: ds_write_b8
48 ; SI-DAG: ds_write_b8
49 ; SI-DAG: ds_write_b8
50 ; SI-DAG: ds_write_b8
51 ; SI-DAG: ds_write_b8
52
53 ; SI-DAG: ds_write_b8
54 ; SI-DAG: ds_write_b8
55 ; SI-DAG: ds_write_b8
56 ; SI-DAG: ds_write_b8
57 ; SI-DAG: ds_write_b8
58 ; SI-DAG: ds_write_b8
59 ; SI-DAG: ds_write_b8
60 ; SI-DAG: ds_write_b8
61
62 ; SI-DAG: ds_write_b8
63 ; SI-DAG: ds_write_b8
64 ; SI-DAG: ds_write_b8
65 ; SI-DAG: ds_write_b8
66 ; SI-DAG: ds_write_b8
67 ; SI-DAG: ds_write_b8
68 ; SI-DAG: ds_write_b8
69 ; SI-DAG: ds_write_b8
70
71 ; SI-DAG: ds_write_b8
72 ; SI-DAG: ds_write_b8
73 ; SI-DAG: ds_write_b8
74 ; SI-DAG: ds_write_b8
75 ; SI-DAG: ds_write_b8
76 ; SI-DAG: ds_write_b8
77 ; SI-DAG: ds_write_b8
78 ; SI-DAG: ds_write_b8
7979
8080 ; SI: s_endpgm
8181 define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
8686 }
8787
8888 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align2:
89 ; SI: ds_read_u16
90 ; SI: ds_read_u16
91 ; SI: ds_read_u16
92 ; SI: ds_read_u16
93 ; SI: ds_read_u16
94 ; SI: ds_read_u16
95 ; SI: ds_read_u16
96 ; SI: ds_read_u16
97
98 ; SI: ds_read_u16
99 ; SI: ds_read_u16
100 ; SI: ds_read_u16
101 ; SI: ds_read_u16
102 ; SI: ds_read_u16
103 ; SI: ds_read_u16
104 ; SI: ds_read_u16
105 ; SI: ds_read_u16
106
107 ; SI: ds_write_b16
108 ; SI: ds_write_b16
109 ; SI: ds_write_b16
110 ; SI: ds_write_b16
111 ; SI: ds_write_b16
112 ; SI: ds_write_b16
113 ; SI: ds_write_b16
114 ; SI: ds_write_b16
115
116 ; SI: ds_write_b16
117 ; SI: ds_write_b16
118 ; SI: ds_write_b16
119 ; SI: ds_write_b16
120 ; SI: ds_write_b16
121 ; SI: ds_write_b16
122 ; SI: ds_write_b16
123 ; SI: ds_write_b16
89 ; SI-DAG: ds_read_u16
90 ; SI-DAG: ds_read_u16
91 ; SI-DAG: ds_read_u16
92 ; SI-DAG: ds_read_u16
93 ; SI-DAG: ds_read_u16
94 ; SI-DAG: ds_read_u16
95 ; SI-DAG: ds_read_u16
96 ; SI-DAG: ds_read_u16
97
98 ; SI-DAG: ds_read_u16
99 ; SI-DAG: ds_read_u16
100 ; SI-DAG: ds_read_u16
101 ; SI-DAG: ds_read_u16
102 ; SI-DAG: ds_read_u16
103 ; SI-DAG: ds_read_u16
104 ; SI-DAG: ds_read_u16
105 ; SI-DAG: ds_read_u16
106
107 ; SI-DAG: ds_write_b16
108 ; SI-DAG: ds_write_b16
109 ; SI-DAG: ds_write_b16
110 ; SI-DAG: ds_write_b16
111 ; SI-DAG: ds_write_b16
112 ; SI-DAG: ds_write_b16
113 ; SI-DAG: ds_write_b16
114 ; SI-DAG: ds_write_b16
115
116 ; SI-DAG: ds_write_b16
117 ; SI-DAG: ds_write_b16
118 ; SI-DAG: ds_write_b16
119 ; SI-DAG: ds_write_b16
120 ; SI-DAG: ds_write_b16
121 ; SI-DAG: ds_write_b16
122 ; SI-DAG: ds_write_b16
123 ; SI-DAG: ds_write_b16
124124
125125 ; SI: s_endpgm
126126 define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
3131 ; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
3232 ; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], vcc, 16, v{{[0-9]+}}
3333 ; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]]
34 ; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16
35 ; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]]
34 ; CI-DAG: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16
35 ; CI-DAG: ds_read_b32 {{v[0-9]+}}, [[ADDRR]]
3636
3737 define void @local_memory_two_objects(i32 addrspace(1)* %out) {
3838 entry:
155155 }
156156
157157 ; FUNC-LABEL: @reorder_local_offsets
158 ; FIXME: The scheduler doesn't think its proftible to re-order the
159 ; loads and stores, and I'm not sure that it really is.
160 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
158161 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
159162 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404
160 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
161163 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
162164 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404
163165 ; CI: buffer_store_dword
106106 ; EG-DAG: CNDE_INT
107107 ; EG-DAG: CNDE_INT
108108
109 ; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]]
110 ; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]]
111 ; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]]
112 ; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], vcc, 0, [[FIRST_RCP_LO]]
113 ; SI-DAG: v_cndmask_b32_e64
114 ; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
115 ; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], vcc, [[FIRST_E]], [[FIRST_RCP]]
116 ; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], vcc, [[FIRST_E]], [[FIRST_RCP]]
117 ; SI-DAG: v_cndmask_b32_e64
118 ; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]]
119 ; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]]
120 ; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[0-9]+]], vcc, [[FIRST_Num_S_Remainder]], v{{[0-9]+}}
121 ; SI-DAG: v_cndmask_b32_e64
122 ; SI-DAG: v_cndmask_b32_e64
123 ; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]]
124 ; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]]
125 ; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]],
126 ; SI-DAG: v_cndmask_b32_e64
127 ; SI-DAG: v_cndmask_b32_e64
128 ; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]],
129 ; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]],
130 ; SI-DAG: v_cndmask_b32_e64
131 ; SI-DAG: v_cndmask_b32_e64
132 ; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]]
133 ; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]]
134 ; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]]
135 ; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], vcc, 0, [[SECOND_RCP_LO]]
136 ; SI-DAG: v_cndmask_b32_e64
137 ; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
138 ; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], vcc, [[SECOND_E]], [[SECOND_RCP]]
139 ; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], vcc, [[SECOND_E]], [[SECOND_RCP]]
140 ; SI-DAG: v_cndmask_b32_e64
141 ; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]]
142 ; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]]
143 ; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], vcc, [[SECOND_Num_S_Remainder]], v{{[0-9]+}}
144 ; SI-DAG: v_cndmask_b32_e64
145 ; SI-DAG: v_cndmask_b32_e64
146 ; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]]
147 ; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]]
148 ; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]],
149 ; SI-DAG: v_cndmask_b32_e64
150 ; SI-DAG: v_cndmask_b32_e64
151 ; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]],
152 ; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]],
109 ; For SI, we used to have checks for the input and output registers
110 ; of the instructions, but these are way too fragile. The division for
111 ; the two vector elements can be intermixed which makes it impossible to
112 ; accurately check all the operands.
113 ; SI-DAG: v_rcp_iflag_f32_e32
114 ; SI-DAG: v_mul_hi_u32
115 ; SI-DAG: v_mul_lo_i32
116 ; SI-DAG: v_sub_i32_e32
117 ; SI-DAG: v_cndmask_b32_e64
118 ; SI-DAG: v_mul_hi_u32
119 ; SI-DAG: v_add_i32_e32
120 ; SI-DAG: v_subrev_i32_e32
121 ; SI-DAG: v_cndmask_b32_e64
122 ; SI-DAG: v_mul_hi_u32
123 ; SI-DAG: v_mul_lo_i32
124 ; SI-DAG: v_subrev_i32_e32
125 ; SI-DAG: v_cndmask_b32_e64
126 ; SI-DAG: v_cndmask_b32_e64
127 ; SI-DAG: v_and_b32_e32
128 ; SI-DAG: v_add_i32_e32
129 ; SI-DAG: v_subrev_i32_e32
130 ; SI-DAG: v_cndmask_b32_e64
131 ; SI-DAG: v_cndmask_b32_e64
132 ; SI-DAG: v_add_i32_e32
133 ; SI-DAG: v_subrev_i32_e32
134 ; SI-DAG: v_cndmask_b32_e64
135 ; SI-DAG: v_cndmask_b32_e64
136 ; SI-DAG: v_rcp_iflag_f32_e32
137 ; SI-DAG: v_mul_hi_u32
138 ; SI-DAG: v_mul_lo_i32
139 ; SI-DAG: v_sub_i32_e32
140 ; SI-DAG: v_cndmask_b32_e64
141 ; SI-DAG: v_mul_hi_u32
142 ; SI-DAG: v_add_i32_e32
143 ; SI-DAG: v_subrev_i32_e32
144 ; SI-DAG: v_cndmask_b32_e64
145 ; SI-DAG: v_mul_hi_u32
146 ; SI-DAG: v_mul_lo_i32
147 ; SI-DAG: v_subrev_i32_e32
148 ; SI-DAG: v_cndmask_b32_e64
149 ; SI-DAG: v_cndmask_b32_e64
150 ; SI-DAG: v_and_b32_e32
151 ; SI-DAG: v_add_i32_e32
152 ; SI-DAG: v_subrev_i32_e32
153 ; SI-DAG: v_cndmask_b32_e64
154 ; SI-DAG: v_cndmask_b32_e64
155 ; SI-DAG: v_add_i32_e32
156 ; SI-DAG: v_subrev_i32_e32
153157 ; SI-DAG: v_cndmask_b32_e64
154158 ; SI-DAG: v_cndmask_b32_e64
155159 ; SI: s_endpgm