llvm.org GIT mirror llvm / adb194b
AMDGPU/SI: Add support for shrinking v_cndmask_b32_e32 instructions Reviewers: arsenm Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D11061 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@242146 91177308-0d34-0410-b5e6-96231b3b80d8 Tom Stellard 4 years ago
8 changed file(s) with 119 addition(s) and 96 deletion(s). Raw diff Collapse all Expand all
9494 // a register allocation hint pre-regalloc and then do the shrining
9595 // post-regalloc.
9696 if (Src2) {
97 if (MI.getOpcode() != AMDGPU::V_MAC_F32_e64)
98 return false;
99
100 const MachineOperand *Src2Mod =
101 TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
102 if (!isVGPR(Src2, TRI, MRI) || (Src2Mod && Src2Mod->getImm() != 0))
103 return false;
97 switch (MI.getOpcode()) {
98 default: return false;
99
100 case AMDGPU::V_MAC_F32_e64:
101 if (!isVGPR(Src2, TRI, MRI) ||
102 TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
103 return false;
104 break;
105
106 case AMDGPU::V_CNDMASK_B32_e64:
107 break;
108 }
104109 }
105110
106111 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
249254 continue;
250255 }
251256
257 if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
258 // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
259 // instructions.
260 const MachineOperand *Src2 =
261 TII->getNamedOperand(MI, AMDGPU::OpName::src2);
262 if (!Src2->isReg())
263 continue;
264 unsigned SReg = Src2->getReg();
265 if (TargetRegisterInfo::isVirtualRegister(SReg)) {
266 MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC);
267 continue;
268 }
269 if (SReg != AMDGPU::VCC)
270 continue;
271 }
272
252273 // We can shrink this instruction
253274 DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';);
254275
88 ; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]]
99 ; SI: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]]
1010 ; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]]
11 ; SI: v_cmp_le_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0.5, |[[SUB]]|
12 ; SI: v_cndmask_b32_e64 [[SEL:v[0-9]+]], 0, [[VX]], [[CMP]]
11 ; SI: v_cmp_le_f32_e64 vcc, 0.5, |[[SUB]]|
12 ; SI: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[VX]]
1313 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]]
1414 ; SI: buffer_store_dword [[RESULT]]
1515
55
66
77 ; FUNC-LABEL: {{^}}select_v4i8:
8 ; SI: v_cndmask_b32_e64
9 ; SI: v_cndmask_b32_e64
10 ; SI: v_cndmask_b32_e64
11 ; SI: v_cndmask_b32_e64
8 ; SI: v_cndmask_b32_e32
9 ; SI: v_cndmask_b32_e32
10 ; SI: v_cndmask_b32_e32
11 ; SI: v_cndmask_b32_e32
1212 define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind {
1313 %cmp = icmp eq i8 %c, 0
1414 %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
1717 }
1818
1919 ; FUNC-LABEL: {{^}}select_v4i16:
20 ; SI: v_cndmask_b32_e64
21 ; SI: v_cndmask_b32_e64
22 ; SI: v_cndmask_b32_e64
23 ; SI: v_cndmask_b32_e64
20 ; SI: v_cndmask_b32_e32
21 ; SI: v_cndmask_b32_e32
22 ; SI: v_cndmask_b32_e32
23 ; SI: v_cndmask_b32_e32
2424 define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind {
2525 %cmp = icmp eq i32 %c, 0
2626 %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b
2929 }
3030
3131 ; FUNC-LABEL: {{^}}select_v2i32:
32 ; SI: v_cndmask_b32_e64
33 ; SI: v_cndmask_b32_e64
32 ; SI: v_cndmask_b32_e32
33 ; SI: v_cndmask_b32_e32
3434 ; SI: buffer_store_dwordx2
3535 define void @select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind {
3636 %cmp = icmp eq i32 %c, 0
4040 }
4141
4242 ; FUNC-LABEL: {{^}}select_v4i32:
43 ; SI: v_cndmask_b32_e64
44 ; SI: v_cndmask_b32_e64
45 ; SI: v_cndmask_b32_e64
46 ; SI: v_cndmask_b32_e64
43 ; SI: v_cndmask_b32_e32
44 ; SI: v_cndmask_b32_e32
45 ; SI: v_cndmask_b32_e32
46 ; SI: v_cndmask_b32_e32
4747 ; SI: buffer_store_dwordx4
4848 define void @select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind {
4949 %cmp = icmp eq i32 %c, 0
5353 }
5454
5555 ; FUNC-LABEL: {{^}}select_v8i32:
56 ; SI: v_cndmask_b32_e64
57 ; SI: v_cndmask_b32_e64
58 ; SI: v_cndmask_b32_e64
59 ; SI: v_cndmask_b32_e64
60 ; SI: v_cndmask_b32_e64
61 ; SI: v_cndmask_b32_e64
62 ; SI: v_cndmask_b32_e64
63 ; SI: v_cndmask_b32_e64
56 ; SI: v_cndmask_b32_e32
57 ; SI: v_cndmask_b32_e32
58 ; SI: v_cndmask_b32_e32
59 ; SI: v_cndmask_b32_e32
60 ; SI: v_cndmask_b32_e32
61 ; SI: v_cndmask_b32_e32
62 ; SI: v_cndmask_b32_e32
63 ; SI: v_cndmask_b32_e32
6464 define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind {
6565 %cmp = icmp eq i32 %c, 0
6666 %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b
8787 }
8888
8989 ; FUNC-LABEL: {{^}}select_v8f32:
90 ; SI: v_cndmask_b32_e64
91 ; SI: v_cndmask_b32_e64
92 ; SI: v_cndmask_b32_e64
93 ; SI: v_cndmask_b32_e64
94 ; SI: v_cndmask_b32_e64
95 ; SI: v_cndmask_b32_e64
96 ; SI: v_cndmask_b32_e64
97 ; SI: v_cndmask_b32_e64
90 ; SI: v_cndmask_b32_e32
91 ; SI: v_cndmask_b32_e32
92 ; SI: v_cndmask_b32_e32
93 ; SI: v_cndmask_b32_e32
94 ; SI: v_cndmask_b32_e32
95 ; SI: v_cndmask_b32_e32
96 ; SI: v_cndmask_b32_e32
97 ; SI: v_cndmask_b32_e32
9898 define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind {
9999 %cmp = icmp eq i32 %c, 0
100100 %select = select i1 %cmp, <8 x float> %a, <8 x float> %b
103103 }
104104
105105 ; FUNC-LABEL: {{^}}select_v2f64:
106 ; SI: v_cndmask_b32_e64
107 ; SI: v_cndmask_b32_e64
108 ; SI: v_cndmask_b32_e64
109 ; SI: v_cndmask_b32_e64
106 ; SI: v_cndmask_b32_e32
107 ; SI: v_cndmask_b32_e32
108 ; SI: v_cndmask_b32_e32
109 ; SI: v_cndmask_b32_e32
110110 define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind {
111111 %cmp = icmp eq i32 %c, 0
112112 %select = select i1 %cmp, <2 x double> %a, <2 x double> %b
115115 }
116116
117117 ; FUNC-LABEL: {{^}}select_v4f64:
118 ; SI: v_cndmask_b32_e64
119 ; SI: v_cndmask_b32_e64
120 ; SI: v_cndmask_b32_e64
121 ; SI: v_cndmask_b32_e64
122 ; SI: v_cndmask_b32_e64
123 ; SI: v_cndmask_b32_e64
124 ; SI: v_cndmask_b32_e64
125 ; SI: v_cndmask_b32_e64
118 ; SI: v_cndmask_b32_e32
119 ; SI: v_cndmask_b32_e32
120 ; SI: v_cndmask_b32_e32
121 ; SI: v_cndmask_b32_e32
122 ; SI: v_cndmask_b32_e32
123 ; SI: v_cndmask_b32_e32
124 ; SI: v_cndmask_b32_e32
125 ; SI: v_cndmask_b32_e32
126126 define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind {
127127 %cmp = icmp eq i32 %c, 0
128128 %select = select i1 %cmp, <4 x double> %a, <4 x double> %b
131131 }
132132
133133 ; FUNC-LABEL: {{^}}select_v8f64:
134 ; SI: v_cndmask_b32_e64
135 ; SI: v_cndmask_b32_e64
136 ; SI: v_cndmask_b32_e64
137 ; SI: v_cndmask_b32_e64
138 ; SI: v_cndmask_b32_e64
139 ; SI: v_cndmask_b32_e64
140 ; SI: v_cndmask_b32_e64
141 ; SI: v_cndmask_b32_e64
142 ; SI: v_cndmask_b32_e64
143 ; SI: v_cndmask_b32_e64
144 ; SI: v_cndmask_b32_e64
145 ; SI: v_cndmask_b32_e64
146 ; SI: v_cndmask_b32_e64
147 ; SI: v_cndmask_b32_e64
148 ; SI: v_cndmask_b32_e64
149 ; SI: v_cndmask_b32_e64
134 ; SI: v_cndmask_b32_e32
135 ; SI: v_cndmask_b32_e32
136 ; SI: v_cndmask_b32_e32
137 ; SI: v_cndmask_b32_e32
138 ; SI: v_cndmask_b32_e32
139 ; SI: v_cndmask_b32_e32
140 ; SI: v_cndmask_b32_e32
141 ; SI: v_cndmask_b32_e32
142 ; SI: v_cndmask_b32_e32
143 ; SI: v_cndmask_b32_e32
144 ; SI: v_cndmask_b32_e32
145 ; SI: v_cndmask_b32_e32
146 ; SI: v_cndmask_b32_e32
147 ; SI: v_cndmask_b32_e32
148 ; SI: v_cndmask_b32_e32
149 ; SI: v_cndmask_b32_e32
150150 define void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind {
151151 %cmp = icmp eq i32 %c, 0
152152 %select = select i1 %cmp, <8 x double> %a, <8 x double> %b
5454 ; CHECK: s_mov_b32 [[SLO:s[0-9]+]], 0
5555 ; CHECK-DAG: v_mov_b32_e32 [[VHI:v[0-9]+]], [[SHI]]
5656 ; CHECK-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], [[SLO]]
57 ; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VLO]], {{v[0-9]+}}
58 ; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VHI]], {{v[0-9]+}}
57 ; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, [[VLO]], {{v[0-9]+}}
58 ; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, [[VHI]], {{v[0-9]+}}
5959 ; CHECK: s_endpgm
6060 define void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
6161 %cmp = icmp ugt i32 %cond, 5
1111
1212 ; FIXME: select on 0, 0
1313 ; SI-LABEL: {{^}}sint_to_fp_i1_f64:
14 ; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
14 ; SI: v_cmp_eq_i32_e64 vcc,
1515 ; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
16 ; uses an SGPR for [[CMP]]
17 ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
18 ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]]
16 ; uses an SGPR (implicit vcc).
17 ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
18 ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, vcc
1919 ; SI: buffer_store_dwordx2
2020 ; SI: s_endpgm
2121 define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
7171
7272 ; FIXME: select on 0, 0
7373 ; SI-LABEL: {{^}}uint_to_fp_i1_to_f64:
74 ; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
75 ; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
76 ; uses an SGPR for [[CMP]]
77 ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
78 ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]]
74 ; SI: v_cmp_eq_i32_e64 vcc
75 ; We can't fold the SGPRs into v_cndmask_b32_e32, because it already
76 ; uses an SGPR (implicit vcc).
77 ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
78 ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, vcc
7979 ; SI: buffer_store_dwordx2
8080 ; SI: s_endpgm
8181 define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
None ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
1 ;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s
2 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
0 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
1 ;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
2 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=VI --check-prefix=FUNC %s
33
4 ;EG: {{^}}test_select_v2i32:
4 ;FUNC-LABEL: {{^}}test_select_v2i32:
5
56 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
67 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
78
8 ;SI: {{^}}test_select_v2i32:
99 ;SI: v_cndmask_b32_e64
10 ;SI: v_cndmask_b32_e64
10 ;SI: v_cndmask_b32_e32
1111
1212 define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
1313 entry:
1919 ret void
2020 }
2121
22 ;EG: {{^}}test_select_v2f32:
22 ;FUNC-LABEL: {{^}}test_select_v2f32:
23
2324 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
2425 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
2526
26 ;SI: {{^}}test_select_v2f32:
2727 ;SI: v_cndmask_b32_e64
28 ;SI: v_cndmask_b32_e64
28 ;SI: v_cndmask_b32_e32
2929
3030 define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
3131 entry:
3737 ret void
3838 }
3939
40 ;EG: {{^}}test_select_v4i32:
40 ;FUNC-LABEL: {{^}}test_select_v4i32:
41
4142 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
4243 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
4344 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
4445 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
4546
46 ;SI: {{^}}test_select_v4i32:
47 ;SI: v_cndmask_b32_e64
48 ;SI: v_cndmask_b32_e64
49 ;SI: v_cndmask_b32_e64
50 ;SI: v_cndmask_b32_e64
47 ; FIXME: The shrinking does not happen on tonga
48
49 ;SI: v_cndmask_b32
50 ;SI: v_cndmask_b32
51 ;SI: v_cndmask_b32
52 ;SI: v_cndmask_b32
5153
5254 define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
5355 entry:
5961 ret void
6062 }
6163
62 ;EG: {{^}}test_select_v4f32:
64 ;FUNC-LABEL: {{^}}test_select_v4f32:
6365 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
6466 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
6567 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
4141
4242 ; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 0, {{v[0-9]+}}
4343 ; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1.0, {{v[0-9]+}}
44 ; SI: s_xor_b64 [[XOR:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
45 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[XOR]]
44 ; SI: s_xor_b64 [[XOR:vcc]], [[CMP0]], [[CMP1]]
45 ; SI: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
4646 ; SI: buffer_store_dword [[RESULT]]
4747 ; SI: s_endpgm
4848 define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {