llvm.org GIT mirror llvm / 0991c31
R600: Expand vector float operations for both SI and R600 Reviewed-by: Michel Dänzer <michel.daenzer@amd.com> git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188596 91177308-0d34-0410-b5e6-96231b3b80d8 Tom Stellard 6 years ago
6 changed file(s) with 129 addition(s) and 89 deletion(s). Raw diff Collapse all Expand all
114114 setOperationAction(ISD::VSELECT, MVT::v2f32, Expand);
115115 setOperationAction(ISD::VSELECT, MVT::v4f32, Expand);
116116
117 static const int types[] = {
117 static const int IntTypes[] = {
118118 (int)MVT::v2i32,
119119 (int)MVT::v4i32
120120 };
121 const size_t NumTypes = array_lengthof(types);
122
123 for (unsigned int x = 0; x < NumTypes; ++x) {
124 MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x];
121 const size_t NumIntTypes = array_lengthof(IntTypes);
122
123 for (unsigned int x = 0; x < NumIntTypes; ++x) {
124 MVT::SimpleValueType VT = (MVT::SimpleValueType)IntTypes[x];
125125 //Expand the following operations for the current type by default
126126 setOperationAction(ISD::ADD, VT, Expand);
127127 setOperationAction(ISD::AND, VT, Expand);
139139 setOperationAction(ISD::UREM, VT, Expand);
140140 setOperationAction(ISD::VSELECT, VT, Expand);
141141 setOperationAction(ISD::XOR, VT, Expand);
142 }
143
144 static const int FloatTypes[] = {
145 (int)MVT::v2f32,
146 (int)MVT::v4f32
147 };
148 const size_t NumFloatTypes = array_lengthof(FloatTypes);
149
150 for (unsigned int x = 0; x < NumFloatTypes; ++x) {
151 MVT::SimpleValueType VT = (MVT::SimpleValueType)FloatTypes[x];
152 setOperationAction(ISD::FADD, VT, Expand);
153 setOperationAction(ISD::FDIV, VT, Expand);
154 setOperationAction(ISD::FMUL, VT, Expand);
155 setOperationAction(ISD::FSUB, VT, Expand);
142156 }
143157 }
144158
3636 addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
3737
3838 computeRegisterProperties();
39
40 setOperationAction(ISD::FADD, MVT::v4f32, Expand);
41 setOperationAction(ISD::FADD, MVT::v2f32, Expand);
42 setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
43 setOperationAction(ISD::FMUL, MVT::v2f32, Expand);
44 setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
45 setOperationAction(ISD::FDIV, MVT::v2f32, Expand);
46 setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
47 setOperationAction(ISD::FSUB, MVT::v2f32, Expand);
4839
4940 setOperationAction(ISD::FCOS, MVT::f32, Custom);
5041 setOperationAction(ISD::FSIN, MVT::f32, Custom);
None ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
0 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
1 ; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
12
2 ; CHECK: @fadd_f32
3 ; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
4
5 define void @fadd_f32() {
6 %r0 = call float @llvm.R600.load.input(i32 0)
7 %r1 = call float @llvm.R600.load.input(i32 1)
8 %r2 = fadd float %r0, %r1
9 call void @llvm.AMDGPU.store.output(float %r2, i32 0)
3 ; R600-CHECK: @fadd_f32
4 ; R600-CHECK: ADD * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
5 ; SI-CHECK: @fadd_f32
6 ; SI-CHECK: V_ADD_F32
7 define void @fadd_f32(float addrspace(1)* %out, float %a, float %b) {
8 entry:
9 %0 = fadd float %a, %b
10 store float %0, float addrspace(1)* %out
1011 ret void
1112 }
1213
13 declare float @llvm.R600.load.input(i32) readnone
14
15 declare void @llvm.AMDGPU.store.output(float, i32)
16
17 ; CHECK: @fadd_v2f32
18 ; CHECK-DAG: ADD * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
19 ; CHECK-DAG: ADD * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
14 ; R600-CHECK: @fadd_v2f32
15 ; R600-CHECK-DAG: ADD * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
16 ; R600-CHECK-DAG: ADD * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
17 ; SI-CHECK: @fadd_v2f32
18 ; SI-CHECK: V_ADD_F32
19 ; SI-CHECK: V_ADD_F32
2020 define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
2121 entry:
2222 %0 = fadd <2 x float> %a, %b
2424 ret void
2525 }
2626
27 ; CHECK: @fadd_v4f32
28 ; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
29 ; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
30 ; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
31 ; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
32
27 ; R600-CHECK: @fadd_v4f32
28 ; R600-CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
29 ; R600-CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
30 ; R600-CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
31 ; R600-CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
32 ; SI-CHECK: @fadd_v4f32
33 ; SI-CHECK: V_ADD_F32
34 ; SI-CHECK: V_ADD_F32
35 ; SI-CHECK: V_ADD_F32
36 ; SI-CHECK: V_ADD_F32
3337 define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
3438 %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
3539 %a = load <4 x float> addrspace(1) * %in
None ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
0 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
1 ; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
12
23 ; These tests check that fdiv is expanded correctly and also test that the
34 ; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate
45 ; instruction groups.
56
6 ; CHECK: @fdiv_v2f32
7 ; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
8 ; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
9 ; CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
10 ; CHECK-DAG: MUL_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
7 ; R600-CHECK: @fdiv_v2f32
8 ; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
9 ; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
10 ; R600-CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
11 ; R600-CHECK-DAG: MUL_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
12 ; SI-CHECK: @fdiv_v2f32
13 ; SI-CHECK-DAG: V_RCP_F32
14 ; SI-CHECK-DAG: V_MUL_F32
15 ; SI-CHECK-DAG: V_RCP_F32
16 ; SI-CHECK-DAG: V_MUL_F32
1117 define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
1218 entry:
1319 %0 = fdiv <2 x float> %a, %b
1521 ret void
1622 }
1723
18 ; CHECK: @fdiv_v4f32
19 ; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
20 ; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
21 ; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22 ; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
23 ; CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
24 ; CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
25 ; CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
26 ; CHECK-DAG: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
27
24 ; R600-CHECK: @fdiv_v4f32
25 ; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
26 ; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
27 ; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
28 ; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
29 ; R600-CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
30 ; R600-CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
31 ; R600-CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
32 ; R600-CHECK-DAG: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
33 ; SI-CHECK: @fdiv_v4f32
34 ; SI-CHECK-DAG: V_RCP_F32
35 ; SI-CHECK-DAG: V_MUL_F32
36 ; SI-CHECK-DAG: V_RCP_F32
37 ; SI-CHECK-DAG: V_MUL_F32
38 ; SI-CHECK-DAG: V_RCP_F32
39 ; SI-CHECK-DAG: V_MUL_F32
40 ; SI-CHECK-DAG: V_RCP_F32
41 ; SI-CHECK-DAG: V_MUL_F32
2842 define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
2943 %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
3044 %a = load <4 x float> addrspace(1) * %in
None ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
0 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
1 ; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
12
2 ; CHECK: @fmul_f32
3 ; CHECK: MUL_IEEE * {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
4
5 define void @fmul_f32() {
6 %r0 = call float @llvm.R600.load.input(i32 0)
7 %r1 = call float @llvm.R600.load.input(i32 1)
8 %r2 = fmul float %r0, %r1
9 call void @llvm.AMDGPU.store.output(float %r2, i32 0)
10 ret void
3 ; R600-CHECK: @fmul_f32
4 ; R600-CHECK: MUL_IEEE * {{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
5 ; SI-CHECK: @fmul_f32
6 ; SI-CHECK: V_MUL_F32
7 define void @fmul_f32(float addrspace(1)* %out, float %a, float %b) {
8 entry:
9 %0 = fmul float %a, %b
10 store float %0, float addrspace(1)* %out
11 ret void
1112 }
1213
1314 declare float @llvm.R600.load.input(i32) readnone
1415
1516 declare void @llvm.AMDGPU.store.output(float, i32)
1617
17 ; CHECK: @fmul_v2f32
18 ; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW]}}
19 ; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW]}}
18 ; R600-CHECK: @fmul_v2f32
19 ; R600-CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW]}}
20 ; R600-CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW]}}
21 ; SI-CHECK: @fmul_v2f32
22 ; SI-CHECK: V_MUL_F32
23 ; SI-CHECK: V_MUL_F32
2024 define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
2125 entry:
2226 %0 = fmul <2 x float> %a, %b
2428 ret void
2529 }
2630
27 ; CHECK: @fmul_v4f32
28 ; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
29 ; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
30 ; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
31 ; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
32
31 ; R600-CHECK: @fmul_v4f32
32 ; R600-CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
33 ; R600-CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
34 ; R600-CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
35 ; R600-CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
36 ; SI-CHECK: @fmul_v4f32
37 ; SI-CHECK: V_MUL_F32
38 ; SI-CHECK: V_MUL_F32
39 ; SI-CHECK: V_MUL_F32
40 ; SI-CHECK: V_MUL_F32
3341 define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
3442 %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
3543 %a = load <4 x float> addrspace(1) * %in
None ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
0 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
1 ; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
12
2 ; CHECK: @fsub_f32
3 ; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
4
5 define void @fsub_f32() {
6 %r0 = call float @llvm.R600.load.input(i32 0)
7 %r1 = call float @llvm.R600.load.input(i32 1)
8 %r2 = fsub float %r0, %r1
9 call void @llvm.AMDGPU.store.output(float %r2, i32 0)
10 ret void
3 ; R600-CHECK: @fsub_f32
4 ; R600-CHECK: ADD * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, -KC0[2].W
5 ; SI-CHECK: @fsub_f32
6 ; SI-CHECK: V_SUB_F32
7 define void @fsub_f32(float addrspace(1)* %out, float %a, float %b) {
8 entry:
9 %0 = fsub float %a, %b
10 store float %0, float addrspace(1)* %out
11 ret void
1112 }
1213
1314 declare float @llvm.R600.load.input(i32) readnone
1415
1516 declare void @llvm.AMDGPU.store.output(float, i32)
1617
17 ; CHECK: @fsub_v2f32
18 ; CHECK-DAG: ADD * T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z
19 ; CHECK-DAG: ADD * T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y
18 ; R600-CHECK: @fsub_v2f32
19 ; R600-CHECK-DAG: ADD * T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z
20 ; R600-CHECK-DAG: ADD * T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y
21 ; SI-CHECK: @fsub_v2f32
22 ; SI-CHECK: V_SUB_F32
23 ; SI-CHECK: V_SUB_F32
2024 define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
2125 entry:
2226 %0 = fsub <2 x float> %a, %b
2428 ret void
2529 }
2630
27 ; CHECK: @fsub_v4f32
28 ; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
29 ; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
30 ; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
31 ; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
31 ; R600-CHECK: @fsub_v4f32
32 ; R600-CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
33 ; R600-CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
34 ; R600-CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
35 ; R600-CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
36 ; SI-CHECK: @fsub_v4f32
37 ; SI-CHECK: V_SUB_F32
38 ; SI-CHECK: V_SUB_F32
39 ; SI-CHECK: V_SUB_F32
40 ; SI-CHECK: V_SUB_F32
3241 define void @fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
3342 %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
3443 %a = load <4 x float> addrspace(1) * %in