llvm.org GIT mirror llvm / f76a315
AMDGPU: Fix min3/max3 combines for f16/i16 Fix missing instruction definitions for min3/max3. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@303284 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 3 years ago
8 changed file(s) with 365 addition(s) and 109 deletion(s). Raw diff Collapse all Expand all
288288 return getGeneration() >= GFX9;
289289 }
290290
291 bool hasMin3Max3_16() const {
292 return getGeneration() >= GFX9;
293 }
294
291295 bool hasCARRY() const {
292296 return (getGeneration() >= EVERGREEN);
293297 }
44904490
44914491
44924492 if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
4493 VT != MVT::f64) {
4493 VT != MVT::f64 &&
4494 ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) {
44944495 // max(max(a, b), c) -> max3(a, b, c)
44954496 // min(min(a, b), c) -> min3(a, b, c)
44964497 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
299299 def V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile>;
300300
301301 def V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile>;
302
302303 def V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile, AMDGPUfmed3>;
303304 def V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile, AMDGPUsmed3>;
304305 def V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile, AMDGPUumed3>;
305 }
306
307 def V_MIN3_F16 : VOP3Inst <"v_min3_f16", VOP3_Profile, AMDGPUfmin3>;
308 def V_MIN3_I16 : VOP3Inst <"v_min3_i16", VOP3_Profile, AMDGPUsmin3>;
309 def V_MIN3_U16 : VOP3Inst <"v_min3_u16", VOP3_Profile, AMDGPUumin3>;
310
311 def V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile, AMDGPUfmax3>;
312 def V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile, AMDGPUsmax3>;
313 def V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile, AMDGPUumax3>;
314 } // End SubtargetPredicate = isGFX9
306315
307316
308317 //===----------------------------------------------------------------------===//
508517 defm V_PACK_B32_F16 : VOP3_Real_vi <0x2a0>;
509518
510519 defm V_XAD_U32 : VOP3_Real_vi <0x1f3>;
520
521 defm V_MIN3_F16 : VOP3_Real_vi <0x1f4>;
522 defm V_MIN3_I16 : VOP3_Real_vi <0x1f5>;
523 defm V_MIN3_U16 : VOP3_Real_vi <0x1f6>;
524
525 defm V_MAX3_F16 : VOP3_Real_vi <0x1f7>;
526 defm V_MAX3_I16 : VOP3_Real_vi <0x1f8>;
527 defm V_MAX3_U16 : VOP3_Real_vi <0x1f9>;
528
511529 defm V_MED3_F16 : VOP3_Real_vi <0x1fa>;
512530 defm V_MED3_I16 : VOP3_Real_vi <0x1fb>;
513531 defm V_MED3_U16 : VOP3_Real_vi <0x1fc>;
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
23
3 declare float @llvm.maxnum.f32(float, float) nounwind readnone
4
5 ; SI-LABEL: {{^}}test_fmax3_olt_0:
6 ; SI: buffer_load_dword [[REGC:v[0-9]+]]
7 ; SI: buffer_load_dword [[REGB:v[0-9]+]]
8 ; SI: buffer_load_dword [[REGA:v[0-9]+]]
9 ; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
10 ; SI: buffer_store_dword [[RESULT]],
11 ; SI: s_endpgm
12 define amdgpu_kernel void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
4 ; GCN-LABEL: {{^}}test_fmax3_olt_0_f32:
5 ; GCN: buffer_load_dword [[REGC:v[0-9]+]]
6 ; GCN: buffer_load_dword [[REGB:v[0-9]+]]
7 ; GCN: buffer_load_dword [[REGA:v[0-9]+]]
8 ; GCN: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
9 ; GCN: buffer_store_dword [[RESULT]],
10 ; GCN: s_endpgm
11 define amdgpu_kernel void @test_fmax3_olt_0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 {
1312 %a = load volatile float, float addrspace(1)* %aptr, align 4
1413 %b = load volatile float, float addrspace(1)* %bptr, align 4
1514 %c = load volatile float, float addrspace(1)* %cptr, align 4
16 %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
17 %f1 = call float @llvm.maxnum.f32(float %f0, float %c) nounwind readnone
15 %f0 = call float @llvm.maxnum.f32(float %a, float %b)
16 %f1 = call float @llvm.maxnum.f32(float %f0, float %c)
1817 store float %f1, float addrspace(1)* %out, align 4
1918 ret void
2019 }
2120
2221 ; Commute operand of second fmax
23 ; SI-LABEL: {{^}}test_fmax3_olt_1:
24 ; SI: buffer_load_dword [[REGB:v[0-9]+]]
25 ; SI: buffer_load_dword [[REGA:v[0-9]+]]
26 ; SI: buffer_load_dword [[REGC:v[0-9]+]]
27 ; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
28 ; SI: buffer_store_dword [[RESULT]],
29 ; SI: s_endpgm
30 define amdgpu_kernel void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
22 ; GCN-LABEL: {{^}}test_fmax3_olt_1_f32:
23 ; GCN: buffer_load_dword [[REGB:v[0-9]+]]
24 ; GCN: buffer_load_dword [[REGA:v[0-9]+]]
25 ; GCN: buffer_load_dword [[REGC:v[0-9]+]]
26 ; GCN: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
27 ; GCN: buffer_store_dword [[RESULT]],
28 ; GCN: s_endpgm
29 define amdgpu_kernel void @test_fmax3_olt_1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 {
3130 %a = load volatile float, float addrspace(1)* %aptr, align 4
3231 %b = load volatile float, float addrspace(1)* %bptr, align 4
3332 %c = load volatile float, float addrspace(1)* %cptr, align 4
34 %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
35 %f1 = call float @llvm.maxnum.f32(float %c, float %f0) nounwind readnone
33 %f0 = call float @llvm.maxnum.f32(float %a, float %b)
34 %f1 = call float @llvm.maxnum.f32(float %c, float %f0)
3635 store float %f1, float addrspace(1)* %out, align 4
3736 ret void
3837 }
38
39 ; GCN-LABEL: {{^}}test_fmax3_olt_0_f16:
40 ; GCN: buffer_load_ushort [[REGC:v[0-9]+]]
41 ; GCN: buffer_load_ushort [[REGB:v[0-9]+]]
42 ; GCN: buffer_load_ushort [[REGA:v[0-9]+]]
43
44 ; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]],
45 ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT]]
46
47 ; VI: v_max_f16_e32
48 ; VI: v_max_f16_e32 [[RESULT:v[0-9]+]],
49
50 ; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
51 ; GCN: buffer_store_short [[RESULT]],
52 define amdgpu_kernel void @test_fmax3_olt_0_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 {
53 %a = load volatile half, half addrspace(1)* %aptr, align 2
54 %b = load volatile half, half addrspace(1)* %bptr, align 2
55 %c = load volatile half, half addrspace(1)* %cptr, align 2
56 %f0 = call half @llvm.maxnum.f16(half %a, half %b)
57 %f1 = call half @llvm.maxnum.f16(half %f0, half %c)
58 store half %f1, half addrspace(1)* %out, align 2
59 ret void
60 }
61
62 ; Commute operand of second fmax
63 ; GCN-LABEL: {{^}}test_fmax3_olt_1_f16:
64 ; GCN: buffer_load_ushort [[REGB:v[0-9]+]]
65 ; GCN: buffer_load_ushort [[REGA:v[0-9]+]]
66 ; GCN: buffer_load_ushort [[REGC:v[0-9]+]]
67
68 ; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]],
69 ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT]]
70
71 ; VI: v_max_f16_e32
72 ; VI: v_max_f16_e32 [[RESULT:v[0-9]+]],
73
74 ; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
75 ; GCN: buffer_store_short [[RESULT]],
76 define amdgpu_kernel void @test_fmax3_olt_1_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 {
77 %a = load volatile half, half addrspace(1)* %aptr, align 2
78 %b = load volatile half, half addrspace(1)* %bptr, align 2
79 %c = load volatile half, half addrspace(1)* %cptr, align 2
80 %f0 = call half @llvm.maxnum.f16(half %a, half %b)
81 %f1 = call half @llvm.maxnum.f16(half %c, half %f0)
82 store half %f1, half addrspace(1)* %out, align 2
83 ret void
84 }
85
86 declare i32 @llvm.amdgcn.workitem.id.x() #1
87 declare float @llvm.maxnum.f32(float, float) #1
88 declare half @llvm.maxnum.f16(half, half) #1
89
90 attributes #0 = { nounwind }
91 attributes #1 = { nounwind readnone speculatable }
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
33
4 declare float @llvm.minnum.f32(float, float) nounwind readnone
5
6 ; SI-LABEL: {{^}}test_fmin3_olt_0:
7 ; SI: buffer_load_dword [[REGC:v[0-9]+]]
8 ; SI: buffer_load_dword [[REGB:v[0-9]+]]
9 ; SI: buffer_load_dword [[REGA:v[0-9]+]]
10 ; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
11 ; SI: buffer_store_dword [[RESULT]],
12 ; SI: s_endpgm
13 define amdgpu_kernel void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
4 ; GCN-LABEL: {{^}}test_fmin3_olt_0_f32:
5 ; GCN: buffer_load_dword [[REGC:v[0-9]+]]
6 ; GCN: buffer_load_dword [[REGB:v[0-9]+]]
7 ; GCN: buffer_load_dword [[REGA:v[0-9]+]]
8 ; GCN: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
9 ; GCN: buffer_store_dword [[RESULT]],
10 define amdgpu_kernel void @test_fmin3_olt_0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 {
1411 %a = load volatile float, float addrspace(1)* %aptr, align 4
1512 %b = load volatile float, float addrspace(1)* %bptr, align 4
1613 %c = load volatile float, float addrspace(1)* %cptr, align 4
17 %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
18 %f1 = call float @llvm.minnum.f32(float %f0, float %c) nounwind readnone
14 %f0 = call float @llvm.minnum.f32(float %a, float %b)
15 %f1 = call float @llvm.minnum.f32(float %f0, float %c)
1916 store float %f1, float addrspace(1)* %out, align 4
2017 ret void
2118 }
2219
2320 ; Commute operand of second fmin
24 ; SI-LABEL: {{^}}test_fmin3_olt_1:
25 ; SI: buffer_load_dword [[REGB:v[0-9]+]]
26 ; SI: buffer_load_dword [[REGA:v[0-9]+]]
27 ; SI: buffer_load_dword [[REGC:v[0-9]+]]
28 ; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
29 ; SI: buffer_store_dword [[RESULT]],
30 ; SI: s_endpgm
31 define amdgpu_kernel void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
21 ; GCN-LABEL: {{^}}test_fmin3_olt_1_f32:
22 ; GCN: buffer_load_dword [[REGB:v[0-9]+]]
23 ; GCN: buffer_load_dword [[REGA:v[0-9]+]]
24 ; GCN: buffer_load_dword [[REGC:v[0-9]+]]
25 ; GCN: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
26 ; GCN: buffer_store_dword [[RESULT]],
27 define amdgpu_kernel void @test_fmin3_olt_1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 {
3228 %a = load volatile float, float addrspace(1)* %aptr, align 4
3329 %b = load volatile float, float addrspace(1)* %bptr, align 4
3430 %c = load volatile float, float addrspace(1)* %cptr, align 4
35 %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
36 %f1 = call float @llvm.minnum.f32(float %c, float %f0) nounwind readnone
31 %f0 = call float @llvm.minnum.f32(float %a, float %b)
32 %f1 = call float @llvm.minnum.f32(float %c, float %f0)
3733 store float %f1, float addrspace(1)* %out, align 4
3834 ret void
3935 }
36
37 ; GCN-LABEL: {{^}}test_fmin3_olt_0_f16:
38 ; GCN: buffer_load_ushort [[REGC:v[0-9]+]]
39 ; GCN: buffer_load_ushort [[REGB:v[0-9]+]]
40 ; GCN: buffer_load_ushort [[REGA:v[0-9]+]]
41
42 ; SI: v_min3_f32 [[RESULT_F32:v[0-9]+]],
43 ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT]]
44
45 ; VI: v_min_f16_e32
46 ; VI: v_min_f16_e32 [[RESULT:v[0-9]+]],
47
48 ; GFX9: v_min3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
49 ; GCN: buffer_store_short [[RESULT]],
50 define amdgpu_kernel void @test_fmin3_olt_0_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 {
51 %a = load volatile half, half addrspace(1)* %aptr, align 2
52 %b = load volatile half, half addrspace(1)* %bptr, align 2
53 %c = load volatile half, half addrspace(1)* %cptr, align 2
54 %f0 = call half @llvm.minnum.f16(half %a, half %b)
55 %f1 = call half @llvm.minnum.f16(half %f0, half %c)
56 store half %f1, half addrspace(1)* %out, align 2
57 ret void
58 }
59
60 ; Commute operand of second fmin
61 ; GCN-LABEL: {{^}}test_fmin3_olt_1_f16:
62 ; GCN: buffer_load_ushort [[REGB:v[0-9]+]]
63 ; GCN: buffer_load_ushort [[REGA:v[0-9]+]]
64 ; GCN: buffer_load_ushort [[REGC:v[0-9]+]]
65
66 ; SI: v_min3_f32 [[RESULT_F32:v[0-9]+]],
67 ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT]]
68
69 ; VI: v_min_f16_e32
70 ; VI: v_min_f16_e32 [[RESULT:v[0-9]+]],
71
72 ; GFX9: v_min3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
73 ; GCN: buffer_store_short [[RESULT]],
74 define amdgpu_kernel void @test_fmin3_olt_1_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 {
75 %a = load volatile half, half addrspace(1)* %aptr, align 2
76 %b = load volatile half, half addrspace(1)* %bptr, align 2
77 %c = load volatile half, half addrspace(1)* %cptr, align 2
78 %f0 = call half @llvm.minnum.f16(half %a, half %b)
79 %f1 = call half @llvm.minnum.f16(half %c, half %f0)
80 store half %f1, half addrspace(1)* %out, align 2
81 ret void
82 }
83
84 declare i32 @llvm.amdgcn.workitem.id.x() #1
85 declare float @llvm.minnum.f32(float, float) #1
86 declare half @llvm.minnum.f16(half, half) #1
87
88 attributes #0 = { nounwind }
89 attributes #1 = { nounwind readnone speculatable }
None ; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn < %s | FileCheck -check-prefixes=GCN,SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
13
2 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
3
4 ; FUNC-LABEL: @v_test_imax3_sgt_i32
5 ; SI: v_max3_i32
6 define amdgpu_kernel void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
7 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
4 ; GCN-LABEL: {{^}}v_test_imax3_sgt_i32:
5 ; GCN: v_max3_i32
6 define amdgpu_kernel void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 {
7 %tid = call i32 @llvm.amdgcn.workitem.id.x()
88 %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
99 %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
1010 %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
1111 %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
12 %a = load i32, i32 addrspace(1)* %gep0, align 4
13 %b = load i32, i32 addrspace(1)* %gep1, align 4
14 %c = load i32, i32 addrspace(1)* %gep2, align 4
12 %a = load i32, i32 addrspace(1)* %gep0
13 %b = load i32, i32 addrspace(1)* %gep1
14 %c = load i32, i32 addrspace(1)* %gep2
1515 %icmp0 = icmp sgt i32 %a, %b
1616 %i0 = select i1 %icmp0, i32 %a, i32 %b
1717 %icmp1 = icmp sgt i32 %i0, %c
1818 %i1 = select i1 %icmp1, i32 %i0, i32 %c
19 store i32 %i1, i32 addrspace(1)* %out, align 4
19 store i32 %i1, i32 addrspace(1)* %out
2020 ret void
2121 }
2222
23 ; FUNC-LABEL: @v_test_umax3_ugt_i32
24 ; SI: v_max3_u32
25 define amdgpu_kernel void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
26 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
23 ; GCN-LABEL: {{^}}v_test_umax3_ugt_i32:
24 ; GCN: v_max3_u32
25 define amdgpu_kernel void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 {
26 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2727 %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
2828 %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
2929 %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
3030 %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
31 %a = load i32, i32 addrspace(1)* %gep0, align 4
32 %b = load i32, i32 addrspace(1)* %gep1, align 4
33 %c = load i32, i32 addrspace(1)* %gep2, align 4
31 %a = load i32, i32 addrspace(1)* %gep0
32 %b = load i32, i32 addrspace(1)* %gep1
33 %c = load i32, i32 addrspace(1)* %gep2
3434 %icmp0 = icmp ugt i32 %a, %b
3535 %i0 = select i1 %icmp0, i32 %a, i32 %b
3636 %icmp1 = icmp ugt i32 %i0, %c
3737 %i1 = select i1 %icmp1, i32 %i0, i32 %c
38 store i32 %i1, i32 addrspace(1)* %out, align 4
38 store i32 %i1, i32 addrspace(1)* %out
3939 ret void
4040 }
41
42 ; GCN-LABEL: {{^}}v_test_imax3_sgt_i16:
43 ; SI: v_max3_i32
44
45 ; VI: v_max_i16
46 ; VI: v_max_i16
47
48 ; GFX9: v_max3_i16
49 define amdgpu_kernel void @v_test_imax3_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 addrspace(1)* %cptr) #0 {
50 %tid = call i32 @llvm.amdgcn.workitem.id.x()
51 %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
52 %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
53 %gep2 = getelementptr i16, i16 addrspace(1)* %cptr, i32 %tid
54 %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
55 %a = load i16, i16 addrspace(1)* %gep0
56 %b = load i16, i16 addrspace(1)* %gep1
57 %c = load i16, i16 addrspace(1)* %gep2
58 %icmp0 = icmp sgt i16 %a, %b
59 %i0 = select i1 %icmp0, i16 %a, i16 %b
60 %icmp1 = icmp sgt i16 %i0, %c
61 %i1 = select i1 %icmp1, i16 %i0, i16 %c
62 store i16 %i1, i16 addrspace(1)* %out
63 ret void
64 }
65
66 ; GCN-LABEL: {{^}}v_test_umax3_ugt_i16:
67 ; SI: v_max3_u32
68
69 ; VI: v_max_u16
70 ; VI: v_max_u16
71
72 ; GFX9: v_max3_u16
73 define amdgpu_kernel void @v_test_umax3_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 addrspace(1)* %cptr) #0 {
74 %tid = call i32 @llvm.amdgcn.workitem.id.x()
75 %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
76 %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
77 %gep2 = getelementptr i16, i16 addrspace(1)* %cptr, i32 %tid
78 %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
79 %a = load i16, i16 addrspace(1)* %gep0
80 %b = load i16, i16 addrspace(1)* %gep1
81 %c = load i16, i16 addrspace(1)* %gep2
82 %icmp0 = icmp ugt i16 %a, %b
83 %i0 = select i1 %icmp0, i16 %a, i16 %b
84 %icmp1 = icmp ugt i16 %i0, %c
85 %i1 = select i1 %icmp1, i16 %i0, i16 %c
86 store i16 %i1, i16 addrspace(1)* %out
87 ret void
88 }
89
90 declare i32 @llvm.amdgcn.workitem.id.x() #1
91
92 attributes #0 = { nounwind }
93 attributes #1 = { nounwind readnone speculatable }
None ; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn < %s | FileCheck -check-prefixes=GCN,SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
13
2 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
3
4 ; FUNC-LABEL: @v_test_imin3_slt_i32
5 ; SI: v_min3_i32
6 define amdgpu_kernel void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
7 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
4 ; GCN-LABEL: {{^}}v_test_imin3_slt_i32:
5 ; GCN: v_min3_i32
6 define amdgpu_kernel void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 {
7 %tid = call i32 @llvm.amdgcn.workitem.id.x()
88 %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
99 %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
1010 %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
1111 %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
12 %a = load i32, i32 addrspace(1)* %gep0, align 4
13 %b = load i32, i32 addrspace(1)* %gep1, align 4
14 %c = load i32, i32 addrspace(1)* %gep2, align 4
12 %a = load i32, i32 addrspace(1)* %gep0
13 %b = load i32, i32 addrspace(1)* %gep1
14 %c = load i32, i32 addrspace(1)* %gep2
1515 %icmp0 = icmp slt i32 %a, %b
1616 %i0 = select i1 %icmp0, i32 %a, i32 %b
1717 %icmp1 = icmp slt i32 %i0, %c
1818 %i1 = select i1 %icmp1, i32 %i0, i32 %c
19 store i32 %i1, i32 addrspace(1)* %outgep, align 4
19 store i32 %i1, i32 addrspace(1)* %outgep
2020 ret void
2121 }
2222
23 ; FUNC-LABEL: @v_test_umin3_ult_i32
24 ; SI: v_min3_u32
25 define amdgpu_kernel void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
26 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
23 ; GCN-LABEL: {{^}}v_test_umin3_ult_i32:
24 ; GCN: v_min3_u32
25 define amdgpu_kernel void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 {
26 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2727 %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
2828 %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
2929 %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
3030 %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
31 %a = load i32, i32 addrspace(1)* %gep0, align 4
32 %b = load i32, i32 addrspace(1)* %gep1, align 4
33 %c = load i32, i32 addrspace(1)* %gep2, align 4
31 %a = load i32, i32 addrspace(1)* %gep0
32 %b = load i32, i32 addrspace(1)* %gep1
33 %c = load i32, i32 addrspace(1)* %gep2
3434 %icmp0 = icmp ult i32 %a, %b
3535 %i0 = select i1 %icmp0, i32 %a, i32 %b
3636 %icmp1 = icmp ult i32 %i0, %c
3737 %i1 = select i1 %icmp1, i32 %i0, i32 %c
38 store i32 %i1, i32 addrspace(1)* %outgep, align 4
38 store i32 %i1, i32 addrspace(1)* %outgep
3939 ret void
4040 }
4141
42 ; FUNC-LABEL: @v_test_umin_umin_umin
43 ; SI: v_min_i32
44 ; SI: v_min3_i32
45 define amdgpu_kernel void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
46 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
42 ; GCN-LABEL: {{^}}v_test_umin_umin_umin:
43 ; GCN: v_min_i32
44 ; GCN: v_min3_i32
45 define amdgpu_kernel void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 {
46 %tid = call i32 @llvm.amdgcn.workitem.id.x()
4747 %tid2 = mul i32 %tid, 2
4848 %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
4949 %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
5656 %outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
5757 %outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2
5858
59 %a = load i32, i32 addrspace(1)* %gep0, align 4
60 %b = load i32, i32 addrspace(1)* %gep1, align 4
61 %c = load i32, i32 addrspace(1)* %gep2, align 4
62 %d = load i32, i32 addrspace(1)* %gep3, align 4
59 %a = load i32, i32 addrspace(1)* %gep0
60 %b = load i32, i32 addrspace(1)* %gep1
61 %c = load i32, i32 addrspace(1)* %gep2
62 %d = load i32, i32 addrspace(1)* %gep3
6363
6464 %icmp0 = icmp slt i32 %a, %b
6565 %i0 = select i1 %icmp0, i32 %a, i32 %b
7070 %icmp2 = icmp slt i32 %i0, %i1
7171 %i2 = select i1 %icmp2, i32 %i0, i32 %i1
7272
73 store i32 %i2, i32 addrspace(1)* %outgep1, align 4
73 store i32 %i2, i32 addrspace(1)* %outgep1
7474 ret void
7575 }
7676
77 ; FUNC-LABEL: @v_test_umin3_2_uses
78 ; SI-NOT: v_min3
79 define amdgpu_kernel void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
80 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
77 ; GCN-LABEL: {{^}}v_test_umin3_2_uses:
78 ; GCN-NOT: v_min3
79 define amdgpu_kernel void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 {
80 %tid = call i32 @llvm.amdgcn.workitem.id.x()
8181 %tid2 = mul i32 %tid, 2
8282 %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
8383 %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
9090 %outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
9191 %outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2
9292
93 %a = load i32, i32 addrspace(1)* %gep0, align 4
94 %b = load i32, i32 addrspace(1)* %gep1, align 4
95 %c = load i32, i32 addrspace(1)* %gep2, align 4
96 %d = load i32, i32 addrspace(1)* %gep3, align 4
93 %a = load i32, i32 addrspace(1)* %gep0
94 %b = load i32, i32 addrspace(1)* %gep1
95 %c = load i32, i32 addrspace(1)* %gep2
96 %d = load i32, i32 addrspace(1)* %gep3
9797
9898 %icmp0 = icmp slt i32 %a, %b
9999 %i0 = select i1 %icmp0, i32 %a, i32 %b
104104 %icmp2 = icmp slt i32 %i0, %c
105105 %i2 = select i1 %icmp2, i32 %i0, i32 %c
106106
107 store i32 %i2, i32 addrspace(1)* %outgep0, align 4
108 store i32 %i0, i32 addrspace(1)* %outgep1, align 4
107 store i32 %i2, i32 addrspace(1)* %outgep0
108 store i32 %i0, i32 addrspace(1)* %outgep1
109109 ret void
110110 }
111
112 ; GCN-LABEL: {{^}}v_test_imin3_slt_i16:
113 ; SI: v_min3_i32
114
115 ; VI: v_min_i16
116 ; VI: v_min_i16
117
118 ; GFX9: v_min3_i16
119 define amdgpu_kernel void @v_test_imin3_slt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 addrspace(1)* %cptr) #0 {
120 %tid = call i32 @llvm.amdgcn.workitem.id.x()
121 %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
122 %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
123 %gep2 = getelementptr i16, i16 addrspace(1)* %cptr, i32 %tid
124 %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
125 %a = load i16, i16 addrspace(1)* %gep0
126 %b = load i16, i16 addrspace(1)* %gep1
127 %c = load i16, i16 addrspace(1)* %gep2
128 %icmp0 = icmp slt i16 %a, %b
129 %i0 = select i1 %icmp0, i16 %a, i16 %b
130 %icmp1 = icmp slt i16 %i0, %c
131 %i1 = select i1 %icmp1, i16 %i0, i16 %c
132 store i16 %i1, i16 addrspace(1)* %outgep
133 ret void
134 }
135
136 ; GCN-LABEL: {{^}}v_test_umin3_ult_i16:
137 ; SI: v_min3_u32
138
139 ; VI: v_min_u16
140 ; VI: v_min_u16
141
142 ; GFX9: v_min3_u16
143 define amdgpu_kernel void @v_test_umin3_ult_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 addrspace(1)* %cptr) #0 {
144 %tid = call i32 @llvm.amdgcn.workitem.id.x()
145 %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
146 %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
147 %gep2 = getelementptr i16, i16 addrspace(1)* %cptr, i32 %tid
148 %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
149 %a = load i16, i16 addrspace(1)* %gep0
150 %b = load i16, i16 addrspace(1)* %gep1
151 %c = load i16, i16 addrspace(1)* %gep2
152 %icmp0 = icmp ult i16 %a, %b
153 %i0 = select i1 %icmp0, i16 %a, i16 %b
154 %icmp1 = icmp ult i16 %i0, %c
155 %i1 = select i1 %icmp1, i16 %i0, i16 %c
156 store i16 %i1, i16 addrspace(1)* %outgep
157 ret void
158 }
159
160 declare i32 @llvm.amdgcn.workitem.id.x() #1
161
162 attributes #0 = { nounwind }
163 attributes #1 = { nounwind readnone speculatable }
3434 // GFX9: v_xad_u32 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xf3,0xd1,0x02,0x07,0x12,0x04]
3535 // NOVI: :1: error: instruction not supported on this GPU
3636
37 v_min3_f16 v1, v2, v3, v4
38 // GFX9: v_min3_f16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xf4,0xd1,0x02,0x07,0x12,0x04]
39 // NOVI: :1: error: instruction not supported on this GPU
40
41 v_min3_i16 v1, v2, v3, v4
42 // GFX9: v_min3_i16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xf5,0xd1,0x02,0x07,0x12,0x04]
43 // NOVI: :1: error: instruction not supported on this GPU
44
45 v_min3_u16 v1, v2, v3, v4
46 // GFX9: v_min3_u16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xf6,0xd1,0x02,0x07,0x12,0x04]
47 // NOVI: :1: error: instruction not supported on this GPU
48
49 v_max3_f16 v1, v2, v3, v4
50 // GFX9: v_max3_f16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xf7,0xd1,0x02,0x07,0x12,0x04]
51 // NOVI: :1: error: instruction not supported on this GPU
52
53 v_max3_i16 v1, v2, v3, v4
54 // GFX9: v_max3_i16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xf8,0xd1,0x02,0x07,0x12,0x04]
55 // NOVI: :1: error: instruction not supported on this GPU
56
57 v_max3_u16 v1, v2, v3, v4
58 // GFX9: v_max3_u16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xf9,0xd1,0x02,0x07,0x12,0x04]
59 // NOVI: :1: error: instruction not supported on this GPU
60
3761 v_med3_f16 v1, v2, v3, v4
3862 // GFX9: v_med3_f16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xfa,0xd1,0x02,0x07,0x12,0x04]
3963 // NOVI: :1: error: instruction not supported on this GPU