llvm.org GIT mirror llvm / 8763b3a
AMDGPU: Add macro fusion schedule DAG mutation Try to increase opportunities to shrink vcc uses. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@307313 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 2 years ago
18 changed file(s) with 511 addition(s) and 308 deletion(s). Raw diff Collapse all Expand all
0 //===--- AMDGPUMacroFusion.cpp - AMDGPU Macro Fusion ----------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This file contains the AMDGPU implementation of the DAG scheduling
10 /// mutation to pair instructions back to back.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPUMacroFusion.h"
15 #include "AMDGPUSubtarget.h"
16 #include "SIInstrInfo.h"
17
18 #include "llvm/CodeGen/MacroFusion.h"
19
20 using namespace llvm;
21
22 namespace {
23
24 /// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
25 /// together. Given SecondMI, when FirstMI is unspecified, then check if
26 /// SecondMI may be part of a fused pair at all.
27 static bool shouldScheduleAdjacent(const TargetInstrInfo &TII_,
28 const TargetSubtargetInfo &TSI,
29 const MachineInstr *FirstMI,
30 const MachineInstr &SecondMI) {
31 const SIInstrInfo &TII = static_cast(TII_);
32
33 switch (SecondMI.getOpcode()) {
34 case AMDGPU::V_ADDC_U32_e64:
35 case AMDGPU::V_SUBB_U32_e64:
36 case AMDGPU::V_CNDMASK_B32_e64: {
37 // Try to cluster defs of condition registers to their uses. This improves
38 // the chance VCC will be available which will allow shrinking to VOP2
39 // encodings.
40 if (!FirstMI)
41 return true;
42
43 const MachineOperand *Src2 = TII.getNamedOperand(SecondMI,
44 AMDGPU::OpName::src2);
45 return FirstMI->definesRegister(Src2->getReg());
46 }
47 default:
48 return false;
49 }
50
51 return false;
52 }
53
54 } // end namespace
55
56
57 namespace llvm {
58
59 std::unique_ptr createAMDGPUMacroFusionDAGMutation () {
60 return createMacroFusionDAGMutation(shouldScheduleAdjacent);
61 }
62
63 } // end namespace llvm
0 //===- AMDGPUMacroFusion.h - AMDGPU Macro Fusion ----------------*- C++ -*-===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "llvm/CodeGen/MachineScheduler.h"
10
11 namespace llvm {
12
13 /// Note that you have to add:
14 /// DAG.addMutation(createAMDGPUMacroFusionDAGMutation());
15 /// to AMDGPUPassConfig::createMachineScheduler() to have an effect.
16 std::unique_ptr createAMDGPUMacroFusionDAGMutation();
17
18 } // llvm
1818 #include "AMDGPUCallLowering.h"
1919 #include "AMDGPUInstructionSelector.h"
2020 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPUMacroFusion.h"
2122 #include "AMDGPUTargetObjectFile.h"
2223 #include "AMDGPUTargetTransformInfo.h"
2324 #include "GCNIterativeScheduler.h"
172173 new GCNScheduleDAGMILive(C, make_unique(C));
173174 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
174175 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
176 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
175177 return DAG;
176178 }
177179
4646 AMDGPUIntrinsicInfo.cpp
4747 AMDGPUISelDAGToDAG.cpp
4848 AMDGPULowerIntrinsics.cpp
49 AMDGPUMacroFusion.cpp
4950 AMDGPUMCInstLower.cpp
5051 AMDGPUMachineCFGStructurizer.cpp
5152 AMDGPUMachineFunction.cpp
99 ; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
1010 ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
1111 ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
12
12 ; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
13 ; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
14 ; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
15 ; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
16
17 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
1318 ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
1419 ; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(15, 16, 16)
1520 ; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
1621 ; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]]
1722
1823 ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base
19
20 ; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
21
22 ; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
23 ; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]]
24 ; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
25 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
24 ; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
25 ; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
26 ; GFX9-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
27 ; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
2628
2729 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
2830
4749 ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
4850 ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
4951
52 ; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
53 ; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], 0
54 ; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
55 ; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
56 ; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
57
5058 ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
5159 ; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(15, 0, 16)
5260 ; GFX9-DAG: s_lshl_b32 [[SSRC_PRIVATE_BASE:s[0-9]+]], [[SSRC_PRIVATE]], 16
5462
5563 ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base
5664
57 ; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
58
59 ; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], 0
60 ; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]]
61 ; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
62 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
65 ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
66 ; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], 0
67 ; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
68 ; GFX9: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
69 ; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
6370
6471 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
6572
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
2 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
2 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
33
44 declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
55 declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
3434
3535 ; FUNC-LABEL: {{^}}v_ctlz_i32:
3636 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
37 ; GCN-DAG: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]]
38 ; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[CTLZ]]
37 ; GCN: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]]
38 ; GCN: v_cmp_ne_u32_e32 vcc, 0, [[VAL]]
3939 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], 32, [[CTLZ]], vcc
4040 ; GCN: buffer_store_dword [[RESULT]],
4141 ; GCN: s_endpgm
103103
104104 ; FUNC-LABEL: {{^}}v_ctlz_i8:
105105 ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
106 ; SI-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
107 ; VI-DAG: v_ffbh_u32_sdwa [[RESULT:v[0-9]+]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
106 ; SI-DAG: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
107 ; VI-DAG: v_ffbh_u32_sdwa [[FFBH:v[0-9]+]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
108 ; SI: v_cmp_ne_u32_e32 vcc, 0, [[VAL]]
109 ; VI: v_cmp_ne_u16_e32 vcc, 0, [[VAL]]
110
111 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 32, [[FFBH]], vcc
112
113 ; SI: v_subrev_i32_e32 [[RESULT:v[0-9]+]], vcc, 24, [[SELECT]]
114 ; VI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, -16, [[SELECT]]
108115 ; GCN: buffer_store_byte [[RESULT]],
109116 ; GCN: s_endpgm
110117 define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
141148
142149 ; FUNC-LABEL: {{^}}v_ctlz_i64:
143150 ; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
144 ; GCN-DAG: v_cmp_eq_u32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]
151 ; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, v[[HI]]
145152 ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]
146153 ; GCN-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
147154 ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
148 ; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], [[CMPHI]]
155 ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], vcc
149156 ; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[HI]], v[[LO]]
150157 ; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[OR]]
151158 ; GCN-DAG: v_cndmask_b32_e32 v[[CLTZ_LO:[0-9]+]], 64, v[[CTLZ:[0-9]+]], vcc
123123
124124 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64:
125125 ; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
126 ; GCN-DAG: v_cmp_eq_u32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]
126 ; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, v[[HI]]
127127 ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]
128128 ; GCN-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
129129 ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
130 ; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]]
130 ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]]
131131 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI:[0-9]+]]{{\]}}
132132 define amdgpu_kernel void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
133133 %tid = call i32 @llvm.r600.read.tidig.x()
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
22
33 ; GCN-LABEL: {{^}}fcmp_f16_lt
44 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
350350 ret void
351351 }
352352
353 ; GCN-LABEL: {{^}}fcmp_v2f16_lt
354 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
355 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
356 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
357 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
358 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
359 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
360 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
361 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
362 ; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
363 ; SI: v_cmp_lt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
364 ; VI: v_cmp_lt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
365 ; VI: v_cmp_lt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
366 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
367 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
368 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
369 ; GCN: s_endpgm
353 ; GCN-LABEL: {{^}}fcmp_v2f16_lt:
354 ; SI: v_cmp_lt_f32_e32 vcc,
355 ; SI: v_cmp_lt_f32_e32 vcc,
356
357 ; VI: v_cmp_lt_f16_e32 vcc,
358 ; VI: v_cmp_lt_f16_e32 vcc,
370359 define amdgpu_kernel void @fcmp_v2f16_lt(
371360 <2 x i32> addrspace(1)* %r,
372361 <2 x half> addrspace(1)* %a,
381370 }
382371
383372 ; GCN-LABEL: {{^}}fcmp_v2f16_eq
384 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
385 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
386 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
387 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
388 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
389 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
390 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
391 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
392 ; SI: v_cmp_eq_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
393 ; SI: v_cmp_eq_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
394 ; VI: v_cmp_eq_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
395 ; VI: v_cmp_eq_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
396 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
397 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
398 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
399 ; GCN: s_endpgm
373 ; SI: v_cmp_eq_f32_e32 vcc,
374 ; SI: v_cmp_eq_f32_e32 vcc,
375
376 ; VI: v_cmp_eq_f16_e32 vcc,
377 ; VI: v_cmp_eq_f16_e32 vcc,
400378 define amdgpu_kernel void @fcmp_v2f16_eq(
401379 <2 x i32> addrspace(1)* %r,
402380 <2 x half> addrspace(1)* %a,
410388 ret void
411389 }
412390
413 ; GCN-LABEL: {{^}}fcmp_v2f16_le
414 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
415 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
416 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
417 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
418 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
419 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
420 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
421 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
422 ; SI: v_cmp_le_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
423 ; SI: v_cmp_le_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
424 ; VI: v_cmp_le_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
425 ; VI: v_cmp_le_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
426 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
427 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
428 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
429 ; GCN: s_endpgm
391 ; GCN-LABEL: {{^}}fcmp_v2f16_le:
392 ; SI: v_cmp_le_f32_e32 vcc
393 ; SI: v_cmp_le_f32_e32 vcc
394 ; VI: v_cmp_le_f16_e32 vcc
395 ; VI: v_cmp_le_f16_e32 vcc
430396 define amdgpu_kernel void @fcmp_v2f16_le(
431397 <2 x i32> addrspace(1)* %r,
432398 <2 x half> addrspace(1)* %a,
440406 ret void
441407 }
442408
443 ; GCN-LABEL: {{^}}fcmp_v2f16_gt
444 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
445 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
446 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
447 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
448 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
449 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
450 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
451 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
452 ; SI: v_cmp_gt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
453 ; SI: v_cmp_gt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
454 ; VI: v_cmp_gt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
455 ; VI: v_cmp_gt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
456 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
457 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
458 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
459 ; GCN: s_endpgm
409 ; GCN-LABEL: {{^}}fcmp_v2f16_gt:
410 ; SI: v_cmp_gt_f32_e32 vcc,
411 ; SI: v_cmp_gt_f32_e32 vcc,
412
413 ; VI: v_cmp_gt_f16_e32 vcc,
414 ; VI: v_cmp_gt_f16_e32 vcc,
460415 define amdgpu_kernel void @fcmp_v2f16_gt(
461416 <2 x i32> addrspace(1)* %r,
462417 <2 x half> addrspace(1)* %a,
470425 ret void
471426 }
472427
473 ; GCN-LABEL: {{^}}fcmp_v2f16_lg
474 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
475 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
476 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
477 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
478 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
479 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
480 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
481 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
482 ; SI: v_cmp_lg_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
483 ; SI: v_cmp_lg_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
484 ; VI: v_cmp_lg_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
485 ; VI: v_cmp_lg_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
486 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
487 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
488 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
489 ; GCN: s_endpgm
428 ; GCN-LABEL: {{^}}fcmp_v2f16_lg:
429 ; SI: v_cmp_lg_f32_e32 vcc,
430 ; SI: v_cmp_lg_f32_e32 vcc,
431
432 ; VI: v_cmp_lg_f16_e32 vcc,
433 ; VI: v_cmp_lg_f16_e32 vcc,
490434 define amdgpu_kernel void @fcmp_v2f16_lg(
491435 <2 x i32> addrspace(1)* %r,
492436 <2 x half> addrspace(1)* %a,
500444 ret void
501445 }
502446
503 ; GCN-LABEL: {{^}}fcmp_v2f16_ge
504 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
505 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
506 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
507 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
508 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
509 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
510 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
511 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
512 ; SI: v_cmp_ge_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
513 ; SI: v_cmp_ge_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
514 ; VI: v_cmp_ge_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
515 ; VI: v_cmp_ge_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
516 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
517 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
518 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
519 ; GCN: s_endpgm
447 ; GCN-LABEL: {{^}}fcmp_v2f16_ge:
448 ; SI: v_cmp_ge_f32_e32 vcc,
449 ; SI: v_cmp_ge_f32_e32 vcc,
450
451 ; VI: v_cmp_ge_f16_e32 vcc,
452 ; VI: v_cmp_ge_f16_e32 vcc,
520453 define amdgpu_kernel void @fcmp_v2f16_ge(
521454 <2 x i32> addrspace(1)* %r,
522455 <2 x half> addrspace(1)* %a,
530463 ret void
531464 }
532465
533 ; GCN-LABEL: {{^}}fcmp_v2f16_o
534 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
535 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
536 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
537 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
538 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
539 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
540 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
541 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
542 ; SI: v_cmp_o_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
543 ; SI: v_cmp_o_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
544 ; VI: v_cmp_o_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
545 ; VI: v_cmp_o_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
546 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
547 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
548 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
549 ; GCN: s_endpgm
466 ; GCN-LABEL: {{^}}fcmp_v2f16_o:
467 ; SI: v_cmp_o_f32_e32 vcc,
468 ; SI: v_cmp_o_f32_e32 vcc,
469
470 ; VI: v_cmp_o_f16_e32 vcc,
471 ; VI: v_cmp_o_f16_e32 vcc,
550472 define amdgpu_kernel void @fcmp_v2f16_o(
551473 <2 x i32> addrspace(1)* %r,
552474 <2 x half> addrspace(1)* %a,
560482 ret void
561483 }
562484
563 ; GCN-LABEL: {{^}}fcmp_v2f16_u
564 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
565 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
566 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
567 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
568 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
569 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
570 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
571 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
572 ; SI: v_cmp_u_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
573 ; SI: v_cmp_u_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
574 ; VI: v_cmp_u_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
575 ; VI: v_cmp_u_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
576 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
577 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
578 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
579 ; GCN: s_endpgm
485 ; GCN-LABEL: {{^}}fcmp_v2f16_u:
486 ; SI: v_cmp_u_f32_e32 vcc,
487 ; SI: v_cmp_u_f32_e32 vcc,
488
489 ; VI: v_cmp_u_f16_e32 vcc,
490 ; VI: v_cmp_u_f16_e32 vcc,
580491 define amdgpu_kernel void @fcmp_v2f16_u(
581492 <2 x i32> addrspace(1)* %r,
582493 <2 x half> addrspace(1)* %a,
591502 }
592503
593504 ; GCN-LABEL: {{^}}fcmp_v2f16_nge
594 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
595 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
596 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
597 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
598 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
599 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
600 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
601 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
602 ; SI: v_cmp_nge_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
603 ; SI: v_cmp_nge_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
604 ; VI: v_cmp_nge_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
605 ; VI: v_cmp_nge_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
606 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
607 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
608 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
609 ; GCN: s_endpgm
505 ; SI: v_cmp_nge_f32_e32 vcc,
506 ; SI: v_cmp_nge_f32_e32 vcc,
507
508 ; VI: v_cmp_nge_f16_e32 vcc,
509 ; VI: v_cmp_nge_f16_e32 vcc,
610510 define amdgpu_kernel void @fcmp_v2f16_nge(
611511 <2 x i32> addrspace(1)* %r,
612512 <2 x half> addrspace(1)* %a,
621521 }
622522
623523 ; GCN-LABEL: {{^}}fcmp_v2f16_nlg
624 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
625 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
626 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
627 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
628 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
629 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
630 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
631 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
632 ; SI: v_cmp_nlg_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
633 ; SI: v_cmp_nlg_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
634 ; VI: v_cmp_nlg_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
635 ; VI: v_cmp_nlg_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
636 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
637 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
638 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
639 ; GCN: s_endpgm
524 ; SI: v_cmp_nlg_f32_e32 vcc
525 ; SI: v_cmp_nlg_f32_e32 vcc
526
527 ; VI: v_cmp_nlg_f16_e32 vcc
528 ; VI: v_cmp_nlg_f16_e32 vcc
640529 define amdgpu_kernel void @fcmp_v2f16_nlg(
641530 <2 x i32> addrspace(1)* %r,
642531 <2 x half> addrspace(1)* %a,
651540 }
652541
653542 ; GCN-LABEL: {{^}}fcmp_v2f16_ngt
654 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
655 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
656 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
657 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
658 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
659 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
660 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
661 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
662 ; SI: v_cmp_ngt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
663 ; SI: v_cmp_ngt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
664 ; VI: v_cmp_ngt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
665 ; VI: v_cmp_ngt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
666 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
667 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
668 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
669 ; GCN: s_endpgm
543 ; SI: v_cmp_ngt_f32_e32 vcc,
544 ; SI: v_cmp_ngt_f32_e32 vcc,
545
546 ; VI: v_cmp_ngt_f16_e32 vcc,
547 ; VI: v_cmp_ngt_f16_e32 vcc,
670548 define amdgpu_kernel void @fcmp_v2f16_ngt(
671549 <2 x i32> addrspace(1)* %r,
672550 <2 x half> addrspace(1)* %a,
681559 }
682560
683561 ; GCN-LABEL: {{^}}fcmp_v2f16_nle
684 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
685 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
686 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
687 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
688 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
689 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
690 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
691 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
692 ; SI: v_cmp_nle_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
693 ; SI: v_cmp_nle_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
694 ; VI: v_cmp_nle_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
695 ; VI: v_cmp_nle_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
696 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
697 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
698 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
699 ; GCN: s_endpgm
562 ; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
563 ; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
564
565 ; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
566 ; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
700567 define amdgpu_kernel void @fcmp_v2f16_nle(
701568 <2 x i32> addrspace(1)* %r,
702569 <2 x half> addrspace(1)* %a,
711578 }
712579
713580 ; GCN-LABEL: {{^}}fcmp_v2f16_neq
714 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
715 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
716 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
717 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
718 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
719 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
720 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
721 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
722 ; SI: v_cmp_neq_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
723 ; SI: v_cmp_neq_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
724 ; VI: v_cmp_neq_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
725 ; VI: v_cmp_neq_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
726 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
727 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
728 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
729 ; GCN: s_endpgm
581 ; SI: v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
582 ; SI: v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
583
584 ; VI: v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
585 ; VI: v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
730586 define amdgpu_kernel void @fcmp_v2f16_neq(
731587 <2 x i32> addrspace(1)* %r,
732588 <2 x half> addrspace(1)* %a,
743599 ; GCN-LABEL: {{^}}fcmp_v2f16_nlt
744600 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
745601 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
746 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
747 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
748 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
749 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
750 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
751 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
752 ; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
753 ; SI: v_cmp_nlt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
754 ; VI: v_cmp_nlt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
755 ; VI: v_cmp_nlt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
602 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
603 ; GCN-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
604 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
605 ; SI-DAG: v_cmp_nlt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
606
607 ; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
608 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
609 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
610 ; SI-DAG: v_cmp_nlt_f32_e32 vcc, v[[A_F32_1]], v[[B_F32_1]]
611 ; VI-DAG: v_cmp_nlt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
756612 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
613
614 ; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16_1]], v[[B_F16_1]]
757615 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
758616 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
759617 ; GCN: s_endpgm
221221 ; FIXME: Should be scheduled to shrink vcc
222222 ; CHECK-LABEL: {{^}}i1_input_phys_vgpr_x2:
223223 ; CHECK: v_cmp_eq_u32_e32 vcc, 1, v0
224 ; CHECK: v_cmp_eq_u32_e64 s[0:1], 1, v1
225224 ; CHECK: v_cndmask_b32_e64 v0, 0, -1, vcc
226 ; CHECK: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
225 ; CHECK: v_cmp_eq_u32_e32 vcc, 1, v1
226 ; CHECK: v_cndmask_b32_e64 v1, 0, -1, vcc
227227 define amdgpu_kernel void @i1_input_phys_vgpr_x2() {
228228 entry:
229229 %val0 = load volatile i1, i1 addrspace(1)* undef
0 # RUN: llc -march=amdgcn -verify-machineinstrs -run-pass machine-scheduler -o - %s | FileCheck -check-prefix=GCN %s
1
2 # GCN-LABEL: name: cluster_add_addc
3 # GCN: S_NOP 0, implicit-def %vcc
4 # GCN: dead %2, %3 = V_ADD_I32_e64 %0, %1, implicit %exec
5 # GCN: dead %4, dead %5 = V_ADDC_U32_e64 %6, %7, %3, implicit %exec
6 name: cluster_add_addc
7 registers:
8 - { id: 0, class: vgpr_32 }
9 - { id: 1, class: vgpr_32 }
10 - { id: 2, class: vgpr_32 }
11 - { id: 3, class: sreg_64 }
12 - { id: 4, class: vgpr_32 }
13 - { id: 5, class: sreg_64 }
14 - { id: 6, class: vgpr_32 }
15 - { id: 7, class: vgpr_32 }
16
17 body: |
18 bb.0:
19 %0 = V_MOV_B32_e32 0, implicit %exec
20 %1 = V_MOV_B32_e32 0, implicit %exec
21 %2, %3 = V_ADD_I32_e64 %0, %1, implicit %exec
22 %6 = V_MOV_B32_e32 0, implicit %exec
23 %7 = V_MOV_B32_e32 0, implicit %exec
24 S_NOP 0, implicit def %vcc
25 %4, %5 = V_ADDC_U32_e64 %6, %7, %3, implicit %exec
26 ...
27
28 # GCN-LABEL: name: interleave_add64s
29 # GCN: dead %8, %9 = V_ADD_I32_e64 %0, %1, implicit %exec
30 # GCN-NEXT: dead %12, dead %13 = V_ADDC_U32_e64 %4, %5, %9, implicit %exec
31 # GCN-NEXT: dead %10, %11 = V_ADD_I32_e64 %2, %3, implicit %exec
32 # GCN-NEXT: dead %14, dead %15 = V_ADDC_U32_e64 %6, %7, %11, implicit %exec
33 name: interleave_add64s
34 registers:
35 - { id: 0, class: vgpr_32 }
36 - { id: 1, class: vgpr_32 }
37 - { id: 2, class: vgpr_32 }
38 - { id: 3, class: vgpr_32 }
39 - { id: 4, class: vgpr_32 }
40 - { id: 5, class: vgpr_32 }
41 - { id: 6, class: vgpr_32 }
42 - { id: 7, class: vgpr_32 }
43 - { id: 8, class: vgpr_32 }
44 - { id: 9, class: sreg_64 }
45 - { id: 10, class: vgpr_32 }
46 - { id: 11, class: sreg_64 }
47 - { id: 12, class: vgpr_32 }
48 - { id: 13, class: sreg_64 }
49 - { id: 14, class: vgpr_32 }
50 - { id: 15, class: sreg_64 }
51
52 body: |
53 bb.0:
54 %0 = V_MOV_B32_e32 0, implicit %exec
55 %1 = V_MOV_B32_e32 0, implicit %exec
56 %2 = V_MOV_B32_e32 0, implicit %exec
57 %3 = V_MOV_B32_e32 0, implicit %exec
58 %4 = V_MOV_B32_e32 0, implicit %exec
59 %5 = V_MOV_B32_e32 0, implicit %exec
60 %6 = V_MOV_B32_e32 0, implicit %exec
61 %7 = V_MOV_B32_e32 0, implicit %exec
62
63 %8, %9 = V_ADD_I32_e64 %0, %1, implicit %exec
64 %10, %11 = V_ADD_I32_e64 %2, %3, implicit %exec
65
66
67 %12, %13 = V_ADDC_U32_e64 %4, %5, %9, implicit %exec
68 %14, %15 = V_ADDC_U32_e64 %6, %7, %11, implicit %exec
69 ...
70
71 # GCN-LABEL: name: cluster_mov_addc
72 # GCN: S_NOP 0, implicit-def %vcc
73 # GCN-NEXT: %2 = S_MOV_B64 0
74 # GCN-NEXT: dead %3, dead %4 = V_ADDC_U32_e64 %0, %1, %2, implicit %exec
75 name: cluster_mov_addc
76 registers:
77 - { id: 0, class: vgpr_32 }
78 - { id: 1, class: vgpr_32 }
79 - { id: 2, class: sreg_64 }
80 - { id: 3, class: vgpr_32 }
81 - { id: 4, class: sreg_64 }
82 - { id: 6, class: vgpr_32 }
83 - { id: 7, class: vgpr_32 }
84
85 body: |
86 bb.0:
87 %0 = V_MOV_B32_e32 0, implicit %exec
88 %1 = V_MOV_B32_e32 0, implicit %exec
89 %2 = S_MOV_B64 0
90 S_NOP 0, implicit def %vcc
91 %3, %4 = V_ADDC_U32_e64 %0, %1, %2, implicit %exec
92 ...
93
94 # GCN-LABEL: name: no_cluster_add_addc_diff_sgpr
95 # GCN: dead %2, dead %3 = V_ADD_I32_e64 %0, %1, implicit %exec
96 # GCN-NEXT: %6 = V_MOV_B32_e32 0, implicit %exec
97 # GCN-NEXT: %7 = V_MOV_B32_e32 0, implicit %exec
98 # GCN-NEXT: S_NOP 0, implicit-def %vcc
99 # GCN-NEXT: %8 = S_MOV_B64 0
100 # GCN-NEXT: dead %4, dead %5 = V_ADDC_U32_e64 %6, %7, %8, implicit %exec
101 name: no_cluster_add_addc_diff_sgpr
102 registers:
103 - { id: 0, class: vgpr_32 }
104 - { id: 1, class: vgpr_32 }
105 - { id: 2, class: vgpr_32 }
106 - { id: 3, class: sreg_64 }
107 - { id: 4, class: vgpr_32 }
108 - { id: 5, class: sreg_64 }
109 - { id: 6, class: vgpr_32 }
110 - { id: 7, class: vgpr_32 }
111 - { id: 8, class: sreg_64 }
112 body: |
113 bb.0:
114 %0 = V_MOV_B32_e32 0, implicit %exec
115 %1 = V_MOV_B32_e32 0, implicit %exec
116 %8 = S_MOV_B64 0
117 %2, %3 = V_ADD_I32_e64 %0, %1, implicit %exec
118 %6 = V_MOV_B32_e32 0, implicit %exec
119 %7 = V_MOV_B32_e32 0, implicit %exec
120 S_NOP 0, implicit def %vcc
121 %4, %5 = V_ADDC_U32_e64 %6, %7, %8, implicit %exec
122 ...
123 # GCN-LABEL: name: cluster_sub_subb
124 # GCN: S_NOP 0, implicit-def %vcc
125 # GCN: dead %2, %3 = V_SUB_I32_e64 %0, %1, implicit %exec
126 # GCN: dead %4, dead %5 = V_SUBB_U32_e64 %6, %7, %3, implicit %exec
127 name: cluster_sub_subb
128 registers:
129 - { id: 0, class: vgpr_32 }
130 - { id: 1, class: vgpr_32 }
131 - { id: 2, class: vgpr_32 }
132 - { id: 3, class: sreg_64 }
133 - { id: 4, class: vgpr_32 }
134 - { id: 5, class: sreg_64 }
135 - { id: 6, class: vgpr_32 }
136 - { id: 7, class: vgpr_32 }
137
138 body: |
139 bb.0:
140 %0 = V_MOV_B32_e32 0, implicit %exec
141 %1 = V_MOV_B32_e32 0, implicit %exec
142 %2, %3 = V_SUB_I32_e64 %0, %1, implicit %exec
143 %6 = V_MOV_B32_e32 0, implicit %exec
144 %7 = V_MOV_B32_e32 0, implicit %exec
145 S_NOP 0, implicit def %vcc
146 %4, %5 = V_SUBB_U32_e64 %6, %7, %3, implicit %exec
147 ...
148
149 # GCN-LABEL: name: cluster_cmp_cndmask
150 # GCN: S_NOP 0, implicit-def %vcc
151 # GCN-NEXT: %3 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec
152 # GCN-NEXT: dead %4 = V_CNDMASK_B32_e64 %0, %1, %3, implicit %exec
153 name: cluster_cmp_cndmask
154 registers:
155 - { id: 0, class: vgpr_32 }
156 - { id: 1, class: vgpr_32 }
157 - { id: 2, class: vgpr_32 }
158 - { id: 3, class: sreg_64 }
159 - { id: 4, class: vgpr_32 }
160 - { id: 5, class: sreg_64 }
161 - { id: 6, class: vgpr_32 }
162 - { id: 7, class: vgpr_32 }
163
164 body: |
165 bb.0:
166 %0 = V_MOV_B32_e32 0, implicit %exec
167 %1 = V_MOV_B32_e32 0, implicit %exec
168 %3 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec
169 S_NOP 0, implicit def %vcc
170 %4 = V_CNDMASK_B32_e64 %0, %1, %3, implicit %exec
171 ...
172
173 # GCN-LABEL: name: cluster_multi_use_cmp_cndmask
174 # GCN: %4 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec
175 # GCN-NEXT: dead %5 = V_CNDMASK_B32_e64 %2, %1, %4, implicit %exec
176 # GCN-NEXT: dead %6 = V_CNDMASK_B32_e64 %1, %3, %4, implicit %exec
177 name: cluster_multi_use_cmp_cndmask
178 registers:
179 - { id: 0, class: vgpr_32 }
180 - { id: 1, class: vgpr_32 }
181 - { id: 2, class: vgpr_32 }
182 - { id: 3, class: vgpr_32 }
183 - { id: 4, class: sreg_64 }
184 - { id: 5, class: vgpr_32 }
185 - { id: 6, class: vgpr_32 }
186 - { id: 7, class: vgpr_32 }
187
188 body: |
189 bb.0:
190 %0 = V_MOV_B32_e32 0, implicit %exec
191 %1 = V_MOV_B32_e32 0, implicit %exec
192 %2 = V_MOV_B32_e32 0, implicit %exec
193 %3 = V_MOV_B32_e32 0, implicit %exec
194
195 %4 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec
196 S_NOP 0, implicit def %vcc
197 %5 = V_CNDMASK_B32_e64 %2, %1, %4, implicit %exec
198 %6 = V_CNDMASK_B32_e64 %1, %3, %4, implicit %exec
199 ...
200
201 # GCN-LABEL: name: cluster_multi_use_cmp_cndmask2
202 # GCN: %4 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec
203 # GCN-NEXT: dead %5 = V_CNDMASK_B32_e64 %2, %1, %4, implicit %exec
204 # GCN-NEXT: %3 = V_MOV_B32_e32 0, implicit %exec
205 # GCN-NEXT: dead %6 = V_CNDMASK_B32_e64 %1, %3, %4, implicit %exec
206 name: cluster_multi_use_cmp_cndmask2
207 registers:
208 - { id: 0, class: vgpr_32 }
209 - { id: 1, class: vgpr_32 }
210 - { id: 2, class: vgpr_32 }
211 - { id: 3, class: vgpr_32 }
212 - { id: 4, class: sreg_64 }
213 - { id: 5, class: vgpr_32 }
214 - { id: 6, class: vgpr_32 }
215 - { id: 7, class: vgpr_32 }
216
217 body: |
218 bb.0:
219 %0 = V_MOV_B32_e32 0, implicit %exec
220 %1 = V_MOV_B32_e32 0, implicit %exec
221 %4 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec
222 %2 = V_MOV_B32_e32 0, implicit %exec
223 %5 = V_CNDMASK_B32_e64 %2, %1, %4, implicit %exec
224 %3 = V_MOV_B32_e32 0, implicit %exec
225 %6 = V_CNDMASK_B32_e64 %1, %3, %4, implicit %exec
226 ...
6565
6666 ; FIXME: Why is this compare essentially repeated?
6767 ; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]
68 ; GCN-NEXT: v_cmp_ne_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, [[REG]]
6968 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
70 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1
69 ; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]]
70 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
7171
7272 ; GCN: ; %Flow1
7373 ; GCN-NEXT: s_or_b64 exec, exec
133133
134134 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_select_pat2:
135135 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
136 ; GCN: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
137 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
136 ; GCN-DAG: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
137 ; GCN-DAG: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
138138 define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
139139 %icmp0 = icmp ugt i32 %a, %b
140140 %sub0 = sub i32 %a, %b
103103 ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[D_F32]], vcc
104104 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
105105
106 ; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x3800{{$}}
106107 ; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
107 ; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x3800{{$}}
108108 ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[C_F16]], v[[D_F16]], vcc
109109 ; GCN: buffer_store_short v[[R_F16]]
110110 ; GCN: s_endpgm
133133 ; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
134134 ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[C_F32]]
135135 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
136 ; VI: v_mov_b32_e32 v[[D_F16:[0-9]+]], 0x3800{{$}}
136137 ; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
137 ; VI: v_mov_b32_e32 v[[D_F16:[0-9]+]], 0x3800{{$}}
138138 ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
139139 ; GCN: buffer_store_short v[[R_F16]]
140140 ; GCN: s_endpgm
158158 ; SI: v_cvt_f32_f16_e32
159159 ; SI: v_cvt_f32_f16_e32
160160 ; SI: v_cvt_f32_f16_e32
161 ; SI: v_cmp_lt_f32_e64
162161 ; SI: v_cmp_lt_f32_e32
163162 ; SI: v_cndmask_b32_e32
164 ; SI: v_cndmask_b32_e64
163 ; SI: v_cmp_lt_f32_e32
164 ; SI: v_cndmask_b32_e32
165165 ; SI: v_cvt_f16_f32_e32
166166 ; SI: v_cvt_f16_f32_e32
167167
168 ; VI: v_cmp_lt_f16_e64
169168 ; VI: v_cmp_lt_f16_e32
170 ; VI: v_cndmask_b32_e64
169 ; VI: v_cndmask_b32_e32
170 ; VI: v_cmp_lt_f16_e32
171171 ; VI: v_cndmask_b32_e32
172172
173173 ; GCN: s_endpgm
195195 ; SI: v_cvt_f32_f16_e32
196196 ; SI: v_cvt_f32_f16_e32
197197 ; SI: v_cvt_f32_f16_e32
198 ; SI-DAG: v_cmp_gt_f32_e64
199 ; SI-DAG: v_cmp_lt_f32_e32 vcc, 0.5
200
201 ; VI: v_cmp_lt_f16_e32
202 ; VI: v_cmp_gt_f16_e64
203 ; GCN: v_cndmask_b32_e32
204 ; GCN: v_cndmask_b32_e64
198
199 ; SI: v_cmp_lt_f32_e32 vcc, 0.5
200 ; SI: v_cndmask_b32_e32
201 ; SI: v_cmp_gt_f32_e32
202 ; SI: v_cndmask_b32_e32
203
204 ; VI: v_cmp_lt_f16_e32
205 ; VI: v_cndmask_b32_e32
206 ; VI: v_cmp_gt_f16_e32
207 ; VI: v_cndmask_b32_e32
208
205209 ; SI: v_cvt_f16_f32_e32
206210 ; SI: v_cvt_f16_f32_e32
207211 ; GCN: s_endpgm
227231 ; SI: v_cvt_f32_f16_e32
228232 ; SI: v_cvt_f32_f16_e32
229233 ; SI: v_cvt_f32_f16_e32
230 ; SI-DAG: v_cmp_lt_f32_e64
231 ; SI-DAG: v_cmp_gt_f32_e32 vcc, 0.5
232
233 ; VI: v_cmp_gt_f16_e32
234 ; VI: v_cmp_lt_f16_e64
235 ; GCN: v_cndmask_b32_e32
236 ; GCN: v_cndmask_b32_e64
234
235 ; SI: v_cmp_gt_f32_e32 vcc, 0.5
236 ; SI: v_cndmask_b32_e32
237 ; SI: v_cmp_lt_f32_e32
238 ; SI: v_cndmask_b32_e32
239
240 ; VI: v_cmp_gt_f16_e32
241 ; VI: v_cndmask_b32_e32
242 ; VI: v_cmp_lt_f16_e32
243 ; VI: v_cndmask_b32_e32
237244
238245 ; SI: v_cvt_f16_f32_e32
239246 ; SI: v_cvt_f16_f32_e32
262269 ; SI: v_cvt_f32_f16_e32
263270
264271 ; SI: v_cmp_nlt_f32_e32
265 ; SI: v_cmp_nlt_f32_e64
266 ; SI: v_cndmask_b32_e64
272 ; SI: v_cndmask_b32_e32
273 ; SI: v_cmp_nlt_f32_e32
267274 ; SI: v_cndmask_b32_e32
268275
269276 ; VI: v_cmp_nlt_f16_e32
297304 ; SI: v_cvt_f32_f16_e32
298305 ; SI: v_cvt_f32_f16_e32
299306 ; SI: v_cvt_f32_f16_e32
300 ; SI: v_cmp_lt_f32_e64
307
301308 ; SI: v_cmp_lt_f32_e32
309 ; SI: v_cndmask_b32
310 ; SI: v_cmp_lt_f32_e32
311 ; SI: v_cndmask_b32
302312
303313 ; VI: v_cmp_lt_f16_e32
304 ; VI: v_cmp_lt_f16_e64
305 ; GCN: v_cndmask_b32
306 ; GCN: v_cndmask_b32
314 ; VI: v_cndmask_b32
315 ; VI: v_cmp_lt_f16_e32
316 ; VI: v_cndmask_b32
317
307318 ; SI: v_cvt_f16_f32_e32
308319 ; SI: v_cvt_f16_f32_e32
309320 ; GCN: s_endpgm
66 ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z
77 ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y
88
9 ; GCN-DAG: v_cmp_eq_u32_e32
10 ; GCN-DAG: v_cmp_eq_u32_e64
9 ; GCN: v_cmp_eq_u32_e32
10 ; GCN: v_cmp_eq_u32_e32
1111 define amdgpu_kernel void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
1212 %result = icmp eq <2 x i32> %a, %b
1313 %sext = sext <2 x i1> %result to <2 x i32>
2222 ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
2323
2424 ; GCN: v_cmp_eq_u32_e32
25 ; GCN: v_cmp_eq_u32_e64
26 ; GCN: v_cmp_eq_u32_e64
27 ; GCN: v_cmp_eq_u32_e64
25 ; GCN: v_cmp_eq_u32_e32
26 ; GCN: v_cmp_eq_u32_e32
27 ; GCN: v_cmp_eq_u32_e32
2828 define amdgpu_kernel void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
2929 %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
3030 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
5757 }
5858
5959 ; FUNC-LABEL: {{^}}v_uaddo_i32_novcc:
60 ; GCN: v_add_i32_e64 v{{[0-9]+}}, [[COND:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}
61 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[COND]]
60 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
61 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
6262
6363 ; EG: ADDC_UINT
6464 ; EG: ADD_INT
5757 }
5858
5959 ; FUNC-LABEL: {{^}}v_usubo_i32_novcc:
60 ; GCN: v_sub_i32_e64 v{{[0-9]+}}, [[COND:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}
61 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[COND]]
60 ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
61 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
6262
6363 ; EG-DAG: SUBB_UINT
6464 ; EG-DAG: SUB_INT
199199 ; SI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v[[Z_HI]], vcc
200200 ; SI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v[[Z_LO]], vcc
201201
202 ; VI-DAG: v_cmp_lt_i64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}}
203 ; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v[[Z_HI]], s
204 ; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 2, v[[Z_LO]], s
202 ; VI-DAG: v_cmp_lt_i64_e32 vcc, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}}
203 ; VI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v[[Z_HI]], vcc
204 ; VI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v[[Z_LO]], vcc
205205 define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
206206 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
207207 %tid.ext = sext i32 %tid to i64
291291 ; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i1:
292292 ; GCN: load_dword
293293 ; GCN: load_ubyte
294 ; GCN-DAG: v_cmp_gt_i32_e64 s{{\[[0-9]+:[0-9]+\]}}, 0, v
294 ; GCN-DAG: v_cmp_gt_i32_e32 vcc, 0, v
295295 ; DCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 1,
296 ; GCN-DAG: v_cmp_eq_u32_e32 vcc, 1, v
297 ; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, vcc
296 ; GCN-DAG: v_cmp_eq_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, v
297 ; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, s{{\[[0-9]+:[0-9]+\]}}
298298 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s
299299 ; GCN: store_byte
300300 define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 {
66 ; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z
77 ; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y
88
9 ; SI: v_cndmask_b32_e64
9 ; SI: v_cmp_gt_i32_e32 vcc
10 ; SI: v_cndmask_b32_e32
11 ; SI: v_cmp_gt_i32_e32 vcc
1012 ; SI: v_cndmask_b32_e32
1113
1214 define amdgpu_kernel void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) {
2426 ; EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
2527 ; EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
2628
27 ;SI: v_cndmask_b32_e64
28 ;SI: v_cndmask_b32_e32
29
30 ; SI: v_cmp_neq_f32_e32 vcc
31 ; SI: v_cndmask_b32_e32
32 ; SI: v_cmp_neq_f32_e32 vcc
33 ; SI: v_cndmask_b32_e32
2934
3035 define amdgpu_kernel void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
3136 entry:
4449 ; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z
4550 ; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y
4651
47 ; FIXME: The shrinking does not happen on tonga
48
49 ; SI: v_cndmask_b32
50 ; SI: v_cndmask_b32
51 ; SI: v_cndmask_b32
52 ; SI: v_cndmask_b32
52 ; SI: v_cndmask_b32_e32
53 ; SI: v_cndmask_b32_e32
54 ; SI: v_cndmask_b32_e32
55 ; SI: v_cndmask_b32_e32
5356
5457 define amdgpu_kernel void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) {
5558 entry:
6770 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
6871 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
6972
73 ; SI: v_cndmask_b32_e32
74 ; SI: v_cndmask_b32_e32
75 ; SI: v_cndmask_b32_e32
76 ; SI: v_cndmask_b32_e32
7077 define amdgpu_kernel void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) {
7178 entry:
7279 %0 = load <4 x float>, <4 x float> addrspace(1)* %in0