llvm.org GIT mirror llvm / f0c3d71
[AMDGPU] Allow SDWA in instructions with immediates and SGPRs An encoding does not allow to use SDWA in an instruction with scalar operands, either literals or SGPRs. That is however possible to copy these operands into a VGPR first. Several copies of the value are produced if multiple SDWA conversions were done. To cleanup MachineLICM (to hoist copies out of loops), MachineCSE (to remove duplicate copies) and SIFoldOperands (to replace SGPR to VGPR copy with immediate copy right to the VGPR) runs are added after the SDWA pass. Differential Revision: https://reviews.llvm.org/D33583 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@304219 91177308-0d34-0410-b5e6-96231b3b80d8 Stanislav Mekhanoshin 2 years ago
51 changed file(s) with 746 addition(s) and 245 deletion(s). Raw diff Collapse all Expand all
735735 addPass(createSIShrinkInstructionsPass());
736736 if (EnableSDWAPeephole) {
737737 addPass(&SIPeepholeSDWAID);
738 addPass(&MachineLICMID);
739 addPass(&MachineCSEID);
740 addPass(&SIFoldOperandsID);
738741 addPass(&DeadMachineInstructionElimID);
739742 }
740743 }
246246
247247 // If the use operand doesn't care about the value, this may be an operand only
248248 // used for register indexing, in which case it is unsafe to fold.
249 static bool isUseSafeToFold(const MachineInstr &MI,
249 static bool isUseSafeToFold(const SIInstrInfo *TII,
250 const MachineInstr &MI,
250251 const MachineOperand &UseMO) {
251 return !UseMO.isUndef();
252 return !UseMO.isUndef() && !TII->isSDWA(MI);
252253 //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
253254 }
254255
260261 SmallVectorImpl &CopiesToReplace) const {
261262 const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
262263
263 if (!isUseSafeToFold(*UseMI, UseOp))
264 if (!isUseSafeToFold(TII, *UseMI, UseOp))
264265 return;
265266
266267 // FIXME: Fold operands with subregs.
5454
5555 std::unordered_map> SDWAOperands;
5656 std::unordered_map PotentialMatches;
57 SmallVector ConvertedInstructions;
5758
5859 Optional foldToImm(const MachineOperand &Op) const;
5960
6869 void matchSDWAOperands(MachineFunction &MF);
6970 bool isConvertibleToSDWA(const MachineInstr &MI) const;
7071 bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
72 void legalizeScalarOperands(MachineInstr &MI) const;
7173
7274 StringRef getPassName() const override { return "SI Peephole SDWA"; }
7375
288290 MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
289291 MachineOperand *SrcMods =
290292 TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
291 assert(Src && Src->isReg());
293 assert(Src && (Src->isReg() || Src->isImm()));
292294 if (!isSameReg(*Src, *getReplacedOperand())) {
293295 // If this is not src0 then it should be src1
294296 Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
579581 }
580582
581583 bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI) const {
582 // Check if this instruction can be converted to SDWA:
583 // 1. Does this opcode support SDWA
584 if (AMDGPU::getSDWAOp(MI.getOpcode()) == -1)
585 return false;
586
587 // 2. Are all operands - VGPRs
588 for (const MachineOperand &Operand : MI.explicit_operands()) {
589 if (!Operand.isReg() || !TRI->isVGPR(*MRI, Operand.getReg()))
590 return false;
591 }
592
593 return true;
584 // Check if this instruction has opcode that supports SDWA
585 return AMDGPU::getSDWAOp(MI.getOpcode()) != -1;
594586 }
595587
596588 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
684676 if (PotentialMatches.count(Operand->getParentInst()) == 0)
685677 Converted |= Operand->convertToSDWA(*SDWAInst, TII);
686678 }
687 if (!Converted) {
679 if (Converted) {
680 ConvertedInstructions.push_back(SDWAInst);
681 } else {
688682 SDWAInst->eraseFromParent();
689683 return false;
690684 }
695689
696690 MI.eraseFromParent();
697691 return true;
692 }
693
694 // If an instruction was converted to SDWA it should not have immediates or SGPR
695 // operands. Copy its scalar operands into VGPRs.
696 void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI) const {
697 const MCInstrDesc &Desc = TII->get(MI.getOpcode());
698 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
699 MachineOperand &Op = MI.getOperand(I);
700 if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
701 continue;
702 if (Desc.OpInfo[I].RegClass == -1 ||
703 !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass)))
704 continue;
705 unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
706 auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
707 TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
708 if (Op.isImm())
709 Copy.addImm(Op.getImm());
710 else if (Op.isReg())
711 Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0,
712 Op.getSubReg());
713 Op.ChangeToRegister(VGPR, false);
714 }
698715 }
699716
700717 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
727744
728745 PotentialMatches.clear();
729746 SDWAOperands.clear();
747
748 while (!ConvertedInstructions.empty())
749 legalizeScalarOperands(*ConvertedInstructions.pop_back_val());
750
730751 return false;
731752 }
2222 ; GFX9: s_load_dword [[VAL0:s[0-9]+]]
2323 ; GFX9: s_load_dword [[VAL1:s[0-9]+]]
2424 ; GFX9: v_mov_b32_e32 [[VVAL1:v[0-9]+]]
25 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[VVAL1]], [[VAL0]]
25 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[VAL0]], [[VVAL1]]
2626
2727 ; VI: s_add_i32
2828 ; VI: s_add_i32
4949
5050 ; FIXME: VI should not scalarize arg access.
5151 ; GCN-LABEL: {{^}}s_test_add_v2i16_kernarg:
52 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
52 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
5353
5454 ; VI: v_add_i32
5555 ; VI: v_add_i32_sdwa
6161
6262 ; GCN-LABEL: {{^}}v_test_add_v2i16_constant:
6363 ; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}}
64 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}
64 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]]
6565
6666 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}}
67 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x1c8, v{{[0-9]+}}
67 ; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0x1c8
68 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v[[SCONST]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
6869 define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
6970 %tid = call i32 @llvm.amdgcn.workitem.id.x()
7071 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
7879 ; FIXME: Need to handle non-uniform case for function below (load without gep).
7980 ; GCN-LABEL: {{^}}v_test_add_v2i16_neg_constant:
8081 ; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0xfc21fcb3{{$}}
81 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}
82 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]]
8283
8384 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffcb3, v{{[0-9]+}}
84 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffc21, v{{[0-9]+}}
85 ; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0xfffffc21
86 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v[[SCONST]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
8587 define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
8688 %tid = call i32 @llvm.amdgcn.workitem.id.x()
8789 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
9597 ; GCN-LABEL: {{^}}v_test_add_v2i16_inline_neg1:
9698 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, -1{{$}}
9799
100 ; VI: v_mov_b32_e32 v[[SCONST:[0-9]+]], -1
98101 ; VI: flat_load_ushort [[LOAD0:v[0-9]+]]
99102 ; VI: flat_load_ushort [[LOAD1:v[0-9]+]]
100 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD0]]
103 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v[[SCONST]], [[LOAD0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
101104 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD1]]
102 ; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
103105 ; VI: v_or_b32_e32
104106 define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
105107 %tid = call i32 @llvm.amdgcn.workitem.id.x()
113115
114116 ; GCN-LABEL: {{^}}v_test_add_v2i16_inline_lo_zero_hi:
115117 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 32{{$}}
116 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}}
118 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]{{$}}
117119
118120 ; VI-NOT: v_add_u16
119121 ; VI: v_add_u16_e32 v{{[0-9]+}}, 32, v{{[0-9]+}}
133135 ; The high element gives fp
134136 ; GCN-LABEL: {{^}}v_test_add_v2i16_inline_fp_split:
135137 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 1.0
136 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}}
137
138 ; VI-NOT: v_add_u16
139 ; VI: v_add_u16_e32 v{{[0-9]+}}, 0x3f80, v{{[0-9]+}}
140 ; VI-NOT: v_add_u16
141 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
138 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]{{$}}
139
140 ; VI-NOT: v_add_u16
141 ; VI: v_mov_b32_e32 v[[K:[0-9]+]], 0x3f80
142 ; VI: v_add_u16_sdwa v{{[0-9]+}}, v[[K]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
143 ; VI-NOT: v_add_u16
142144 ; VI: v_or_b32_e32
143145 define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
144146 %tid = call i32 @llvm.amdgcn.workitem.id.x()
190192 ; GFX9: flat_load_dword [[A:v[0-9]+]]
191193 ; GFX9: flat_load_dword [[B:v[0-9]+]]
192194
193 ; GFX9: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
194195 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
195196 ; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
196197 ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
197198 ; GFX9: buffer_store_dwordx4
198199
200 ; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
199201 ; VI: flat_load_ushort v[[A_LO:[0-9]+]]
200202 ; VI: flat_load_ushort v[[A_HI:[0-9]+]]
201203 ; VI: flat_load_ushort v[[B_LO:[0-9]+]]
202204 ; VI: flat_load_ushort v[[B_HI:[0-9]+]]
203205
204 ; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
205 ; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
206206 ; VI-DAG: v_add_u16_e32
207207 ; VI-DAG: v_add_u16_e32
208208
None ; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck --check-prefix=GCN --check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 < %s | FileCheck --check-prefix=GCN --check-prefix=VI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck --check-prefix=GCN --check-prefix=VI-SDWA %s
12 ; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefix=GCN --check-prefix=CI %s
23
34 ; GCN-LABEL: {{^}}bfe_combine8:
45 ; VI: v_bfe_u32 v[[BFE:[0-9]+]], v{{[0-9]+}}, 8, 8
56 ; VI: v_lshlrev_b32_e32 v[[ADDRBASE:[0-9]+]], 2, v[[BFE]]
7 ; VI-SDWA: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2
8 ; VI-SDWA: v_lshlrev_b32_sdwa v[[ADDRBASE:[0-9]+]], v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
69 ; CI: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 6, v{{[0-9]+}}
710 ; CI: v_and_b32_e32 v[[ADDRLO:[0-9]+]], 0x3fc, v[[SHR]]
811 ; VI: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADDRBASE]]
12 ; VI-SDWA: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADDRBASE]]
913 ; GCN: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]:
1014 define amdgpu_kernel void @bfe_combine8(i32 addrspace(1)* nocapture %arg, i32 %x) {
1115 %id = tail call i32 @llvm.amdgcn.workitem.id.x() #2
2125 ; GCN-LABEL: {{^}}bfe_combine16:
2226 ; VI: v_bfe_u32 v[[BFE:[0-9]+]], v{{[0-9]+}}, 16, 16
2327 ; VI: v_lshlrev_b32_e32 v[[ADDRBASE:[0-9]+]], {{[^,]+}}, v[[BFE]]
28 ; VI-SDWA: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 15
29 ; VI-SDWA: v_lshlrev_b32_sdwa v[[ADDRBASE1:[0-9]+]], v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
30 ; VI-SDWA: v_lshlrev_b64 v{{\[}}[[ADDRBASE:[0-9]+]]:{{[^\]+}}], 2, v{{\[}}[[ADDRBASE1]]:{{[^\]+}}]
31 ; VI-SDWA: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADDRBASE]]
2432 ; CI: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 1, v{{[0-9]+}}
2533 ; CI: v_and_b32_e32 v[[AND:[0-9]+]], 0x7fff8000, v[[SHR]]
2634 ; CI: v_lshl_b64 v{{\[}}[[ADDRLO:[0-9]+]]:{{[^\]+}}], v{{\[}}[[AND]]:{{[^\]+}}], 2
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
0 ; RUN: llc -march=amdgcn -amdgpu-sdwa-peephole=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
11
22 declare i32 @llvm.amdgcn.workitem.id.x() #0
33
5050 ; FUNC-LABEL: @commute_add_lit_fabs_f32
5151 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
5252 ; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000
53 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[K]], |[[X]]|
53 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, [[K]]
5454 ; SI: buffer_store_dword [[REG]]
5555 define amdgpu_kernel void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
5656 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
0 ; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-sdwa-peephole=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
22
33 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
44 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
9393 ; GCN-DAG: v_cvt_f32_ubyte3_e32
9494
9595 ; GCN-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 24
96 ; GCN-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16
9796
9897 ; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16
9998 ; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 8
5454 ; SI-LABEL: {{^}}fabs_fold_f64:
5555 ; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
5656 ; SI-NOT: and
57 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|
57 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
5858 ; SI: s_endpgm
5959 define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
6060 %fabs = call double @llvm.fabs.f64(double %in0)
6666 ; SI-LABEL: {{^}}fabs_fn_fold_f64:
6767 ; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
6868 ; SI-NOT: and
69 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|
69 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
7070 ; SI: s_endpgm
7171 define amdgpu_kernel void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
7272 %fabs = call double @fabs(double %in0)
7474 ; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
7575 ; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
7676 ; GCN-NOT: and
77 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |[[ABS_VALUE]]|
77 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
7878 define amdgpu_kernel void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
7979 %fabs = call float @fabs(float %in0)
8080 %fmul = fmul float %fabs, %in1
8686 ; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
8787 ; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
8888 ; GCN-NOT: and
89 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |[[ABS_VALUE]]|
89 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
9090 define amdgpu_kernel void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
9191 %fabs = call float @llvm.fabs.f32(float %in0)
9292 %fmul = fmul float %fabs, %in1
9595 }
9696
9797 ; GCN-LABEL: {{^}}fadd_v2f16_imm_a:
98 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
98 ; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
9999 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
100 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
100 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
101101 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
102102 ; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
103103 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
106106 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
107107 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
108108
109 ; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
109 ; VI-DAG: v_mov_b32_e32 v[[CONST2:[0-9]+]], 0x4000
110 ; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[CONST2]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
110111 ; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
111 ; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
112112 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
113113
114114 ; GCN: buffer_store_dword v[[R_V2_F16]]
124124 }
125125
126126 ; GCN-LABEL: {{^}}fadd_v2f16_imm_b:
127 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
127 ; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
128128 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
129 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
129 ; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
130130 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
131131 ; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 2.0, v[[A_F32_0]]
132132 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
135135 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
136136 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
137137
138 ; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[A_F16_1]]
138 ; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
139 ; VI-DAG: v_add_f16_sdwa v[[R_F16_0:[0-9]+]], v[[CONST1]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
139140 ; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[A_V2_F16]]
140 ; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]]
141 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_1]]
141 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
142142
143143 ; GCN: buffer_store_dword v[[R_V2_F16]]
144144 ; GCN: s_endpgm
1212 }
1313
1414 ; CHECK-LABEL: {{^}}s_fadd_f64:
15 ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
15 ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
1616 define amdgpu_kernel void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) {
1717 %r2 = fadd double %r0, %r1
1818 store double %r2, double addrspace(1)* %out
204204 }
205205
206206 ; GCN-LABEL: {{^}}v_test_canonicalize_var_v2f16:
207 ; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, {{v[0-9]+}}
207 ; VI: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
208 ; VI-DAG: v_mul_f16_sdwa [[REG0:v[0-9]+]], v[[CONST1]], {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
208209 ; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, {{v[0-9]+}}
209 ; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
210210 ; VI-NOT: v_and_b32
211211
212212 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}}
222222 ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_v2f16:
223223 ; VI-DAG: v_bfe_u32
224224 ; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, v{{[0-9]+}}
225 ; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, v{{[0-9]+}}
225 ; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
226 ; VI: v_mul_f16_sdwa [[REG0:v[0-9]+]], v[[CONST1]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
226227 ; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}}
227228 ; VI-NOT: 0xffff
228229 ; VI: v_or_b32
239240 }
240241
241242 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_v2f16:
242 ; VI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
243 ; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, v{{[0-9]+}}
244 ; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}}
243 ; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
244 ; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
245 ; VI-DAG: v_mul_f16_sdwa [[REG0:v[0-9]+]], v[[CONST1]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
246 ; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}}
245247 ; VI: v_or_b32
246248
247249 ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
258260
259261 ; FIXME: Fold modifier
260262 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16:
261 ; VI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}}
262 ; VI-DAG: v_lshrrev_b32_e32 [[FNEG_HI:v[0-9]+]], 16, [[FNEG]]
263 ; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, [[FNEG_HI]]
263 ; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
264 ; VI-DAG: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}}
265 ; VI-DAG: v_mul_f16_sdwa [[REG1:v[0-9]+]], v[[CONST1]], [[FNEG]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
264266 ; VI-DAG: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, [[FNEG]]
265 ; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
266267 ; VI-NOT: 0xffff
267268
268269 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}}
9595 }
9696
9797 ; GCN-LABEL: {{^}}fmul_v2f16_imm_a:
98 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
98 ; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
9999 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
100 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
100 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
101101 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
102102 ; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
103103 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
104104 ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
105105 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
106 ; VI-DAG: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
106 ; VI-DAG: v_mov_b32_e32 v[[CONST4:[0-9]+]], 0x4400
107 ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[CONST4]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
107108 ; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
108 ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
109 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
109110 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
110111 ; GCN: buffer_store_dword v[[R_V2_F16]]
111112 ; GCN: s_endpgm
120121 }
121122
122123 ; GCN-LABEL: {{^}}fmul_v2f16_imm_b:
123 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
124 ; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
124125 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
125 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
126 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
126127 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
127128 ; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
128129 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
129130 ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
130131 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
131 ; VI-DAG: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
132 ; VI-DAG: v_mov_b32_e32 v[[CONST3:[0-9]+]], 0x4200
133 ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[CONST3]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
132134 ; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
133 ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
135 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
134136 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
135137 ; GCN: buffer_store_dword v[[R_V2_F16]]
136138 ; GCN: s_endpgm
7070 ; FIXME: single bit op
7171 ; GCN-LABEL: {{^}}s_fneg_fabs_v2f16:
7272 ; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
73 ; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
73 ; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]]
74 ; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
75 ; VI: v_or_b32_sdwa v{{[0-9]+}}, [[VMASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
7476 ; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
7577 ; CIVI: flat_store_dword
7678
8486
8587 ; GCN-LABEL: {{^}}fneg_fabs_v4f16:
8688 ; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
87 ; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
88 ; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
89 ; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
90 ; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
89 ; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
90 ; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
91 ; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
92 ; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
93 ; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]]
94 ; VI: v_or_b32_sdwa v{{[0-9]+}}, [[VMASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
95 ; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
96 ; VI: v_or_b32_sdwa v{{[0-9]+}}, [[VMASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
97 ; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
9198
9299 ; GFX9: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
93100 ; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}}
44 ; into 2 modifiers, although theoretically that should work.
55
66 ; GCN-LABEL: {{^}}fneg_fabs_fadd_f64:
7 ; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}|, {{s\[[0-9]+:[0-9]+\]}}
7 ; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}|
88 define amdgpu_kernel void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) {
99 %fabs = call double @llvm.fabs.f64(double %x)
1010 %fsub = fsub double -0.000000e+00, %fabs
2424 }
2525
2626 ; GCN-LABEL: {{^}}fneg_fabs_fmul_f64:
27 ; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -|{{v\[[0-9]+:[0-9]+\]}}|, {{s\[[0-9]+:[0-9]+\]}}
27 ; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}|
2828 define amdgpu_kernel void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) {
2929 %fabs = call double @llvm.fabs.f64(double %x)
3030 %fsub = fsub double -0.000000e+00, %fabs
33
44 ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
55 ; SI-NOT: and
6 ; SI: v_subrev_f32_e64 {{v[0-9]+}}, |{{v[0-9]+}}|, {{s[0-9]+}}
6 ; SI: v_sub_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}|
77 define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) {
88 %fabs = call float @llvm.fabs.f32(float %x)
99 %fsub = fsub float -0.000000e+00, %fabs
1414
1515 ; FUNC-LABEL: {{^}}fneg_fabs_fmul_f32:
1616 ; SI-NOT: and
17 ; SI: v_mul_f32_e64 {{v[0-9]+}}, -|{{v[0-9]+}}|, {{s[0-9]+}}
17 ; SI: v_mul_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, -|{{v[0-9]+}}|
1818 ; SI-NOT: and
1919 define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
2020 %fabs = call float @llvm.fabs.f32(float %x)
129129 }
130130
131131 ; GCN-LABEL: {{^}}v_extract_fneg_fold_v2f16:
132 ; GCN: flat_load_dword [[VAL:v[0-9]+]]
132 ; GCN-DAG: flat_load_dword [[VAL:v[0-9]+]]
133133 ; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}}
134134 ; CI-DAG: v_sub_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
135135
136 ; GFX89: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VAL]]
136 ; GFX9: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VAL]]
137137 ; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]]
138 ; GFX89-DAG: v_sub_f16_e32 v{{[0-9]+}}, 2.0, [[ELT1]]
138 ; GFX9-DAG: v_sub_f16_e32 v{{[0-9]+}}, 2.0, [[ELT1]]
139 ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
140 ; VI-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
139141 define amdgpu_kernel void @v_extract_fneg_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {
140142 %val = load <2 x half>, <2 x half> addrspace(1)* %in
141143 %fneg = fsub <2 x half> , %val
1111 ; SI-DAG: v_fract_f64_e32 [[FRC:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
1212 ; SI-DAG: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
1313 ; SI-DAG: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
14 ; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
14 ; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], [[FRC]], v{{\[}}[[UPLO]]:[[UPHI]]]
1515 ; SI-DAG: v_cmp_class_f64_e64 vcc, v{{\[}}[[LO]]:[[HI]]], 3
1616 ; SI: v_cndmask_b32_e32 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], vcc
1717 ; SI: v_cndmask_b32_e32 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], vcc
3838 ; SI-DAG: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
3939 ; SI-DAG: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
4040 ; SI-DAG: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
41 ; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
41 ; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], [[FRC]], v{{\[}}[[UPLO]]:[[UPHI]]]
4242 ; SI-DAG: v_cmp_class_f64_e64 vcc, v{{\[}}[[LO]]:[[HI]]], 3
4343 ; SI: v_cndmask_b32_e32 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], vcc
4444 ; SI: v_cndmask_b32_e32 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], vcc
6666 ; SI-DAG: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]|
6767 ; SI-DAG: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
6868 ; SI-DAG: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
69 ; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
69 ; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], [[FRC]], v{{\[}}[[UPLO]]:[[UPHI]]]
7070 ; SI-DAG: v_cmp_class_f64_e64 vcc, v{{\[}}[[LO]]:[[HI]]], 3
7171 ; SI: v_cndmask_b32_e32 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], vcc
7272 ; SI: v_cndmask_b32_e32 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], vcc
9898 }
9999
100100 ; GCN-LABEL: {{^}}fsub_v2f16_imm_a:
101 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
101 ; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
102102
103103 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
104104 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
110110 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
111111 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
112112
113 ; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
114 ; VI-DAG: v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
113 ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
114 ; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST2]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
115115 ; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
116 ; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
117116 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
118117
119118 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x40003c00
120 ; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[K]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1]
119 ; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]] neg_lo:[1,0] neg_hi:[1,0]
121120
122121 ; GCN: buffer_store_dword v[[R_V2_F16]]
123122 ; GCN: s_endpgm
133132 }
134133
135134 ; GCN-LABEL: {{^}}fsub_v2f16_imm_b:
136 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
135 ; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
137136
138137 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
139138 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
145144 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
146145 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
147146
148 ; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
149 ; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], -1.0, v[[A_F16_1]]
147 ; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xbc00
148 ; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONSTM1]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
150149 ; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]]
151 ; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
152150 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
153151
154152 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00c000
155 ; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[K]], v[[A_V2_F16]]{{$}}
153 ; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]{{$}}
156154
157155 ; GCN: buffer_store_dword v[[R_V2_F16]]
158156 ; GCN: s_endpgm
3838 }
3939
4040 ; SI-LABEL: {{^}}s_fsub_f64:
41 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\]}}
41 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
4242 define amdgpu_kernel void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) {
4343 %sub = fsub double %a, %b
4444 store double %sub, double addrspace(1)* %out
122122 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
123123 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
124124 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL0]]
125 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL1]]
125 ; VI-DAG: v_mov_b32_e32 [[CONST0:v[0-9]+]], 0
126 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST0]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
126127 ; VI: v_or_b32
127128 ; VI: buffer_store_dword
128129 define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
139140 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
140141 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
141142 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL0]]
142 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL1]]
143 ; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
144 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST05]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
143145 ; VI: v_or_b32
144146 ; VI: buffer_store_dword
145147 define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
156158 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
157159 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
158160 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL0]]
159 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL1]]
161 ; VI-DAG: v_mov_b32_e32 [[CONSTM05:v[0-9]+]], 0xb800
162 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM05]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
160163 ; VI: v_or_b32
161164 ; VI: buffer_store_dword
162165 define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
173176 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
174177 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
175178 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL0]]
176 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL1]]
179 ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0x3c00
180 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST1]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
177181 ; VI: v_or_b32
178182 ; VI: buffer_store_dword
179183 define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
190194 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
191195 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
192196 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL0]]
193 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL1]]
197 ; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xbc00
198 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM1]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
194199 ; VI: v_or_b32
195200 ; VI: buffer_store_dword
196201 define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
207212 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
208213 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
209214 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL0]]
210 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL1]]
215 ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
216 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
211217 ; VI: v_or_b32
212218 ; VI: buffer_store_dword
213219 define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
224230 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
225231 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
226232 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL0]]
227 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL1]]
233 ; VI-DAG: v_mov_b32_e32 [[CONSTM2:v[0-9]+]], 0xc000
234 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM2]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
228235 ; VI: v_or_b32
229236 ; VI: buffer_store_dword
230237 define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
241248 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
242249 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
243250 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL0]]
244 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL1]]
251 ; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
252 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST4]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
245253 ; VI: v_or_b32
246254 ; VI: buffer_store_dword
247255 define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
258266 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
259267 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
260268 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL0]]
261 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL1]]
269 ; VI-DAG: v_mov_b32_e32 [[CONSTM4:v[0-9]+]], 0xc400
270 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM4]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
262271 ; VI: v_or_b32
263272 ; VI: buffer_store_dword
264273 define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
272281 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5
273282 ; GFX9: buffer_store_dword [[REG]]
274283
284 ; VI: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
275285 ; VI: buffer_load_dword
276286 ; VI-NOT: and
277 ; VI: v_lshrrev_b32_e32 {{v[0-9]+}}, 16,
278 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
287 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST05]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
279288 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
280289 ; VI: v_or_b32
281290 ; VI: buffer_store_dword
289298 ; GCN-LABEL: {{^}}commute_add_literal_v2f16:
290299 ; GFX9-DAG: buffer_load_dword [[VAL:v[0-9]+]]
291300 ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
292 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[K]], [[VAL]] op_sel_hi:[0,1]{{$}}
301 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], [[K]] op_sel_hi:[1,0]{{$}}
293302 ; GFX9: buffer_store_dword [[REG]]
294303
295304 ; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
314323 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
315324 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
316325 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL0]]
317 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL1]]
326 ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 1
327 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST1]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
318328 ; VI: v_or_b32
319329 ; VI: buffer_store_dword
320330 define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
331341 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
332342 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
333343 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL0]]
334 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL1]]
344 ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 2
345 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
335346 ; VI: v_or_b32
336347 ; VI: buffer_store_dword
337348 define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
348359 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
349360 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
350361 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL0]]
351 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL1]]
362 ; VI-DAG: v_mov_b32_e32 [[CONST16:v[0-9]+]], 16
363 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST16]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
352364 ; VI: v_or_b32
353365 ; VI: buffer_store_dword
354366 define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
365377 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
366378 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
367379 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1, [[VAL0]]
368 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1, [[VAL1]]
380 ; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xffff
381 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM1]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
369382 ; VI: v_or_b32
370383 ; VI: buffer_store_dword
371384 define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
382395 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
383396 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
384397 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2, [[VAL0]]
385 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2, [[VAL1]]
398 ; VI-DAG: v_mov_b32_e32 [[CONSTM2:v[0-9]+]], 0xfffe
399 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM2]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
386400 ; VI: v_or_b32
387401 ; VI: buffer_store_dword
388402 define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
399413 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
400414 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
401415 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -16, [[VAL0]]
402 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -16, [[VAL1]]
416 ; VI-DAG: v_mov_b32_e32 [[CONSTM16:v[0-9]+]], 0xfff0
417 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM16]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
403418 ; VI: v_or_b32
404419 ; VI: buffer_store_dword
405420 define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
416431 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
417432 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
418433 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL0]]
419 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL1]]
434 ; VI-DAG: v_mov_b32_e32 [[CONST63:v[0-9]+]], 63
435 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST63]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
420436 ; VI: v_or_b32
421437 ; VI: buffer_store_dword
422438 define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
433449 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
434450 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
435451 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL0]]
436 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL1]]
452 ; VI-DAG: v_mov_b32_e32 [[CONST64:v[0-9]+]], 64
453 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST64]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
437454 ; VI: v_or_b32
438455 ; VI: buffer_store_dword
439456 define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
257257 ; FIXME: fold lshl_or c0, c1, v0 -> or (c0 << c1), v0
258258
259259 ; GCN-LABEL: {{^}}v_insertelement_v2i16_1:
260 ; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]]
261 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[VEC]]
260 ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e70000
261 ; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]]
262 ; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[VEC]]
263 ; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
262264
263265 ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
264266 ; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
277279 }
278280
279281 ; GCN-LABEL: {{^}}v_insertelement_v2i16_1_inlineimm:
282 ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0xfff10000
280283 ; GCN: flat_load_dword [[VEC:v[0-9]+]]
281 ; GCN: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
282 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0xfff10000, [[ELT0]]
284 ; CI: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
285 ; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
286 ; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0xfff10000, [[ELT0]]
287 ; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
283288 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], -15, 16, [[ELT0]]
284289 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
285290 define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
336341 }
337342
338343 ; GCN-LABEL: {{^}}v_insertelement_v2f16_1:
339 ; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]]
340 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[VEC]]
344 ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x45000000
345 ; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]]
346 ; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[VEC]]
347 ; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
341348
342349 ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4500
343350 ; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
356363 }
357364
358365 ; GCN-LABEL: {{^}}v_insertelement_v2f16_1_inlineimm:
366 ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x230000
359367 ; GCN: flat_load_dword [[VEC:v[0-9]+]]
360 ; GCN: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
361 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x230000, [[ELT0]]
368 ; CI: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
369 ; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
370 ; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x230000, [[ELT0]]
371 ; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
362372 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], 35, 16, [[ELT0]]
363373 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
364374 define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
410420 }
411421
412422 ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
423 ; GFX89: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
424 ; CI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
413425 ; GCN: flat_load_dword [[IDX:v[0-9]+]]
414426 ; GCN: flat_load_dword [[VEC:v[0-9]+]]
415 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
416
417 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
427 ; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
428
418429 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
419430 ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
420431
437448 }
438449
439450 ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
451 ; GFX89: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
452 ; CI: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
440453 ; GCN: flat_load_dword [[IDX:v[0-9]+]]
441454 ; GCN: flat_load_dword [[VEC:v[0-9]+]]
442 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
443
444 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
455 ; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
456
445457 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
446458 ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
447459
2626 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
2727 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
2828 ; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}}
29 ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
29 ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
3030 ; GCN: buffer_store_short v[[R_F16]]
3131 ; GCN: s_endpgm
3232 define amdgpu_kernel void @div_fixup_f16_imm_a(
4545 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
4646 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
4747 ; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}}
48 ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
48 ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
4949 ; GCN: buffer_store_short v[[R_F16]]
5050 ; GCN: s_endpgm
5151 define amdgpu_kernel void @div_fixup_f16_imm_b(
6464 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
6565 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
6666 ; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}}
67 ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
67 ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
6868 ; GCN: buffer_store_short v[[R_F16]]
6969 ; GCN: s_endpgm
7070 define amdgpu_kernel void @div_fixup_f16_imm_c(
1616 ; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
1717 ; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
1818 ; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
19 ; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VB]], [[VA]], [[VC]]
19 ; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], [[VC]]
2020 ; GCN: buffer_store_dword [[RESULT]],
2121 ; GCN: s_endpgm
2222 define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
1313 }
1414
1515 ; GCN-LABEL: {{^}}v_fcmp_f32_oeq_with_fabs:
16 ; GCN: v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, |{{v[0-9]+}}|, {{s[0-9]+}}
16 ; GCN: v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}}, |{{v[0-9]+}}|
1717 define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(i64 addrspace(1)* %out, float %src, float %a) {
1818 %temp = call float @llvm.fabs.f32(float %a)
1919 %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float %temp, i32 1)
2222 }
2323
2424 ; GCN-LABEL: {{^}}v_fcmp_f32_oeq_both_operands_with_fabs:
25 ; GCN: v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, |{{v[0-9]+}}|, |{{s[0-9]+}}|
25 ; GCN: v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, |{{s[0-9]+}}|, |{{v[0-9]+}}|
2626 define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(i64 addrspace(1)* %out, float %src, float %a) {
2727 %temp = call float @llvm.fabs.f32(float %a)
2828 %src_input = call float @llvm.fabs.f32(float %src)
2626 }
2727
2828 ; GCN-LABEL: {{^}}test_mul_legacy_fabs_f32:
29 ; GCN: v_mul_legacy_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |s{{[0-9]+}}|
29 ; GCN: v_mul_legacy_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|, |v{{[0-9]+}}|
3030 define amdgpu_kernel void @test_mul_legacy_fabs_f32(float addrspace(1)* %out, float %a, float %b) #0 {
3131 %a.fabs = call float @llvm.fabs.f32(float %a)
3232 %b.fabs = call float @llvm.fabs.f32(float %b)
3838 ; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
3939 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
4040 ; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}}
41 ; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
41 ; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
4242 ; GCN: buffer_store_short v[[R_F16]]
4343 ; GCN: s_endpgm
4444 define amdgpu_kernel void @fma_f16_imm_a(
6161 ; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
6262 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
6363 ; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}}
64 ; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
64 ; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
6565 ; GCN: buffer_store_short v[[R_F16]]
6666 ; GCN: s_endpgm
6767 define amdgpu_kernel void @fma_f16_imm_b(
8484 ; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
8585 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
8686 ; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}}
87 ; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
87 ; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
8888 ; GCN: buffer_store_short v[[R_F16]]
8989 ; GCN: s_endpgm
9090 define amdgpu_kernel void @fma_f16_imm_c(
4949 ; VI-FLUSH: buffer_store_short v[[C_F16]]
5050
5151 ; VI-DENORM: v_mov_b32_e32 [[KA:v[0-9]+]], 0x4200
52 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[KA]], v[[B_F16]], v[[C_F16]]
52 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[B_F16]], [[KA]], v[[C_F16]]
5353 ; VI-DENORM: buffer_store_short [[RESULT]]
5454
5555 ; GCN: s_endpgm
7777 ; VI-FLUSH: buffer_store_short v[[C_F16]]
7878
7979 ; VI-DENORM: v_mov_b32_e32 [[KA:v[0-9]+]], 0x4200
80 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[KA]], v[[A_F16]], v[[C_F16]]
80 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], [[KA]], v[[C_F16]]
8181 ; VI-DENORM buffer_store_short [[RESULT]]
8282
8383
100100 }
101101
102102 ; GCN-LABEL: {{^}}maxnum_v2f16_imm_a:
103 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
103 ; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
104104 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
105 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
105 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
106106 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
107107 ; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
108108 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
109109 ; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
110110 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
111 ; VI-DAG: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
111 ; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
112 ; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST4]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
112113 ; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
113114
114 ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
115 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
115116 ; GCN-NOT: and
116117 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
117118 ; GCN: buffer_store_dword v[[R_V2_F16]]
127128 }
128129
129130 ; GCN-LABEL: {{^}}maxnum_v2f16_imm_b:
130 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
131 ; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
131132 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
132 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
133 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
133134 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
134135 ; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
135136 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
136137 ; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
137138 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
138 ; VI-DAG: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
139 ; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
140 ; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST3]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
139141 ; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
140142
141 ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
143 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
142144 ; GCN-NOT: and
143145 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
144146 ; GCN: buffer_store_dword v[[R_V2_F16]]
9999 }
100100
101101 ; GCN-LABEL: {{^}}minnum_v2f16_imm_a:
102 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
102 ; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
103103
104104 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
105105 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
109109 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
110110 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
111111
112 ; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
113 ; VI-DAG: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
112 ; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
113 ; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST4]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
114114 ; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
115115
116 ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
116 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
117117 ; GCN-NOT: and
118118 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
119119 ; GCN: buffer_store_dword v[[R_V2_F16]]
129129 }
130130
131131 ; GCN-LABEL: {{^}}minnum_v2f16_imm_b:
132 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
132 ; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
133133 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
134 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
134 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
135135 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
136136 ; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
137137 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
138138 ; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
139139 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
140 ; VI-DAG: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
140 ; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
141 ; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST3]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
141142 ; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
142143
143 ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
144 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
144145 ; GCN-NOT: and
145146 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
146147 ; GCN: buffer_store_dword v[[R_V2_F16]]
99 ; GCN-LABEL: {{^}}get_global_id_0:
1010 ; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff
1111 ; GCN: v_mov_b32_e32 [[VWGSIZEX:v[0-9]+]], [[WGSIZEX]]
12 ; GCN: v_mad_u32_u24 v{{[0-9]+}}, [[VWGSIZEX]], s8, v0
12 ; GCN: v_mad_u32_u24 v{{[0-9]+}}, s8, [[VWGSIZEX]], v0
1313 define amdgpu_kernel void @get_global_id_0(i32 addrspace(1)* %out) #1 {
1414 %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
1515 %cast.dispatch.ptr = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
150150 ; GCN-LABEL: {{^}}no_madak_src0_modifier_f32:
151151 ; GCN: buffer_load_dword [[VA:v[0-9]+]]
152152 ; GCN: buffer_load_dword [[VB:v[0-9]+]]
153 ; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
153 ; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
154154 ; GCN: s_endpgm
155155 define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
156156 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
172172 ; GCN-LABEL: {{^}}no_madak_src1_modifier_f32:
173173 ; GCN: buffer_load_dword [[VA:v[0-9]+]]
174174 ; GCN: buffer_load_dword [[VB:v[0-9]+]]
175 ; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
175 ; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
176176 ; GCN: s_endpgm
177177 define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
178178 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
128128 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
129129 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
130130 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
131 ; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], |[[VA]]|, [[VB]]
131 ; GCN: v_mad_f32 {{v[0-9]+}}, |[[VA]]|, [[VK]], [[VB]]
132132 define amdgpu_kernel void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
133133 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
134134 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
170170 ; GCN-LABEL: {{^}}madmk_add_inline_imm_f32:
171171 ; GCN: buffer_load_dword [[A:v[0-9]+]]
172172 ; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
173 ; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], [[A]], 2.0
173 ; GCN: v_mad_f32 {{v[0-9]+}}, [[A]], [[VK]], 2.0
174174 define amdgpu_kernel void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
175175 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
176176 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
210210 ; SI: s_mul_i32
211211 ; SI: v_mul_hi_u32
212212 ; SI: s_mul_i32
213 ; SI: s_mul_i32
214 ; SI: v_mul_hi_u32
215 ; SI: v_mul_hi_u32
216 ; SI: s_mul_i32
213 ; SI-DAG: s_mul_i32
214 ; SI-DAG: v_mul_hi_u32
215 ; SI-DAG: v_mul_hi_u32
216 ; SI-DAG: s_mul_i32
217217 ; SI-DAG: s_mul_i32
218218 ; SI-DAG: v_mul_hi_u32
219219 ; SI: s_mul_i32
88 ; GCN-LABEL: {{^}}ps_main:
99
1010 ; GCN-DAG: s_mov_b32 [[SWO:s[0-9]+]], s0
11 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x200
12 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0x400{{$}}
1311 ; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
1412 ; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
1513
16 ; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[CLAMP_IDX]], [[K]]
17 ; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[CLAMP_IDX]], [[ZERO]]
14 ; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], 0x200, [[CLAMP_IDX]]
15 ; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], 0x400, [[CLAMP_IDX]]
1816
1917 ; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
2018 ; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
3535 ; FUNC-LABEL: {{^}}slow_sdiv_i32_3435:
3636 ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]],
3737 ; SI-DAG: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b
38 ; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[MAGIC]], [[VAL]]
38 ; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[VAL]], [[MAGIC]]
3939 ; SI: v_add_i32
4040 ; SI: v_lshrrev_b32
4141 ; SI: v_ashrrev_i32
344344
345345 ; GCN-LABEL: {{^}}immediate_mul_v2i16:
346346 ; NOSDWA-NOT: v_mul_u32_u24_sdwa
347 ; SDWA-NOT: v_mul_u32_u24_sdwa
347 ; SDWA-DAG: v_mov_b32_e32 v[[M321:[0-9]+]], 0x141
348 ; SDWA-DAG: v_mov_b32_e32 v[[M123:[0-9]+]], 0x7b
349 ; SDWA-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v[[M123]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
350 ; SDWA-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v[[M321]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
348351
349352 define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
350353 entry:
0 # RUN: llc -march=amdgcn -mcpu=fiji -start-before si-peephole-sdwa -o - %s | FileCheck -check-prefix=GCN %s
1
2 # GCN-LABEL: {{^}}sdwa_imm_operand:
3 # GCN: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2
4 # GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2
5 # GCN: BB0_1:
6 # GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
7 # GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
8
9 # GCN-LABEL: {{^}}sdwa_sgpr_operand:
10 # GCN: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2
11 # GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2
12 # GCN: BB1_1:
13 # GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
14 # GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
15
16 --- |
17 ; ModuleID = 'sdwa-scalar-ops.opt.ll'
18 source_filename = "sdwa-scalar-ops.opt.ll"
19 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
20
21 define amdgpu_kernel void @sdwa_imm_operand(i32 addrspace(1)* nocapture %arg) {
22 bb:
23 br label %bb2
24
25 bb1: ; preds = %bb2
26 ret void
27
28 bb2: ; preds = %bb2, %bb
29 %lsr.iv = phi i64 [ %lsr.iv.next, %bb2 ], [ 0, %bb ]
30 %bc = bitcast i32 addrspace(1)* %arg to i8 addrspace(1)*
31 %uglygep4 = getelementptr i8, i8 addrspace(1)* %bc, i64 %lsr.iv
32 %uglygep45 = bitcast i8 addrspace(1)* %uglygep4 to i32 addrspace(1)*
33 %tmp5 = load i32, i32 addrspace(1)* %uglygep45, align 4
34 %tmp6 = lshr i32 %tmp5, 8
35 %tmp7 = and i32 %tmp6, 255
36 %tmp8 = zext i32 %tmp7 to i64
37 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp8
38 store i32 1, i32 addrspace(1)* %tmp9, align 4
39 %scevgep = getelementptr i32, i32 addrspace(1)* %uglygep45, i64 1
40 %tmp13 = load i32, i32 addrspace(1)* %scevgep, align 4
41 %tmp14 = lshr i32 %tmp13, 8
42 %tmp15 = and i32 %tmp14, 255
43 %tmp16 = zext i32 %tmp15 to i64
44 %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp16
45 store i32 1, i32 addrspace(1)* %tmp17, align 4
46 %lsr.iv.next = add nuw nsw i64 %lsr.iv, 8
47 %tmp1 = trunc i64 %lsr.iv.next to i32
48 %tmp19 = icmp eq i32 %tmp1, 4096
49 br i1 %tmp19, label %bb1, label %bb2
50 }
51
52 define amdgpu_kernel void @sdwa_sgpr_operand(i32 addrspace(1)* nocapture %arg) {
53 bb:
54 br label %bb2
55
56 bb1: ; preds = %bb2
57 ret void
58
59 bb2: ; preds = %bb2, %bb
60 %lsr.iv = phi i64 [ %lsr.iv.next, %bb2 ], [ 0, %bb ]
61 %bc = bitcast i32 addrspace(1)* %arg to i8 addrspace(1)*
62 %uglygep4 = getelementptr i8, i8 addrspace(1)* %bc, i64 %lsr.iv
63 %uglygep45 = bitcast i8 addrspace(1)* %uglygep4 to i32 addrspace(1)*
64 %tmp5 = load i32, i32 addrspace(1)* %uglygep45, align 4
65 %tmp6 = lshr i32 %tmp5, 8
66 %tmp7 = and i32 %tmp6, 255
67 %tmp8 = zext i32 %tmp7 to i64
68 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp8
69 store i32 1, i32 addrspace(1)* %tmp9, align 4
70 %scevgep = getelementptr i32, i32 addrspace(1)* %uglygep45, i64 1
71 %tmp13 = load i32, i32 addrspace(1)* %scevgep, align 4
72 %tmp14 = lshr i32 %tmp13, 8
73 %tmp15 = and i32 %tmp14, 255
74 %tmp16 = zext i32 %tmp15 to i64
75 %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp16
76 store i32 1, i32 addrspace(1)* %tmp17, align 4
77 %lsr.iv.next = add nuw nsw i64 %lsr.iv, 8
78 %tmp1 = trunc i64 %lsr.iv.next to i32
79 %tmp19 = icmp eq i32 %tmp1, 4096
80 br i1 %tmp19, label %bb1, label %bb2
81 }
82
83 ...
84 ---
85 name: sdwa_imm_operand
86 alignment: 0
87 exposesReturnsTwice: false
88 noVRegs: false
89 legalized: false
90 regBankSelected: false
91 selected: false
92 tracksRegLiveness: true
93 registers:
94 - { id: 0, class: sreg_64 }
95 - { id: 1, class: sreg_64 }
96 - { id: 2, class: vgpr_32 }
97 - { id: 3, class: sgpr_128 }
98 - { id: 4, class: sgpr_64 }
99 - { id: 5, class: sreg_32_xm0 }
100 - { id: 6, class: sgpr_32 }
101 - { id: 7, class: sreg_64 }
102 - { id: 8, class: sreg_64 }
103 - { id: 9, class: sreg_64_xexec }
104 - { id: 10, class: sreg_32_xm0 }
105 - { id: 11, class: sreg_32_xm0 }
106 - { id: 12, class: sreg_32_xm0 }
107 - { id: 13, class: sreg_32_xm0 }
108 - { id: 14, class: sreg_32_xm0 }
109 - { id: 15, class: sreg_32_xm0 }
110 - { id: 16, class: sreg_64 }
111 - { id: 17, class: vgpr_32 }
112 - { id: 18, class: vreg_64 }
113 - { id: 19, class: sreg_32_xm0 }
114 - { id: 20, class: sreg_32 }
115 - { id: 21, class: sreg_32_xm0 }
116 - { id: 22, class: sreg_32_xm0 }
117 - { id: 23, class: sreg_32_xm0 }
118 - { id: 24, class: sreg_64 }
119 - { id: 25, class: sreg_32_xm0 }
120 - { id: 26, class: sreg_32_xm0 }
121 - { id: 27, class: sreg_32_xm0 }
122 - { id: 28, class: sreg_32_xm0 }
123 - { id: 29, class: sreg_64 }
124 - { id: 30, class: vgpr_32 }
125 - { id: 31, class: vreg_64 }
126 - { id: 32, class: sreg_32_xm0 }
127 - { id: 33, class: sreg_32_xm0 }
128 - { id: 34, class: sreg_64 }
129 - { id: 35, class: sreg_32_xm0 }
130 - { id: 36, class: sreg_32_xm0 }
131 - { id: 37, class: sreg_32_xm0 }
132 - { id: 38, class: sreg_32_xm0 }
133 - { id: 39, class: vreg_64 }
134 - { id: 40, class: vgpr_32 }
135 - { id: 41, class: vreg_64 }
136 - { id: 42, class: sreg_32_xm0 }
137 - { id: 43, class: sreg_32 }
138 - { id: 44, class: sreg_32_xm0 }
139 - { id: 45, class: sreg_64 }
140 - { id: 46, class: sreg_32_xm0 }
141 - { id: 47, class: sreg_32_xm0 }
142 - { id: 48, class: sreg_32_xm0 }
143 - { id: 49, class: sreg_32_xm0 }
144 - { id: 50, class: sreg_64 }
145 - { id: 51, class: vreg_64 }
146 - { id: 52, class: sreg_64 }
147 - { id: 53, class: sreg_32_xm0 }
148 - { id: 54, class: sreg_32_xm0 }
149 - { id: 55, class: sreg_32_xm0 }
150 - { id: 56, class: sreg_32_xm0 }
151 - { id: 57, class: sreg_64 }
152 - { id: 58, class: sreg_32_xm0 }
153 - { id: 59, class: sreg_32_xm0 }
154 - { id: 60, class: vgpr_32 }
155 - { id: 61, class: vgpr_32 }
156 - { id: 62, class: vreg_64 }
157 - { id: 63, class: vgpr_32 }
158 - { id: 64, class: vgpr_32 }
159 - { id: 65, class: vgpr_32 }
160 - { id: 66, class: vgpr_32 }
161 - { id: 67, class: vreg_64 }
162 - { id: 68, class: vgpr_32 }
163 - { id: 69, class: vgpr_32 }
164 - { id: 70, class: vgpr_32 }
165 - { id: 71, class: vgpr_32 }
166 - { id: 72, class: vgpr_32 }
167 - { id: 73, class: vgpr_32 }
168 - { id: 74, class: vgpr_32 }
169 - { id: 75, class: vreg_64 }
170 - { id: 76, class: vgpr_32 }
171 - { id: 77, class: vgpr_32 }
172 - { id: 78, class: vgpr_32 }
173 - { id: 79, class: vgpr_32 }
174 - { id: 80, class: vreg_64 }
175 - { id: 81, class: vgpr_32 }
176 - { id: 82, class: vgpr_32 }
177 - { id: 83, class: vgpr_32 }
178 liveins:
179 - { reg: '%sgpr4_sgpr5', virtual-reg: '%4' }
180 frameInfo:
181 isFrameAddressTaken: false
182 isReturnAddressTaken: false
183 hasStackMap: false
184 hasPatchPoint: false
185 stackSize: 0
186 offsetAdjustment: 0
187 maxAlignment: 0
188 adjustsStack: false
189 hasCalls: false
190 hasOpaqueSPAdjustment: false
191 hasVAStart: false
192 hasMustTailInVarArgFunc: false
193 body: |
194 bb.0.bb:
195 successors: %bb.2.bb2(0x80000000)
196 liveins: %sgpr4_sgpr5
197
198 %4 = COPY %sgpr4_sgpr5
199 %9 = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
200 %8 = S_MOV_B64 0
201 %7 = COPY %9
202 %30 = V_MOV_B32_e32 1, implicit %exec
203 S_BRANCH %bb.2.bb2
204
205 bb.1.bb1:
206 S_ENDPGM
207
208 bb.2.bb2:
209 successors: %bb.1.bb1(0x04000000), %bb.2.bb2(0x7c000000)
210
211 %0 = PHI %8, %bb.0.bb, %1, %bb.2.bb2
212 %13 = COPY %7.sub1
213 %14 = S_ADD_U32 %7.sub0, %0.sub0, implicit-def %scc
214 %15 = S_ADDC_U32 %7.sub1, %0.sub1, implicit-def dead %scc, implicit %scc
215 %16 = REG_SEQUENCE %14, 1, %15, 2
216 %18 = COPY %16
217 %17 = FLAT_LOAD_DWORD %18, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.uglygep45)
218 %60 = V_BFE_U32 %17, 8, 8, implicit %exec
219 %61 = V_LSHLREV_B32_e32 2, killed %60, implicit %exec
220 %70 = V_ADD_I32_e32 %7.sub0, %61, implicit-def %vcc, implicit %exec
221 %66 = COPY %13
222 %65 = V_ADDC_U32_e32 0, %66, implicit-def %vcc, implicit %vcc, implicit %exec
223 %67 = REG_SEQUENCE %70, 1, killed %65, 2
224 FLAT_STORE_DWORD %67, %30, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.tmp9)
225 %37 = S_ADD_U32 %14, 4, implicit-def %scc
226 %38 = S_ADDC_U32 %15, 0, implicit-def dead %scc, implicit %scc
227 %71 = COPY killed %37
228 %72 = COPY killed %38
229 %41 = REG_SEQUENCE killed %71, 1, killed %72, 2
230 %40 = FLAT_LOAD_DWORD killed %41, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.scevgep)
231 %73 = V_BFE_U32 %40, 8, 8, implicit %exec
232 %74 = V_LSHLREV_B32_e32 2, killed %73, implicit %exec
233 %83 = V_ADD_I32_e32 %7.sub0, %74, implicit-def %vcc, implicit %exec
234 %78 = V_ADDC_U32_e32 0, %66, implicit-def %vcc, implicit %vcc, implicit %exec
235 %80 = REG_SEQUENCE %83, 1, killed %78, 2
236 FLAT_STORE_DWORD %80, %30, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.tmp17)
237 %55 = S_ADD_U32 %0.sub0, 8, implicit-def %scc
238 %56 = S_ADDC_U32 %0.sub1, 0, implicit-def dead %scc, implicit %scc
239 %57 = REG_SEQUENCE %55, 1, killed %56, 2
240 %1 = COPY %57
241 S_CMPK_EQ_I32 %55, 4096, implicit-def %scc
242 S_CBRANCH_SCC1 %bb.1.bb1, implicit %scc
243 S_BRANCH %bb.2.bb2
244
245 ...
246 ---
247 name: sdwa_sgpr_operand
248 alignment: 0
249 exposesReturnsTwice: false
250 noVRegs: false
251 legalized: false
252 regBankSelected: false
253 selected: false
254 tracksRegLiveness: true
255 registers:
256 - { id: 0, class: sreg_64 }
257 - { id: 1, class: sreg_64 }
258 - { id: 2, class: vgpr_32 }
259 - { id: 3, class: sgpr_128 }
260 - { id: 4, class: sgpr_64 }
261 - { id: 5, class: sreg_32_xm0 }
262 - { id: 6, class: sgpr_32 }
263 - { id: 7, class: sreg_64 }
264 - { id: 8, class: sreg_64 }
265 - { id: 9, class: sreg_64_xexec }
266 - { id: 10, class: sreg_32_xm0 }
267 - { id: 11, class: sreg_32_xm0 }
268 - { id: 12, class: sreg_32_xm0 }
269 - { id: 13, class: sreg_32_xm0 }
270 - { id: 14, class: sreg_32_xm0 }
271 - { id: 15, class: sreg_32_xm0 }
272 - { id: 16, class: sreg_64 }
273 - { id: 17, class: vgpr_32 }
274 - { id: 18, class: vreg_64 }
275 - { id: 19, class: sreg_32_xm0 }
276 - { id: 20, class: sreg_32 }
277 - { id: 21, class: sreg_32_xm0 }
278 - { id: 22, class: sreg_32_xm0 }
279 - { id: 23, class: sreg_32_xm0 }
280 - { id: 24, class: sreg_64 }
281 - { id: 25, class: sreg_32_xm0 }
282 - { id: 26, class: sreg_32_xm0 }
283 - { id: 27, class: sreg_32_xm0 }
284 - { id: 28, class: sreg_32_xm0 }
285 - { id: 29, class: sreg_64 }
286 - { id: 30, class: vgpr_32 }
287 - { id: 31, class: vreg_64 }
288 - { id: 32, class: sreg_32_xm0 }
289 - { id: 33, class: sreg_32_xm0 }
290 - { id: 34, class: sreg_64 }
291 - { id: 35, class: sreg_32_xm0 }
292 - { id: 36, class: sreg_32_xm0 }
293 - { id: 37, class: sreg_32_xm0 }
294 - { id: 38, class: sreg_32_xm0 }
295 - { id: 39, class: vreg_64 }
296 - { id: 40, class: vgpr_32 }
297 - { id: 41, class: vreg_64 }
298 - { id: 42, class: sreg_32_xm0 }
299 - { id: 43, class: sreg_32 }
300 - { id: 44, class: sreg_32_xm0 }
301 - { id: 45, class: sreg_64 }
302 - { id: 46, class: sreg_32_xm0 }
303 - { id: 47, class: sreg_32_xm0 }
304 - { id: 48, class: sreg_32_xm0 }
305 - { id: 49, class: sreg_32_xm0 }
306 - { id: 50, class: sreg_64 }
307 - { id: 51, class: vreg_64 }
308 - { id: 52, class: sreg_64 }
309 - { id: 53, class: sreg_32_xm0 }
310 - { id: 54, class: sreg_32_xm0 }
311 - { id: 55, class: sreg_32_xm0 }
312 - { id: 56, class: sreg_32_xm0 }
313 - { id: 57, class: sreg_64 }
314 - { id: 58, class: sreg_32_xm0 }
315 - { id: 59, class: sreg_32_xm0 }
316 - { id: 60, class: vgpr_32 }
317 - { id: 61, class: vgpr_32 }
318 - { id: 62, class: vreg_64 }
319 - { id: 63, class: vgpr_32 }
320 - { id: 64, class: vgpr_32 }
321 - { id: 65, class: vgpr_32 }
322 - { id: 66, class: vgpr_32 }
323 - { id: 67, class: vreg_64 }
324 - { id: 68, class: vgpr_32 }
325 - { id: 69, class: vgpr_32 }
326 - { id: 70, class: vgpr_32 }
327 - { id: 71, class: vgpr_32 }
328 - { id: 72, class: vgpr_32 }
329 - { id: 73, class: vgpr_32 }
330 - { id: 74, class: vgpr_32 }
331 - { id: 75, class: vreg_64 }
332 - { id: 76, class: vgpr_32 }
333 - { id: 77, class: vgpr_32 }
334 - { id: 78, class: vgpr_32 }
335 - { id: 79, class: vgpr_32 }
336 - { id: 80, class: vreg_64 }
337 - { id: 81, class: vgpr_32 }
338 - { id: 82, class: vgpr_32 }
339 - { id: 83, class: vgpr_32 }
340 - { id: 84, class: sreg_32_xm0 }
341 liveins:
342 - { reg: '%sgpr4_sgpr5', virtual-reg: '%4' }
343 frameInfo:
344 isFrameAddressTaken: false
345 isReturnAddressTaken: false
346 hasStackMap: false
347 hasPatchPoint: false
348 stackSize: 0
349 offsetAdjustment: 0
350 maxAlignment: 0
351 adjustsStack: false
352 hasCalls: false
353 hasOpaqueSPAdjustment: false
354 hasVAStart: false
355 hasMustTailInVarArgFunc: false
356 body: |
357 bb.0.bb:
358 successors: %bb.2.bb2(0x80000000)
359 liveins: %sgpr4_sgpr5
360
361 %4 = COPY %sgpr4_sgpr5
362 %9 = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
363 %8 = S_MOV_B64 0
364 %7 = COPY %9
365 %30 = V_MOV_B32_e32 1, implicit %exec
366 %84 = S_MOV_B32 2
367 S_BRANCH %bb.2.bb2
368
369 bb.1.bb1:
370 S_ENDPGM
371
372 bb.2.bb2:
373 successors: %bb.1.bb1(0x04000000), %bb.2.bb2(0x7c000000)
374
375 %0 = PHI %8, %bb.0.bb, %1, %bb.2.bb2
376 %13 = COPY %7.sub1
377 %14 = S_ADD_U32 %7.sub0, %0.sub0, implicit-def %scc
378 %15 = S_ADDC_U32 %7.sub1, %0.sub1, implicit-def dead %scc, implicit %scc
379 %16 = REG_SEQUENCE %14, 1, %15, 2
380 %18 = COPY %16
381 %17 = FLAT_LOAD_DWORD %18, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.uglygep45)
382 %60 = V_BFE_U32 %17, 8, 8, implicit %exec
383 %61 = V_LSHLREV_B32_e32 %84, killed %60, implicit %exec
384 %70 = V_ADD_I32_e32 %7.sub0, %61, implicit-def %vcc, implicit %exec
385 %66 = COPY %13
386 %65 = V_ADDC_U32_e32 0, %66, implicit-def %vcc, implicit %vcc, implicit %exec
387 %67 = REG_SEQUENCE %70, 1, killed %65, 2
388 FLAT_STORE_DWORD %67, %30, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.tmp9)
389 %37 = S_ADD_U32 %14, 4, implicit-def %scc
390 %38 = S_ADDC_U32 %15, 0, implicit-def dead %scc, implicit %scc
391 %71 = COPY killed %37
392 %72 = COPY killed %38
393 %41 = REG_SEQUENCE killed %71, 1, killed %72, 2
394 %40 = FLAT_LOAD_DWORD killed %41, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.scevgep)
395 %73 = V_BFE_U32 %40, 8, 8, implicit %exec
396 %74 = V_LSHLREV_B32_e32 %84, killed %73, implicit %exec
397 %83 = V_ADD_I32_e32 %7.sub0, %74, implicit-def %vcc, implicit %exec
398 %78 = V_ADDC_U32_e32 0, %66, implicit-def %vcc, implicit %vcc, implicit %exec
399 %80 = REG_SEQUENCE %83, 1, killed %78, 2
400 FLAT_STORE_DWORD %80, %30, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.tmp17)
401 %55 = S_ADD_U32 %0.sub0, 8, implicit-def %scc
402 %56 = S_ADDC_U32 %0.sub1, 0, implicit-def dead %scc, implicit %scc
403 %57 = REG_SEQUENCE %55, 1, killed %56, 2
404 %1 = COPY %57
405 S_CMPK_EQ_I32 %55, 4096, implicit-def %scc
406 S_CBRANCH_SCC1 %bb.1.bb1, implicit %scc
407 S_BRANCH %bb.2.bb2
408
409 ...
195195 ; SI: v_cvt_f32_f16_e32
196196 ; SI: v_cvt_f32_f16_e32
197197 ; SI: v_cvt_f32_f16_e32
198 ; SI: v_cmp_lt_f32_e64
199 ; SI: v_cmp_lt_f32_e32 vcc, 0.5
198 ; SI-DAG: v_cmp_gt_f32_e64
199 ; SI-DAG: v_cmp_lt_f32_e32 vcc, 0.5
200200
201201 ; VI: v_cmp_lt_f16_e32
202 ; VI: v_cmp_lt_f16_e64
202 ; VI: v_cmp_gt_f16_e64
203203 ; GCN: v_cndmask_b32_e32
204204 ; GCN: v_cndmask_b32_e64
205205 ; SI: v_cvt_f16_f32_e32
227227 ; SI: v_cvt_f32_f16_e32
228228 ; SI: v_cvt_f32_f16_e32
229229 ; SI: v_cvt_f32_f16_e32
230 ; SI: v_cmp_gt_f32_e64
231 ; SI: v_cmp_gt_f32_e32 vcc, 0.5
230 ; SI-DAG: v_cmp_lt_f32_e64
231 ; SI-DAG: v_cmp_gt_f32_e32 vcc, 0.5
232232
233233 ; VI: v_cmp_gt_f16_e32
234 ; VI: v_cmp_gt_f16_e64
234 ; VI: v_cmp_lt_f16_e64
235235 ; GCN: v_cndmask_b32_e32
236236 ; GCN: v_cndmask_b32_e64
237237
44 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
55
66 ; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}}
7 ; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], 0{{$}}
7 ; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}}
88 ; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}}
99 ; GCN: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
1010
2323
2424 ; Extract the high bit of the 2nd quarter
2525 ; GCN-LABEL: {{^}}v_uextract_bit_63_i128:
26 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
26 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
2727
2828 ; GCN-DAG: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}}
2929 ; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}}
3030 ; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}}
31 ; GCN: v_mov_b32_e32 v[[ZERO3:[0-9]+]], v[[ZERO0]]{{$}}
3132 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
3233
33 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
34 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO3]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
3435 ; GCN: s_endpgm
3536 define amdgpu_kernel void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
3637 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
4849 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
4950
5051 ; GCN-DAG: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}}
51 ; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], 0{{$}}
52 ; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}}
5253 ; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}}
5354 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
5455
6768
6869 ; Extract the high bit of the 4th quarter
6970 ; GCN-LABEL: {{^}}v_uextract_bit_127_i128:
70 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
71 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
7172
72 ; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}}
73 ; GCN-DAG: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}}
7374 ; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}}
7475 ; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}}
76 ; GCN: v_mov_b32_e32 v[[ZERO3:[0-9]+]], v[[ZERO0]]{{$}}
7577 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
7678
77 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
79 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO3]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
7880 ; GCN: s_endpgm
7981 define amdgpu_kernel void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
8082 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
8991
9092 ; Spans more than 2 dword boundaries
9193 ; GCN-LABEL: {{^}}v_uextract_bit_34_100_i128:
92 ; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
94 ; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
9395
9496 ; GCN-DAG: v_lshl_b64 v{{\[}}[[SHLLO:[0-9]+]]:[[SHLHI:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, 30
9597 ; GCN-DAG: v_lshrrev_b32_e32 v[[ELT1PART:[0-9]+]], 2, v{{[[0-9]+}}
9698 ; GCN-DAG: v_bfe_u32 v[[ELT2PART:[0-9]+]], v[[VAL3]], 2, 2{{$}}
9799 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
98100 ; GCN-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[SHLLO]], v[[ELT1PART]]
101 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]{{$}}
99102
100 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[OR0]]:[[ZERO]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
103 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[OR0]]:[[ZERO1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
101104 ; GCN: s_endpgm
102105 define amdgpu_kernel void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
103106 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
2020
2121 ; Extract the high bit of the high half
2222 ; GCN-LABEL: {{^}}v_uextract_bit_63_i64:
23 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
2324 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
2425 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
25 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
26 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
26 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
27 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO1]]{{\]}}
2728 define amdgpu_kernel void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
2829 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
2930 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
6869 }
6970
7071 ; GCN-LABEL: {{^}}v_uextract_bit_32_i64:
71 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
72 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
7273 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 1, [[VAL]]
7374 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
74 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
75 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]{{$}}
76 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO1]]{{\]}}
7577 define amdgpu_kernel void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
7678 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
7779 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
8486 }
8587
8688 ; GCN-LABEL: {{^}}v_uextract_bit_33_i64:
89 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
8790 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
8891 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1{{$}}
89 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
90 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
92 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
93 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO1]]{{\]}}
9194 define amdgpu_kernel void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
9295 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
9396 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
166169 }
167170
168171 ; GCN-LABEL: {{^}}v_uextract_bit_32_33_i64:
172 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
169173 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
170174 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 2
171 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
172 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
175 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
176 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO1]]{{\]}}
173177 define amdgpu_kernel void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
174178 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
175179 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
182186 }
183187
184188 ; GCN-LABEL: {{^}}v_uextract_bit_30_60_i64:
189 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
185190 ; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
186191 ; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 30
187192 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 0x3fffffff, v[[SHRLO]]{{$}}
188 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
189 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
193 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
194 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO1]]{{\]}}
190195 define amdgpu_kernel void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
191196 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
192197 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
199204 }
200205
201206 ; GCN-LABEL: {{^}}v_uextract_bit_33_63_i64:
207 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
202208 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
203209 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30
204 ; GCN-DAG: v_mov_b32_e32 v[[BFE:[0-9]+]], 0{{$}}
205 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
210 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
211 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO1]]{{\]}}
206212 define amdgpu_kernel void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
207213 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
208214 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
215221 }
216222
217223 ; GCN-LABEL: {{^}}v_uextract_bit_31_63_i64:
224 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
218225 ; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
219226 ; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 31
220 ; GCN-NEXT: v_mov_b32_e32 v[[SHRHI]], 0{{$}}
227 ; GCN-NEXT: v_mov_b32_e32 v[[SHRHI]], v[[ZERO]]
221228 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
222229 define amdgpu_kernel void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
223230 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
299306
300307 ; GCN-LABEL: {{^}}and_not_mask_i64:
301308 ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
302 ; GCN: v_mov_b32_e32 v[[SHRHI:[0-9]+]], 0{{$}}
309 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
310 ; GCN-DAG: v_mov_b32_e32 v[[SHRHI:[0-9]+]], v[[ZERO]]{{$}}
303311 ; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VALLO]]
304312 ; GCN-DAG: v_and_b32_e32 v[[SHRLO:[0-9]+]], 4, [[SHR]]
305313 ; GCN-NOT: v[[SHRLO]]
320328 ; keeping the 32-bit and has a smaller encoding size than the bfe.
321329
322330 ; GCN-LABEL: {{^}}v_uextract_bit_27_29_multi_use_shift_i64:
323 ; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
331 ; GCN-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
324332 ; GCN-DAG: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 27
325333 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]
326334 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
339347 }
340348
341349 ; GCN-LABEL: {{^}}v_uextract_bit_34_37_multi_use_shift_i64:
342 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
343 ; GCN: v_mov_b32_e32 v[[ZERO_SHR:[0-9]+]], 0{{$}}
350 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
351 ; GCN-DAG: v_mov_b32_e32 v[[ZERO_SHR:[0-9]+]], 0{{$}}
344352 ; GCN: v_mov_b32_e32 v[[ZERO_BFE:[0-9]+]], v[[ZERO_SHR]]
345353 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 2, [[VAL]]
346354 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 2, 3
361369 ; GCN-LABEL: {{^}}v_uextract_bit_33_36_use_upper_half_shift_i64:
362370 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
363371 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 3
372 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
364373 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:{{[0-9]+\]}}
365374 ; GCN: buffer_store_dword v[[ZERO]]
366375 define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 {
3939 ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
4040 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
4141
42 ; VI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
4243 ; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16,
4344 ; VI: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
4445 ; VI: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
4546 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
4647 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
4748 ; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}
48 ; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}
49 ; VI: v_add_u16_sdwa v{{[0-9]+}}, [[TWO]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4950 ; VI-NOT: v_and_b32
5051 ; VI: v_or_b32_e32
5152 define amdgpu_kernel void @v_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 {
205206 }
206207
207208 ; GCN-LABEL: {{^}}u_min_max_v2i16:
208 ; GFX9: v_pk_max_u16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
209 ; GFX9: v_pk_max_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
209210 ; GFX9: v_pk_min_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
210211 define amdgpu_kernel void @u_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) nounwind {
211212 %cond0 = icmp ugt <2 x i16> %val0, %val1
1919
2020 ; FUNC-LABEL: {{^}}srem_i32_7:
2121 ; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x92492493
22 ; SI: v_mul_hi_i32 {{v[0-9]+}}, [[MAGIC]],
22 ; SI: v_mul_hi_i32 {{v[0-9]+}}, {{v[0-9]+}}, [[MAGIC]]
2323 ; SI: v_mul_lo_i32
2424 ; SI: v_sub_i32
2525 ; SI: s_endpgm
2222 ; GFX9: s_load_dword [[VAL0:s[0-9]+]]
2323 ; GFX9: s_load_dword [[VAL1:s[0-9]+]]
2424 ; GFX9: v_mov_b32_e32 [[VVAL1:v[0-9]+]]
25 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[VVAL1]], [[VAL0]]
25 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[VAL0]], [[VVAL1]]
2626
2727 ; VI: s_sub_i32
2828 ; VI: s_sub_i32
4646
4747 ; FIXME: VI should not scalarize arg access.
4848 ; GCN-LABEL: {{^}}s_test_sub_v2i16_kernarg:
49 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
49 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
5050
5151 ; VI: v_subrev_i32_e32
5252 ; VI: v_subrev_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
5858
5959 ; GCN-LABEL: {{^}}v_test_sub_v2i16_constant:
6060 ; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}}
61 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}
62
63 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffe38, v{{[0-9]+}}
61 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]]
62
63 ; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xfffffe38
64 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[K]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
6465 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xffffff85, v{{[0-9]+}}
6566 define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
6667 %tid = call i32 @llvm.amdgcn.workitem.id.x()
7576 ; FIXME: Need to handle non-uniform case for function below (load without gep).
7677 ; GCN-LABEL: {{^}}v_test_sub_v2i16_neg_constant:
7778 ; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0xfc21fcb3{{$}}
78 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}
79
80 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x3df, v{{[0-9]+}}
79 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]]
80
81 ; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3df
82 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[K]], v{{[0-9]+}}
8183 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x34d, v{{[0-9]+}}
8284 define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
8385 %tid = call i32 @llvm.amdgcn.workitem.id.x()
9294 ; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_neg1:
9395 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, -1{{$}}
9496
97 ; VI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
9598 ; VI: flat_load_ushort [[LOAD0:v[0-9]+]]
9699 ; VI: flat_load_ushort [[LOAD1:v[0-9]+]]
97 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD0]]
100 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[ONE]], [[LOAD0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
98101 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD1]]
99 ; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
100102 ; VI: v_or_b32_e32
101103 define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
102104 %tid = call i32 @llvm.amdgcn.workitem.id.x()
110112
111113 ; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_lo_zero_hi:
112114 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 32{{$}}
113 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}}
115 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
114116
115117 ; VI-NOT: v_subrev_i16
116118 ; VI: v_add_u16_e32 v{{[0-9]+}}, 0xffffffe0, v{{[0-9]+}}
130132 ; The high element gives fp
131133 ; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_fp_split:
132134 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 1.0
133 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}}
134
135 ; VI-NOT: v_subrev_i16
136 ; VI: v_add_u16_e32 v{{[0-9]+}}, 0xffffc080, v{{[0-9]+}}
137 ; VI-NOT: v_subrev_i16
138 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
135 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
136
137 ; VI-NOT: v_subrev_i16
138 ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0xffffc080
139 ; VI: v_add_u16_sdwa v{{[0-9]+}}, [[K]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
140 ; VI-NOT: v_subrev_i16
139141 ; VI: v_or_b32_e32
140142 define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
141143 %tid = call i32 @llvm.amdgcn.workitem.id.x()
184186
185187 ; FIXME: Need to handle non-uniform case for function below (load without gep).
186188 ; GCN-LABEL: {{^}}v_test_sub_v2i16_zext_to_v2i64:
189 ; GFX9: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
187190 ; GFX9: flat_load_dword [[A:v[0-9]+]]
188191 ; GFX9: flat_load_dword [[B:v[0-9]+]]
189192
190 ; GFX9: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
191193 ; GFX9: v_pk_sub_i16 [[ADD:v[0-9]+]], [[A]], [[B]]
192194 ; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
193195 ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
198200 ; VI: flat_load_ushort v[[B_LO:[0-9]+]]
199201 ; VI: flat_load_ushort v[[B_HI:[0-9]+]]
200202
201 ; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
202 ; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
203203 ; VI-DAG: v_subrev_u16_e32
204204 ; VI-DAG: v_subrev_u16_e32
205205
7373 ; FUNC-LABEL: {{^}}udiv_i32_div_k_even:
7474 ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
7575 ; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xfabbd9c1
76 ; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[K]], [[VAL]]
76 ; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[VAL]], [[K]]
7777 ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 25, [[MULHI]]
7878 ; SI: buffer_store_dword [[RESULT]]
7979 define amdgpu_kernel void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
8787 ; FUNC-LABEL: {{^}}udiv_i32_div_k_odd:
8888 ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
8989 ; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x7d5deca3
90 ; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[K]], [[VAL]]
90 ; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[VAL]], [[K]]
9191 ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 24, [[MULHI]]
9292 ; SI: buffer_store_dword [[RESULT]]
9393 define amdgpu_kernel void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
175175
176176 ; FUNC-LABEL: {{^}}test_udiv_3_mulhu:
177177 ; SI: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab
178 ; SI: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}}
178 ; SI: v_mul_hi_u32 v0, {{s[0-9]+}}, {{v[0-9]+}}
179179 ; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
180180 define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
181181 %i = udiv i32 %p, 3
1919
2020 ; FUNC-LABEL: {{^}}test_urem_i32_7:
2121 ; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x24924925
22 ; SI: v_mul_hi_u32 {{v[0-9]+}}, [[MAGIC]]
22 ; SI: v_mul_hi_u32 [[MAGIC]], {{v[0-9]+}}
2323 ; SI: v_subrev_i32
2424 ; SI: v_mul_lo_i32
2525 ; SI: v_sub_i32
5353 ; VI: buffer_load_dword [[VA0:v[0-9]+]]
5454 ; VI: buffer_load_dword [[VA1:v[0-9]+]]
5555
56 ; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VA0]], [[SA]], [[VB]]
57 ; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VA1]], [[SA]], [[VB]]
56 ; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SA]], [[VA0]], [[VB]]
57 ; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SA]], [[VA1]], [[VB]]
5858 ; GCN: buffer_store_dword [[RESULT0]]
5959 ; GCN: buffer_store_dword [[RESULT1]]
6060 define amdgpu_kernel void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 {
7373 ; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
7474 ; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
7575 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
76 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
76 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
7777 ; GCN: buffer_store_dword [[RESULT]]
7878 define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 {
7979 %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1
8787 ; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
8888 ; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
8989 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
90 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
90 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
9191 ; GCN: buffer_store_dword [[RESULT]]
9292 define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 {
9393 %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1
227227 ; GCN-DAG: v_mov_b32_e32 [[VK0:v[0-9]+]], 0x44800000
228228 ; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], [[SGPR1]]
229229
230 ; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VS1]], [[SGPR0]], [[VK0]]
230 ; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK0]]
231231 ; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x45800000
232232 ; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK1]]
233233
250250
251251 ; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB0:[0-9]+]], s[[SGPR1_SUB0]]
252252 ; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB1:[0-9]+]], s[[SGPR1_SUB1]]
253 ; GCN: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, [[SGPR0]], v{{\[}}[[VZERO]]:[[VK0_SUB1]]{{\]}}
253 ; GCN: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[}}[[VZERO]]:[[VK0_SUB1]]{{\]}}
254254
255255 ; Same zero component is re-used for half of each immediate.
256256 ; GCN: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], 0x40b00000
481481 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]]
482482 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
483483
484 ; VI: v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
485 ; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
484 ; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
485 ; VI-DAG: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
486 ; VI-DAG: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
486487 ; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
487488 ; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
488489
512513 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}}
513514 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
514515
515 ; VI: v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
516 ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
516517 ; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
518 ; VI: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
517519 ; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
518520 ; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
519521
543545 ; SI-DAG: v_mac_f32_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}}
544546 ; SI-DAG: v_mac_f32_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}}
545547
546 ; VI: v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
548 ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
547549 ; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
550 ; VI: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
548551 ; VI-DAG: v_mac_f16_sdwa v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
549552 ; VI-DAG: v_mac_f16_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}}
550553
348348
349349 ; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body
350350 ; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
351 ; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]]
351 ; CHECK: v_cmp_gt_f32_e32 vcc, [[CTR]], [[SEVEN]]
352352 ; CHECK: s_cbranch_vccz [[LOOPHDR]]
353353 ; CHECK: ; %break
354354