llvm.org GIT mirror llvm / a038a83
AMDGPU: Allow SIShrinkInstructions to work in non-SSA Immediates can be folded as long as the immediate is a vreg. Also undo commuting instructions if it didn't fold an immediate. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@307575 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 2 years ago
69 changed file(s) with 563 addition(s) and 514 deletion(s). Raw diff Collapse all Expand all
125125
126126 /// \brief This function checks \p MI for operands defined by a move immediate
127127 /// instruction and then folds the literal constant into the instruction if it
128 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction
129 /// and will only fold literal constants if we are still in SSA.
130 static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
128 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
129 static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
131130 MachineRegisterInfo &MRI, bool TryToCommute = true) {
132
133 if (!MRI.isSSA())
134 return;
135
136131 assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
137132
138133 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
139134
140135 // Try to fold Src0
141136 MachineOperand &Src0 = MI.getOperand(Src0Idx);
142 if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) {
137 if (Src0.isReg()) {
143138 unsigned Reg = Src0.getReg();
144 MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
145 if (Def && Def->isMoveImmediate()) {
146 MachineOperand &MovSrc = Def->getOperand(1);
147 bool ConstantFolded = false;
148
149 if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) ||
150 isUInt<32>(MovSrc.getImm()))) {
151 Src0.ChangeToImmediate(MovSrc.getImm());
152 ConstantFolded = true;
153 }
154 if (ConstantFolded) {
155 if (MRI.use_empty(Reg))
139 if (TargetRegisterInfo::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) {
140 MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
141 if (Def && Def->isMoveImmediate()) {
142 MachineOperand &MovSrc = Def->getOperand(1);
143 bool ConstantFolded = false;
144
145 if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) ||
146 isUInt<32>(MovSrc.getImm()))) {
147 // It's possible to have only one component of a super-reg defined by
148 // a single mov, so we need to clear any subregister flag.
149 Src0.setSubReg(0);
150 Src0.ChangeToImmediate(MovSrc.getImm());
151 ConstantFolded = true;
152 }
153
154 if (ConstantFolded) {
155 assert(MRI.use_empty(Reg));
156156 Def->eraseFromParent();
157 ++NumLiteralConstantsFolded;
158 return;
157 ++NumLiteralConstantsFolded;
158 return true;
159 }
159160 }
160161 }
161162 }
162163
163164 // We have failed to fold src0, so commute the instruction and try again.
164 if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(MI))
165 foldImmediates(MI, TII, MRI, false);
166
165 if (TryToCommute && MI.isCommutable()) {
166 if (TII->commuteInstruction(MI)) {
167 if (foldImmediates(MI, TII, MRI, false))
168 return true;
169
170 // Commute back.
171 TII->commuteInstruction(MI);
172 }
173 }
174
175 return false;
167176 }
168177
169178 // Copy MachineOperand with all flags except setting it as implicit.
33 ; GCN-LABEL: {{^}}v_test_add_i16:
44 ; VI: flat_load_ushort [[A:v[0-9]+]]
55 ; VI: flat_load_ushort [[B:v[0-9]+]]
6 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
6 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
77 ; VI-NEXT: buffer_store_short [[ADD]]
88 define amdgpu_kernel void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
99 %tid = call i32 @llvm.amdgcn.workitem.id.x()
6666 ; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i32:
6767 ; VI: flat_load_ushort [[A:v[0-9]+]]
6868 ; VI: flat_load_ushort [[B:v[0-9]+]]
69 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
69 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
7070 ; VI-NEXT: buffer_store_dword [[ADD]]
7171 define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
7272 %tid = call i32 @llvm.amdgcn.workitem.id.x()
8585 ; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i64:
8686 ; VI: flat_load_ushort [[A:v[0-9]+]]
8787 ; VI: flat_load_ushort [[B:v[0-9]+]]
88 ; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
88 ; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]]
8989 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:{{[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
9090 define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
9191 %tid = call i32 @llvm.amdgcn.workitem.id.x()
104104 ; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i32:
105105 ; VI: flat_load_ushort [[A:v[0-9]+]]
106106 ; VI: flat_load_ushort [[B:v[0-9]+]]
107 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
107 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
108108 ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
109109 ; VI-NEXT: buffer_store_dword [[SEXT]]
110110 define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
124124 ; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i64:
125125 ; VI: flat_load_ushort [[A:v[0-9]+]]
126126 ; VI: flat_load_ushort [[B:v[0-9]+]]
127 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
127 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
128128 ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
129129 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
130130 ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
167167 ; VI: flat_load_ushort v[[B_HI:[0-9]+]]
168168 ; VI: flat_load_ushort v[[B_LO:[0-9]+]]
169169
170 ; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
170 ; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
171171 ; VI-NOT: and
172172 ; VI-NOT: shl
173 ; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
173 ; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
174174 ; VI-NOT: and
175175 ; VI-NOT: shl
176176 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
106106 ; GCN: v_bcnt_u32_b32{{(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}}
107107 ; GCN: v_bcnt_u32_b32{{(_e32)*(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}}
108108 ; GCN-DAG: v_not_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]]
109 ; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]]
109 ; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]], v[[VREG1_LO]]
110110 ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]]
111111 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
112112 define amdgpu_kernel void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
153153 ; GCN-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
154154 ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
155155 ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], vcc
156 ; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[HI]], v[[LO]]
156 ; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[LO]], v[[HI]]
157157 ; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[OR]]
158158 ; GCN-DAG: v_cndmask_b32_e32 v[[CLTZ_LO:[0-9]+]], 64, v[[CTLZ:[0-9]+]], vcc
159159 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI:[0-9]+]]{{\]}}
88 ; SI-LABEL: @simple_read2_f32
99 ; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8
1010 ; SI: s_waitcnt lgkmcnt(0)
11 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
11 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
1212 ; SI: buffer_store_dword [[RESULT]]
1313 ; SI: s_endpgm
1414 define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 {
2727 ; SI-LABEL: @simple_read2_f32_max_offset
2828 ; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:255
2929 ; SI: s_waitcnt lgkmcnt(0)
30 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
30 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
3131 ; SI: buffer_store_dword [[RESULT]]
3232 ; SI: s_endpgm
3333 define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
3737 ; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align4:
3838 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
3939 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_Z:[0-9]+]]:[[REG_W:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
40 ; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]]
41 ; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_W]], v[[REG_Y]]
42 ; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD1]], v[[ADD0]]
40 ; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_X]], v[[REG_Z]]
41 ; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[REG_W]]
42 ; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD0]], v[[ADD1]]
4343 ; CI: buffer_store_dword v[[ADD2]]
4444 ; CI: s_endpgm
4545 define amdgpu_kernel void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 {
6363 ; CI-LABEL: {{^}}simple_read2_v3f32_superreg_align4:
6464 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
6565 ; CI-DAG: ds_read_b32 v[[REG_Z:[0-9]+]], v{{[0-9]+}} offset:8{{$}}
66 ; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]]
67 ; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[ADD0]]
66 ; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_X]], v[[REG_Z]]
67 ; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[ADD0]], v[[REG_Y]]
6868 ; CI: buffer_store_dword v[[ADD1]]
6969 ; CI: s_endpgm
7070 define amdgpu_kernel void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
66 ; SI-LABEL: @simple_read2st64_f32_0_1
77 ; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
88 ; SI: s_waitcnt lgkmcnt(0)
9 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
9 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
1010 ; SI: buffer_store_dword [[RESULT]]
1111 ; SI: s_endpgm
1212 define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
2525 ; SI-LABEL: @simple_read2st64_f32_1_2
2626 ; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
2727 ; SI: s_waitcnt lgkmcnt(0)
28 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
28 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
2929 ; SI: buffer_store_dword [[RESULT]]
3030 ; SI: s_endpgm
3131 define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
4545 ; SI-LABEL: @simple_read2st64_f32_max_offset
4646 ; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255
4747 ; SI: s_waitcnt lgkmcnt(0)
48 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
48 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
4949 ; SI: buffer_store_dword [[RESULT]]
5050 ; SI: s_endpgm
5151 define amdgpu_kernel void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
66 ; Test that the -enable-no-signed-zeros-fp-math flag works
77
88 ; GCN-LABEL: {{^}}fneg_fsub_f32:
9 ; GCN: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
9 ; GCN: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
1010 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]]
1111
1212 ; GCN-UNSAFE-NOT: xor
3838 ; VI: flat_load_ushort [[HI:v[0-9]+]]
3939 ; VI: flat_load_ushort [[LO:v[0-9]+]]
4040 ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
41 ; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[MASK]], [[HI]]
41 ; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[HI]], [[MASK]]
4242 ; VI-DAG: v_and_b32_sdwa [[FABS_HI:v[0-9]+]], [[LO]], [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
43 ; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, [[FABS_HI]], [[FABS_LO]]
43 ; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, [[FABS_LO]], [[FABS_HI]]
4444 ; VI: flat_store_dword
4545
4646 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
6161 ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
6262 ; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
6363 ; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
64 ; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
65 ; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
64 ; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]]
65 ; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]]
6666 ; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
6767 ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
6868
7979
8080 ; CI-DAG: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[IN0]]
8181 ; CI-DAG: v_cvt_f32_f16_e64 [[ABS_CVT1:v[0-9]+]], |[[IN1]]|
82 ; CI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[CVT0]], [[ABS_CVT1]]
82 ; CI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[ABS_CVT1]], [[CVT0]]
8383 ; CI: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]]
8484 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVTRESULT]]
8585
134134 ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[FABS]], v{{[0-9]+$}}
135135 define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
136136 %tid = call i32 @llvm.amdgcn.workitem.id.x()
137 %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
137 %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
138138 %val = load <2 x half>, <2 x half> addrspace(1)* %gep
139139 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
140140 %fmul = fmul <2 x half> %fabs, %val
1515 ; GCN: buffer_load_dword [[U:v[0-9]+]]
1616 ; GCN: buffer_load_dword [[V:v[0-9]+]]
1717
18 ; GCN-FLUSH: v_mac_f32_e32 [[Z]], [[V]], [[U]]
19 ; GCN-FLUSH-NEXT: v_mac_f32_e32 [[Z]], [[Y]], [[X]]
18 ; GCN-FLUSH: v_mac_f32_e32 [[Z]], [[U]], [[V]]
19 ; GCN-FLUSH-NEXT: v_mac_f32_e32 [[Z]], [[X]], [[Y]]
2020 ; GCN-FLUSH-NEXT: buffer_store_dword [[Z]]
2121
2222 ; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], [[Z]]
4848 ; GCN: buffer_load_dword [[V:v[0-9]+]]
4949
5050 ; GCN-FLUSH: v_mad_f32 [[TMP:v[0-9]]], [[U]], [[V]], -[[Z]]
51 ; GCN-FLUSH-NEXT: v_mac_f32_e32 [[TMP]], [[Y]], [[X]]
51 ; GCN-FLUSH-NEXT: v_mac_f32_e32 [[TMP]], [[X]], [[Y]]
5252 ; GCN-FLUSH-NEXT: buffer_store_dword [[Z]]
5353
5454 ; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], -[[Z]]
7474 ; GCN: buffer_load_dword [[U:v[0-9]+]]
7575 ; GCN: buffer_load_dword [[V:v[0-9]+]]
7676
77 ; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
78 ; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[Y]], [[X]]
77 ; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
78 ; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
79 ; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[U]], [[Z]]
80
81 ; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
82 ; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]]
83 ; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[FMA1]], [[Z]]
84
85 ; GCN-SLOWFMA: v_mul_f32_e32
86 ; GCN-SLOWFMA: v_mul_f32_e32
87 ; GCN-SLOWFMA: v_add_f32_e32
88 ; GCN-SLOWFMA: v_add_f32_e32
89 define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
90 %x = load volatile float, float addrspace(1)* undef
91 %y = load volatile float, float addrspace(1)* undef
92 %z = load volatile float, float addrspace(1)* undef
93 %u = load volatile float, float addrspace(1)* undef
94 %v = load volatile float, float addrspace(1)* undef
95 %mul.u.v = fmul fast float %u, %v
96 store volatile float %mul.u.v, float addrspace(1)* undef
97 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
98 %add = fadd fast float %fma, %z
99 store volatile float %add, float addrspace(1)* undef
100 ret void
101 }
102
103 ; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_mul_commute:
104 ; GCN: buffer_load_dword [[X:v[0-9]+]]
105 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
106 ; GCN: buffer_load_dword [[Z:v[0-9]+]]
107 ; GCN: buffer_load_dword [[U:v[0-9]+]]
108 ; GCN: buffer_load_dword [[V:v[0-9]+]]
109
110 ; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
111 ; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
79112 ; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[U]]
80113
81 ; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
114 ; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
82115 ; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]]
83116 ; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[FMA1]]
84117
86119 ; GCN-SLOWFMA: v_mul_f32_e32
87120 ; GCN-SLOWFMA: v_add_f32_e32
88121 ; GCN-SLOWFMA: v_add_f32_e32
89 define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
90 %x = load volatile float, float addrspace(1)* undef
91 %y = load volatile float, float addrspace(1)* undef
92 %z = load volatile float, float addrspace(1)* undef
93 %u = load volatile float, float addrspace(1)* undef
94 %v = load volatile float, float addrspace(1)* undef
95 %mul.u.v = fmul fast float %u, %v
96 store volatile float %mul.u.v, float addrspace(1)* undef
97 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
98 %add = fadd fast float %fma, %z
99 store volatile float %add, float addrspace(1)* undef
100 ret void
101 }
102
103 ; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_mul_commute:
104 ; GCN: buffer_load_dword [[X:v[0-9]+]]
105 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
106 ; GCN: buffer_load_dword [[Z:v[0-9]+]]
107 ; GCN: buffer_load_dword [[U:v[0-9]+]]
108 ; GCN: buffer_load_dword [[V:v[0-9]+]]
109
110 ; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
111 ; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[Y]], [[X]]
112 ; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[U]], [[Z]]
113
114 ; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
115 ; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]]
116 ; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[FMA1]], [[Z]]
117
118 ; GCN-SLOWFMA: v_mul_f32_e32
119 ; GCN-SLOWFMA: v_mul_f32_e32
120 ; GCN-SLOWFMA: v_add_f32_e32
121 ; GCN-SLOWFMA: v_add_f32_e32
122122 define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
123123 %x = load volatile float, float addrspace(1)* undef
124124 %y = load volatile float, float addrspace(1)* undef
190190 ; GCN: buffer_load_dword [[U:v[0-9]+]]
191191 ; GCN: buffer_load_dword [[V:v[0-9]+]]
192192
193 ; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
194
195 ; GCN-FLUSH: v_mad_f32 [[MAD:v[0-9]+]], [[Y]], [[X]], [[MUL]]
196 ; GCN-FLUSH: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MAD]]
193 ; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
194
195 ; GCN-FLUSH: v_mad_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]]
196 ; GCN-FLUSH: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MAD]], [[Z]]
197197
198198 ; GCN-FASTFMA: v_fma_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]]
199 ; GCN-FASTFMA: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MAD]]
200
201 ; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[Y]], [[X]]
202 ; GCN-SLOWFMA: v_add_f32_e32
203 ; GCN-SLOWFMA: v_subrev_f32_e32 [[MAD:v[0-9]+]]
199 ; GCN-FASTFMA: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MAD]], [[Z]]
200
201 ; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
202 ; GCN-SLOWFMA: v_add_f32_e32
203 ; GCN-SLOWFMA: v_sub_f32_e32 [[MAD:v[0-9]+]]
204204
205205 ; GCN: buffer_store_dword [[MUL]]
206206 ; GCN: buffer_store_dword [[MAD]]
225225 ; GCN: buffer_load_dword [[U:v[0-9]+]]
226226 ; GCN: buffer_load_dword [[V:v[0-9]+]]
227227
228 ; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
229
230 ; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[Y]], [[X]]
231 ; GCN-FLUSH-NEXT: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MUL]]
228 ; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
229
230 ; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
231 ; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MUL]], [[Z]]
232232 ; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
233233 ; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
234234
235235 ; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]]
236 ; GCN-FASTFMA-NEXT: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[FMA]]
236 ; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[FMA]], [[Z]]
237237 ; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
238238 ; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
239239
240 ; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[Y]], [[X]]
241 ; GCN-SLOWFMA: v_add_f32_e32
242 ; GCN-SLOWFMA: v_subrev_f32_e32
240 ; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
241 ; GCN-SLOWFMA: v_add_f32_e32
242 ; GCN-SLOWFMA: v_sub_f32_e32
243243 define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 {
244244 %x = load volatile float, float addrspace(1)* undef
245245 %y = load volatile float, float addrspace(1)* undef
55 ; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]]
66 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
77 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
8 ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
8 ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
99 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
10 ; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
10 ; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
1111 ; GCN: buffer_store_short v[[R_F16]]
1212 ; GCN: s_endpgm
1313 define amdgpu_kernel void @fadd_f16(
6969
7070 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
7171 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
72 ; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
73 ; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
72 ; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
73 ; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
7474 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
7575 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
7676 ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
77 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
77 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
7878
79 ; VI-DAG: v_add_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
79 ; VI-DAG: v_add_f16_e32 v[[R_F16_LO:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
8080 ; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
81 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
81 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
8282
8383 ; GCN: buffer_store_dword v[[R_V2_F16]]
8484 ; GCN: s_endpgm
107107 ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
108108 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
109109 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
110 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
110 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
111111
112112 ; VI-DAG: v_mov_b32_e32 v[[CONST2:[0-9]+]], 0x4000
113113 ; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
114114 ; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
115 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
115 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
116116
117117 ; GCN: buffer_store_dword v[[R_V2_F16]]
118118 ; GCN: s_endpgm
138138 ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]]
139139 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
140140 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
141 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
141 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
142142
143143 ; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
144144 ; VI-DAG: v_add_f16_sdwa v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
145145 ; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[A_V2_F16]]
146 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
146 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
147147
148148 ; GCN: buffer_store_dword v[[R_V2_F16]]
149149 ; GCN: s_endpgm
2626 ; VI-DAG: v_cvt_f32_f16_e32 [[CVT_RHS:v[0-9]+]], [[RHS]]
2727
2828 ; VI-DAG: v_rcp_f32_e32 [[RCP_RHS:v[0-9]+]], [[CVT_RHS]]
29 ; VI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[RCP_RHS]], [[CVT_LHS]]
29 ; VI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_LHS]], [[RCP_RHS]]
3030 ; VI: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]]
3131 ; VI: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]]
3232 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
164164 ; VI: flat_load_ushort [[RHS:v[0-9]+]]
165165
166166 ; VI: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]]
167 ; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]]
167 ; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
168168
169169 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
170170 define amdgpu_kernel void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
186186 ; VI: flat_load_ushort [[RHS:v[0-9]+]]
187187
188188 ; VI: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]]
189 ; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]]
189 ; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
190190
191191 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
192192 define amdgpu_kernel void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 {
1919 ; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2020 ; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
2121 ; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
22 ; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]]
22 ; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
2323 ; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
2424 ; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
2525 ; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
4444 ; GCN-NOT: s_setreg
4545 ; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
4646 ; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
47 ; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]]
47 ; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
4848 ; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
4949 ; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
5050 ; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
386386
387387 ; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y:
388388 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
389 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
389 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
390390 ;
391391 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
392392 define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
402402
403403 ; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one:
404404 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
405 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
405 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
406406 ;
407407 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
408408 define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
418418
419419 ; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y:
420420 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
421 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
421 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
422422 ;
423423 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
424424 define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
434434
435435 ; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone:
436436 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
437 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
437 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
438438 ;
439439 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
440440 define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
450450
451451 ; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y:
452452 ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
453 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
453 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
454454 ;
455455 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
456456 define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
466466
467467 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x:
468468 ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
469 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
469 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
470470 ;
471471 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
472472 define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
482482
483483 ; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y:
484484 ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
485 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
485 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
486486 ;
487487 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
488488 define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
498498
499499 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x:
500500 ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
501 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
501 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
502502 ;
503503 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
504504 define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
514514
515515 ; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y:
516516 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
517 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
517 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
518518 ;
519519 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
520520 define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
530530
531531 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one:
532532 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
533 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
533 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
534534 ;
535535 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
536536 define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
546546
547547 ; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y:
548548 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
549 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
549 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
550550 ;
551551 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
552552 define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
562562
563563 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone:
564564 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
565 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
565 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
566566 ;
567567 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
568568 define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
582582
583583 ; FUNC-LABEL: {{^}}test_f32_interp:
584584 ; SI-NOFMA: v_sub_f32_e32 [[VT1:v[0-9]]], 1.0, [[VT:v[0-9]]]
585 ; SI-NOFMA: v_mul_f32_e32 [[VTY:v[0-9]]], [[VT1]], [[VY:v[0-9]]]
586 ; SI-NOFMA: v_mac_f32_e32 [[VTY]], [[VT]], [[VX:v[0-9]]]
585 ; SI-NOFMA: v_mul_f32_e32 [[VTY:v[0-9]]], [[VY:v[0-9]]], [[VT1]]
586 ; SI-NOFMA: v_mac_f32_e32 [[VTY]], [[VX:v[0-9]]], [[VT]]
587587 ;
588588 ; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]]
589589 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]]
99 ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
1010 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
1111 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
12 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
12 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
1313
1414 ; EG: MAX
1515 define amdgpu_kernel void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
3030 ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
3131 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
3232 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
33 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
33 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
3434 ; EG: MAX
3535 define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
3636 %tid = call i32 @llvm.r600.read.tidig.x() #1
5050 ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
5151 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
5252 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
53 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
53 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
5454 ; EG: MAX
5555 define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
5656 %tid = call i32 @llvm.r600.read.tidig.x() #1
7070 ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
7171 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
7272 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
73 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
73 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
7474 ; EG: MAX
7575 define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
7676 %tid = call i32 @llvm.r600.read.tidig.x() #1
9090 ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
9191 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
9292 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
93 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
93 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
9494 ; EG: MAX
9595 define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
9696 %tid = call i32 @llvm.r600.read.tidig.x() #1
871871 ; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
872872 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
873873 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
874 ; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], [[B]], [[A]]
875 ; GCN: v_min_f32_e32 v{{[0-9]+}}, [[C]], [[MAX]]
874 ; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], [[A]], [[B]]
875 ; GCN: v_min_f32_e32 v{{[0-9]+}}, [[MAX]], [[C]]
876876 define amdgpu_kernel void @v_test_global_nnans_min_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
877877 %tid = call i32 @llvm.amdgcn.workitem.id.x()
878878 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
4444 ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
4545 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
4646 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
47 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
47 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
4848 define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
4949 %tid = call i32 @llvm.r600.read.tidig.x() #1
5050 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
6363 ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
6464 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
6565 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
66 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
66 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
6767 define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
6868 %tid = call i32 @llvm.r600.read.tidig.x() #1
6969 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
8282 ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
8383 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
8484 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
85 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
85 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
8686 define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
8787 %tid = call i32 @llvm.r600.read.tidig.x() #1
8888 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
101101 ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
102102 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
103103 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
104 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
104 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
105105 define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
106106 %tid = call i32 @llvm.r600.read.tidig.x() #1
107107 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
120120 ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
121121 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
122122 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
123 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
123 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
124124 define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
125125 %tid = call i32 @llvm.r600.read.tidig.x() #1
126126 %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid
55 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
66 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
77 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
8 ; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
8 ; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
99 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
10 ; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
10 ; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
1111 ; GCN: buffer_store_short v[[R_F16]]
1212 ; GCN: s_endpgm
1313 define amdgpu_kernel void @fmul_f16(
6969 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
7070 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
7171 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
72 ; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
73 ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
72 ; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
73 ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
7474 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
7575 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
7676 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
77 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
77 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
7878
79 ; VI-DAG: v_mul_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
79 ; VI-DAG: v_mul_f16_e32 v[[R_F16_LO:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
8080 ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
81 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
81 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
8282
8383 ; GCN: buffer_store_dword v[[R_V2_F16]]
8484 ; GCN: s_endpgm
107107 ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
108108 ; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
109109 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
110 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
110 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
111111 ; GCN: buffer_store_dword v[[R_V2_F16]]
112112 ; GCN: s_endpgm
113113 define amdgpu_kernel void @fmul_v2f16_imm_a(
133133 ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
134134 ; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
135135 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
136 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
136 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
137137 ; GCN: buffer_store_dword v[[R_V2_F16]]
138138 ; GCN: s_endpgm
139139 define amdgpu_kernel void @fmul_v2f16_imm_b(
7878 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
7979
8080 ; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
81 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
81 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
8282
8383 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
8484 define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out,
107107 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
108108
109109 ; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
110 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
110 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
111111
112112 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
113113 define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out,
226226
227227 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
228228
229 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
230 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
229 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
230 ; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
231231
232232 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
233233 define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
256256
257257 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
258258
259 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
260 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
259 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
260 ; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
261261
262262 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
263263 define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
286286
287287 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
288288
289 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
289 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
290290 ; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
291291
292292 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
318318
319319 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
320320
321 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
321 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
322322 ; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
323323
324324 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
346346 ; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
347347 ; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
348348
349 ; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGB]], [[REGA]]
349 ; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGA]], [[REGB]]
350350 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
351351
352352 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
353353
354 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
355 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
354 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
355 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
356356 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
357357 define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
358358 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
384384 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
385385
386386 ; VI-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
387 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
387 ; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
388388
389389 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
390390 define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
415415 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
416416
417417 ; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
418 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
418 ; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
419419
420420 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
421421 define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
443443 ; VI-DENORM-CONTRACT: v_fma_f16 [[R2]], [[R1]], 2.0, -[[R2]]
444444
445445 ; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
446 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
446 ; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
447447
448448 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
449449 define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
6666 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
6767
6868 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
69 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
69 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
7070
7171 ; SI-DENORM buffer_store_dword [[RESULT]]
7272 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
9595 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
9696
9797 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
98 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
98 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
9999
100100 ; SI-DENORM: buffer_store_dword [[RESULT]]
101101 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
124124 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
125125
126126 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
127 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
127 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
128128
129129 ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
130 ; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
130 ; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
131131
132132 ; SI-DENORM: buffer_store_dword [[RESULT]]
133133 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
159159 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
160160
161161 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
162 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
162 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
163163
164164 ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
165 ; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
165 ; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
166166
167167 ; SI-DENORM: buffer_store_dword [[RESULT]]
168168 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
191191 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
192192
193193 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
194 ; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
194 ; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
195195
196196 ; SI-DENORM: buffer_store_dword [[RESULT]]
197197 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
220220 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]]
221221
222222 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
223 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
223 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
224224
225225 ; SI-DENORM: buffer_store_dword [[RESULT]]
226226 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
251251 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]]
252252
253253 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
254 ; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
254 ; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
255255
256256 ; SI-DENORM: buffer_store_dword [[RESULT]]
257257 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
281281 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
282282
283283 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
284 ; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
284 ; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
285285
286286 ; SI-DENORM: buffer_store_dword [[RESULT]]
287287 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
309309
310310 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
311311
312 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
313 ; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
314
315 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
316 ; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
312 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
313 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
314
315 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
316 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
317317
318318 ; SI: buffer_store_dword [[RESULT]]
319319 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
344344
345345 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
346346
347 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
348 ; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
349
350 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
351 ; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
347 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
348 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
349
350 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
351 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
352352
353353 ; SI: buffer_store_dword [[RESULT]]
354354 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
378378
379379 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
380380
381 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
381 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
382382 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
383383
384 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
384 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
385385 ; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
386386
387387 ; SI: buffer_store_dword [[RESULT]]
413413
414414 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
415415
416 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
416 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
417417 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
418418
419 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
419 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
420420 ; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
421421
422422 ; SI: buffer_store_dword [[RESULT]]
445445 ; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]]
446446 ; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]]
447447
448 ; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGB]], [[REGA]]
448 ; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGA]], [[REGB]]
449449 ; SI-FLUSH: buffer_store_dword [[REGC]]
450450 ; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
451451
452452 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
453453
454 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
455 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
456
457 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
458 ; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
454 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
455 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
456
457 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
458 ; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
459459
460460 ; SI-DENORM: buffer_store_dword [[RESULT]]
461461 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
488488 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
489489
490490 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
491 ; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
491 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
492492
493493 ; GCN-DENORM-STRICT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
494 ; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
494 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
495495
496496 ; SI: buffer_store_dword [[RESULT]]
497497 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
524524 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
525525
526526 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
527 ; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
527 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
528528
529529 ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
530 ; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
530 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
531531
532532 ; SI-DENORM: buffer_store_dword [[RESULT]]
533533 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
555555 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
556556
557557 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
558 ; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
558 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
559559
560560 ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
561 ; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
561 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
562562
563563 ; SI: buffer_store_dword [[RESULT]]
564564 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
88 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
99 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1010
11 ; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
11 ; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
1212 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
1313
1414 ; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]]
3030 ; GCN-LABEL: {{^}}v_fneg_add_store_use_add_f32:
3131 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
3232 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
33 ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
33 ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
3434 ; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
3535 ; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
3636 ; GCN-NEXT: buffer_store_dword [[ADD]]
5353 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
5454 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
5555
56 ; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
56 ; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
5757 ; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
5858 ; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
5959
8181 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
8282 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
8383
84 ; GCN-SAFE: v_subrev_f32_e32
84 ; GCN-SAFE: v_sub_f32_e32
8585 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000,
8686
87 ; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
87 ; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
8888 ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
8989 define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
9090 %tid = call i32 @llvm.amdgcn.workitem.id.x()
105105 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
106106 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
107107
108 ; GCN-SAFE: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
108 ; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
109109 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
110110
111 ; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
111 ; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
112112 ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
113113 define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
114114 %tid = call i32 @llvm.amdgcn.workitem.id.x()
132132 ; GCN-SAFE: v_sub_f32_e64 [[ADD:v[0-9]+]], -[[A]], [[B]]
133133 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
134134
135 ; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
135 ; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
136136 ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
137137 define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
138138 %tid = call i32 @llvm.amdgcn.workitem.id.x()
156156
157157 ; GCN-SAFE: v_bfrev_b32_e32 [[SIGNBIT:v[0-9]+]], 1{{$}}
158158 ; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], [[A]], [[SIGNBIT]]
159 ; GCN-SAFE: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
159 ; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
160160 ; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], [[ADD]], [[SIGNBIT]]
161161
162162 ; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
163 ; GCN-NSZ-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]]
163 ; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
164164 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]]
165165 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]]
166166 define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
184184 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
185185
186186 ; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
187 ; GCN-SAFE-DAG: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
187 ; GCN-SAFE-DAG: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
188188 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
189189
190 ; GCN-NSZ-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]]
190 ; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
191191 ; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
192192 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]]
193193 ; GCN-NSZ-NEXT: buffer_store_dword [[MUL]]
234234 ; GCN-LABEL: {{^}}v_fneg_mul_store_use_mul_f32:
235235 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
236236 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
237 ; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
237 ; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
238238 ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
239239 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
240240 ; GCN: buffer_store_dword [[ADD]]
279279 ; GCN-LABEL: {{^}}v_fneg_mul_fneg_x_f32:
280280 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
281281 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
282 ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
282 ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
283283 ; GCN-NEXT: buffer_store_dword [[ADD]]
284284 define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
285285 %tid = call i32 @llvm.amdgcn.workitem.id.x()
299299 ; GCN-LABEL: {{^}}v_fneg_mul_x_fneg_f32:
300300 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
301301 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
302 ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
302 ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
303303 ; GCN-NEXT: buffer_store_dword [[ADD]]
304304 define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
305305 %tid = call i32 @llvm.amdgcn.workitem.id.x()
341341 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
342342 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
343343 ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
344 ; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]]
344 ; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]]
345345 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
346346 ; GCN: buffer_store_dword [[NEG_A]]
347347 define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
363363 ; GCN-LABEL: {{^}}v_fneg_mul_multi_use_fneg_x_f32:
364364 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
365365 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
366 ; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]]
366 ; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]]
367367 ; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
368368 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
369369 ; GCN: buffer_store_dword [[MUL]]
973973 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
974974 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
975975
976 ; GCN-SAFE: v_mac_f32_e32 [[C]], [[B]], [[A]]
976 ; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]]
977977 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[C]]
978978
979979 ; GCN-NSZ: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
999999 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
10001000 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
10011001
1002 ; GCN-SAFE: v_mac_f32_e32 [[C]], [[B]], [[A]]
1002 ; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]]
10031003 ; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]]
10041004 ; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]]
10051005
14481448 ; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_mul_legacy_f32:
14491449 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
14501450 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1451 ; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
1451 ; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
14521452 ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
14531453 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
14541454 ; GCN: buffer_store_dword [[ADD]]
14931493 ; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_x_f32:
14941494 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
14951495 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1496 ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
1496 ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
14971497 ; GCN-NEXT: buffer_store_dword [[ADD]]
14981498 define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
14991499 %tid = call i32 @llvm.amdgcn.workitem.id.x()
15131513 ; GCN-LABEL: {{^}}v_fneg_mul_legacy_x_fneg_f32:
15141514 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
15151515 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1516 ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
1516 ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
15171517 ; GCN-NEXT: buffer_store_dword [[ADD]]
15181518 define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
15191519 %tid = call i32 @llvm.amdgcn.workitem.id.x()
15551555 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
15561556 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
15571557 ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
1558 ; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[B]], [[A]]
1558 ; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]]
15591559 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
15601560 ; GCN: buffer_store_dword [[NEG_A]]
15611561 define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
15771577 ; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_fneg_x_f32:
15781578 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
15791579 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1580 ; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[B]], [[A]]
1580 ; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]]
15811581 ; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
15821582 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
15831583 ; GCN: buffer_store_dword [[MUL]]
16631663 ; GCN-LABEL: {{^}}v_fneg_round_f32:
16641664 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
16651665 ; GCN: v_trunc_f32_e32
1666 ; GCN: v_subrev_f32_e32
1666 ; GCN: v_sub_f32_e32
16671667 ; GCN: v_cndmask_b32
16681668
16691669 ; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
17811781 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
17821782 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
17831783 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1784 ; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[B]], [[A]]
1784 ; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[A]], [[B]]
17851785 ; GCN: s_cbranch_scc1
17861786
17871787 ; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x80000000, [[MUL0]]
1788 ; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[C]], [[XOR]]
1788 ; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[XOR]], [[C]]
17891789 ; GCN: buffer_store_dword [[MUL1]]
17901790
17911791 ; GCN: buffer_store_dword [[MUL0]]
18501850 ; GCN-LABEL: {{^}}v_fneg_inlineasm_multi_use_src_f32:
18511851 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
18521852 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1853 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]]
1853 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]]
18541854 ; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]]
18551855 ; GCN: ; use [[NEG]]
18561856 ; GCN: buffer_store_dword [[MUL]]
19831983 ; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL2:v[0-9]+]], -[[FMA0]], [[D]]
19841984
19851985 ; GCN-NSZ: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], -[[B]], -2.0
1986 ; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[C]], [[FMA0]]
1987 ; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[D]], [[FMA0]]
1986 ; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[FMA0]], [[C]]
1987 ; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[FMA0]], [[D]]
19881988
19891989 ; GCN: buffer_store_dword [[MUL1]]
19901990 ; GCN-NEXT: buffer_store_dword [[MUL2]]
20832083 ; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
20842084 ; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
20852085 ; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
2086 ; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[D]], [[TRUNC_A]]
2086 ; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[TRUNC_A]], [[D]]
20872087 ; GCN: buffer_store_dword [[FMA0]]
20882088 ; GCN: buffer_store_dword [[MUL1]]
20892089 define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
44 ; GCN-LABEL: {{^}}fneg_fabs_fadd_f16:
55 ; CI: v_cvt_f32_f16_e32
66 ; CI: v_cvt_f32_f16_e64 [[CVT_ABS_X:v[0-9]+]], |v{{[0-9]+}}|
7 ; CI: v_subrev_f32_e32 v{{[0-9]+}}, [[CVT_ABS_X]], v{{[0-9]+}}
7 ; CI: v_sub_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_ABS_X]]
88
99 ; GFX89-NOT: _and
1010 ; GFX89: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|
1919 ; GCN-LABEL: {{^}}fneg_fabs_fmul_f16:
2020 ; CI-DAG: v_cvt_f32_f16_e32
2121 ; CI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG_ABS_X:v[0-9]+]], -|{{v[0-9]+}}|
22 ; CI: v_mul_f32_e32 {{v[0-9]+}}, [[CVT_NEG_ABS_X]], {{v[0-9]+}}
22 ; CI: v_mul_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, [[CVT_NEG_ABS_X]]
2323 ; CI: v_cvt_f16_f32_e32
2424
2525 ; GFX89-NOT: _and
4545
4646 ; CI-DAG: v_cvt_f32_f16_e32 [[CVT_VAL:v[0-9]+]], [[NEG_VALUE]]
4747 ; CI-DAG: v_cvt_f32_f16_e64 [[NEG_CVT0:v[0-9]+]], -[[NEG_VALUE]]
48 ; CI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_VAL]], [[NEG_CVT0]]
48 ; CI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[NEG_CVT0]], [[CVT_VAL]]
4949 ; CI: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], [[MUL]]
5050 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVT1]]
5151
153153 ; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
154154 ; GCN-DAG: v_cvt_f32_f16_e64 [[CVTA_NEG:v[0-9]+]], -[[A]]
155155 ; SI-DAG: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]]
156 ; SI: v_mul_f32_e32 [[MUL_F32:v[0-9]+]], [[CVTA]], [[CVTA_NEG]]
156 ; SI: v_mul_f32_e32 [[MUL_F32:v[0-9]+]], [[CVTA_NEG]], [[CVTA]]
157157 ; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]]
158158
159159 ; GFX89-DAG: v_cvt_f32_f16_e64 [[CVT_NEGA:v[0-9]+]], -[[A]]
5959 ; SI: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
6060 ; SI: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]]
6161 ; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]
62 ; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]]
62 ; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_LO]], v[[R_I16_HI]]
6363
6464 ; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
6565 ; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
5959 ; SI: v_cvt_u32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
6060 ; SI: v_cvt_u32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
6161 ; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]
62 ; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_0]]
62 ; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_0]], v[[R_I16_HI]]
6363
6464 ; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_V2_F16]]
6565 ; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
3737 ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]]
3838 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
3939 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
40 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
40 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
4141
4242 ; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
43 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
43 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
4444
4545 ; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
4646 ; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
6767
6868 ; VI: v_cvt_f16_f32_sdwa v[[R_F16_HI:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
6969
70 ; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
70 ; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
7171
7272 ; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
7373 ; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
88
99 ; GCN-LABEL: {{^}}fract_f32:
1010 ; GCN-SAFE: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]]
11 ; GCN-SAFE: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]]
11 ; GCN-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[INPUT]], [[FLR]]
1212
1313 ; GCN-UNSAFE: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]]
1414
2828 ; GCN: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16
2929 ; GCN: buffer_load_dword [[X:v[0-9]+]], {{.*}}
3030 ; GCN: v_rcp_f32_e32 [[INVY:v[0-9]+]], [[Y]]
31 ; GCN: v_mul_f32_e32 [[DIV:v[0-9]+]], [[INVY]], [[X]]
31 ; GCN: v_mul_f32_e32 [[DIV:v[0-9]+]], [[X]], [[INVY]]
3232 ; GCN: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]]
3333 ; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]]
3434 ; GCN: buffer_store_dword [[RESULT]]
66 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
77 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
88 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
9 ; SI: v_subrev_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
9 ; SI: v_sub_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
1010 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
11 ; GFX89: v_subrev_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
11 ; GFX89: v_sub_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
1212 ; GCN: buffer_store_short v[[R_F16]]
1313 ; GCN: s_endpgm
1414 define amdgpu_kernel void @fsub_f16(
6969
7070 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
7171 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
72 ; SI: v_subrev_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
73 ; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
72 ; SI: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
73 ; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
7474 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
7575 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
7676 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
77 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
77 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
7878
79 ; VI-DAG: v_subrev_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
79 ; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
8080 ; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
81 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
81 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
8282
8383 ; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1]
8484
108108 ; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
109109 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
110110 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
111 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
111 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
112112
113113 ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
114114 ; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST2]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
115115 ; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
116 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
116 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
117117
118118 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x40003c00
119119 ; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]] neg_lo:[1,0] neg_hi:[1,0]
142142 ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]]
143143 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
144144 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
145 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
145 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
146146
147147 ; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xbc00
148148 ; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONSTM1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
149149 ; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]]
150 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
150 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
151151
152152 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00c000
153153 ; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]{{$}}
22 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
33
44 ; FUNC-LABEL: {{^}}v_fsub_f32:
5 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
5 ; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
66 define amdgpu_kernel void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
77 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
88 %a = load float, float addrspace(1)* %in, align 4
4040 ; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
4141 ; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
4242
43 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
44 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
45 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
46 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
43 ; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
44 ; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
45 ; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
46 ; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
4747 define amdgpu_kernel void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
4848 %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
4949 %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16
6666 }
6767
6868 ; FUNC-LABEL: {{^}}v_fneg_fsub_f32:
69 ; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
69 ; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
7070 ; SI: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]]
7171 define amdgpu_kernel void @v_fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
7272 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
7979 }
8080
8181 ; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_f32:
82 ; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
82 ; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
8383 ; SI-NOT: xor
8484 define amdgpu_kernel void @v_fneg_fsub_nsz_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
8585 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
9292 }
9393
9494 ; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_attribute_f32:
95 ; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
95 ; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
9696 ; SI-NOT: xor
9797 define amdgpu_kernel void @v_fneg_fsub_nsz_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
9898 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
108108 ; make sure it is disabled and the fneg is not folded if it is not
109109 ; "true".
110110 ; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_false_attribute_f32:
111 ; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
111 ; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
112112 ; SI: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]]
113113 define amdgpu_kernel void @v_fneg_fsub_nsz_false_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
114114 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
1616 ; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
1717 ; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
1818 ; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]]
19 ; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[HI]], [[V0]]
19 ; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]]
2020 ; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
2121 ; GCN: s_endpgm
2222 define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
470470
471471 ; SI-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]]
472472 ; SI-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]]
473 ; SI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[SHL]], [[CVT0]]
473 ; SI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[SHL]]
474474
475475 ; VI-DAG: v_cvt_f16_f32_sdwa [[CVT1:v[0-9]+]], v[[HI]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
476 ; VI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT1]], [[CVT0]]
476 ; VI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[CVT1]]
477477
478478 ; GCN-DAG: buffer_store_dword [[PACKED]]
479479 ; GCN: s_endpgm
304304 ; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
305305 ; VI-DAG: buffer_load_dword
306306 ; VI-NOT: and
307 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}
307 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
308308 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
309309 ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
310310 ; VI: buffer_store_dword
3232 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
3333 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
3434 ; SI-NOT: and
35 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
35 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
3636
3737 ; VI-DAG: v_ceil_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
3838 ; VI-DAG: v_ceil_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
3939 ; VI-NOT: and
40 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
40 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
4141
4242 ; GCN: buffer_store_dword v[[R_V2_F16]]
4343 ; GCN: s_endpgm
2828 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
2929 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
3030 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
31 ; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]]
32 ; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]]
31 ; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PIE]]
32 ; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PIE]]
3333
3434 ; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
3535 ; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
4747 ; GCN-NOT: and
4848
4949 ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
50 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
51 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
50 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
51 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
5252 ; GCN: buffer_store_dword v[[R_V2_F16]]
5353 ; GCN: s_endpgm
5454 define amdgpu_kernel void @cos_v2f16(
3232 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
3333 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
3434 ; SI-NOT: and
35 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
35 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
3636
3737 ; VI-DAG: v_exp_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
3838 ; VI-DAG: v_exp_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
3939 ; VI-NOT: and
40 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
40 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
4141
4242 ; GCN: buffer_store_dword v[[R_V2_F16]]
4343 ; GCN: s_endpgm
3232 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
3333 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
3434 ; SI-NOT: and
35 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
35 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
3636
3737 ; VI-DAG: v_floor_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
3838 ; VI-DAG: v_floor_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
3939 ; VI-NOT: and
40 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
40 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
4141
4242 ; GCN: buffer_store_dword v[[R_V2_F16]]
4343 ; GCN: s_endpgm
127127
128128 ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
129129 ; GCN-NOT: and
130 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
130 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
131131 ; GCN: buffer_store_dword v[[R_V2_F16]]
132132 ; GCN: s_endpgm
133133 define amdgpu_kernel void @fma_v2f16(
166166
167167 ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
168168 ; GCN-NOT: and
169 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
169 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
170170 ; GCN: buffer_store_dword v[[R_V2_F16]]
171171 ; GCN: s_endpgm
172172 define amdgpu_kernel void @fma_v2f16_imm_a(
209209
210210 ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
211211 ; GCN-NOT: and
212 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
212 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
213213 ; GCN: buffer_store_dword v[[R_V2_F16]]
214214 ; GCN: s_endpgm
215215 define amdgpu_kernel void @fma_v2f16_imm_b(
252252 ; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]]
253253
254254 ; GCN-NOT: and
255 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
255 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
256256 ; GCN: buffer_store_dword v[[R_V2_F16]]
257257 ; GCN: s_endpgm
258258 define amdgpu_kernel void @fma_v2f16_imm_c(
1212 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
1313 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
1414 ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
15 ; SI: v_mac_f32_e32 v[[C_F32]], v[[B_F32]], v[[A_F32]]
15 ; SI: v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]]
1616 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]]
1717 ; SI: buffer_store_short v[[R_F16]]
1818
19 ; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], v[[B_F16]], v[[A_F16]]
19 ; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]]
2020 ; VI-FLUSH: buffer_store_short v[[C_F16]]
2121
2222 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
109109 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
110110 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
111111 ; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
112 ; SI: v_mac_f32_e32 v[[C_F32_0]], v[[B_F32_0]], v[[A_F32_0]]
113 ; SI: v_mac_f32_e32 v[[C_F32_1]], v[[B_F32_1]], v[[A_F32_1]]
112 ; SI: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]]
113 ; SI: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]]
114114 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
115115 ; SI: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
116116 ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
117 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
117 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
118118
119119 ; VI-FLUSH: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
120120 ; VI-FLUSH-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[B_V2_F16]], v[[C_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
121 ; VI-FLUSH-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[C_V2_F16]], v[[B_V2_F16]]
121 ; VI-FLUSH-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
122122 ; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]]
123123 ; VI-FLUSH-NOT: v_and_b32
124 ; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[A_V2_F16]]
124 ; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[R_F16_HI]]
125125
126126 ; VI-DENORM: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
127127 ; VI-DENORM: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
130130 ; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
131131 ; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]]
132132 ; VI-DENORM-NOT: v_and_b32
133 ; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[RES0]]
133 ; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]]
134134
135135 ; GCN: buffer_store_dword v[[R_V2_F16]]
136136 ; GCN: s_endpgm
3232 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
3333 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
3434 ; SI-NOT: and
35 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
35 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
3636
3737 ; VI-DAG: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
3838 ; VI-DAG: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
3939 ; VI-NOT: and
40 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
40 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
4141
4242 ; GCN: buffer_store_dword v[[R_V2_F16]]
4343 ; GCN: s_endpgm
88 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
99 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
1010 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
11 ; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
11 ; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
1212 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
13 ; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
13 ; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
1414 ; GCN: buffer_store_short v[[R_F16]]
1515 ; GCN: s_endpgm
1616 define amdgpu_kernel void @maxnum_f16(
7272
7373 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
7474 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
75 ; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
76 ; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
75 ; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
76 ; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
7777 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
7878 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
7979 ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
8080 ; SI-NOT: and
81 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
81 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
8282
83 ; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
83 ; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
8484 ; VI-DAG: v_max_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
8585 ; VI-NOT: and
86 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
86 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
8787
8888 ; GCN: buffer_store_dword v[[R_V2_F16]]
8989 ; GCN: s_endpgm
114114
115115 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
116116 ; GCN-NOT: and
117 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
117 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
118118 ; GCN: buffer_store_dword v[[R_V2_F16]]
119119 ; GCN: s_endpgm
120120 define amdgpu_kernel void @maxnum_v2f16_imm_a(
142142
143143 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
144144 ; GCN-NOT: and
145 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
145 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
146146 ; GCN: buffer_store_dword v[[R_V2_F16]]
147147 ; GCN: s_endpgm
148148 define amdgpu_kernel void @maxnum_v2f16_imm_b(
88 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
99 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
1010 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
11 ; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
11 ; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
1212 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
13 ; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
13 ; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
1414 ; GCN: buffer_store_short v[[R_F16]]
1515 ; GCN: s_endpgm
1616 define amdgpu_kernel void @minnum_f16(
7171 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
7272 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
7373 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
74 ; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
75 ; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
74 ; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
75 ; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
7676 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
7777 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
7878 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
7979 ; SI-NOT: and
80 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
80 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
8181
82 ; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
82 ; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
8383 ; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
8484 ; VI-NOT: and
85 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
85 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
8686
8787 ; GCN: buffer_store_dword v[[R_V2_F16]]
8888 ; GCN: s_endpgm
115115
116116 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
117117 ; GCN-NOT: and
118 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
118 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
119119 ; GCN: buffer_store_dword v[[R_V2_F16]]
120120 ; GCN: s_endpgm
121121 define amdgpu_kernel void @minnum_v2f16_imm_a(
143143
144144 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
145145 ; GCN-NOT: and
146 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
146 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
147147 ; GCN: buffer_store_dword v[[R_V2_F16]]
148148 ; GCN: s_endpgm
149149 define amdgpu_kernel void @minnum_v2f16_imm_b(
3333 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
3434 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
3535 ; SI-NOT: v_and_b32
36 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
36 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
3737
3838 ; VI-DAG: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
3939 ; VI-DAG: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
4040 ; VI-NOT: v_and_b32
41 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
41 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
4242
4343 ; GFX9: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
4444 ; GFX9: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1111 ; GCN: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]]
1212 ; GCN: v_cmp_ge_f32_e64 vcc, |[[SUB]]|, 0.5
1313 ; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[VX]]
14 ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]]
14 ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TRUNC]], [[SEL]]
1515 ; GCN: buffer_store_dword [[RESULT]]
1616
1717 ; R600: TRUNC {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]]
6969 ; GFX89: v_sub_f16_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]]
7070 ; GFX89: v_cmp_ge_f16_e64 vcc, |[[SUB]]|, 0.5
7171 ; GFX89: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[COPYSIGN]]
72 ; GFX89: v_add_f16_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]]
72 ; GFX89: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TRUNC]], [[SEL]]
7373 ; GFX89: buffer_store_short [[RESULT]]
7474 define amdgpu_kernel void @round_f16(half addrspace(1)* %out, i32 %x.arg) #0 {
7575 %x.arg.trunc = trunc i32 %x.arg to i16
2828 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
2929 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
3030 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
31 ; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]]
31 ; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PIE]]
3232 ; SI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
33 ; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]]
33 ; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PIE]]
3434 ; SI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
3535
3636 ; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
4646
4747 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
4848 ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
49 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
49 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
5050
5151 ; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[R_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
52 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
52 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
5353
5454 ; GCN: buffer_store_dword v[[R_V2_F16]]
5555 ; GCN: s_endpgm
3232 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
3333 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
3434 ; SI-NOT: v_and_b32
35 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
35 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
3636
3737 ; VI-DAG: v_sqrt_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
3838 ; VI-DAG: v_sqrt_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
3939 ; VI-NOT: v_and_b32
40 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
40 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
4141
4242 ; GCN: buffer_store_dword v[[R_V2_F16]]
4343 ; GCN: s_endpgm
3232 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
3333 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
3434 ; SI-NOT: v_and_b32
35 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
35 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
3636
3737 ; VI-DAG: v_trunc_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
3838 ; VI-DAG: v_trunc_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
3939 ; VI-NOT: v_and_b32
40 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
40 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
4141
4242 ; GCN: buffer_store_dword v[[R_V2_F16]]
4343 ; GCN: s_endpgm
1818 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
1919 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
2020
21 ; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]]
21 ; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]]
2222
2323 ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
2424
2525 ; SI-DENORM-SLOWFMAF-NOT: v_fma
2626 ; SI-DENORM-SLOWFMAF-NOT: v_mad
2727
28 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
29 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
28 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
29 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
3030
3131 ; SI-DENORM: buffer_store_dword [[RESULT]]
3232 ; SI-STD: buffer_store_dword [[C]]
5454 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
5555 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
5656
57 ; SI-STD-DAG: v_mac_f32_e32 [[C]], [[B]], [[A]]
58 ; SI-STD-DAG: v_mac_f32_e32 [[D]], [[B]], [[A]]
57 ; SI-STD-DAG: v_mac_f32_e32 [[C]], [[A]], [[B]]
58 ; SI-STD-DAG: v_mac_f32_e32 [[D]], [[A]], [[B]]
5959
6060 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
6161 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
6262
63 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
64 ; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
65 ; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
63 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
64 ; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
65 ; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
6666
6767 ; SI-DENORM-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
6868 ; SI-DENORM-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
9898 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
9999 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
100100
101 ; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]]
101 ; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]]
102102 ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
103103
104 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
105 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
104 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
105 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
106106
107107 ; SI-DENORM: buffer_store_dword [[RESULT]]
108108 ; SI-STD: buffer_store_dword [[C]]
132132 ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
133133 ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
134134
135 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
136 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
135 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
136 ; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
137137
138138 ; SI: buffer_store_dword [[RESULT]]
139139 define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
166166 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
167167 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
168168
169 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
170 ; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
171 ; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
169 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
170 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
171 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
172172
173173 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
174174 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
204204 ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
205205 ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
206206
207 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
208 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
207 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
208 ; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
209209
210210 ; SI: buffer_store_dword [[RESULT]]
211211 define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
237237 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
238238 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
239239
240 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
241 ; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
242 ; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
240 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
241 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
242 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
243243
244244 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
245245 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
277277 ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
278278
279279 ; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]]
280 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
280 ; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
281281
282282 ; SI: buffer_store_dword [[RESULT]]
283283 define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
312312 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
313313
314314 ; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]]
315 ; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
316 ; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
315 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
316 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
317317
318318 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
319319 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
354354 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
355355 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
356356
357 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
357 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
358358 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
359 ; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
359 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
360360
361361 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
362362 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
394394 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
395395 ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
396396
397 ; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
397 ; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
398398 ; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
399 ; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]]
400
401 ; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
399 ; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]]
400
401 ; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
402402 ; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
403 ; SI-DENORM: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]]
403 ; SI-DENORM: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP1]], [[C]]
404404
405405 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
406406 define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
436436 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
437437 ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
438438
439 ; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
439 ; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
440440 ; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
441 ; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
442
443 ; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
441 ; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]]
442
443 ; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
444444 ; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
445 ; SI-DENORM: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
445 ; SI-DENORM: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]]
446446
447447 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
448448 ; SI: s_endpgm
478478 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
479479 ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
480480
481 ; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
482 ; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[B]], [[A]]
483 ; SI-STD-SAFE: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP0]]
481 ; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
482 ; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[A]], [[B]]
483 ; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP0]], [[C]]
484484
485485 ; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], [[D]], [[E]], -[[C]]
486 ; SI-STD-UNSAFE: v_mac_f32_e32 [[RESULT]], [[B]], [[A]]
487
488 ; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
486 ; SI-STD-UNSAFE: v_mac_f32_e32 [[RESULT]], [[A]], [[B]]
487
488 ; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
489489 ; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
490 ; SI-DENORM-FASTFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]]
491
492 ; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
493 ; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]]
494 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
495 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]]
490 ; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]]
491
492 ; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
493 ; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[A]], [[B]]
494 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]]
495 ; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[C]]
496496
497497 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
498498 ; SI: s_endpgm
529529 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
530530 ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
531531
532 ; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
533 ; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[C]], [[B]]
534 ; SI-STD-SAFE: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP0]], [[A]]
532 ; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
533 ; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[B]], [[C]]
534 ; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP0]]
535535
536536 ; SI-STD-UNSAFE: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
537537 ; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
538538
539 ; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
539 ; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
540540 ; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
541 ; SI-DENORM-FASTFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
542
543 ; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
544 ; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]]
545 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
546 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[A]]
541 ; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]]
542
543 ; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
544 ; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[C]]
545 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]]
546 ; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP2]]
547547
548548 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
549549 ; SI: s_endpgm
3333 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
3434 ; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
3535 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
36 ; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VB]], [[VA]], [[VK]]
37 ; GCN-DAG: v_mac_f32_e32 [[VK]], [[VC]], [[VA]]
36 ; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], [[VK]]
37 ; GCN-DAG: v_mac_f32_e32 [[VK]], [[VA]], [[VC]]
3838 ; GCN: s_endpgm
3939 define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
4040 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
198198 ; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
199199 ; GCN: buffer_load_dword [[VGPR:v[0-9]+]]
200200 ; GCN: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
201 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VGPR]], [[MADAK]]
201 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]]
202202 ; GCN: buffer_store_dword [[MUL]]
203203 define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 {
204204 bb:
3131 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
3232 ; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
3333 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
34 ; GCN-DAG: v_mac_f32_e32 [[VB]], [[VK]], [[VA]]
35 ; GCN-DAG: v_mac_f32_e32 [[VC]], [[VK]], [[VA]]
34 ; GCN-DAG: v_mac_f32_e32 [[VB]], [[VA]], [[VK]]
35 ; GCN-DAG: v_mac_f32_e32 [[VC]], [[VA]], [[VK]]
3636 ; GCN: s_endpgm
3737 define amdgpu_kernel void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
3838 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
4747
4848 ; SI-UNSAFE-DAG: v_rsq_f32_e32 [[RSQA:v[0-9]+]], [[A]]
4949 ; SI-UNSAFE-DAG: v_rcp_f32_e32 [[RCPB:v[0-9]+]], [[B]]
50 ; SI-UNSAFE-DAG: v_mul_f32_e32 [[TMP:v[0-9]+]], [[RCPB]], [[RSQA]]
51 ; SI-UNSAFE: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
50 ; SI-UNSAFE-DAG: v_mul_f32_e32 [[TMP:v[0-9]+]], [[RSQA]], [[RCPB]]
51 ; SI-UNSAFE: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
5252 ; SI-UNSAFE: buffer_store_dword [[RESULT]]
5353
5454 ; SI-SAFE-NOT: v_rsq_f32
55 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
66 ; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 16, [[VAL]]
77 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[SHR]]
8 ; GCN: v_or_b32_e32 v[[OR:[0-9]+]], [[SHL]], [[SHR]]
8 ; GCN: v_or_b32_e32 v[[OR:[0-9]+]], [[SHR]], [[SHL]]
99 ; GCN: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[OR]]
1010 ; GCN: buffer_store_dwordx2 v{{\[}}[[OR]]:[[COPY]]{{\]}}
1111 define amdgpu_kernel void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
1313
1414 ; GCN-DAG: v_mov_b32_e32 [[C200:v[0-9]+]], 0x200
1515 ; GCN-DAG: v_mov_b32_e32 [[C400:v[0-9]+]], 0x400
16 ; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[C200]], [[CLAMP_IDX]]
17 ; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[C400]], [[CLAMP_IDX]]
16 ; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[CLAMP_IDX]], [[C200]]
17 ; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[CLAMP_IDX]], [[C400]]
1818
1919 ; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
2020 ; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
3434 ; GCN-LABEL: {{^}}mul_shr_i32:
3535 ; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
3636 ; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
37 ; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v[[DST1]], v[[DST0]]
37 ; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v[[DST0]], v[[DST1]]
3838 ; NOSDWA-NOT: v_mul_u32_u24_sdwa
3939
4040 ; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
6767 ; GCN-LABEL: {{^}}mul_v2i16:
6868 ; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
6969 ; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
70 ; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]]
70 ; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST0]], v[[DST1]]
7171 ; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]]
72 ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
72 ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]]
7373 ; NOSDWA-NOT: v_mul_u32_u24_sdwa
7474
7575 ; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
167167 ; GCN-LABEL: {{^}}mul_v2half:
168168 ; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
169169 ; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
170 ; NOSDWA: v_mul_f16_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]]
170 ; NOSDWA: v_mul_f16_e32 v[[DST_MUL:[0-9]+]], v[[DST0]], v[[DST1]]
171171 ; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]]
172 ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
172 ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]]
173173 ; NOSDWA-NOT: v_mul_f16_sdwa
174174
175175 ; VI-DAG: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
176176 ; VI-DAG: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
177 ; VI: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]]
177 ; VI: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]]
178178
179179 ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
180180
361361 ; GCN-LABEL: {{^}}mac_v2half:
362362 ; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
363363 ; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
364 ; NOSDWA: v_mac_f16_e32 v[[DST_MAC:[0-9]+]], v[[DST1]], v[[DST0]]
364 ; NOSDWA: v_mac_f16_e32 v[[DST_MAC:[0-9]+]], v[[DST0]], v[[DST1]]
365365 ; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]]
366 ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
366 ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]]
367367 ; NOSDWA-NOT: v_mac_f16_sdwa
368368
369369 ; VI: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
490490 %tmp17 = shufflevector <2 x i8> %tmp10, <2 x i8> %tmp12, <4 x i32>
491491 %tmp18 = shufflevector <2 x i8> %tmp14, <2 x i8> %tmp16, <4 x i32>
492492 %tmp19 = shufflevector <4 x i8> %tmp17, <4 x i8> %tmp18, <8 x i32>
493
493
494494 %arrayidx5 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %destValues, i64 %idxprom
495495 store <8 x i8> %tmp19, <8 x i8> addrspace(1)* %arrayidx5, align 8
496496 ret void
102102
103103 ; GCN: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
104104 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_ABS]], vcc
105 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
105 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
106106 define amdgpu_kernel void @add_select_fabs_var_f32(i32 %c) #0 {
107107 %x = load volatile float, float addrspace(1)* undef
108108 %y = load volatile float, float addrspace(1)* undef
121121
122122 ; GCN: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]]
123123 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc
124 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
124 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
125125 define amdgpu_kernel void @add_select_fabs_negk_f32(i32 %c) #0 {
126126 %x = load volatile float, float addrspace(1)* undef
127127 %y = load volatile float, float addrspace(1)* undef
153153 ; GCN: buffer_load_dword [[X:v[0-9]+]]
154154
155155 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 1.0, 2.0, s
156 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
156 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
157157 define amdgpu_kernel void @add_select_posk_posk_f32(i32 %c) #0 {
158158 %x = load volatile float, float addrspace(1)* undef
159159 %cmp = icmp eq i32 %c, 0
170170 ; GCN-DAG: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]]
171171 ; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
172172 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc
173 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
173 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
174174 define amdgpu_kernel void @add_select_negk_fabs_f32(i32 %c) #0 {
175175 %x = load volatile float, float addrspace(1)* undef
176176 %y = load volatile float, float addrspace(1)* undef
190190 ; GCN-DAG: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]]
191191 ; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
192192 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[FABS_X]], vcc
193 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
193 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
194194 define amdgpu_kernel void @add_select_negliteralk_fabs_f32(i32 %c) #0 {
195195 %x = load volatile float, float addrspace(1)* undef
196196 %y = load volatile float, float addrspace(1)* undef
244244 ; GCN: buffer_load_dword [[Z:v[0-9]+]]
245245
246246 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
247 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
247 ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
248248 define amdgpu_kernel void @add_select_fneg_fneg_f32(i32 %c) #0 {
249249 %x = load volatile float, float addrspace(1)* undef
250250 %y = load volatile float, float addrspace(1)* undef
265265 ; GCN: buffer_load_dword [[W:v[0-9]+]]
266266
267267 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
268 ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
269 ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[X]], [[W]]
268 ; GCN-DAG: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
269 ; GCN-DAG: v_sub_f32_e32 v{{[0-9]+}}, [[W]], [[X]]
270270 define amdgpu_kernel void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 {
271271 %x = load volatile float, float addrspace(1)* undef
272272 %y = load volatile float, float addrspace(1)* undef
290290
291291 ; GCN-DAG: v_xor_b32_e32 [[NEG_X:v[0-9]+]], 0x80000000, [[X]]
292292 ; GCN-DAG: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
293 ; GCN-DAG: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[SELECT]], [[Z]]
293 ; GCN-DAG: v_sub_f32_e32 [[ADD:v[0-9]+]], [[Z]], [[SELECT]]
294294
295295 ; GCN: buffer_store_dword [[ADD]]
296296 ; GCN: buffer_store_dword [[NEG_X]]
315315 ; GCN: buffer_load_dword [[W:v[0-9]+]]
316316
317317 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
318 ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
319 ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[Y]], [[W]]
318 ; GCN-DAG: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
319 ; GCN-DAG: v_sub_f32_e32 v{{[0-9]+}}, [[W]], [[Y]]
320320 define amdgpu_kernel void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 {
321321 %x = load volatile float, float addrspace(1)* undef
322322 %y = load volatile float, float addrspace(1)* undef
340340
341341 ; GCN: v_xor_b32_e32 [[X_NEG:v[0-9]+]], 0x80000000, [[X]]
342342 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_NEG]], vcc
343 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
343 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
344344 define amdgpu_kernel void @add_select_fneg_var_f32(i32 %c) #0 {
345345 %x = load volatile float, float addrspace(1)* undef
346346 %y = load volatile float, float addrspace(1)* undef
358358 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
359359
360360 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
361 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
361 ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
362362 define amdgpu_kernel void @add_select_fneg_negk_f32(i32 %c) #0 {
363363 %x = load volatile float, float addrspace(1)* undef
364364 %y = load volatile float, float addrspace(1)* undef
376376 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
377377
378378 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
379 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
379 ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
380380 define amdgpu_kernel void @add_select_fneg_inv2pi_f32(i32 %c) #0 {
381381 %x = load volatile float, float addrspace(1)* undef
382382 %y = load volatile float, float addrspace(1)* undef
396396 ; SI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
397397 ; VI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 0.15915494, [[X]], vcc
398398
399 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
399 ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
400400 define amdgpu_kernel void @add_select_fneg_neginv2pi_f32(i32 %c) #0 {
401401 %x = load volatile float, float addrspace(1)* undef
402402 %y = load volatile float, float addrspace(1)* undef
413413
414414 ; GCN: v_cmp_eq_u32_e64
415415 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
416 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
416 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
417417 define amdgpu_kernel void @add_select_negk_negk_f32(i32 %c) #0 {
418418 %x = load volatile float, float addrspace(1)* undef
419419 %cmp = icmp eq i32 %c, 0
430430
431431 ; GCN: v_cmp_eq_u32_e64
432432 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K1]], [[K0]], vcc
433 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
433 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
434434 define amdgpu_kernel void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 {
435435 %x = load volatile float, float addrspace(1)* undef
436436 %cmp = icmp eq i32 %c, 0
444444 ; GCN: buffer_load_dword [[X:v[0-9]+]]
445445
446446 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
447 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
447 ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
448448 define amdgpu_kernel void @add_select_fneg_negk_negk_f32(i32 %c) #0 {
449449 %x = load volatile float, float addrspace(1)* undef
450450 %cmp = icmp eq i32 %c, 0
461461
462462 ; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
463463 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
464 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
464 ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
465465 define amdgpu_kernel void @add_select_negk_fneg_f32(i32 %c) #0 {
466466 %x = load volatile float, float addrspace(1)* undef
467467 %y = load volatile float, float addrspace(1)* undef
478478 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
479479
480480 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc
481 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
481 ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
482482 define amdgpu_kernel void @add_select_fneg_posk_f32(i32 %c) #0 {
483483 %x = load volatile float, float addrspace(1)* undef
484484 %y = load volatile float, float addrspace(1)* undef
496496
497497 ; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
498498 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc
499 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
499 ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
500500 define amdgpu_kernel void @add_select_posk_fneg_f32(i32 %c) #0 {
501501 %x = load volatile float, float addrspace(1)* undef
502502 %y = load volatile float, float addrspace(1)* undef
516516 ; GCN-DAG: v_or_b32_e32 [[X_NEG_ABS:v[0-9]+]], 0x80000000, [[X]]
517517 ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
518518 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG_ABS]], vcc
519 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
519 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
520520 define amdgpu_kernel void @add_select_negfabs_fabs_f32(i32 %c) #0 {
521521 %x = load volatile float, float addrspace(1)* undef
522522 %y = load volatile float, float addrspace(1)* undef
539539 ; GCN-DAG: v_or_b32_e32 [[Y_NEG_ABS:v[0-9]+]], 0x80000000, [[Y]]
540540 ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
541541 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG_ABS]], [[X_ABS]], vcc
542 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
542 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
543543 define amdgpu_kernel void @add_select_fabs_negfabs_f32(i32 %c) #0 {
544544 %x = load volatile float, float addrspace(1)* undef
545545 %y = load volatile float, float addrspace(1)* undef
562562 ; GCN-DAG: v_xor_b32_e32 [[X_NEG:v[0-9]+]], 0x80000000, [[X]]
563563 ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
564564 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG]], vcc
565 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
565 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
566566 define amdgpu_kernel void @add_select_neg_fabs_f32(i32 %c) #0 {
567567 %x = load volatile float, float addrspace(1)* undef
568568 %y = load volatile float, float addrspace(1)* undef
584584 ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
585585 ; GCN-DAG: v_xor_b32_e32 [[Y_NEG:v[0-9]+]], 0x80000000, [[Y]]
586586 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG]], [[X_ABS]], vcc
587 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
587 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
588588 define amdgpu_kernel void @add_select_fabs_neg_f32(i32 %c) #0 {
589589 %x = load volatile float, float addrspace(1)* undef
590590 %y = load volatile float, float addrspace(1)* undef
605605
606606 ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
607607 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X]], vcc
608 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
608 ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
609609 define amdgpu_kernel void @add_select_neg_negfabs_f32(i32 %c) #0 {
610610 %x = load volatile float, float addrspace(1)* undef
611611 %y = load volatile float, float addrspace(1)* undef
627627
628628 ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
629629 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[X_ABS]], [[Y]], vcc
630 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
630 ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
631631 define amdgpu_kernel void @add_select_negfabs_neg_f32(i32 %c) #0 {
632632 %x = load volatile float, float addrspace(1)* undef
633633 %y = load volatile float, float addrspace(1)* undef
66 ; GCN: buffer_load_dword [[B:v[0-9]+]]
77 ; GCN: buffer_load_dword [[C:v[0-9]+]]
88
9 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]]
9 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]]
1010 ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL]]
1111 ; GCN: buffer_store_dword [[MUL]]
1212 define amdgpu_kernel void @multi_use_fneg_src() #0 {
2929 ; GCN: buffer_load_dword [[B:v[0-9]+]]
3030 ; GCN: buffer_load_dword [[C:v[0-9]+]]
3131
32 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]]
32 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]]
3333 ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[A]]
3434 ; GCN: v_mul_f32_e64 [[USE1:v[0-9]+]], [[MUL]], -[[MUL]]
3535 define amdgpu_kernel void @multi_foldable_use_fneg_src() #0 {
7777 ; GCN: buffer_load_dword [[A:v[0-9]+]]
7878 ; GCN: buffer_load_dword [[B:v[0-9]+]]
7979
80 ; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[B]], [[A]]
80 ; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[A]], [[B]]
8181 ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL0]]
8282 ; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[MUL0]], [[MUL0]]
8383 ; GCN: buffer_store_dword [[MUL1]]
9797 ; GCN-DAG: v_lshrrev_b32_e32 v[[ELT1PART:[0-9]+]], 2, v{{[[0-9]+}}
9898 ; GCN-DAG: v_bfe_u32 v[[ELT2PART:[0-9]+]], v[[VAL3]], 2, 2{{$}}
9999 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
100 ; GCN-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[SHLLO]], v[[ELT1PART]]
100 ; GCN-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[ELT1PART]], v[[SHLLO]]
101101 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]{{$}}
102102
103103 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[OR0]]:[[ZERO1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
342342 ...
343343 ---
344344 # GCN-LABEL: name: shrink_addc_vop3{{$}}
345 # GCN: %29 = V_ADDC_U32_e32 %17, %19, implicit-def %vcc, implicit %vcc, implicit %exec
345 # GCN: %29 = V_ADDC_U32_e32 %19, %17, implicit-def %vcc, implicit %vcc, implicit %exec
346346 # GCN %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
347347
348348 name: shrink_addc_vop3
428428
429429 ---
430430 # GCN-LABEL: name: shrink_addc_undef_vcc{{$}}
431 # GCN: %29 = V_ADDC_U32_e32 %17, %19, implicit-def %vcc, implicit undef %vcc, implicit %exec
431 # GCN: %29 = V_ADDC_U32_e32 %19, %17, implicit-def %vcc, implicit undef %vcc, implicit %exec
432432 # GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
433433 name: shrink_addc_undef_vcc
434434 alignment: 0
1717
1818 ; FUNC-LABEL: {{^}}v_abs_i32:
1919 ; GCN: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]]
20 ; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG]], [[SRC]]
20 ; GCN: v_max_i32_e32 {{v[0-9]+}}, [[SRC]], [[NEG]]
2121 ; GCN: v_add_i32
2222
2323 ; EG: MAX_INT
3333
3434 ; GCN-LABEL: {{^}}v_abs_i32_repeat_user:
3535 ; GCN: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]]
36 ; GCN: v_max_i32_e32 [[MAX:v[0-9]+]], [[NEG]], [[SRC]]
36 ; GCN: v_max_i32_e32 [[MAX:v[0-9]+]], [[SRC]], [[NEG]]
3737 ; GCN: v_mul_lo_i32 v{{[0-9]+}}, [[MAX]], [[MAX]]
3838 define amdgpu_kernel void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
3939 %val = load i32, i32 addrspace(1)* %src, align 4
7070 ; GCN-DAG: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
7171 ; GCN-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]
7272
73 ; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
74 ; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
73 ; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC0]], [[NEG0]]
74 ; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC1]], [[NEG1]]
7575
7676 ; GCN: v_add_i32
7777 ; GCN: v_add_i32
131131 ; GCN-DAG: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]]
132132 ; GCN-DAG: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]]
133133
134 ; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
135 ; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
136 ; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]]
137 ; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]]
134 ; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC0]], [[NEG0]]
135 ; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC1]], [[NEG1]]
136 ; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC2]], [[NEG2]]
137 ; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC3]], [[NEG3]]
138138
139139 ; GCN: v_add_i32
140140 ; GCN: v_add_i32
183183 ; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]]
184184 ; GCN: {{buffer|flat}}_load_dword [[VAL1:v[0-9]+]]
185185
186 ; GCN-DAG: v_min_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]]
187 ; GCN-DAG: v_max_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]]
186 ; GCN-DAG: v_min_i32_e32 v{{[0-9]+}}, [[VAL0]], [[VAL1]]
187 ; GCN-DAG: v_max_i32_e32 v{{[0-9]+}}, [[VAL0]], [[VAL1]]
188188 define amdgpu_kernel void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
189189 %val0 = load volatile i32, i32 addrspace(1)* %ptr0
190190 %val1 = load volatile i32, i32 addrspace(1)* %ptr1
44 ; GCN-LABEL: {{^}}v_test_sub_i16:
55 ; VI: flat_load_ushort [[A:v[0-9]+]]
66 ; VI: flat_load_ushort [[B:v[0-9]+]]
7 ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
7 ; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
88 ; VI-NEXT: buffer_store_short [[ADD]]
99 define amdgpu_kernel void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
1010 %tid = call i32 @llvm.amdgcn.workitem.id.x()
6767 ; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i32:
6868 ; VI: flat_load_ushort [[A:v[0-9]+]]
6969 ; VI: flat_load_ushort [[B:v[0-9]+]]
70 ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
70 ; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
7171 ; VI-NEXT: buffer_store_dword [[ADD]]
7272 define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
7373 %tid = call i32 @llvm.amdgcn.workitem.id.x()
8787 ; VI: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
8888 ; VI: flat_load_ushort [[A:v[0-9]+]]
8989 ; VI: flat_load_ushort [[B:v[0-9]+]]
90 ; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
90 ; VI-DAG: v_sub_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]]
9191 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
9292 define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
9393 %tid = call i32 @llvm.amdgcn.workitem.id.x()
106106 ; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i32:
107107 ; VI: flat_load_ushort [[A:v[0-9]+]]
108108 ; VI: flat_load_ushort [[B:v[0-9]+]]
109 ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
109 ; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
110110 ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
111111 ; VI-NEXT: buffer_store_dword [[SEXT]]
112112 define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
126126 ; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i64:
127127 ; VI: flat_load_ushort [[A:v[0-9]+]]
128128 ; VI: flat_load_ushort [[B:v[0-9]+]]
129 ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
129 ; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
130130 ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
131131 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
132132 ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
55 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
66
77 ; VI: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
8 ; VI: v_subrev_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
8 ; VI: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
99 define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
1010 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1111 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
164164 ; VI: flat_load_ushort v[[B_HI:[0-9]+]]
165165 ; VI: flat_load_ushort v[[B_LO:[0-9]+]]
166166
167 ; VI: v_subrev_u16_e32 v[[ADD_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
167 ; VI: v_sub_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
168168 ; VI-NOT: and
169169 ; VI-NOT: shl
170 ; VI: v_subrev_u16_e32 v[[ADD_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
170 ; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
171171 ; VI-NOT: and
172172 ; VI-NOT: shl
173173 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
200200 ; VI: flat_load_ushort v[[B_LO:[0-9]+]]
201201 ; VI: flat_load_ushort v[[B_HI:[0-9]+]]
202202
203 ; VI-DAG: v_subrev_u16_e32
204 ; VI-DAG: v_subrev_u16_e32
203 ; VI: v_sub_u16_e32
204 ; VI: v_sub_u16_e32
205205
206206 ; VI: buffer_store_dwordx4
207207 define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
227227 ; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
228228 ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
229229
230 ; VI: v_subrev_u16_e32
231 ; VI: v_subrev_u16_e32
230 ; VI: v_sub_u16_e32
231 ; VI: v_sub_u16_e32
232232 ; VI: buffer_store_dwordx2
233233 define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
234234 %tid = call i32 @llvm.amdgcn.workitem.id.x()
252252 ; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
253253
254254 ; VI: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
255 ; VI: v_subrev_u16_e32
255 ; VI: v_sub_u16_e32
256256
257257 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
258258 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
119119 }
120120
121121 ; FUNC-LABEL: {{^}}v_usubo_i16:
122 ; VI: v_subrev_u16_e32
122 ; VI: v_sub_u16_e32
123123 ; VI: v_cmp_gt_u16_e32
124124 define amdgpu_kernel void @v_usubo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
125125 %tid = call i32 @llvm.amdgcn.workitem.id.x()
55 ; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
66 ; GCN: buffer_load_dword [[B:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4
77 ; GCN: buffer_load_dword [[C:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
8 ; GCN: v_mac_f32_e32 [[C]], [[B]], [[A]]
8 ; GCN: v_mac_f32_e32 [[C]], [[A]], [[B]]
99 ; GCN: buffer_store_dword [[C]]
1010 define amdgpu_kernel void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
1111 entry:
134134
135135 ; GCN-LABEL: {{^}}safe_mad_sub0_src0:
136136 ; GCN: v_sub_f32_e32 [[SUB0:v[0-9]+]], 0,
137 ; GCN: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[SUB0]]
137 ; GCN: v_mac_f32_e32 v{{[0-9]+}}, [[SUB0]], v{{[0-9]+}}
138138 define amdgpu_kernel void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
139139 entry:
140140 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
77 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
88 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
99 ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
10 ; SI: v_mac_f32_e32 v[[C_F32]], v[[B_F32]], v[[A_F32]]
10 ; SI: v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]]
1111 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]]
1212 ; SI: buffer_store_short v[[R_F16]]
13 ; VI: v_mac_f16_e32 v[[C_F16]], v[[B_F16]], v[[A_F16]]
13 ; VI: v_mac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]]
1414 ; VI: buffer_store_short v[[C_F16]]
1515 ; GCN: s_endpgm
1616 define amdgpu_kernel void @mac_f16(
146146
147147 ; GCN-LABEL: {{^}}mac_f16_neg_a_safe_fp_math:
148148 ; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
149 ; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}}
150 ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
151 ; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}}
152 ; GCN: s_endpgm
153 define amdgpu_kernel void @mac_f16_neg_a_safe_fp_math(
154 half addrspace(1)* %r,
155 half addrspace(1)* %a,
156 half addrspace(1)* %b,
157 half addrspace(1)* %c) #0 {
158 entry:
159 %a.val = load half, half addrspace(1)* %a
160 %b.val = load half, half addrspace(1)* %b
161 %c.val = load half, half addrspace(1)* %c
162
163 %a.neg = fsub half 0.0, %a.val
164 %t.val = fmul half %a.neg, %b.val
165 %r.val = fadd half %t.val, %c.val
166
167 store half %r.val, half addrspace(1)* %r
168 ret void
169 }
170
171 ; GCN-LABEL: {{^}}mac_f16_neg_b_safe_fp_math:
172 ; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
149173 ; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]]
150174 ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
151175 ; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]]
152 ; GCN: s_endpgm
153 define amdgpu_kernel void @mac_f16_neg_a_safe_fp_math(
154 half addrspace(1)* %r,
155 half addrspace(1)* %a,
156 half addrspace(1)* %b,
157 half addrspace(1)* %c) #0 {
158 entry:
159 %a.val = load half, half addrspace(1)* %a
160 %b.val = load half, half addrspace(1)* %b
161 %c.val = load half, half addrspace(1)* %c
162
163 %a.neg = fsub half 0.0, %a.val
164 %t.val = fmul half %a.neg, %b.val
165 %r.val = fadd half %t.val, %c.val
166
167 store half %r.val, half addrspace(1)* %r
168 ret void
169 }
170
171 ; GCN-LABEL: {{^}}mac_f16_neg_b_safe_fp_math:
172 ; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
173 ; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}}
174 ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
175 ; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}}
176176 ; GCN: s_endpgm
177177 define amdgpu_kernel void @mac_f16_neg_b_safe_fp_math(
178178 half addrspace(1)* %r,
311311 ; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
312312 ; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
313313 ; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
314 ; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[B_F32_0]], v[[A_F32_0]]
314 ; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]]
315315 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
316 ; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[B_F32_1]], v[[A_F32_1]]
316 ; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]]
317317 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
318318 ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
319319 ; VI-NOT: and
320 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
320 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
321321
322322 ; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
323323 ; VI-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
324 ; VI-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]]
324 ; VI-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]]
325325 ; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]]
326326 ; VI-NOT: and
327 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[C_V2_F16]]
327 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], v[[R_F16_HI]]
328328
329329 ; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]]
330330 ; GCN: s_endpgm
480480
481481 ; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
482482 ; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
483 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]]
484 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
483 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}}
484 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
485485
486486 ; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
487487 ; VI-DAG: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
488488 ; VI-DAG: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
489489 ; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
490 ; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
490 ; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
491491
492492 ; GCN: s_endpgm
493493 define amdgpu_kernel void @mac_v2f16_neg_a_safe_fp_math(
512512
513513 ; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
514514 ; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
515 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}}
516 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
515 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]]
516 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
517517
518518 ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
519519 ; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
520520 ; VI: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
521521 ; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
522 ; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
522 ; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
523523
524524 ; GCN: s_endpgm
525525 define amdgpu_kernel void @mac_v2f16_neg_b_safe_fp_math(
0 # RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-shrink-instructions -o - %s | FileCheck -check-prefix=GCN %s
1 ...
2 # GCN-LABEL: name: fold_imm_non_ssa{{$}}
3 # GCN: %0 = V_MOV_B32_e32 123, implicit %exec
4 # GCN: %2 = V_ADD_I32_e32 456, %0, implicit-def %vcc, implicit %exec
5
6 name: fold_imm_non_ssa
7 tracksRegLiveness: true
8 registers:
9 - { id: 0, class: vgpr_32 }
10 - { id: 1, class: vgpr_32 }
11 - { id: 2, class: vgpr_32 }
12 - { id: 3, class: sreg_64 }
13 body: |
14 bb.0:
15 %0 = COPY undef %0
16 %0 = V_MOV_B32_e32 123, implicit %exec
17 %1 = V_MOV_B32_e32 456, implicit %exec
18 %2, %vcc = V_ADD_I32_e64 %0, %1, implicit %exec
19 S_ENDPGM
20
21 ...
22 # GCN-LABEL: name: fold_partially_defined_superreg{{$}}
23 # GCN: %1 = V_MOV_B32_e32 456, implicit %exec
24 # GCN: %2 = V_ADD_I32_e32 123, %1, implicit-def %vcc, implicit %exec
25 name: fold_partially_defined_superreg
26 tracksRegLiveness: true
27 registers:
28 - { id: 0, class: vgpr_32 }
29 - { id: 1, class: vgpr_32 }
30 - { id: 2, class: vgpr_32 }
31 - { id: 3, class: vreg_64 }
32 body: |
33 bb.0:
34 undef %3.sub0 = V_MOV_B32_e32 123, implicit %exec, implicit-def %3
35 %1 = V_MOV_B32_e32 456, implicit %exec
36 %2, %vcc = V_ADD_I32_e64 %3.sub0, %1, implicit %exec
37 S_ENDPGM
38
39 ...
5959 ; FUNC-LABEL: {{^}}v_xor_i1:
6060 ; SI: buffer_load_ubyte [[B:v[0-9]+]]
6161 ; SI: buffer_load_ubyte [[A:v[0-9]+]]
62 ; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[A]], [[B]]
62 ; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[B]], [[A]]
6363 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]]
6464 ; SI: buffer_store_byte [[RESULT]]
6565 define amdgpu_kernel void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
55 ; GCN-NOT: _or_
66 ; GCN-NOT: v[[HI]]
77 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
8 ; GCN: v_or_b32_e32 v[[LO]], v[[LD32]], v[[LO]]
8 ; GCN: v_or_b32_e32 v[[LO]], v[[LO]], v[[LD32]]
99 ; GCN-NOT: _or_
1010 ; GCN-NOT: v[[HI]]
1111 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
2525 ; GCN-NOT: _or_
2626 ; GCN-NOT: v[[HI]]
2727 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
28 ; GCN: v_or_b32_e32 v[[LO]], v[[LD32]], v[[LO]]
28 ; GCN: v_or_b32_e32 v[[LO]], v[[LO]], v[[LD32]]
2929 ; GCN-NOT: v[[HI]]
3030 ; GCN-NOT: _or_
3131 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
170170
171171 # CHECK-LABEL: name: add_f32_1.0_multi_f16_use
172172 # CHECK: %13 = V_MOV_B32_e32 1065353216, implicit %exec
173 # CHECK: %14 = V_ADD_F16_e32 %13, killed %11, implicit %exec
174 # CHECK: %15 = V_ADD_F16_e32 killed %13, killed %12, implicit %exec
173 # CHECK: %14 = V_ADD_F16_e32 killed %11, %13, implicit %exec
174 # CHECK: %15 = V_ADD_F16_e32 killed %12, killed %13, implicit %exec
175175
176176
177177 name: add_f32_1.0_multi_f16_use
306306
307307 # CHECK-LABEL: name: add_f32_1.0_one_f32_use_multi_f16_use
308308 # CHECK: %14 = V_MOV_B32_e32 1065353216, implicit %exec
309 # CHECK: %15 = V_ADD_F16_e32 %14, %11, implicit %exec
310 # CHECK: %16 = V_ADD_F16_e32 %14, %12, implicit %exec
309 # CHECK: %15 = V_ADD_F16_e32 %11, %14, implicit %exec
310 # CHECK: %16 = V_ADD_F16_e32 %12, %14, implicit %exec
311311 # CHECK: %17 = V_ADD_F32_e32 1065353216, killed %13, implicit %exec
312312
313313 name: add_f32_1.0_one_f32_use_multi_f16_use
513513
514514 # CHECK-LABEL: name: add_f16_1.0_multi_f32_use
515515 # CHECK: %13 = V_MOV_B32_e32 15360, implicit %exec
516 # CHECK: %14 = V_ADD_F32_e32 %13, %11, implicit %exec
517 # CHECK: %15 = V_ADD_F32_e32 %13, %12, implicit %exec
516 # CHECK: %14 = V_ADD_F32_e32 %11, %13, implicit %exec
517 # CHECK: %15 = V_ADD_F32_e32 %12, %13, implicit %exec
518518
519519 name: add_f16_1.0_multi_f32_use
520520 alignment: 0
580580
581581 # CHECK-LABEL: name: add_f16_1.0_other_high_bits_multi_f16_use
582582 # CHECK: %13 = V_MOV_B32_e32 80886784, implicit %exec
583 # CHECK: %14 = V_ADD_F16_e32 %13, %11, implicit %exec
584 # CHECK: %15 = V_ADD_F16_e32 %13, %12, implicit %exec
583 # CHECK: %14 = V_ADD_F16_e32 %11, %13, implicit %exec
584 # CHECK: %15 = V_ADD_F16_e32 %12, %13, implicit %exec
585585
586586 name: add_f16_1.0_other_high_bits_multi_f16_use
587587 alignment: 0
647647
648648 # CHECK-LABEL: name: add_f16_1.0_other_high_bits_use_f16_f32
649649 # CHECK: %13 = V_MOV_B32_e32 305413120, implicit %exec
650 # CHECK: %14 = V_ADD_F32_e32 %13, %11, implicit %exec
651 # CHECK: %15 = V_ADD_F16_e32 %13, %12, implicit %exec
650 # CHECK: %14 = V_ADD_F32_e32 %11, %13, implicit %exec
651 # CHECK: %15 = V_ADD_F16_e32 %12, %13, implicit %exec
652652 name: add_f16_1.0_other_high_bits_use_f16_f32
653653 alignment: 0
654654 exposesReturnsTwice: false