llvm.org GIT mirror llvm / 4ed9917
R600: Relax some vector constraints on Dot4. Dot4 now uses 8 scalar operands instead of 2 vectors one which allows register coalescer to remove some unneeded COPY. This patch also defines some structures/functions that can be used to handle every vector instructions (CUBE, Cayman special instructions...) in a similar fashion. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182126 91177308-0d34-0410-b5e6-96231b3b80d8 Vincent Lejeune 7 years ago
11 changed file(s) with 283 addition(s) and 29 deletion(s). Raw diff Collapse all Expand all
125125 SMIN,
126126 UMIN,
127127 URECIP,
128 DOT4,
128129 TEXTURE_FETCH,
129130 EXPORT,
130131 CONST_ADDRESS,
9797 {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8, 9,-1,10,11,12,13,14,15,16,17,18}
9898 };
9999
100 enum VecOps {
101 UPDATE_EXEC_MASK_X,
102 UPDATE_PREDICATE_X,
103 WRITE_X,
104 OMOD_X,
105 DST_REL_X,
106 CLAMP_X,
107 SRC0_X,
108 SRC0_NEG_X,
109 SRC0_REL_X,
110 SRC0_ABS_X,
111 SRC0_SEL_X,
112 SRC1_X,
113 SRC1_NEG_X,
114 SRC1_REL_X,
115 SRC1_ABS_X,
116 SRC1_SEL_X,
117 PRED_SEL_X,
118 UPDATE_EXEC_MASK_Y,
119 UPDATE_PREDICATE_Y,
120 WRITE_Y,
121 OMOD_Y,
122 DST_REL_Y,
123 CLAMP_Y,
124 SRC0_Y,
125 SRC0_NEG_Y,
126 SRC0_REL_Y,
127 SRC0_ABS_Y,
128 SRC0_SEL_Y,
129 SRC1_Y,
130 SRC1_NEG_Y,
131 SRC1_REL_Y,
132 SRC1_ABS_Y,
133 SRC1_SEL_Y,
134 PRED_SEL_Y,
135 UPDATE_EXEC_MASK_Z,
136 UPDATE_PREDICATE_Z,
137 WRITE_Z,
138 OMOD_Z,
139 DST_REL_Z,
140 CLAMP_Z,
141 SRC0_Z,
142 SRC0_NEG_Z,
143 SRC0_REL_Z,
144 SRC0_ABS_Z,
145 SRC0_SEL_Z,
146 SRC1_Z,
147 SRC1_NEG_Z,
148 SRC1_REL_Z,
149 SRC1_ABS_Z,
150 SRC1_SEL_Z,
151 PRED_SEL_Z,
152 UPDATE_EXEC_MASK_W,
153 UPDATE_PREDICATE_W,
154 WRITE_W,
155 OMOD_W,
156 DST_REL_W,
157 CLAMP_W,
158 SRC0_W,
159 SRC0_NEG_W,
160 SRC0_REL_W,
161 SRC0_ABS_W,
162 SRC0_SEL_W,
163 SRC1_W,
164 SRC1_NEG_W,
165 SRC1_REL_W,
166 SRC1_ABS_W,
167 SRC1_SEL_W,
168 PRED_SEL_W,
169 IMM_0,
170 IMM_1,
171 VEC_COUNT
172 };
173
100174 }
101175
102176 //===----------------------------------------------------------------------===//
3535 case AMDGPU::INTERP_PAIR_XY:
3636 case AMDGPU::INTERP_PAIR_ZW:
3737 case AMDGPU::INTERP_VEC_LOAD:
38 case AMDGPU::DOT4_eg_pseudo:
39 case AMDGPU::DOT4_r600_pseudo:
38 case AMDGPU::DOT_4:
4039 return 4;
4140 case AMDGPU::KILL:
4241 return 0;
7069 case AMDGPU::INTERP_PAIR_ZW:
7170 case AMDGPU::INTERP_VEC_LOAD:
7271 case AMDGPU::COPY:
73 case AMDGPU::DOT4_eg_pseudo:
74 case AMDGPU::DOT4_r600_pseudo:
72 case AMDGPU::DOT_4:
7573 return true;
7674 default:
7775 return false;
181181 MI.eraseFromParent();
182182 continue;
183183 }
184 case AMDGPU::DOT_4: {
185
186 const R600RegisterInfo &TRI = TII->getRegisterInfo();
187
188 unsigned DstReg = MI.getOperand(0).getReg();
189 unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
190
191 for (unsigned Chan = 0; Chan < 4; ++Chan) {
192 bool Mask = (Chan != TRI.getHWRegChan(DstReg));
193 unsigned SubDstReg =
194 AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
195 MachineInstr *BMI =
196 TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg);
197 if (Chan > 0) {
198 BMI->bundleWithPred();
199 }
200 if (Mask) {
201 TII->addFlag(BMI, 0, MO_FLAG_MASK);
202 }
203 if (Chan != 3)
204 TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
205 unsigned Opcode = BMI->getOpcode();
206 // While not strictly necessary from hw point of view, we force
207 // all src operands of a dot4 inst to belong to the same slot.
208 unsigned Src0 = BMI->getOperand(
209 TII->getOperandIdx(Opcode, R600Operands::SRC0))
210 .getReg();
211 unsigned Src1 = BMI->getOperand(
212 TII->getOperandIdx(Opcode, R600Operands::SRC1))
213 .getReg();
214 assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1));
215 }
216 MI.eraseFromParent();
217 continue;
218 }
184219 }
185220
186221 bool IsReduction = TII->isReductionOp(MI.getOpcode());
267302 case AMDGPU::CUBE_eg_pseudo:
268303 Opcode = AMDGPU::CUBE_eg_real;
269304 break;
270 case AMDGPU::DOT4_r600_pseudo:
271 Opcode = AMDGPU::DOT4_r600_real;
272 break;
273 case AMDGPU::DOT4_eg_pseudo:
274 Opcode = AMDGPU::DOT4_eg_real;
275 break;
276305 default:
277306 break;
278307 }
630630 };
631631 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
632632 }
633 case AMDGPUIntrinsic::AMDGPU_dp4: {
634 SDValue Args[8] = {
635 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
636 DAG.getConstant(0, MVT::i32)),
637 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
638 DAG.getConstant(0, MVT::i32)),
639 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
640 DAG.getConstant(1, MVT::i32)),
641 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
642 DAG.getConstant(1, MVT::i32)),
643 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
644 DAG.getConstant(2, MVT::i32)),
645 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
646 DAG.getConstant(2, MVT::i32)),
647 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
648 DAG.getConstant(3, MVT::i32)),
649 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
650 DAG.getConstant(3, MVT::i32))
651 };
652 return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
653 }
633654
634655 case r600_read_ngroups_x:
635656 return LowerImplicitParameter(DAG, VT, DL, 0);
115115 bool R600InstrInfo::isReductionOp(unsigned Opcode) const {
116116 switch(Opcode) {
117117 default: return false;
118 case AMDGPU::DOT4_r600_pseudo:
119 case AMDGPU::DOT4_eg_pseudo:
120 return true;
121118 }
122119 }
123120
862859 .addImm(0) // $literal
863860 .addImm(0); // $bank_swizzle
864861
862 return MIB;
863 }
864
865 #define OPERAND_CASE(Label) \
866 case Label: { \
867 static const R600Operands::VecOps Ops[] = \
868 { \
869 Label##_X, \
870 Label##_Y, \
871 Label##_Z, \
872 Label##_W \
873 }; \
874 return Ops[Slot]; \
875 }
876
877 static R600Operands::VecOps
878 getSlotedOps(R600Operands::Ops Op, unsigned Slot) {
879 switch (Op) {
880 OPERAND_CASE(R600Operands::UPDATE_EXEC_MASK)
881 OPERAND_CASE(R600Operands::UPDATE_PREDICATE)
882 OPERAND_CASE(R600Operands::WRITE)
883 OPERAND_CASE(R600Operands::OMOD)
884 OPERAND_CASE(R600Operands::DST_REL)
885 OPERAND_CASE(R600Operands::CLAMP)
886 OPERAND_CASE(R600Operands::SRC0)
887 OPERAND_CASE(R600Operands::SRC0_NEG)
888 OPERAND_CASE(R600Operands::SRC0_REL)
889 OPERAND_CASE(R600Operands::SRC0_ABS)
890 OPERAND_CASE(R600Operands::SRC0_SEL)
891 OPERAND_CASE(R600Operands::SRC1)
892 OPERAND_CASE(R600Operands::SRC1_NEG)
893 OPERAND_CASE(R600Operands::SRC1_REL)
894 OPERAND_CASE(R600Operands::SRC1_ABS)
895 OPERAND_CASE(R600Operands::SRC1_SEL)
896 OPERAND_CASE(R600Operands::PRED_SEL)
897 default:
898 llvm_unreachable("Wrong Operand");
899 }
900 }
901
902 #undef OPERAND_CASE
903
904 static int
905 getVecOperandIdx(R600Operands::VecOps Op) {
906 return 1 + Op;
907 }
908
909
910 MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
911 MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned DstReg)
912 const {
913 assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented");
914 unsigned Opcode;
915 const AMDGPUSubtarget &ST = TM.getSubtarget();
916 if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD4XXX)
917 Opcode = AMDGPU::DOT4_r600;
918 else
919 Opcode = AMDGPU::DOT4_eg;
920 MachineBasicBlock::iterator I = MI;
921 MachineOperand &Src0 = MI->getOperand(
922 getVecOperandIdx(getSlotedOps(R600Operands::SRC0, Slot)));
923 MachineOperand &Src1 = MI->getOperand(
924 getVecOperandIdx(getSlotedOps(R600Operands::SRC1, Slot)));
925 MachineInstr *MIB = buildDefaultInstruction(
926 MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg());
927 static const R600Operands::Ops Operands[14] = {
928 R600Operands::UPDATE_EXEC_MASK,
929 R600Operands::UPDATE_PREDICATE,
930 R600Operands::WRITE,
931 R600Operands::OMOD,
932 R600Operands::DST_REL,
933 R600Operands::CLAMP,
934 R600Operands::SRC0_NEG,
935 R600Operands::SRC0_REL,
936 R600Operands::SRC0_ABS,
937 R600Operands::SRC0_SEL,
938 R600Operands::SRC1_NEG,
939 R600Operands::SRC1_REL,
940 R600Operands::SRC1_ABS,
941 R600Operands::SRC1_SEL,
942 };
943
944 for (unsigned i = 0; i < 14; i++) {
945 MachineOperand &MO = MI->getOperand(
946 getVecOperandIdx(getSlotedOps(Operands[i], Slot)));
947 assert (MO.isImm());
948 setImmOperand(MIB, Operands[i], MO.getImm());
949 }
950 MIB->getOperand(20).setImm(0);
865951 return MIB;
866952 }
867953
197197 unsigned Src0Reg,
198198 unsigned Src1Reg = 0) const;
199199
200 MachineInstr *buildSlotOfVectorInstruction(MachineBasicBlock &MBB,
201 MachineInstr *MI,
202 unsigned Slot,
203 unsigned DstReg) const;
204
200205 MachineInstr *buildMovImm(MachineBasicBlock &BB,
201206 MachineBasicBlock::iterator I,
202207 unsigned DstReg,
592592 [SDNPVariadic]
593593 >;
594594
595 def DOT4 : SDNode<"AMDGPUISD::DOT4",
596 SDTypeProfile<1, 8, [SDTCisFP<0>, SDTCisVT<1, f32>, SDTCisVT<2, f32>,
597 SDTCisVT<3, f32>, SDTCisVT<4, f32>, SDTCisVT<5, f32>,
598 SDTCisVT<6, f32>, SDTCisVT<7, f32>, SDTCisVT<8, f32>]>,
599 []
600 >;
601
595602 def TEXTURE_FETCH_Type : SDTypeProfile<1, 19, [SDTCisFP<0>]>;
596603
597604 def TEXTURE_FETCH: SDNode<"AMDGPUISD::TEXTURE_FETCH", TEXTURE_FETCH_Type, []>;
12281235 [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_GE))]
12291236 >;
12301237
1231 multiclass DOT4_Common inst> {
1232
1233 def _pseudo : R600_REDUCTION
1234 (ins R600_Reg128:$src0, R600_Reg128:$src1),
1235 "DOT4 $dst $src0, $src1",
1236 [(set f32:$dst, (int_AMDGPU_dp4 v4f32:$src0, v4f32:$src1))]
1237 >;
1238
1239 def _real : R600_2OP ;
1240 }
1238
1239 let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in {
1240 class R600_VEC2OP pattern> : InstR600 <(outs R600_Reg32:$dst), (ins
1241 // Slot X
1242 UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X,
1243 OMOD:$omod_X, REL:$dst_rel_X, CLAMP:$clamp_X,
1244 R600_TReg32_X:$src0_X, NEG:$src0_neg_X, REL:$src0_rel_X, ABS:$src0_abs_X, SEL:$src0_sel_X,
1245 R600_TReg32_X:$src1_X, NEG:$src1_neg_X, REL:$src1_rel_X, ABS:$src1_abs_X, SEL:$src1_sel_X,
1246 R600_Pred:$pred_sel_X,
1247 // Slot Y
1248 UEM:$update_exec_mask_Y, UP:$update_pred_Y, WRITE:$write_Y,
1249 OMOD:$omod_Y, REL:$dst_rel_Y, CLAMP:$clamp_Y,
1250 R600_TReg32_Y:$src0_Y, NEG:$src0_neg_Y, REL:$src0_rel_Y, ABS:$src0_abs_Y, SEL:$src0_sel_Y,
1251 R600_TReg32_Y:$src1_Y, NEG:$src1_neg_Y, REL:$src1_rel_Y, ABS:$src1_abs_Y, SEL:$src1_sel_Y,
1252 R600_Pred:$pred_sel_Y,
1253 // Slot Z
1254 UEM:$update_exec_mask_Z, UP:$update_pred_Z, WRITE:$write_Z,
1255 OMOD:$omod_Z, REL:$dst_rel_Z, CLAMP:$clamp_Z,
1256 R600_TReg32_Z:$src0_Z, NEG:$src0_neg_Z, REL:$src0_rel_Z, ABS:$src0_abs_Z, SEL:$src0_sel_Z,
1257 R600_TReg32_Z:$src1_Z, NEG:$src1_neg_Z, REL:$src1_rel_Z, ABS:$src1_abs_Z, SEL:$src1_sel_Z,
1258 R600_Pred:$pred_sel_Z,
1259 // Slot W
1260 UEM:$update_exec_mask_W, UP:$update_pred_W, WRITE:$write_W,
1261 OMOD:$omod_W, REL:$dst_rel_W, CLAMP:$clamp_W,
1262 R600_TReg32_W:$src0_W, NEG:$src0_neg_W, REL:$src0_rel_W, ABS:$src0_abs_W, SEL:$src0_sel_W,
1263 R600_TReg32_W:$src1_W, NEG:$src1_neg_W, REL:$src1_rel_W, ABS:$src1_abs_W, SEL:$src1_sel_W,
1264 R600_Pred:$pred_sel_W,
1265 LITERAL:$literal0, LITERAL:$literal1),
1266 "",
1267 pattern,
1268 AnyALU> {}
1269 }
1270
1271 def DOT_4 : R600_VEC2OP<[(set R600_Reg32:$dst, (DOT4
1272 R600_TReg32_X:$src0_X, R600_TReg32_X:$src1_X,
1273 R600_TReg32_Y:$src0_Y, R600_TReg32_Y:$src1_Y,
1274 R600_TReg32_Z:$src0_Z, R600_TReg32_Z:$src1_Z,
1275 R600_TReg32_W:$src0_W, R600_TReg32_W:$src1_W))]>;
1276
1277
1278 class DOT4_Common inst> : R600_2OP ;
1279
12411280
12421281 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
12431282 multiclass CUBE_Common inst> {
14111450 def CNDE_r600 : CNDE_Common<0x18>;
14121451 def CNDGT_r600 : CNDGT_Common<0x19>;
14131452 def CNDGE_r600 : CNDGE_Common<0x1A>;
1414 defm DOT4_r600 : DOT4_Common<0x50>;
1453 def DOT4_r600 : DOT4_Common<0x50>;
14151454 defm CUBE_r600 : CUBE_Common<0x52>;
14161455 def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>;
14171456 def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>;
16101649 def CNDGE_eg : CNDGE_Common<0x1B>;
16111650 def MUL_LIT_eg : MUL_LIT_Common<0x1F>;
16121651 def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>;
1613 defm DOT4_eg : DOT4_Common<0xBE>;
1652 def DOT4_eg : DOT4_Common<0xBE>;
16141653 defm CUBE_eg : CUBE_Common<0xC0>;
16151654
16161655 let hasSideEffects = 1 in {
184184 case AMDGPU::INTERP_PAIR_XY:
185185 case AMDGPU::INTERP_PAIR_ZW:
186186 case AMDGPU::INTERP_VEC_LOAD:
187 case AMDGPU::DOT_4:
187188 return AluT_XYZW;
188189 case AMDGPU::COPY:
189190 if (TargetRegisterInfo::isPhysicalRegister(MI->getOperand(1).getReg())) {
251252 case AMDGPU::INTERP_PAIR_XY:
252253 case AMDGPU::INTERP_PAIR_ZW:
253254 case AMDGPU::INTERP_VEC_LOAD:
254 case AMDGPU::DOT4_eg_pseudo:
255 case AMDGPU::DOT4_r600_pseudo:
255 case AMDGPU::DOT_4:
256256 return IDAlu;
257257 case AMDGPU::TEX_VTX_CONSTBUF:
258258 case AMDGPU::TEX_VTX_TEXBUF:
8585 if (BI->getOperand(OperandIdx).getImm() == 0)
8686 continue;
8787 unsigned Dst = BI->getOperand(0).getReg();
88 if (BI->getOpcode() == AMDGPU::DOT4_r600_real) {
88 if (BI->getOpcode() == AMDGPU::DOT4_r600 ||
89 BI->getOpcode() == AMDGPU::DOT4_eg) {
8990 Result[Dst] = AMDGPU::PV_X;
9091 continue;
9192 }
0 ; RUN: llc < %s -march=r600 | FileCheck %s
11
22 ;CHECK: DOT4 * T{{[0-9]\.W}} (MASKED)
3 ;CHECK-NEXT: CNDGE T{{[0-9].[XYZW]}}, PV.x
3 ;CHECK: CNDGE * T{{[0-9].[XYZW]}}, PV.x
44
55 define void @main() #0 {
66 main_body: