llvm.org GIT mirror llvm / 8d63149
AMDGPU: Fix handling of 16-bit immediates Since 32-bit instructions with 32-bit input immediate behavior are used to materialize 16-bit constants in 32-bit registers for 16-bit instructions, determining the legality based on the size is incorrect. Change operands to have the size specified in the type. Also adds a workaround for a disassembler bug that produces an immediate MCOperand for an operand that is supposed to be OPERAND_REGISTER. The assembler appears to accept out of bounds immediates and truncates them, but this seems to be an issue for 32-bit already. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@289306 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 3 years ago
35 changed file(s) with 2042 addition(s) and 272 deletion(s). Raw diff Collapse all Expand all
214214 return isRegKind() || isInlinableImm(type);
215215 }
216216
217 bool isRegOrImmWithInt16InputMods() const {
218 return isRegOrImmWithInputMods(MVT::i16);
219 }
220
217221 bool isRegOrImmWithInt32InputMods() const {
218222 return isRegOrImmWithInputMods(MVT::i32);
219223 }
220224
221225 bool isRegOrImmWithInt64InputMods() const {
222226 return isRegOrImmWithInputMods(MVT::i64);
227 }
228
229 bool isRegOrImmWithFP16InputMods() const {
230 return isRegOrImmWithInputMods(MVT::f16);
223231 }
224232
225233 bool isRegOrImmWithFP32InputMods() const {
281289
282290 bool isRegClass(unsigned RCID) const;
283291
292 bool isSCSrcB16() const {
293 return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::i16);
294 }
295
284296 bool isSCSrcB32() const {
285297 return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::i32);
286298 }
289301 return isRegClass(AMDGPU::SReg_64RegClassID) || isInlinableImm(MVT::i64);
290302 }
291303
304 bool isSCSrcF16() const {
305 return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::f16);
306 }
307
292308 bool isSCSrcF32() const {
293309 return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::f32);
294310 }
299315
300316 bool isSSrcB32() const {
301317 return isSCSrcB32() || isLiteralImm(MVT::i32) || isExpr();
318 }
319
320 bool isSSrcB16() const {
321 return isSCSrcB16() || isLiteralImm(MVT::i16);
302322 }
303323
304324 bool isSSrcB64() const {
315335 return isSCSrcB64() || isLiteralImm(MVT::f64);
316336 }
317337
338 bool isSSrcF16() const {
339 return isSCSrcB16() || isLiteralImm(MVT::f16);
340 }
341
318342 bool isVCSrcB32() const {
319343 return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::i32);
320344 }
323347 return isRegClass(AMDGPU::VS_64RegClassID) || isInlinableImm(MVT::i64);
324348 }
325349
350 bool isVCSrcB16() const {
351 return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::i16);
352 }
353
326354 bool isVCSrcF32() const {
327355 return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::f32);
328356 }
331359 return isRegClass(AMDGPU::VS_64RegClassID) || isInlinableImm(MVT::f64);
332360 }
333361
362 bool isVCSrcF16() const {
363 return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::f16);
364 }
365
334366 bool isVSrcB32() const {
335367 return isVCSrcF32() || isLiteralImm(MVT::i32);
336368 }
339371 return isVCSrcF64() || isLiteralImm(MVT::i64);
340372 }
341373
374 bool isVSrcB16() const {
375 return isVCSrcF16() || isLiteralImm(MVT::i16);
376 }
377
342378 bool isVSrcF32() const {
343379 return isVCSrcF32() || isLiteralImm(MVT::f32);
344380 }
347383 return isVCSrcF64() || isLiteralImm(MVT::f64);
348384 }
349385
386 bool isVSrcF16() const {
387 return isVCSrcF16() || isLiteralImm(MVT::f16);
388 }
389
350390 bool isKImmFP32() const {
351391 return isLiteralImm(MVT::f32);
392 }
393
394 bool isKImmFP16() const {
395 return isLiteralImm(MVT::f16);
352396 }
353397
354398 bool isMem() const override {
438482
439483 void addLiteralImmOperand(MCInst &Inst, int64_t Val) const;
440484
441 void addKImmFP32Operands(MCInst &Inst, unsigned N) const;
485 template
486 void addKImmFPOperands(MCInst &Inst, unsigned N) const;
487
488 void addKImmFP16Operands(MCInst &Inst, unsigned N) const {
489 addKImmFPOperands<16>(Inst, N);
490 }
491
492 void addKImmFP32Operands(MCInst &Inst, unsigned N) const {
493 addKImmFPOperands<32>(Inst, N);
494 }
442495
443496 void addRegOperands(MCInst &Inst, unsigned N) const;
444497
825878 } // end anonymous namespace
826879
827880 // May be called with integer type with equivalent bitwidth.
828 static const fltSemantics *getFltSemantics(MVT VT) {
829 switch (VT.getSizeInBits()) {
830 case 32:
881 static const fltSemantics *getFltSemantics(unsigned Size) {
882 switch (Size) {
883 case 4:
831884 return &APFloat::IEEEsingle;
832 case 64:
885 case 8:
833886 return &APFloat::IEEEdouble;
834 case 16:
887 case 2:
835888 return &APFloat::IEEEhalf;
836889 default:
837890 llvm_unreachable("unsupported fp type");
838891 }
892 }
893
894 static const fltSemantics *getFltSemantics(MVT VT) {
895 return getFltSemantics(VT.getSizeInBits() / 8);
839896 }
840897
841898 //===----------------------------------------------------------------------===//
894951 AsmParser->hasInv2PiInlineImm());
895952 }
896953
954 if (type.getScalarSizeInBits() == 16) {
955 return AMDGPU::isInlinableLiteral16(
956 static_cast(Literal.getLoBits(16).getSExtValue()),
957 AsmParser->hasInv2PiInlineImm());
958 }
959
897960 return AMDGPU::isInlinableLiteral32(
898961 static_cast(Literal.getLoBits(32).getZExtValue()),
899962 AsmParser->hasInv2PiInlineImm());
908971 if (!Imm.IsFPImm) {
909972 // We got int literal token.
910973
974 unsigned Size = type.getSizeInBits();
975 if (Size == 64)
976 Size = 32;
977
911978 // FIXME: 64-bit operands can zero extend, sign extend, or pad zeroes for FP
912979 // types.
913 return isUInt<32>(Imm.Val) || isInt<32>(Imm.Val);
980 return isUIntN(Size, Imm.Val) || isIntN(Size, Imm.Val);
914981 }
915982
916983 // We got fp literal token
9461013 }
9471014 }
9481015
949 if (AMDGPU::isSISrcOperand(AsmParser->getMII()->get(Inst.getOpcode()), Inst.getNumOperands())) {
1016 if (AMDGPU::isSISrcOperand(AsmParser->getMII()->get(Inst.getOpcode()),
1017 Inst.getNumOperands())) {
9501018 addLiteralImmOperand(Inst, Val);
9511019 } else {
9521020 Inst.addOperand(MCOperand::createImm(Val));
9591027 // Check that this operand accepts literals
9601028 assert(AMDGPU::isSISrcOperand(InstDesc, OpNum));
9611029
962 APInt Literal(64, Val);
963 auto OpSize = AMDGPU::getRegOperandSize(AsmParser->getMRI(), InstDesc, OpNum); // expected operand size
1030 auto OpSize = AMDGPU::getOperandSize(InstDesc, OpNum); // expected operand size
9641031
9651032 if (Imm.IsFPImm) { // We got fp literal token
966 if (OpSize == 8) { // Expected 64-bit operand
967 // Check if literal is inlinable
1033 APInt Literal(64, Val);
1034
1035 switch (OpSize) {
1036 case 8: {
9681037 if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(),
9691038 AsmParser->hasInv2PiInlineImm())) {
9701039 Inst.addOperand(MCOperand::createImm(Literal.getZExtValue()));
971 } else if (AMDGPU::isSISrcFPOperand(InstDesc, OpNum)) { // Expected 64-bit fp operand
1040 return;
1041 }
1042
1043 // Non-inlineable
1044 if (AMDGPU::isSISrcFPOperand(InstDesc, OpNum)) { // Expected 64-bit fp operand
9721045 // For fp operands we check if low 32 bits are zeros
9731046 if (Literal.getLoBits(32) != 0) {
9741047 const_cast(AsmParser)->Warning(Inst.getLoc(),
975 "Can't encode literal as exact 64-bit"
976 " floating-point operand. Low 32-bits will be"
977 " set to zero");
1048 "Can't encode literal as exact 64-bit floating-point operand. "
1049 "Low 32-bits will be set to zero");
9781050 }
1051
9791052 Inst.addOperand(MCOperand::createImm(Literal.lshr(32).getZExtValue()));
980 } else {
981 // We don't allow fp literals in 64-bit integer instructions. It is
982 // unclear how we should encode them. This case should be checked earlier
983 // in predicate methods (isLiteralImm())
984 llvm_unreachable("fp literal in 64-bit integer instruction.");
1053 return;
9851054 }
986 } else { // Expected 32-bit operand
1055
1056 // We don't allow fp literals in 64-bit integer instructions. It is
1057 // unclear how we should encode them. This case should be checked earlier
1058 // in predicate methods (isLiteralImm())
1059 llvm_unreachable("fp literal in 64-bit integer instruction.");
1060 }
1061 case 4:
1062 case 2: {
9871063 bool lost;
9881064 APFloat FPLiteral(APFloat::IEEEdouble, Literal);
9891065 // Convert literal to single precision
990 FPLiteral.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven, &lost);
1066 FPLiteral.convert(*getFltSemantics(OpSize),
1067 APFloat::rmNearestTiesToEven, &lost);
9911068 // We allow precision lost but not overflow or underflow. This should be
9921069 // checked earlier in isLiteralImm()
9931070 Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue()));
994 }
995 } else { // We got int literal token
996 if (OpSize == 8) { // Expected 64-bit operand
997 auto LiteralVal = Literal.getZExtValue();
998 if (AMDGPU::isInlinableLiteral64(LiteralVal,
999 AsmParser->hasInv2PiInlineImm())) {
1000 Inst.addOperand(MCOperand::createImm(LiteralVal));
1001 return;
1002 }
1003 } else { // Expected 32-bit operand
1004 auto LiteralVal = static_cast(Literal.getLoBits(32).getZExtValue());
1005 if (AMDGPU::isInlinableLiteral32(LiteralVal,
1006 AsmParser->hasInv2PiInlineImm())) {
1007 Inst.addOperand(MCOperand::createImm(LiteralVal));
1008 return;
1009 }
1010 }
1011 Inst.addOperand(MCOperand::createImm(Literal.getLoBits(32).getZExtValue()));
1012 }
1013 }
1014
1015 void AMDGPUOperand::addKImmFP32Operands(MCInst &Inst, unsigned N) const {
1071 return;
1072 }
1073 default:
1074 llvm_unreachable("invalid operand size");
1075 }
1076
1077 return;
1078 }
1079
1080 // We got int literal token.
1081 // Only sign extend inline immediates.
1082 // FIXME: No errors on truncation
1083 switch (OpSize) {
1084 case 4: {
1085 if (isInt<32>(Val) &&
1086 AMDGPU::isInlinableLiteral32(static_cast(Val),
1087 AsmParser->hasInv2PiInlineImm())) {
1088 Inst.addOperand(MCOperand::createImm(Val));
1089 return;
1090 }
1091
1092 Inst.addOperand(MCOperand::createImm(Val & 0xffffffff));
1093 return;
1094 }
1095 case 8: {
1096 if (AMDGPU::isInlinableLiteral64(Val,
1097 AsmParser->hasInv2PiInlineImm())) {
1098 Inst.addOperand(MCOperand::createImm(Val));
1099 return;
1100 }
1101
1102 Inst.addOperand(MCOperand::createImm(Lo_32(Val)));
1103 return;
1104 }
1105 case 2: {
1106 if (isInt<16>(Val) &&
1107 AMDGPU::isInlinableLiteral16(static_cast(Val),
1108 AsmParser->hasInv2PiInlineImm())) {
1109 Inst.addOperand(MCOperand::createImm(Val));
1110 return;
1111 }
1112
1113 Inst.addOperand(MCOperand::createImm(Val & 0xffff));
1114 return;
1115 }
1116 default:
1117 llvm_unreachable("invalid operand size");
1118 }
1119 }
1120
1121 template
1122 void AMDGPUOperand::addKImmFPOperands(MCInst &Inst, unsigned N) const {
10161123 APInt Literal(64, Imm.Val);
1017 if (Imm.IsFPImm) { // We got fp literal
1018 bool lost;
1019 APFloat FPLiteral(APFloat::IEEEdouble, Literal);
1020 FPLiteral.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven, &lost);
1021 Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue()));
1022 } else { // We got int literal token
1023 Inst.addOperand(MCOperand::createImm(Literal.getLoBits(32).getZExtValue()));
1024 }
1124
1125 if (!Imm.IsFPImm) {
1126 // We got int literal token.
1127 Inst.addOperand(MCOperand::createImm(Literal.getLoBits(Bitwidth).getZExtValue()));
1128 return;
1129 }
1130
1131 bool Lost;
1132 APFloat FPLiteral(APFloat::IEEEdouble, Literal);
1133 FPLiteral.convert(*getFltSemantics(Bitwidth / 8),
1134 APFloat::rmNearestTiesToEven, &Lost);
1135 Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue()));
10251136 }
10261137
10271138 void AMDGPUOperand::addRegOperands(MCInst &Inst, unsigned N) const {
8686 DECODE_OPERAND(SReg_128)
8787 DECODE_OPERAND(SReg_256)
8888 DECODE_OPERAND(SReg_512)
89
90
91 static DecodeStatus decodeOperand_VSrc16(MCInst &Inst,
92 unsigned Imm,
93 uint64_t Addr,
94 const void *Decoder) {
95 auto DAsm = static_cast(Decoder);
96 return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm));
97 }
8998
9099 #define GET_SUBTARGETINFO_ENUM
91100 #include "AMDGPUGenSubtargetInfo.inc"
249258 return decodeSrcOp(OPW64, Val);
250259 }
251260
261 MCOperand AMDGPUDisassembler::decodeOperand_VSrc16(unsigned Val) const {
262 return decodeSrcOp(OPW16, Val);
263 }
264
252265 MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const {
253266 // Some instructions have operand restrictions beyond what the encoding
254267 // allows. Some ordinarily VSrc_32 operands are VGPR_32, so clear the extra
323336 // Cast prevents negative overflow.
324337 }
325338
326 MCOperand AMDGPUDisassembler::decodeFPImmed(bool Is32, unsigned Imm) {
339 static int64_t getInlineImmVal32(unsigned Imm) {
340 switch (Imm) {
341 case 240:
342 return FloatToBits(0.5f);
343 case 241:
344 return FloatToBits(-0.5f);
345 case 242:
346 return FloatToBits(1.0f);
347 case 243:
348 return FloatToBits(-1.0f);
349 case 244:
350 return FloatToBits(2.0f);
351 case 245:
352 return FloatToBits(-2.0f);
353 case 246:
354 return FloatToBits(4.0f);
355 case 247:
356 return FloatToBits(-4.0f);
357 case 248: // 1 / (2 * PI)
358 return 0x3e22f983;
359 default:
360 llvm_unreachable("invalid fp inline imm");
361 }
362 }
363
364 static int64_t getInlineImmVal64(unsigned Imm) {
365 switch (Imm) {
366 case 240:
367 return DoubleToBits(0.5);
368 case 241:
369 return DoubleToBits(-0.5);
370 case 242:
371 return DoubleToBits(1.0);
372 case 243:
373 return DoubleToBits(-1.0);
374 case 244:
375 return DoubleToBits(2.0);
376 case 245:
377 return DoubleToBits(-2.0);
378 case 246:
379 return DoubleToBits(4.0);
380 case 247:
381 return DoubleToBits(-4.0);
382 case 248: // 1 / (2 * PI)
383 return 0x3fc45f306dc9c882;
384 default:
385 llvm_unreachable("invalid fp inline imm");
386 }
387 }
388
389 static int64_t getInlineImmVal16(unsigned Imm) {
390 switch (Imm) {
391 case 240:
392 return 0x3800;
393 case 241:
394 return 0xB800;
395 case 242:
396 return 0x3C00;
397 case 243:
398 return 0xBC00;
399 case 244:
400 return 0x4000;
401 case 245:
402 return 0xC000;
403 case 246:
404 return 0x4400;
405 case 247:
406 return 0xC400;
407 case 248: // 1 / (2 * PI)
408 return 0x3118;
409 default:
410 llvm_unreachable("invalid fp inline imm");
411 }
412 }
413
414 MCOperand AMDGPUDisassembler::decodeFPImmed(OpWidthTy Width, unsigned Imm) {
327415 assert(Imm >= AMDGPU::EncValues::INLINE_FLOATING_C_MIN
328416 && Imm <= AMDGPU::EncValues::INLINE_FLOATING_C_MAX);
417
329418 // ToDo: case 248: 1/(2*PI) - is allowed only on VI
330 // ToDo: AMDGPUInstPrinter does not support 1/(2*PI). It consider 1/(2*PI) as
331 // literal constant.
332 float V = 0.0f;
333 switch (Imm) {
334 case 240: V = 0.5f; break;
335 case 241: V = -0.5f; break;
336 case 242: V = 1.0f; break;
337 case 243: V = -1.0f; break;
338 case 244: V = 2.0f; break;
339 case 245: V = -2.0f; break;
340 case 246: V = 4.0f; break;
341 case 247: V = -4.0f; break;
342 case 248: return MCOperand::createImm(Is32 ? // 1/(2*PI)
343 0x3e22f983 :
344 0x3fc45f306dc9c882);
345 default: break;
346 }
347 return MCOperand::createImm(Is32? FloatToBits(V) : DoubleToBits(V));
419 switch (Width) {
420 case OPW32:
421 return MCOperand::createImm(getInlineImmVal32(Imm));
422 case OPW64:
423 return MCOperand::createImm(getInlineImmVal64(Imm));
424 case OPW16:
425 return MCOperand::createImm(getInlineImmVal16(Imm));
426 default:
427 llvm_unreachable("implement me");
428 }
348429 }
349430
350431 unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
352433 assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
353434 switch (Width) {
354435 default: // fall
355 case OPW32: return VGPR_32RegClassID;
436 case OPW32:
437 case OPW16:
438 return VGPR_32RegClassID;
356439 case OPW64: return VReg_64RegClassID;
357440 case OPW128: return VReg_128RegClassID;
358441 }
363446 assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
364447 switch (Width) {
365448 default: // fall
366 case OPW32: return SGPR_32RegClassID;
449 case OPW32:
450 case OPW16:
451 return SGPR_32RegClassID;
367452 case OPW64: return SGPR_64RegClassID;
368453 case OPW128: return SGPR_128RegClassID;
369454 }
374459 assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
375460 switch (Width) {
376461 default: // fall
377 case OPW32: return TTMP_32RegClassID;
462 case OPW32:
463 case OPW16:
464 return TTMP_32RegClassID;
378465 case OPW64: return TTMP_64RegClassID;
379466 case OPW128: return TTMP_128RegClassID;
380467 }
395482 return createSRegOperand(getTtmpClassId(Width), Val - TTMP_MIN);
396483 }
397484
398 assert(Width == OPW32 || Width == OPW64);
399 const bool Is32 = (Width == OPW32);
485 assert(Width == OPW16 || Width == OPW32 || Width == OPW64);
400486
401487 if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX)
402488 return decodeIntImmed(Val);
403489
404490 if (INLINE_FLOATING_C_MIN <= Val && Val <= INLINE_FLOATING_C_MAX)
405 return decodeFPImmed(Is32, Val);
491 return decodeFPImmed(Width, Val);
406492
407493 if (Val == LITERAL_CONST)
408494 return decodeLiteralConstant();
409495
410 return Is32 ? decodeSpecialReg32(Val) : decodeSpecialReg64(Val);
496 switch (Width) {
497 case OPW32:
498 case OPW16:
499 return decodeSpecialReg32(Val);
500 case OPW64:
501 return decodeSpecialReg64(Val);
502 default:
503 llvm_unreachable("unexpected immediate type");
504 }
411505 }
412506
413507 MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
6565 MCOperand decodeOperand_VGPR_32(unsigned Val) const;
6666 MCOperand decodeOperand_VS_32(unsigned Val) const;
6767 MCOperand decodeOperand_VS_64(unsigned Val) const;
68 MCOperand decodeOperand_VSrc16(unsigned Val) const;
6869
6970 MCOperand decodeOperand_VReg_64(unsigned Val) const;
7071 MCOperand decodeOperand_VReg_96(unsigned Val) const;
8283 OPW32,
8384 OPW64,
8485 OPW128,
86 OPW16,
8587 OPW_LAST_,
8688 OPW_FIRST_ = OPW32
8789 };
9193 unsigned getTtmpClassId(const OpWidthTy Width) const;
9294
9395 static MCOperand decodeIntImmed(unsigned Imm);
94 static MCOperand decodeFPImmed(bool Is32, unsigned Imm);
96 static MCOperand decodeFPImmed(OpWidthTy Width, unsigned Imm);
9597 MCOperand decodeLiteralConstant() const;
9698
9799 MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const;
4646 void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
4747 const MCSubtargetInfo &STI,
4848 raw_ostream &O) {
49 O << formatHex(MI->getOperand(OpNo).getImm() & 0xffff);
49 // It's possible to end up with a 32-bit literal used with a 16-bit operand
50 // with ignored high bits. Print as 32-bit anyway in that case.
51 int64_t Imm = MI->getOperand(OpNo).getImm();
52 if (isInt<16>(Imm) || isUInt<16>(Imm))
53 O << formatHex(static_cast(Imm & 0xffff));
54 else
55 printU32ImmOperand(MI, OpNo, STI, O);
5056 }
5157
5258 void AMDGPUInstPrinter::printU4ImmDecOperand(const MCInst *MI, unsigned OpNo,
335341 printOperand(MI, OpNo, STI, O);
336342 }
337343
344 void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
345 const MCSubtargetInfo &STI,
346 raw_ostream &O) {
347 int16_t SImm = static_cast(Imm);
348 if (SImm >= -16 && SImm <= 64) {
349 O << SImm;
350 return;
351 }
352
353 if (Imm == 0x3C00)
354 O<< "1.0";
355 else if (Imm == 0xBC00)
356 O<< "-1.0";
357 else if (Imm == 0x3800)
358 O<< "0.5";
359 else if (Imm == 0xB800)
360 O<< "-0.5";
361 else if (Imm == 0x4000)
362 O<< "2.0";
363 else if (Imm == 0xC000)
364 O<< "-2.0";
365 else if (Imm == 0x4400)
366 O<< "4.0";
367 else if (Imm == 0xC400)
368 O<< "-4.0";
369 else if (Imm == 0x3118) {
370 assert(STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]);
371 O << "0.15915494";
372 } else
373 O << formatHex(static_cast(Imm));
374 }
375
338376 void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
339377 const MCSubtargetInfo &STI,
340378 raw_ostream &O) {
430468 }
431469 } else if (Op.isImm()) {
432470 const MCInstrDesc &Desc = MII.get(MI->getOpcode());
433 int RCID = Desc.OpInfo[OpNo].RegClass;
434 if (RCID != -1) {
435 unsigned RCBits = AMDGPU::getRegBitWidth(MRI.getRegClass(RCID));
436 if (RCBits == 32)
437 printImmediate32(Op.getImm(), STI, O);
438 else if (RCBits == 64)
439 printImmediate64(Op.getImm(), STI, O);
440 else
441 llvm_unreachable("Invalid register class size");
442 } else if (Desc.OpInfo[OpNo].OperandType == MCOI::OPERAND_IMMEDIATE) {
471 switch (Desc.OpInfo[OpNo].OperandType) {
472 case AMDGPU::OPERAND_REG_IMM_INT32:
473 case AMDGPU::OPERAND_REG_IMM_FP32:
474 case AMDGPU::OPERAND_REG_INLINE_C_INT32:
475 case AMDGPU::OPERAND_REG_INLINE_C_FP32:
476 case MCOI::OPERAND_IMMEDIATE:
443477 printImmediate32(Op.getImm(), STI, O);
444 } else {
478 break;
479 case AMDGPU::OPERAND_REG_IMM_INT64:
480 case AMDGPU::OPERAND_REG_IMM_FP64:
481 case AMDGPU::OPERAND_REG_INLINE_C_INT64:
482 case AMDGPU::OPERAND_REG_INLINE_C_FP64:
483 printImmediate64(Op.getImm(), STI, O);
484 break;
485 case AMDGPU::OPERAND_REG_INLINE_C_INT16:
486 case AMDGPU::OPERAND_REG_INLINE_C_FP16:
487 case AMDGPU::OPERAND_REG_IMM_INT16:
488 case AMDGPU::OPERAND_REG_IMM_FP16:
489 printImmediate16(Op.getImm(), STI, O);
490 break;
491 case MCOI::OPERAND_UNKNOWN:
492 case MCOI::OPERAND_PCREL:
493 O << formatDec(Op.getImm());
494 break;
495 case MCOI::OPERAND_REGISTER:
496 // FIXME: This should be removed and handled somewhere else. Seems to come
497 // from a disassembler bug.
498 O << "/*invalid immediate*/";
499 break;
500 default:
445501 // We hit this for the immediate instruction bits that don't yet have a
446502 // custom printer.
447 // TODO: Eventually this should be unnecessary.
448 O << formatDec(Op.getImm());
503 llvm_unreachable("unexpected immediate operand type");
449504 }
450505 } else if (Op.isFPImm()) {
451506 // We special case 0.0 because otherwise it will be printed as an integer.
8787 void printRegOperand(unsigned RegNo, raw_ostream &O);
8888 void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
8989 raw_ostream &O);
90 void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,
91 raw_ostream &O);
9092 void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
9193 raw_ostream &O);
9294 void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI,
3838 const MCRegisterInfo &MRI;
3939
4040 /// \brief Encode an fp or int literal
41 uint32_t getLitEncoding(const MCOperand &MO, unsigned OpSize,
41 uint32_t getLitEncoding(const MCOperand &MO, const MCOperandInfo &OpInfo,
4242 const MCSubtargetInfo &STI) const;
4343
4444 public:
8686 return 0;
8787 }
8888
89 static uint32_t getLit16Encoding(uint16_t Val, const MCSubtargetInfo &STI) {
90 uint16_t IntImm = getIntInlineImmEncoding(static_cast(Val));
91 if (IntImm != 0)
92 return IntImm;
93
94 if (Val == 0x3800) // 0.5
95 return 240;
96
97 if (Val == 0xB800) // -0.5
98 return 241;
99
100 if (Val == 0x3C00) // 1.0
101 return 242;
102
103 if (Val == 0xBC00) // -1.0
104 return 243;
105
106 if (Val == 0x4000) // 2.0
107 return 244;
108
109 if (Val == 0xC000) // -2.0
110 return 245;
111
112 if (Val == 0x4400) // 4.0
113 return 246;
114
115 if (Val == 0xC400) // -4.0
116 return 247;
117
118 if (Val == 0x3118 && // 1.0 / (2.0 * pi)
119 STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
120 return 248;
121
122 return 255;
123 }
124
89125 static uint32_t getLit32Encoding(uint32_t Val, const MCSubtargetInfo &STI) {
90126 uint32_t IntImm = getIntInlineImmEncoding(static_cast(Val));
91127 if (IntImm != 0)
159195 }
160196
161197 uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
162 unsigned OpSize,
198 const MCOperandInfo &OpInfo,
163199 const MCSubtargetInfo &STI) const {
164200
165201 int64_t Imm;
179215 Imm = MO.getImm();
180216 }
181217
182 if (OpSize == 4)
218 switch (AMDGPU::getOperandSize(OpInfo)) {
219 case 4:
183220 return getLit32Encoding(static_cast(Imm), STI);
184
185 assert(OpSize == 8);
186
187 return getLit64Encoding(static_cast(Imm), STI);
221 case 8:
222 return getLit64Encoding(static_cast(Imm), STI);
223 case 2:
224 return getLit16Encoding(static_cast(Imm), STI);
225 default:
226 llvm_unreachable("invalid operand size");
227 }
188228 }
189229
190230 void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
211251 if (!AMDGPU::isSISrcOperand(Desc, i))
212252 continue;
213253
214 int RCID = Desc.OpInfo[i].RegClass;
215 const MCRegisterClass &RC = MRI.getRegClass(RCID);
216
217254 // Is this operand a literal immediate?
218255 const MCOperand &Op = MI.getOperand(i);
219 if (getLitEncoding(Op, AMDGPU::getRegBitWidth(RC) / 8, STI) != 255)
256 if (getLitEncoding(Op, Desc.OpInfo[i], STI) != 255)
220257 continue;
221258
222259 // Yes! Encode it
281318
282319 const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
283320 if (AMDGPU::isSISrcOperand(Desc, OpNo)) {
284 uint32_t Enc = getLitEncoding(MO,
285 AMDGPU::getRegOperandSize(&MRI, Desc, OpNo),
286 STI);
321 uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI);
287322 if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4))
288323 return Enc;
289324
8787 namespace AMDGPU {
8888 enum OperandType {
8989 /// Operands with register or 32-bit immediate
90 OPERAND_REG_IMM32_INT = MCOI::OPERAND_FIRST_TARGET,
91 OPERAND_REG_IMM32_FP,
90 OPERAND_REG_IMM_INT32 = MCOI::OPERAND_FIRST_TARGET,
91 OPERAND_REG_IMM_INT64,
92 OPERAND_REG_IMM_INT16,
93 OPERAND_REG_IMM_FP32,
94 OPERAND_REG_IMM_FP64,
95 OPERAND_REG_IMM_FP16,
96
9297 /// Operands with register or inline constant
93 OPERAND_REG_INLINE_C_INT,
94 OPERAND_REG_INLINE_C_FP,
98 OPERAND_REG_INLINE_C_INT16,
99 OPERAND_REG_INLINE_C_INT32,
100 OPERAND_REG_INLINE_C_INT64,
101 OPERAND_REG_INLINE_C_FP16,
102 OPERAND_REG_INLINE_C_FP32,
103 OPERAND_REG_INLINE_C_FP64,
104
105 OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32,
106 OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_FP16,
107
108 OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16,
109 OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_C_FP64,
110
111 OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32,
112 OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST,
95113
96114 // Operand for source modifiers for VOP instructions
97115 OPERAND_INPUT_MODS,
98116
99117 /// Operand with 32-bit immediate that uses the constant bus.
100 OPERAND_KIMM32
118 OPERAND_KIMM32,
119 OPERAND_KIMM16
101120 };
102121 }
103122
314314 return;
315315 }
316316
317 APInt Imm(64, OpToFold.getImm());
318317
319318 const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
320319 const TargetRegisterClass *FoldRC =
321320 TRI.getRegClass(FoldDesc.OpInfo[0].RegClass);
321
322 APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType),
323 OpToFold.getImm());
322324
323325 // Split 64-bit constants into 32-bits for folding.
324326 if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
327329 = TargetRegisterInfo::isVirtualRegister(UseReg) ?
328330 MRI.getRegClass(UseReg) :
329331 TRI.getPhysRegClass(UseReg);
332
333 assert(Imm.getBitWidth() == 64);
330334
331335 if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
332336 return;
504508 if (!isSafeToFold(MI))
505509 continue;
506510
507 unsigned OpSize = TII->getOpSize(MI, 1);
508511 MachineOperand &OpToFold = MI.getOperand(1);
509512 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
510513
558561 Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end();
559562 Use != E; ++Use) {
560563 MachineInstr *UseMI = Use->getParent();
561
562 if (TII->isInlineConstant(OpToFold, OpSize)) {
563 foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList,
564 unsigned OpNo = Use.getOperandNo();
565
566 if (TII->isInlineConstant(*UseMI, OpNo, OpToFold)) {
567 foldOperand(OpToFold, UseMI, OpNo, FoldList,
564568 CopiesToReplace, TII, TRI, MRI);
565569 } else {
566570 if (++NumLiteralUses == 1) {
567571 NonInlineUse = &*Use;
568 NonInlineUseOpNo = Use.getOperandNo();
572 NonInlineUseOpNo = OpNo;
569573 }
570574 }
571575 }
14141414 // If this is a free constant, there's no reason to do this.
14151415 // TODO: We could fold this here instead of letting SIFoldOperands do it
14161416 // later.
1417 if (isInlineConstant(ImmOp, 4))
1417 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
1418
1419 // Any src operand can be used for the legality check.
1420 if (isInlineConstant(UseMI, *Src0, ImmOp))
14181421 return false;
14191422
1420 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
14211423 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
14221424 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
14231425
16191621 case AMDGPU::V_MAC_F16_e32:
16201622 IsF16 = true;
16211623 case AMDGPU::V_MAC_F32_e32: {
1622 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
1623 if (Src0->isImm() && !isInlineConstant(*Src0, 4))
1624 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1625 AMDGPU::OpName::src0);
1626 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
1627 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
16241628 return nullptr;
16251629 break;
16261630 }
16811685 case 64:
16821686 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
16831687 ST.hasInv2PiInlineImm());
1688 case 16:
1689 return AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
1690 ST.hasInv2PiInlineImm());
16841691 default:
16851692 llvm_unreachable("invalid bitwidth");
16861693 }
16871694 }
16881695
16891696 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
1690 unsigned OpSize) const {
1691 if (MO.isImm()) {
1692 // MachineOperand provides no way to tell the true operand size, since it
1693 // only records a 64-bit value. We need to know the size to determine if a
1694 // 32-bit floating point immediate bit pattern is legal for an integer
1695 // immediate. It would be for any 32-bit integer operand, but would not be
1696 // for a 64-bit one.
1697 switch (OpSize) {
1698 case 4:
1699 return AMDGPU::isInlinableLiteral32(static_cast(MO.getImm()),
1700 ST.hasInv2PiInlineImm());
1701 case 8:
1702 return AMDGPU::isInlinableLiteral64(MO.getImm(),
1703 ST.hasInv2PiInlineImm());
1704 default:
1705 llvm_unreachable("invalid bitwidth");
1706 }
1707 }
1708
1709 return false;
1710 }
1711
1712 bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO,
1713 unsigned OpSize) const {
1714 return MO.isImm() && !isInlineConstant(MO, OpSize);
1697 uint8_t OperandType) const {
1698 if (!MO.isImm() || OperandType < MCOI::OPERAND_FIRST_TARGET)
1699 return false;
1700
1701 // MachineOperand provides no way to tell the true operand size, since it only
1702 // records a 64-bit value. We need to know the size to determine if a 32-bit
1703 // floating point immediate bit pattern is legal for an integer immediate. It
1704 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
1705
1706 int64_t Imm = MO.getImm();
1707 switch (operandBitWidth(OperandType)) {
1708 case 32: {
1709 int32_t Trunc = static_cast(Imm);
1710 return Trunc == Imm &&
1711 AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
1712 }
1713 case 64: {
1714 return AMDGPU::isInlinableLiteral64(MO.getImm(),
1715 ST.hasInv2PiInlineImm());
1716 }
1717 case 16: {
1718 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
1719 int16_t Trunc = static_cast(Imm);
1720 return AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
1721 }
1722
1723 return false;
1724 }
1725 default:
1726 llvm_unreachable("invalid bitwidth");
1727 }
17151728 }
17161729
17171730 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO,
1718 unsigned OpSize) const {
1731 const MCOperandInfo &OpInfo) const {
17191732 switch (MO.getType()) {
17201733 case MachineOperand::MO_Register:
17211734 return false;
17221735 case MachineOperand::MO_Immediate:
1723 return !isInlineConstant(MO, OpSize);
1736 return !isInlineConstant(MO, OpInfo);
17241737 case MachineOperand::MO_FrameIndex:
17251738 case MachineOperand::MO_MachineBasicBlock:
17261739 case MachineOperand::MO_ExternalSymbol:
17591772 if (OpInfo.RegClass < 0)
17601773 return false;
17611774
1762 unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize();
1763 if (isLiteralConstant(MO, OpSize))
1764 return RI.opCanUseLiteralConstant(OpInfo.OperandType);
1765
1766 return RI.opCanUseInlineConstant(OpInfo.OperandType);
1775 if (MO.isImm() && isInlineConstant(MO, OpInfo))
1776 return RI.opCanUseInlineConstant(OpInfo.OperandType);
1777
1778 return RI.opCanUseLiteralConstant(OpInfo.OperandType);
17671779 }
17681780
17691781 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
17901802
17911803 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
17921804 const MachineOperand &MO,
1793 unsigned OpSize) const {
1805 const MCOperandInfo &OpInfo) const {
17941806 // Literal constants use the constant bus.
1795 if (isLiteralConstant(MO, OpSize))
1796 return true;
1797
1798 if (!MO.isReg() || !MO.isUse())
1807 //if (isLiteralConstantLike(MO, OpInfo))
1808 // return true;
1809 if (MO.isImm())
1810 return !isInlineConstant(MO, OpInfo);
1811
1812 if (!MO.isReg())
1813 return true; // Misc other operands like FrameIndex
1814
1815 if (!MO.isUse())
17991816 return false;
18001817
18011818 if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
19241941 return false;
19251942 }
19261943 break;
1927 case AMDGPU::OPERAND_REG_IMM32_INT:
1928 case AMDGPU::OPERAND_REG_IMM32_FP:
1944 case AMDGPU::OPERAND_REG_IMM_INT32:
1945 case AMDGPU::OPERAND_REG_IMM_FP32:
19291946 break;
1930 case AMDGPU::OPERAND_REG_INLINE_C_INT:
1931 case AMDGPU::OPERAND_REG_INLINE_C_FP:
1932 if (isLiteralConstant(MI.getOperand(i),
1933 RI.getRegClass(RegClass)->getSize())) {
1947 case AMDGPU::OPERAND_REG_INLINE_C_INT32:
1948 case AMDGPU::OPERAND_REG_INLINE_C_FP32:
1949 case AMDGPU::OPERAND_REG_INLINE_C_INT64:
1950 case AMDGPU::OPERAND_REG_INLINE_C_FP64:
1951 case AMDGPU::OPERAND_REG_INLINE_C_INT16:
1952 case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
1953 const MachineOperand &MO = MI.getOperand(i);
1954 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
19341955 ErrInfo = "Illegal immediate value for operand.";
19351956 return false;
19361957 }
19371958 break;
1959 }
19381960 case MCOI::OPERAND_IMMEDIATE:
19391961 case AMDGPU::OPERAND_KIMM32:
19401962 // Check if this operand is an immediate.
19862008 if (OpIdx == -1)
19872009 break;
19882010 const MachineOperand &MO = MI.getOperand(OpIdx);
1989 if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) {
2011 if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
19902012 if (MO.isReg()) {
19912013 if (MO.getReg() != SGPRUsed)
19922014 ++ConstantBusCount;
23292351 if (!MO)
23302352 MO = &MI.getOperand(OpIdx);
23312353
2332 if (isVALU(MI) && usesConstantBus(MRI, *MO, DefinedRC->getSize())) {
2354 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
23332355
23342356 RegSubRegPair SGPRUsed;
23352357 if (MO->isReg())
23412363 const MachineOperand &Op = MI.getOperand(i);
23422364 if (Op.isReg()) {
23432365 if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
2344 usesConstantBus(MRI, Op, getOpSize(MI, i))) {
2366 usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
23452367 return false;
23462368 }
23472369 } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
35383560 if (Src0Idx == -1)
35393561 return 4; // No operands.
35403562
3541 if (isLiteralConstantLike(MI.getOperand(Src0Idx), getOpSize(MI, Src0Idx)))
3563 if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
35423564 return 8;
35433565
35443566 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
35453567 if (Src1Idx == -1)
35463568 return 4;
35473569
3548 if (isLiteralConstantLike(MI.getOperand(Src1Idx), getOpSize(MI, Src1Idx)))
3570 if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
35493571 return 8;
35503572
35513573 return 4;
461461 return !RI.isSGPRReg(MRI, Dest);
462462 }
463463
464 static int operandBitWidth(uint8_t OperandType) {
465 switch (OperandType) {
466 case AMDGPU::OPERAND_REG_IMM_INT32:
467 case AMDGPU::OPERAND_REG_IMM_FP32:
468 case AMDGPU::OPERAND_REG_INLINE_C_INT32:
469 case AMDGPU::OPERAND_REG_INLINE_C_FP32:
470 return 32;
471 case AMDGPU::OPERAND_REG_IMM_INT64:
472 case AMDGPU::OPERAND_REG_IMM_FP64:
473 case AMDGPU::OPERAND_REG_INLINE_C_INT64:
474 case AMDGPU::OPERAND_REG_INLINE_C_FP64:
475 return 64;
476 case AMDGPU::OPERAND_REG_INLINE_C_INT16:
477 case AMDGPU::OPERAND_REG_INLINE_C_FP16:
478 case AMDGPU::OPERAND_REG_IMM_INT16:
479 case AMDGPU::OPERAND_REG_IMM_FP16:
480 return 16;
481 default:
482 llvm_unreachable("unexpected operand type");
483 }
484 }
485
464486 bool isInlineConstant(const APInt &Imm) const;
465 bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const;
466 bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const;
487
488 bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const;
489
490 bool isInlineConstant(const MachineOperand &MO,
491 const MCOperandInfo &OpInfo) const {
492 return isInlineConstant(MO, OpInfo.OperandType);
493 }
494
495 /// \p returns true if \p UseMO is substituted with \p DefMO in \p MI it would
496 /// be an inline immediate.
497 bool isInlineConstant(const MachineInstr &MI,
498 const MachineOperand &UseMO,
499 const MachineOperand &DefMO) const {
500 assert(UseMO.getParent() == &MI);
501 int OpIdx = MI.getOperandNo(&UseMO);
502 if (!MI.getDesc().OpInfo || OpIdx > MI.getDesc().NumOperands) {
503 return false;
504 }
505
506 return isInlineConstant(DefMO, MI.getDesc().OpInfo[OpIdx]);
507 }
508
509 /// \p returns true if the operand \p OpIdx in \p MI is a valid inline
510 /// immediate.
511 bool isInlineConstant(const MachineInstr &MI, unsigned OpIdx) const {
512 const MachineOperand &MO = MI.getOperand(OpIdx);
513 return isInlineConstant(MO, MI.getDesc().OpInfo[OpIdx].OperandType);
514 }
515
516 bool isInlineConstant(const MachineInstr &MI, unsigned OpIdx,
517 const MachineOperand &MO) const {
518 if (!MI.getDesc().OpInfo || OpIdx > MI.getDesc().NumOperands)
519 return false;
520
521 if (MI.isCopy()) {
522 unsigned Size = getOpSize(MI, OpIdx);
523 assert(Size == 8 || Size == 4);
524
525 uint8_t OpType = (Size == 8) ?
526 AMDGPU::OPERAND_REG_IMM_INT64 : AMDGPU::OPERAND_REG_IMM_INT32;
527 return isInlineConstant(MO, OpType);
528 }
529
530 return isInlineConstant(MO, MI.getDesc().OpInfo[OpIdx].OperandType);
531 }
532
533 bool isInlineConstant(const MachineOperand &MO) const {
534 const MachineInstr *Parent = MO.getParent();
535 return isInlineConstant(*Parent, Parent->getOperandNo(&MO));
536 }
537
538 bool isLiteralConstant(const MachineOperand &MO,
539 const MCOperandInfo &OpInfo) const {
540 return MO.isImm() && !isInlineConstant(MO, OpInfo.OperandType);
541 }
542
543 bool isLiteralConstant(const MachineInstr &MI, int OpIdx) const {
544 const MachineOperand &MO = MI.getOperand(OpIdx);
545 return MO.isImm() && !isInlineConstant(MI, OpIdx);
546 }
467547
468548 // Returns true if this operand could potentially require a 32-bit literal
469549 // operand, but not necessarily. A FrameIndex for example could resolve to an
470550 // inline immediate value that will not require an additional 4-bytes; this
471551 // assumes that it will.
472 bool isLiteralConstantLike(const MachineOperand &MO, unsigned OpSize) const;
552 bool isLiteralConstantLike(const MachineOperand &MO,
553 const MCOperandInfo &OpInfo) const;
473554
474555 bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
475556 const MachineOperand &MO) const;
481562 /// \brief Returns true if this operand uses the constant bus.
482563 bool usesConstantBus(const MachineRegisterInfo &MRI,
483564 const MachineOperand &MO,
484 unsigned OpSize) const;
565 const MCOperandInfo &OpInfo) const;
485566
486567 /// \brief Return true if this instruction has any modifiers.
487568 /// e.g. src[012]_mod, omod, clamp.
444444
445445 } // End OperandType = "OPERAND_IMMEDIATE"
446446
447 class KImmMatchClass : AsmOperandClass {
448 let Name = "KImmFP"#size;
449 let PredicateMethod = "isKImmFP"#size;
450 let ParserMethod = "parseImm";
451 let RenderMethod = "addKImmFP"#size#"Operands";
452 }
453
454 class kimmOperand : Operand {
455 let OperandNamespace = "AMDGPU";
456 let OperandType = "OPERAND_KIMM"#vt.Size;
457 let PrintMethod = "printU"#vt.Size#"ImmOperand";
458 let ParserMatchClass = !cast("KImmFP"#vt.Size#"MatchClass");
459 }
447460
448461 // 32-bit VALU immediate operand that uses the constant bus.
449 def KImmFP32MatchClass : AsmOperandClass {
450 let Name = "KImmFP32";
451 let PredicateMethod = "isKImmFP32";
452 let ParserMethod = "parseImm";
453 let RenderMethod = "addKImmFP32Operands";
454 }
455
456 def f32kimm : Operand {
457 let OperandNamespace = "AMDGPU";
458 let OperandType = "OPERAND_KIMM32";
459 let PrintMethod = "printU32ImmOperand";
460 let ParserMatchClass = KImmFP32MatchClass;
461 }
462 def KImmFP32MatchClass : KImmMatchClass<32>;
463 def f32kimm : kimmOperand;
464
465 // 32-bit VALU immediate operand with a 16-bit value that uses the
466 // constant bus.
467 def KImmFP16MatchClass : KImmMatchClass<16>;
468 def f16kimm : kimmOperand;
469
462470
463471 def VOPDstS64 : VOPDstOperand ;
464472
467475 let ParserMethod = "parseRegOrImmWithFPInputMods";
468476 let PredicateMethod = "isRegOrImmWithFP"#opSize#"InputMods";
469477 }
478 def FP16InputModsMatchClass : FPInputModsMatchClass<16>;
470479 def FP32InputModsMatchClass : FPInputModsMatchClass<32>;
471480 def FP64InputModsMatchClass : FPInputModsMatchClass<64>;
472481
479488 class FPInputMods : InputMods {
480489 let PrintMethod = "printOperandAndFPInputMods";
481490 }
491
492 def FP16InputMods : FPInputMods;
482493 def FP32InputMods : FPInputMods;
483494 def FP64InputMods : FPInputMods;
484495
628639 !if(!eq(VT.Value, f64.Value), 1,
629640 0)));
630641 RegisterOperand ret = !if(isFP,
631 !if(!eq(VT.Size, 64), VSrc_f64, VSrc_f32),
632 !if(!eq(VT.Size, 64), VSrc_b64, VSrc_b32));
642 !if(!eq(VT.Size, 64), VSrc_f64, !if(!eq(VT.Size, 16), VSrc_f16, VSrc_f32)),
643 !if(!eq(VT.Size, 64), VSrc_b64, !if(!eq(VT.Size, 16), VSrc_b16, VSrc_b32)));
633644 }
634645
635646 // Returns the vreg register class to use for source operand given VT
656667 !if(!eq(VT.Value, i1.Value),
657668 SCSrc_b64,
658669 !if(isFP,
659 VCSrc_f32,
660 VCSrc_b32)
670 !if(!eq(VT.Size, 16), VCSrc_f16, VCSrc_f32),
671 !if(!eq(VT.Size, 16), VCSrc_b16, VCSrc_b32)
672 )
661673 )
662674 )
663675 );
690702 0)));
691703 Operand ret = !if(!eq(VT.Size, 64),
692704 !if(isFP, FP64InputMods, Int64InputMods),
693 !if(isFP, FP32InputMods, Int32InputMods));
705 !if(isFP,
706 !if(!eq(VT.Value, f16.Value),
707 FP16InputMods,
708 FP32InputMods
709 ),
710 Int32InputMods)
711 );
694712 }
695713
696714 // Returns the input arguments for VOP[12C] instructions for the given SrcVT.
106106
107107 // 64-bit vector move instruction. This is mainly used by the SIFoldOperands
108108 // pass to enable folding of inline immediates.
109 def V_MOV_B64_PSEUDO : PseudoInstSI <(outs VReg_64:$vdst), (ins VSrc_b64:$src0)> {
110 let VALU = 1;
111 }
109 def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
110 (ins VSrc_b64:$src0)>;
112111 } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
113112
114113 let usesCustomInserter = 1, SALU = 1 in {
10821082
10831083 // Plain copy.
10841084 return getCommonSubClass(DefRC, SrcRC) != nullptr;
1085 }
1086
1087 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
1088 return OpType == AMDGPU::OPERAND_REG_IMM32_INT ||
1089 OpType == AMDGPU::OPERAND_REG_IMM32_FP;
1090 }
1091
1092 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
1093 if (opCanUseLiteralConstant(OpType))
1094 return true;
1095
1096 return OpType == AMDGPU::OPERAND_REG_INLINE_C_INT ||
1097 OpType == AMDGPU::OPERAND_REG_INLINE_C_FP;
10981085 }
10991086
11001087 // FIXME: Most of these are flexible with HSA and we don't need to reserve them
1515 #define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H
1616
1717 #include "AMDGPURegisterInfo.h"
18 #include "SIDefines.h"
1819 #include "llvm/CodeGen/MachineRegisterInfo.h"
1920
2021 namespace llvm {
137138
138139 /// \returns True if operands defined with this operand type can accept
139140 /// a literal constant (i.e. any 32-bit immediate).
140 bool opCanUseLiteralConstant(unsigned OpType) const;
141 bool opCanUseLiteralConstant(unsigned OpType) const {
142 // TODO: 64-bit operands have extending behavior from 32-bit literal.
143 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
144 OpType <= AMDGPU::OPERAND_REG_IMM_LAST;
145 }
141146
142147 /// \returns True if operands defined with this operand type can accept
143148 /// an inline constant. i.e. An integer value in the range (-16, 64) or
144149 /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f.
145 bool opCanUseInlineConstant(unsigned OpType) const;
150 bool opCanUseInlineConstant(unsigned OpType) const {
151 return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
152 OpType <= AMDGPU::OPERAND_SRC_LAST;
153 }
146154
147155 enum PreloadedValue {
148156 // SGPRS:
383383
384384 multiclass SIRegOperand {
385385 let OperandNamespace = "AMDGPU" in {
386 def _b16 : RegisterOperand(rc#"_32")> {
387 let OperandType = opType#"_INT16";
388 let ParserMatchClass = RegImmMatcher;
389 let DecoderMethod = "decodeOperand_VSrc16";
390 }
391
392 def _f16 : RegisterOperand(rc#"_32")> {
393 let OperandType = opType#"_FP16";
394 let ParserMatchClass = RegImmMatcher;
395 let DecoderMethod = "decodeOperand_VSrc16";
396 }
386397
387398 def _b32 : RegisterOperand(rc#"_32")> {
388 let OperandType = opType#"_INT";
399 let OperandType = opType#"_INT32";
389400 let ParserMatchClass = RegImmMatcher;
390401 }
391402
392403 def _f32 : RegisterOperand(rc#"_32")> {
393 let OperandType = opType#"_FP";
404 let OperandType = opType#"_FP32";
394405 let ParserMatchClass = RegImmMatcher;
395406 }
396407
397408 def _b64 : RegisterOperand(rc#"_64")> {
398 let OperandType = opType#"_INT";
409 let OperandType = opType#"_INT64";
399410 let ParserMatchClass = RegImmMatcher;
400411 }
401412
402413 def _f64 : RegisterOperand(rc#"_64")> {
403 let OperandType = opType#"_FP";
414 let OperandType = opType#"_FP64";
404415 let ParserMatchClass = RegImmMatcher;
405416 }
406417 }
407418 }
408419
420 // FIXME: 64-bit sources can sometimes use 32-bit constants.
409421 multiclass RegImmOperand
410 : SIRegOperand32">;
422 : SIRegOperand">;
411423
412424 multiclass RegInlineOperand
413425 : SIRegOperand;
133133 assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
134134
135135 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
136 MachineOperand &Src0 = MI.getOperand(Src0Idx);
137136
138137 // Only one literal constant is allowed per instruction, so if src0 is a
139138 // literal constant then we can't do any folding.
140 if (Src0.isImm() &&
141 TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx)))
139 if (TII->isLiteralConstant(MI, Src0Idx))
142140 return;
143141
144142 // Try to fold Src0
143 MachineOperand &Src0 = MI.getOperand(Src0Idx);
145144 if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) {
146145 unsigned Reg = Src0.getReg();
147146 MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
183182 }
184183
185184 static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
186 return isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4);
185 return isInt<16>(Src.getImm()) &&
186 !TII->isInlineConstant(*Src.getParent(),
187 Src.getParent()->getOperandNo(&Src));
187188 }
188189
189190 static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
190 return isUInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4);
191 return isUInt<16>(Src.getImm()) &&
192 !TII->isInlineConstant(*Src.getParent(),
193 Src.getParent()->getOperandNo(&Src));
191194 }
192195
193196 static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
195198 bool &IsUnsigned) {
196199 if (isInt<16>(Src.getImm())) {
197200 IsUnsigned = false;
198 return !TII->isInlineConstant(Src, 4);
201 return !TII->isInlineConstant(Src);
199202 }
200203
201204 if (isUInt<16>(Src.getImm())) {
202205 IsUnsigned = true;
203 return !TII->isInlineConstant(Src, 4);
206 return !TII->isInlineConstant(Src);
204207 }
205208
206209 return false;
211214 static bool isReverseInlineImm(const SIInstrInfo *TII,
212215 const MachineOperand &Src,
213216 int32_t &ReverseImm) {
214 if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src, 4))
217 if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src))
215218 return false;
216219
217220 ReverseImm = reverseBits(static_cast(Src.getImm()));
328328
329329 bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) {
330330 unsigned OpType = Desc.OpInfo[OpNo].OperandType;
331
332 return OpType == AMDGPU::OPERAND_REG_IMM32_INT ||
333 OpType == AMDGPU::OPERAND_REG_IMM32_FP ||
334 OpType == AMDGPU::OPERAND_REG_INLINE_C_INT ||
335 OpType == AMDGPU::OPERAND_REG_INLINE_C_FP;
331 return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
332 OpType <= AMDGPU::OPERAND_SRC_LAST;
336333 }
337334
338335 bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
339336 unsigned OpType = Desc.OpInfo[OpNo].OperandType;
340
341 return OpType == AMDGPU::OPERAND_REG_IMM32_FP ||
342 OpType == AMDGPU::OPERAND_REG_INLINE_C_FP;
337 switch (OpType) {
338 case AMDGPU::OPERAND_REG_IMM_FP32:
339 case AMDGPU::OPERAND_REG_IMM_FP64:
340 case AMDGPU::OPERAND_REG_IMM_FP16:
341 case AMDGPU::OPERAND_REG_INLINE_C_FP32:
342 case AMDGPU::OPERAND_REG_INLINE_C_FP64:
343 case AMDGPU::OPERAND_REG_INLINE_C_FP16:
344 return true;
345 default:
346 return false;
347 }
343348 }
344349
345350 bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) {
346351 unsigned OpType = Desc.OpInfo[OpNo].OperandType;
347
348 return OpType == AMDGPU::OPERAND_REG_INLINE_C_INT ||
349 OpType == AMDGPU::OPERAND_REG_INLINE_C_FP;
352 return OpType >= AMDGPU::OPERAND_REG_INLINE_C_FIRST &&
353 OpType <= AMDGPU::OPERAND_REG_INLINE_C_LAST;
350354 }
351355
352356 // Avoid using MCRegisterClass::getSize, since that function will go away
412416 if (Literal >= -16 && Literal <= 64)
413417 return true;
414418
419 // The actual type of the operand does not seem to matter as long
420 // as the bits match one of the inline immediate values. For example:
421 //
422 // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal,
423 // so it is a legal inline immediate.
424 //
425 // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in
426 // floating-point, so it is a legal inline immediate.
427
415428 uint32_t Val = static_cast(Literal);
416429 return (Val == FloatToBits(0.0f)) ||
417430 (Val == FloatToBits(1.0f)) ||
425438 (Val == 0x3e22f983 && HasInv2Pi);
426439 }
427440
441 bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
442 assert(HasInv2Pi);
443
444 if (Literal >= -16 && Literal <= 64)
445 return true;
446
447 uint16_t Val = static_cast(Literal);
448 return Val == 0x3C00 || // 1.0
449 Val == 0xBC00 || // -1.0
450 Val == 0x3800 || // 0.5
451 Val == 0xB800 || // -0.5
452 Val == 0x4000 || // 2.0
453 Val == 0xC000 || // -2.0
454 Val == 0x4400 || // 4.0
455 Val == 0xC400 || // -4.0
456 Val == 0x3118; // 1/2pi
457 }
428458
429459 } // End namespace AMDGPU
430460 } // End namespace llvm
1111
1212 #include "AMDKernelCodeT.h"
1313 #include "llvm/IR/CallingConv.h"
14
15 #include "SIDefines.h"
1416
1517 #define GET_INSTRINFO_OPERAND_ENUM
1618 #include "AMDGPUGenInstrInfo.inc"
166168 unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
167169 unsigned OpNo);
168170
171 LLVM_READNONE
172 inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
173 switch (OpInfo.OperandType) {
174 case AMDGPU::OPERAND_REG_IMM_INT32:
175 case AMDGPU::OPERAND_REG_IMM_FP32:
176 case AMDGPU::OPERAND_REG_INLINE_C_INT32:
177 case AMDGPU::OPERAND_REG_INLINE_C_FP32:
178 return 4;
179
180 case AMDGPU::OPERAND_REG_IMM_INT64:
181 case AMDGPU::OPERAND_REG_IMM_FP64:
182 case AMDGPU::OPERAND_REG_INLINE_C_INT64:
183 case AMDGPU::OPERAND_REG_INLINE_C_FP64:
184 return 8;
185
186 case AMDGPU::OPERAND_REG_IMM_INT16:
187 case AMDGPU::OPERAND_REG_IMM_FP16:
188 case AMDGPU::OPERAND_REG_INLINE_C_INT16:
189 case AMDGPU::OPERAND_REG_INLINE_C_FP16:
190 return 2;
191
192 default:
193 llvm_unreachable("unhandled operand type");
194 }
195 }
196
197 LLVM_READNONE
198 inline unsigned getOperandSize(const MCInstrDesc &Desc, unsigned OpNo) {
199 return getOperandSize(Desc.OpInfo[OpNo]);
200 }
201
169202 /// \brief Is this literal inlinable
170203 LLVM_READNONE
171204 bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi);
173206 LLVM_READNONE
174207 bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi);
175208
209 LLVM_READNONE
210 bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi);
176211
177212 } // end namespace AMDGPU
178213 } // end namespace llvm
133133 }
134134
135135 class VOP_MADAK : VOPProfile <[vt, vt, vt, vt]> {
136 field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, f32kimm:$imm);
136 field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
137 field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm);
137138 field string Asm32 = "$vdst, $src0, $src1, $imm";
138139 field bit HasExt = 0;
139140 }
142143 def VOP_MADAK_F32 : VOP_MADAK ;
143144
144145 class VOP_MADMK : VOPProfile <[vt, vt, vt, vt]> {
145 field dag Ins32 = (ins VCSrc_f32:$src0, f32kimm:$imm, VGPR_32:$src1);
146 field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
147 field dag Ins32 = (ins VCSrc_f32:$src0, ImmOpType:$imm, VGPR_32:$src1);
146148 field string Asm32 = "$vdst, $src0, $imm, $src1";
147149 field bit HasExt = 0;
148150 }
4040 }
4141
4242 ; GCN-LABEL: {{^}}br_cc_f16_imm_a
43 ; GCN: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x380{{0|1}}{{$}}
43 ; SI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x3800{{$}}
4444 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
4545 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
4646
4848 ; SI: v_cmp_ngt_f32_e32 vcc, v[[B_F32]], v[[A_F32]]
4949 ; SI: s_cbranch_vccz
5050
51 ; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
51 ; VI: v_cmp_nlt_f16_e32 vcc, 0.5, v[[B_F16]]
5252 ; VI: s_cbranch_vccnz
5353
5454 ; VI: one{{$}}
7979 }
8080
8181 ; GCN-LABEL: {{^}}br_cc_f16_imm_b
82 ; GCN: v_mov_b32_e32 v[[B_F16:[0-9]+]], {{0x37ff|0x3800}}{{$}}
82 ; SI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x3800{{$}}
8383 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
8484 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
8585
8686 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
8787 ; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
88 ; VI: v_cmp_ngt_f16_e32 vcc, v[[B_F16]], v[[A_F16]]
88 ; VI: v_cmp_ngt_f16_e32 vcc, 0.5, v[[A_F16]]
8989 ; GCN: s_cbranch_vccnz
9090
9191 ; GCN: one{{$}}
692692 ret void
693693 }
694694
695
696 ; FIXME: Should be able to fold this frameindex
695697 ; Without commuting the frame index in the pre-regalloc run of
696698 ; SIShrinkInstructions, this was using the VOP3 compare.
697699
698700 ; GCN-LABEL: {{^}}commute_frameindex:
699 ; GCN: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
701 ; XGCN: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
702
703 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
704 ; GCN: v_cmp_eq_u32_e32 vcc, [[FI]], v{{[0-9]+}}
700705 define void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 {
701706 entry:
702707 %stack0 = alloca i32
2828 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
2929 ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
3030 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
31 ; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 0x3c00, v[[B_F16]]
31 ; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]]
3232 ; GCN: buffer_store_short v[[R_F16]]
3333 ; GCN: s_endpgm
3434 define void @fadd_f16_imm_a(
4747 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
4848 ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
4949 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
50 ; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 0x4000, v[[A_F16]]
50 ; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[A_F16]]
5151 ; GCN: buffer_store_short v[[R_F16]]
5252 ; GCN: s_endpgm
5353 define void @fadd_f16_imm_b(
103103 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
104104 ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
105105 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
106 ; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 0x3c00, v[[B_V2_F16]]
107 ; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 0x4000, v[[B_F16_1]]
106 ; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
107 ; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
108108 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
109109 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
110110 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
131131 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
132132 ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
133133 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
134 ; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 0x4000, v[[A_V2_F16]]
135 ; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 0x3c00, v[[A_F16_1]]
134 ; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 2.0, v[[A_V2_F16]]
135 ; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 1.0, v[[A_F16_1]]
136136 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
137137 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
138138 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
4747 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
4848 ; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
4949 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
50 ; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], 0x4400, v[[A_F16]]
50 ; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
5151 ; GCN: buffer_store_short v[[R_F16]]
5252 ; GCN: s_endpgm
5353 define void @fmul_f16_imm_b(
104104 ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
105105 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
106106 ; VI: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
107 ; VI: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4400, v[[B_F16_1]]
107 ; VI: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
108108 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
109109 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
110110 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
131131 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
132132 ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
133133 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
134 ; VI: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4400, v[[A_V2_F16]]
134 ; VI: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
135135 ; VI: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
136136 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
137137 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
2828 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
2929 ; SI: v_subrev_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
3030 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
31 ; VI: v_sub_f16_e32 v[[R_F16:[0-9]+]], 0x3c00, v[[B_F16]]
31 ; VI: v_sub_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]]
3232 ; GCN: buffer_store_short v[[R_F16]]
3333 ; GCN: s_endpgm
3434 define void @fsub_f16_imm_a(
4747 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
4848 ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
4949 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
50 ; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 0xc000, v[[A_F16]]
50 ; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], -2.0, v[[A_F16]]
5151 ; GCN: buffer_store_short v[[R_F16]]
5252 ; GCN: s_endpgm
5353 define void @fsub_f16_imm_b(
103103 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
104104 ; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
105105 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
106 ; VI: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 0x3c00, v[[B_V2_F16]]
107 ; VI: v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 0x4000, v[[B_F16_1]]
106 ; VI: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
107 ; VI: v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
108108 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
109109 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
110110 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
131131 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
132132 ; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
133133 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
134 ; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 0xc000, v[[A_V2_F16]]
135 ; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 0xbc00, v[[A_F16_1]]
134 ; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]]
135 ; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], -1.0, v[[A_F16_1]]
136136 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
137137 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
138138 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
0 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
1 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
2
3 ; FIXME: Merge into imm.ll
4
5 ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_i16:
6 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x8000{{$}}
7 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
8 ; GCN: buffer_store_short [[REG]]
9 define void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out) {
10 store volatile i16 -32768, i16 addrspace(1)* %out
11 ret void
12 }
13
14 ; GCN-LABEL: {{^}}store_inline_imm_0.0_f16:
15 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
16 ; GCN: buffer_store_short [[REG]]
17 define void @store_inline_imm_0.0_f16(half addrspace(1)* %out) {
18 store half 0.0, half addrspace(1)* %out
19 ret void
20 }
21
22 ; GCN-LABEL: {{^}}store_imm_neg_0.0_f16:
23 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x8000{{$}}
24 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
25 ; GCN: buffer_store_short [[REG]]
26 define void @store_imm_neg_0.0_f16(half addrspace(1)* %out) {
27 store half -0.0, half addrspace(1)* %out
28 ret void
29 }
30
31 ; GCN-LABEL: {{^}}store_inline_imm_0.5_f16:
32 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3800{{$}}
33 ; GCN: buffer_store_short [[REG]]
34 define void @store_inline_imm_0.5_f16(half addrspace(1)* %out) {
35 store half 0.5, half addrspace(1)* %out
36 ret void
37 }
38
39 ; GCN-LABEL: {{^}}store_inline_imm_m_0.5_f16:
40 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800{{$}}
41 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffb800{{$}}
42 ; GCN: buffer_store_short [[REG]]
43 define void @store_inline_imm_m_0.5_f16(half addrspace(1)* %out) {
44 store half -0.5, half addrspace(1)* %out
45 ret void
46 }
47
48 ; GCN-LABEL: {{^}}store_inline_imm_1.0_f16:
49 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}}
50 ; GCN: buffer_store_short [[REG]]
51 define void @store_inline_imm_1.0_f16(half addrspace(1)* %out) {
52 store half 1.0, half addrspace(1)* %out
53 ret void
54 }
55
56 ; GCN-LABEL: {{^}}store_inline_imm_m_1.0_f16:
57 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00{{$}}
58 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}}
59 ; GCN: buffer_store_short [[REG]]
60 define void @store_inline_imm_m_1.0_f16(half addrspace(1)* %out) {
61 store half -1.0, half addrspace(1)* %out
62 ret void
63 }
64
65 ; GCN-LABEL: {{^}}store_inline_imm_2.0_f16:
66 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}
67 ; GCN: buffer_store_short [[REG]]
68 define void @store_inline_imm_2.0_f16(half addrspace(1)* %out) {
69 store half 2.0, half addrspace(1)* %out
70 ret void
71 }
72
73 ; GCN-LABEL: {{^}}store_inline_imm_m_2.0_f16:
74 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000{{$}}
75 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffc000{{$}}
76 ; GCN: buffer_store_short [[REG]]
77 define void @store_inline_imm_m_2.0_f16(half addrspace(1)* %out) {
78 store half -2.0, half addrspace(1)* %out
79 ret void
80 }
81
82 ; GCN-LABEL: {{^}}store_inline_imm_4.0_f16:
83 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4400{{$}}
84 ; GCN: buffer_store_short [[REG]]
85 define void @store_inline_imm_4.0_f16(half addrspace(1)* %out) {
86 store half 4.0, half addrspace(1)* %out
87 ret void
88 }
89
90 ; GCN-LABEL: {{^}}store_inline_imm_m_4.0_f16:
91 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400{{$}}
92 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffc400{{$}}
93 ; GCN: buffer_store_short [[REG]]
94 define void @store_inline_imm_m_4.0_f16(half addrspace(1)* %out) {
95 store half -4.0, half addrspace(1)* %out
96 ret void
97 }
98
99
100 ; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_f16:
101 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3118{{$}}
102 ; GCN: buffer_store_short [[REG]]
103 define void @store_inline_imm_inv_2pi_f16(half addrspace(1)* %out) {
104 store half 0xH3118, half addrspace(1)* %out
105 ret void
106 }
107
108 ; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_f16:
109 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118{{$}}
110 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffb118{{$}}
111 ; GCN: buffer_store_short [[REG]]
112 define void @store_inline_imm_m_inv_2pi_f16(half addrspace(1)* %out) {
113 store half 0xHB118, half addrspace(1)* %out
114 ret void
115 }
116
117 ; GCN-LABEL: {{^}}store_literal_imm_f16:
118 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c00
119 ; GCN: buffer_store_short [[REG]]
120 define void @store_literal_imm_f16(half addrspace(1)* %out) {
121 store half 4096.0, half addrspace(1)* %out
122 ret void
123 }
124
125 ; GCN-LABEL: {{^}}add_inline_imm_0.0_f16:
126 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
127 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0, [[VAL]]{{$}}
128 ; VI: buffer_store_short [[REG]]
129 define void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) {
130 %y = fadd half %x, 0.0
131 store half %y, half addrspace(1)* %out
132 ret void
133 }
134
135 ; GCN-LABEL: {{^}}add_inline_imm_0.5_f16:
136 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
137 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}}
138 ; VI: buffer_store_short [[REG]]
139 define void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) {
140 %y = fadd half %x, 0.5
141 store half %y, half addrspace(1)* %out
142 ret void
143 }
144
145 ; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_f16:
146 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
147 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}}
148 ; VI: buffer_store_short [[REG]]
149 define void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) {
150 %y = fadd half %x, -0.5
151 store half %y, half addrspace(1)* %out
152 ret void
153 }
154
155 ; GCN-LABEL: {{^}}add_inline_imm_1.0_f16:
156 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
157 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}}
158 ; VI: buffer_store_short [[REG]]
159 define void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) {
160 %y = fadd half %x, 1.0
161 store half %y, half addrspace(1)* %out
162 ret void
163 }
164
165 ; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_f16:
166 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
167 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}}
168 ; VI: buffer_store_short [[REG]]
169 define void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) {
170 %y = fadd half %x, -1.0
171 store half %y, half addrspace(1)* %out
172 ret void
173 }
174
175 ; GCN-LABEL: {{^}}add_inline_imm_2.0_f16:
176 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
177 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}}
178 ; VI: buffer_store_short [[REG]]
179 define void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) {
180 %y = fadd half %x, 2.0
181 store half %y, half addrspace(1)* %out
182 ret void
183 }
184
185 ; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_f16:
186 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
187 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}}
188 ; VI: buffer_store_short [[REG]]
189 define void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) {
190 %y = fadd half %x, -2.0
191 store half %y, half addrspace(1)* %out
192 ret void
193 }
194
195 ; GCN-LABEL: {{^}}add_inline_imm_4.0_f16:
196 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
197 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}}
198 ; VI: buffer_store_short [[REG]]
199 define void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) {
200 %y = fadd half %x, 4.0
201 store half %y, half addrspace(1)* %out
202 ret void
203 }
204
205 ; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_f16:
206 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
207 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}}
208 ; VI: buffer_store_short [[REG]]
209 define void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) {
210 %y = fadd half %x, -4.0
211 store half %y, half addrspace(1)* %out
212 ret void
213 }
214
215 ; GCN-LABEL: {{^}}commute_add_inline_imm_0.5_f16:
216 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
217 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]
218 ; VI: buffer_store_short [[REG]]
219 define void @commute_add_inline_imm_0.5_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
220 %x = load half, half addrspace(1)* %in
221 %y = fadd half %x, 0.5
222 store half %y, half addrspace(1)* %out
223 ret void
224 }
225
226 ; GCN-LABEL: {{^}}commute_add_literal_f16:
227 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
228 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0x6400, [[VAL]]
229 ; VI: buffer_store_short [[REG]]
230 define void @commute_add_literal_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
231 %x = load half, half addrspace(1)* %in
232 %y = fadd half %x, 1024.0
233 store half %y, half addrspace(1)* %out
234 ret void
235 }
236
237 ; GCN-LABEL: {{^}}add_inline_imm_1_f16:
238 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
239 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1, [[VAL]]{{$}}
240 ; VI: buffer_store_short [[REG]]
241 define void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) {
242 %y = fadd half %x, 0xH0001
243 store half %y, half addrspace(1)* %out
244 ret void
245 }
246
247 ; GCN-LABEL: {{^}}add_inline_imm_2_f16:
248 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
249 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2, [[VAL]]{{$}}
250 ; VI: buffer_store_short [[REG]]
251 define void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) {
252 %y = fadd half %x, 0xH0002
253 store half %y, half addrspace(1)* %out
254 ret void
255 }
256
257 ; GCN-LABEL: {{^}}add_inline_imm_16_f16:
258 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
259 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 16, [[VAL]]{{$}}
260 ; VI: buffer_store_short [[REG]]
261 define void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) {
262 %y = fadd half %x, 0xH0010
263 store half %y, half addrspace(1)* %out
264 ret void
265 }
266
267 ; GCN-LABEL: {{^}}add_inline_imm_neg_1_f16:
268 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
269 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -1, [[VAL]]{{$}}
270 ; VI: buffer_store_short [[REG]]
271 define void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, half %x) {
272 %y = fadd half %x, 0xHFFFF
273 store half %y, half addrspace(1)* %out
274 ret void
275 }
276
277 ; GCN-LABEL: {{^}}add_inline_imm_neg_2_f16:
278 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
279 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -2, [[VAL]]{{$}}
280 ; VI: buffer_store_short [[REG]]
281 define void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, half %x) {
282 %y = fadd half %x, 0xHFFFE
283 store half %y, half addrspace(1)* %out
284 ret void
285 }
286
287 ; GCN-LABEL: {{^}}add_inline_imm_neg_16_f16:
288 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
289 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -16, [[VAL]]{{$}}
290 ; VI: buffer_store_short [[REG]]
291 define void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, half %x) {
292 %y = fadd half %x, 0xHFFF0
293 store half %y, half addrspace(1)* %out
294 ret void
295 }
296
297 ; GCN-LABEL: {{^}}add_inline_imm_63_f16:
298 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
299 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 63, [[VAL]]
300 ; VI: buffer_store_short [[REG]]
301 define void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) {
302 %y = fadd half %x, 0xH003F
303 store half %y, half addrspace(1)* %out
304 ret void
305 }
306
307 ; GCN-LABEL: {{^}}add_inline_imm_64_f16:
308 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
309 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 64, [[VAL]]
310 ; VI: buffer_store_short [[REG]]
311 define void @add_inline_imm_64_f16(half addrspace(1)* %out, half %x) {
312 %y = fadd half %x, 0xH0040
313 store half %y, half addrspace(1)* %out
314 ret void
315 }
1919
2020 ; GCN-LABEL: {{^}}ldexp_f16_imm_a
2121 ; GCN: buffer_load_dword v[[B_I32:[0-9]+]]
22 ; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], 0x4000, v[[B_I32]]
22 ; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[B_I32]]
2323 ; GCN: buffer_store_short v[[R_F16]]
2424 define void @ldexp_f16_imm_a(
2525 half addrspace(1)* %r,
5050 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
5151 ; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
5252 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
53 ; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], 0x4400, v[[A_F16]]
53 ; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
5454 ; GCN: buffer_store_short v[[R_F16]]
5555 ; GCN: s_endpgm
5656 define void @maxnum_f16_imm_b(
107107 ; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
108108 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
109109 ; VI: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
110 ; VI: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 0x4400, v[[B_F16_1]]
110 ; VI: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
111111 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
112112 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
113113 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
134134 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
135135 ; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
136136 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
137 ; VI: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4400, v[[A_V2_F16]]
137 ; VI: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
138138 ; VI: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
139139 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
140140 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
5050 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
5151 ; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
5252 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
53 ; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], 0x4400, v[[A_F16]]
53 ; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
5454 ; GCN: buffer_store_short v[[R_F16]]
5555 ; GCN: s_endpgm
5656 define void @minnum_f16_imm_b(
107107 ; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
108108 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
109109 ; VI: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
110 ; VI: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 0x4400, v[[B_F16_1]]
110 ; VI: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
111111 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
112112 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
113113 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
134134 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
135135 ; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
136136 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
137 ; VI: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4400, v[[A_V2_F16]]
137 ; VI: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
138138 ; VI: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
139139 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
140140 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
4444 ; SI: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
4545 ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
4646 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
47 ; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x3800{{$}}
48 ; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
47 ; VI: v_cmp_lt_f16_e32 vcc, 0.5, v[[B_F16]]
4948 ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
5049 ; GCN: buffer_store_short v[[R_F16]]
5150 ; GCN: s_endpgm
7574 ; SI: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
7675 ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
7776 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
78 ; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x3800{{$}}
79 ; VI: v_cmp_gt_f16_e32 vcc, v[[B_F16]], v[[A_F16]]
77 ; VI: v_cmp_gt_f16_e32 vcc, 0.5, v[[A_F16]]
8078 ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
8179 ; GCN: buffer_store_short v[[R_F16]]
8280 ; GCN: s_endpgm
0 # RUN: llc --mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -run-pass si-fold-operands,si-shrink-instructions %s -o - | FileCheck %s
1 --- |
2 define void @add_f32_1.0_one_f16_use() #0 {
3 %f16.val0 = load volatile half, half addrspace(1)* undef
4 %f16.val1 = load volatile half, half addrspace(1)* undef
5 %f32.val = load volatile float, float addrspace(1)* undef
6 %f16.add0 = fadd half %f16.val0, 0xH3C00
7 %f32.add = fadd float %f32.val, 1.000000e+00
8 store volatile half %f16.add0, half addrspace(1)* undef
9 store volatile float %f32.add, float addrspace(1)* undef
10 ret void
11 }
12
13 define void @add_f32_1.0_multi_f16_use() #0 {
14 %f16.val0 = load volatile half, half addrspace(1)* undef
15 %f16.val1 = load volatile half, half addrspace(1)* undef
16 %f32.val = load volatile float, float addrspace(1)* undef
17 %f16.add0 = fadd half %f16.val0, 0xH3C00
18 %f32.add = fadd float %f32.val, 1.000000e+00
19 store volatile half %f16.add0, half addrspace(1)* undef
20 store volatile float %f32.add, float addrspace(1)* undef
21 ret void
22 }
23
24 define void @add_f32_1.0_one_f32_use_one_f16_use () #0 {
25 %f16.val0 = load volatile half, half addrspace(1)* undef
26 %f16.val1 = load volatile half, half addrspace(1)* undef
27 %f32.val = load volatile float, float addrspace(1)* undef
28 %f16.add0 = fadd half %f16.val0, 0xH3C00
29 %f32.add = fadd float %f32.val, 1.000000e+00
30 store volatile half %f16.add0, half addrspace(1)* undef
31 store volatile float %f32.add, float addrspace(1)* undef
32 ret void
33 }
34
35 define void @add_f32_1.0_one_f32_use_multi_f16_use () #0 {
36 %f16.val0 = load volatile half, half addrspace(1)* undef
37 %f16.val1 = load volatile half, half addrspace(1)* undef
38 %f32.val = load volatile float, float addrspace(1)* undef
39 %f16.add0 = fadd half %f16.val0, 0xH3C00
40 %f16.add1 = fadd half %f16.val1, 0xH3C00
41 %f32.add = fadd float %f32.val, 1.000000e+00
42 store volatile half %f16.add0, half addrspace(1)* undef
43 store volatile half %f16.add1, half addrspace(1)* undef
44 store volatile float %f32.add, float addrspace(1)* undef
45 ret void
46 }
47
48 define void @add_i32_1_multi_f16_use() #0 {
49 %f16.val0 = load volatile half, half addrspace(1)* undef
50 %f16.val1 = load volatile half, half addrspace(1)* undef
51 %f16.add0 = fadd half %f16.val0, 0xH0001
52 %f16.add1 = fadd half %f16.val1, 0xH0001
53 store volatile half %f16.add0, half addrspace(1)* undef
54 store volatile half %f16.add1,half addrspace(1)* undef
55 ret void
56 }
57
58 define void @add_i32_m2_one_f32_use_multi_f16_use () #0 {
59 %f16.val0 = load volatile half, half addrspace(1)* undef
60 %f16.val1 = load volatile half, half addrspace(1)* undef
61 %f32.val = load volatile float, float addrspace(1)* undef
62 %f16.add0 = fadd half %f16.val0, 0xHFFFE
63 %f16.add1 = fadd half %f16.val1, 0xHFFFE
64 %f32.add = fadd float %f32.val, 0xffffffffc0000000
65 store volatile half %f16.add0, half addrspace(1)* undef
66 store volatile half %f16.add1, half addrspace(1)* undef
67 store volatile float %f32.add, float addrspace(1)* undef
68 ret void
69 }
70
71 define void @add_f16_1.0_multi_f32_use() #0 {
72 %f32.val0 = load volatile float, float addrspace(1)* undef
73 %f32.val1 = load volatile float, float addrspace(1)* undef
74 %f32.val = load volatile float, float addrspace(1)* undef
75 %f32.add0 = fadd float %f32.val0, 1.0
76 %f32.add1 = fadd float %f32.val1, 1.0
77 store volatile float %f32.add0, float addrspace(1)* undef
78 store volatile float %f32.add1, float addrspace(1)* undef
79 ret void
80 }
81
82 define void @add_f16_1.0_other_high_bits_multi_f16_use() #0 {
83 %f16.val0 = load volatile half, half addrspace(1)* undef
84 %f16.val1 = load volatile half, half addrspace(1)* undef
85 %f32.val = load volatile half, half addrspace(1)* undef
86 %f16.add0 = fadd half %f16.val0, 0xH3C00
87 %f32.add = fadd half %f32.val, 1.000000e+00
88 store volatile half %f16.add0, half addrspace(1)* undef
89 store volatile half %f32.add, half addrspace(1)* undef
90 ret void
91 }
92
93 define void @add_f16_1.0_other_high_bits_use_f16_f32() #0 {
94 %f16.val0 = load volatile half, half addrspace(1)* undef
95 %f16.val1 = load volatile half, half addrspace(1)* undef
96 %f32.val = load volatile half, half addrspace(1)* undef
97 %f16.add0 = fadd half %f16.val0, 0xH3C00
98 %f32.add = fadd half %f32.val, 1.000000e+00
99 store volatile half %f16.add0, half addrspace(1)* undef
100 store volatile half %f32.add, half addrspace(1)* undef
101 ret void
102 }
103
104 attributes #0 = { nounwind }
105
106 ...
107 ---
108
109 # f32 1.0 with a single use should be folded as the low 32-bits of a
110 # literal constant.
111
112 # CHECK-LABEL: name: add_f32_1.0_one_f16_use
113 # CHECK: %13 = V_ADD_F16_e32 1065353216, killed %11, implicit %exec
114
115 name: add_f32_1.0_one_f16_use
116 alignment: 0
117 exposesReturnsTwice: false
118 legalized: false
119 regBankSelected: false
120 selected: false
121 tracksRegLiveness: true
122 registers:
123 - { id: 0, class: sreg_64 }
124 - { id: 1, class: sreg_32 }
125 - { id: 2, class: sgpr_32 }
126 - { id: 3, class: vgpr_32 }
127 - { id: 4, class: sreg_64 }
128 - { id: 5, class: sreg_32 }
129 - { id: 6, class: sreg_64 }
130 - { id: 7, class: sreg_32 }
131 - { id: 8, class: sreg_32 }
132 - { id: 9, class: sreg_32 }
133 - { id: 10, class: sreg_128 }
134 - { id: 11, class: vgpr_32 }
135 - { id: 12, class: vgpr_32 }
136 - { id: 13, class: vgpr_32 }
137 frameInfo:
138 isFrameAddressTaken: false
139 isReturnAddressTaken: false
140 hasStackMap: false
141 hasPatchPoint: false
142 stackSize: 0
143 offsetAdjustment: 0
144 maxAlignment: 0
145 adjustsStack: false
146 hasCalls: false
147 maxCallFrameSize: 0
148 hasOpaqueSPAdjustment: false
149 hasVAStart: false
150 hasMustTailInVarArgFunc: false
151 body: |
152 bb.0 (%ir-block.0):
153 %4 = IMPLICIT_DEF
154 %5 = COPY %4.sub1
155 %6 = IMPLICIT_DEF
156 %7 = COPY %6.sub0
157 %8 = S_MOV_B32 61440
158 %9 = S_MOV_B32 -1
159 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
160 %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
161 %12 = V_MOV_B32_e32 1065353216, implicit %exec
162 %13 = V_ADD_F16_e64 0, killed %11, 0, %12, 0, 0, implicit %exec
163 BUFFER_STORE_SHORT_OFFSET killed %13, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
164 S_ENDPGM
165
166 ...
167 ---
168 # Materialized f32 inline immediate should not be folded into the f16
169 # operands
170
171 # CHECK-LABEL: name: add_f32_1.0_multi_f16_use
172 # CHECK: %13 = V_MOV_B32_e32 1065353216, implicit %exec
173 # CHECK: %14 = V_ADD_F16_e32 %13, killed %11, implicit %exec
174 # CHECK: %15 = V_ADD_F16_e32 killed %13, killed %12, implicit %exec
175
176
177 name: add_f32_1.0_multi_f16_use
178 alignment: 0
179 exposesReturnsTwice: false
180 legalized: false
181 regBankSelected: false
182 selected: false
183 tracksRegLiveness: true
184 registers:
185 - { id: 0, class: sreg_64 }
186 - { id: 1, class: sreg_32 }
187 - { id: 2, class: sgpr_32 }
188 - { id: 3, class: vgpr_32 }
189 - { id: 4, class: sreg_64 }
190 - { id: 5, class: sreg_32 }
191 - { id: 6, class: sreg_64 }
192 - { id: 7, class: sreg_32 }
193 - { id: 8, class: sreg_32 }
194 - { id: 9, class: sreg_32 }
195 - { id: 10, class: sreg_128 }
196 - { id: 11, class: vgpr_32 }
197 - { id: 12, class: vgpr_32 }
198 - { id: 13, class: vgpr_32 }
199 - { id: 14, class: vgpr_32 }
200 - { id: 15, class: vgpr_32 }
201 frameInfo:
202 isFrameAddressTaken: false
203 isReturnAddressTaken: false
204 hasStackMap: false
205 hasPatchPoint: false
206 stackSize: 0
207 offsetAdjustment: 0
208 maxAlignment: 0
209 adjustsStack: false
210 hasCalls: false
211 maxCallFrameSize: 0
212 hasOpaqueSPAdjustment: false
213 hasVAStart: false
214 hasMustTailInVarArgFunc: false
215 body: |
216 bb.0 (%ir-block.0):
217 %4 = IMPLICIT_DEF
218 %5 = COPY %4.sub1
219 %6 = IMPLICIT_DEF
220 %7 = COPY %6.sub0
221 %8 = S_MOV_B32 61440
222 %9 = S_MOV_B32 -1
223 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
224 %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
225 %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
226 %13 = V_MOV_B32_e32 1065353216, implicit %exec
227 %14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit %exec
228 %15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit %exec
229 BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
230 BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
231 S_ENDPGM
232
233 ...
234 ---
235
236 # f32 1.0 should be folded into the single f32 use as an inline
237 # immediate, and folded into the single f16 use as a literal constant
238
239 # CHECK-LABEL: name: add_f32_1.0_one_f32_use_one_f16_use
240 # CHECK: %15 = V_ADD_F16_e32 1065353216, %11, implicit %exec
241 # CHECK: %16 = V_ADD_F32_e32 1065353216, killed %13, implicit %exec
242
243 name: add_f32_1.0_one_f32_use_one_f16_use
244 alignment: 0
245 exposesReturnsTwice: false
246 legalized: false
247 regBankSelected: false
248 selected: false
249 tracksRegLiveness: true
250 registers:
251 - { id: 0, class: sreg_64 }
252 - { id: 1, class: sreg_32 }
253 - { id: 2, class: sgpr_32 }
254 - { id: 3, class: vgpr_32 }
255 - { id: 4, class: sreg_64 }
256 - { id: 5, class: sreg_32 }
257 - { id: 6, class: sreg_64 }
258 - { id: 7, class: sreg_32 }
259 - { id: 8, class: sreg_32 }
260 - { id: 9, class: sreg_32 }
261 - { id: 10, class: sreg_128 }
262 - { id: 11, class: vgpr_32 }
263 - { id: 12, class: vgpr_32 }
264 - { id: 13, class: vgpr_32 }
265 - { id: 14, class: vgpr_32 }
266 - { id: 15, class: vgpr_32 }
267 - { id: 16, class: vgpr_32 }
268 frameInfo:
269 isFrameAddressTaken: false
270 isReturnAddressTaken: false
271 hasStackMap: false
272 hasPatchPoint: false
273 stackSize: 0
274 offsetAdjustment: 0
275 maxAlignment: 0
276 adjustsStack: false
277 hasCalls: false
278 maxCallFrameSize: 0
279 hasOpaqueSPAdjustment: false
280 hasVAStart: false
281 hasMustTailInVarArgFunc: false
282 body: |
283 bb.0 (%ir-block.0):
284 %4 = IMPLICIT_DEF
285 %5 = COPY %4.sub1
286 %6 = IMPLICIT_DEF
287 %7 = COPY %6.sub0
288 %8 = S_MOV_B32 61440
289 %9 = S_MOV_B32 -1
290 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
291 %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
292 %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
293 %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
294 %14 = V_MOV_B32_e32 1065353216, implicit %exec
295 %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit %exec
296 %16 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit %exec
297 BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
298 BUFFER_STORE_DWORD_OFFSET killed %16, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
299 S_ENDPGM
300
301 ...
302 ---
303
304 # f32 1.0 should be folded for the single f32 use as an inline
305 # constant, and not folded as a multi-use literal for the f16 cases
306
307 # CHECK-LABEL: name: add_f32_1.0_one_f32_use_multi_f16_use
308 # CHECK: %14 = V_MOV_B32_e32 1065353216, implicit %exec
309 # CHECK: %15 = V_ADD_F16_e32 %14, %11, implicit %exec
310 # CHECK: %16 = V_ADD_F16_e32 %14, %12, implicit %exec
311 # CHECK: %17 = V_ADD_F32_e32 1065353216, killed %13, implicit %exec
312
313 name: add_f32_1.0_one_f32_use_multi_f16_use
314 alignment: 0
315 exposesReturnsTwice: false
316 legalized: false
317 regBankSelected: false
318 selected: false
319 tracksRegLiveness: true
320 registers:
321 - { id: 0, class: sreg_64 }
322 - { id: 1, class: sreg_32 }
323 - { id: 2, class: sgpr_32 }
324 - { id: 3, class: vgpr_32 }
325 - { id: 4, class: sreg_64 }
326 - { id: 5, class: sreg_32 }
327 - { id: 6, class: sreg_64 }
328 - { id: 7, class: sreg_32 }
329 - { id: 8, class: sreg_32 }
330 - { id: 9, class: sreg_32 }
331 - { id: 10, class: sreg_128 }
332 - { id: 11, class: vgpr_32 }
333 - { id: 12, class: vgpr_32 }
334 - { id: 13, class: vgpr_32 }
335 - { id: 14, class: vgpr_32 }
336 - { id: 15, class: vgpr_32 }
337 - { id: 16, class: vgpr_32 }
338 - { id: 17, class: vgpr_32 }
339 frameInfo:
340 isFrameAddressTaken: false
341 isReturnAddressTaken: false
342 hasStackMap: false
343 hasPatchPoint: false
344 stackSize: 0
345 offsetAdjustment: 0
346 maxAlignment: 0
347 adjustsStack: false
348 hasCalls: false
349 maxCallFrameSize: 0
350 hasOpaqueSPAdjustment: false
351 hasVAStart: false
352 hasMustTailInVarArgFunc: false
353 body: |
354 bb.0 (%ir-block.0):
355 %4 = IMPLICIT_DEF
356 %5 = COPY %4.sub1
357 %6 = IMPLICIT_DEF
358 %7 = COPY %6.sub0
359 %8 = S_MOV_B32 61440
360 %9 = S_MOV_B32 -1
361 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
362 %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
363 %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
364 %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
365 %14 = V_MOV_B32_e32 1065353216, implicit %exec
366 %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit %exec
367 %16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit %exec
368 %17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit %exec
369 BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
370 BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
371 BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
372 S_ENDPGM
373
374 ...
375 ---
376 # CHECK-LABEL: name: add_i32_1_multi_f16_use
377 # CHECK: %13 = V_MOV_B32_e32 1, implicit %exec
378 # CHECK: %14 = V_ADD_F16_e32 1, killed %11, implicit %exec
379 # CHECK: %15 = V_ADD_F16_e32 1, killed %12, implicit %exec
380
381
382 name: add_i32_1_multi_f16_use
383 alignment: 0
384 exposesReturnsTwice: false
385 legalized: false
386 regBankSelected: false
387 selected: false
388 tracksRegLiveness: true
389 registers:
390 - { id: 0, class: sreg_64 }
391 - { id: 1, class: sreg_32 }
392 - { id: 2, class: sgpr_32 }
393 - { id: 3, class: vgpr_32 }
394 - { id: 4, class: sreg_64 }
395 - { id: 5, class: sreg_32 }
396 - { id: 6, class: sreg_64 }
397 - { id: 7, class: sreg_32 }
398 - { id: 8, class: sreg_32 }
399 - { id: 9, class: sreg_32 }
400 - { id: 10, class: sreg_128 }
401 - { id: 11, class: vgpr_32 }
402 - { id: 12, class: vgpr_32 }
403 - { id: 13, class: vgpr_32 }
404 - { id: 14, class: vgpr_32 }
405 - { id: 15, class: vgpr_32 }
406 frameInfo:
407 isFrameAddressTaken: false
408 isReturnAddressTaken: false
409 hasStackMap: false
410 hasPatchPoint: false
411 stackSize: 0
412 offsetAdjustment: 0
413 maxAlignment: 0
414 adjustsStack: false
415 hasCalls: false
416 maxCallFrameSize: 0
417 hasOpaqueSPAdjustment: false
418 hasVAStart: false
419 hasMustTailInVarArgFunc: false
420 body: |
421 bb.0 (%ir-block.0):
422 %4 = IMPLICIT_DEF
423 %5 = COPY %4.sub1
424 %6 = IMPLICIT_DEF
425 %7 = COPY %6.sub0
426 %8 = S_MOV_B32 61440
427 %9 = S_MOV_B32 -1
428 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
429 %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
430 %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
431 %13 = V_MOV_B32_e32 1, implicit %exec
432 %14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit %exec
433 %15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit %exec
434 BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
435 BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
436 S_ENDPGM
437
438 ...
439 ---
440
441 # CHECK-LABEL: name: add_i32_m2_one_f32_use_multi_f16_use
442 # CHECK: %14 = V_MOV_B32_e32 -2, implicit %exec
443 # CHECK: %15 = V_ADD_F16_e32 -2, %11, implicit %exec
444 # CHECK: %16 = V_ADD_F16_e32 -2, %12, implicit %exec
445 # CHECK: %17 = V_ADD_F32_e32 -2, killed %13, implicit %exec
446
447 name: add_i32_m2_one_f32_use_multi_f16_use
448 alignment: 0
449 exposesReturnsTwice: false
450 legalized: false
451 regBankSelected: false
452 selected: false
453 tracksRegLiveness: true
454 registers:
455 - { id: 0, class: sreg_64 }
456 - { id: 1, class: sreg_32 }
457 - { id: 2, class: sgpr_32 }
458 - { id: 3, class: vgpr_32 }
459 - { id: 4, class: sreg_64 }
460 - { id: 5, class: sreg_32 }
461 - { id: 6, class: sreg_64 }
462 - { id: 7, class: sreg_32 }
463 - { id: 8, class: sreg_32 }
464 - { id: 9, class: sreg_32 }
465 - { id: 10, class: sreg_128 }
466 - { id: 11, class: vgpr_32 }
467 - { id: 12, class: vgpr_32 }
468 - { id: 13, class: vgpr_32 }
469 - { id: 14, class: vgpr_32 }
470 - { id: 15, class: vgpr_32 }
471 - { id: 16, class: vgpr_32 }
472 - { id: 17, class: vgpr_32 }
473 frameInfo:
474 isFrameAddressTaken: false
475 isReturnAddressTaken: false
476 hasStackMap: false
477 hasPatchPoint: false
478 stackSize: 0
479 offsetAdjustment: 0
480 maxAlignment: 0
481 adjustsStack: false
482 hasCalls: false
483 maxCallFrameSize: 0
484 hasOpaqueSPAdjustment: false
485 hasVAStart: false
486 hasMustTailInVarArgFunc: false
487 body: |
488 bb.0 (%ir-block.0):
489 %4 = IMPLICIT_DEF
490 %5 = COPY %4.sub1
491 %6 = IMPLICIT_DEF
492 %7 = COPY %6.sub0
493 %8 = S_MOV_B32 61440
494 %9 = S_MOV_B32 -1
495 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
496 %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
497 %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
498 %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
499 %14 = V_MOV_B32_e32 -2, implicit %exec
500 %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit %exec
501 %16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit %exec
502 %17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit %exec
503 BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
504 BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
505 BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
506 S_ENDPGM
507
508 ...
509 ---
510
511 # f32 1.0 should be folded for the single f32 use as an inline
512 # constant, and not folded as a multi-use literal for the f16 cases
513
514 # CHECK-LABEL: name: add_f16_1.0_multi_f32_use
515 # CHECK: %13 = V_MOV_B32_e32 15360, implicit %exec
516 # CHECK: %14 = V_ADD_F32_e32 %13, %11, implicit %exec
517 # CHECK: %15 = V_ADD_F32_e32 %13, %12, implicit %exec
518
519 name: add_f16_1.0_multi_f32_use
520 alignment: 0
521 exposesReturnsTwice: false
522 legalized: false
523 regBankSelected: false
524 selected: false
525 tracksRegLiveness: true
526 registers:
527 - { id: 0, class: sreg_64 }
528 - { id: 1, class: sreg_32 }
529 - { id: 2, class: sgpr_32 }
530 - { id: 3, class: vgpr_32 }
531 - { id: 4, class: sreg_64 }
532 - { id: 5, class: sreg_32 }
533 - { id: 6, class: sreg_64 }
534 - { id: 7, class: sreg_32 }
535 - { id: 8, class: sreg_32 }
536 - { id: 9, class: sreg_32 }
537 - { id: 10, class: sreg_128 }
538 - { id: 11, class: vgpr_32 }
539 - { id: 12, class: vgpr_32 }
540 - { id: 13, class: vgpr_32 }
541 - { id: 14, class: vgpr_32 }
542 - { id: 15, class: vgpr_32 }
543 frameInfo:
544 isFrameAddressTaken: false
545 isReturnAddressTaken: false
546 hasStackMap: false
547 hasPatchPoint: false
548 stackSize: 0
549 offsetAdjustment: 0
550 maxAlignment: 0
551 adjustsStack: false
552 hasCalls: false
553 maxCallFrameSize: 0
554 hasOpaqueSPAdjustment: false
555 hasVAStart: false
556 hasMustTailInVarArgFunc: false
557 body: |
558 bb.0 (%ir-block.0):
559 %4 = IMPLICIT_DEF
560 %5 = COPY %4.sub1
561 %6 = IMPLICIT_DEF
562 %7 = COPY %6.sub0
563 %8 = S_MOV_B32 61440
564 %9 = S_MOV_B32 -1
565 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
566 %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
567 %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
568 %13 = V_MOV_B32_e32 15360, implicit %exec
569 %14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit %exec
570 %15 = V_ADD_F32_e64 0, %12, 0, %13, 0, 0, implicit %exec
571 BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
572 BUFFER_STORE_DWORD_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
573 S_ENDPGM
574
575 ...
576 ---
577
578 # The low 16-bits are an inline immediate, but the high bits are junk
579 # FIXME: Should be able to fold this
580
581 # CHECK-LABEL: name: add_f16_1.0_other_high_bits_multi_f16_use
582 # CHECK: %13 = V_MOV_B32_e32 80886784, implicit %exec
583 # CHECK: %14 = V_ADD_F16_e32 %13, %11, implicit %exec
584 # CHECK: %15 = V_ADD_F16_e32 %13, %12, implicit %exec
585
586 name: add_f16_1.0_other_high_bits_multi_f16_use
587 alignment: 0
588 exposesReturnsTwice: false
589 legalized: false
590 regBankSelected: false
591 selected: false
592 tracksRegLiveness: true
593 registers:
594 - { id: 0, class: sreg_64 }
595 - { id: 1, class: sreg_32 }
596 - { id: 2, class: sgpr_32 }
597 - { id: 3, class: vgpr_32 }
598 - { id: 4, class: sreg_64 }
599 - { id: 5, class: sreg_32 }
600 - { id: 6, class: sreg_64 }
601 - { id: 7, class: sreg_32 }
602 - { id: 8, class: sreg_32 }
603 - { id: 9, class: sreg_32 }
604 - { id: 10, class: sreg_128 }
605 - { id: 11, class: vgpr_32 }
606 - { id: 12, class: vgpr_32 }
607 - { id: 13, class: vgpr_32 }
608 - { id: 14, class: vgpr_32 }
609 - { id: 15, class: vgpr_32 }
610 frameInfo:
611 isFrameAddressTaken: false
612 isReturnAddressTaken: false
613 hasStackMap: false
614 hasPatchPoint: false
615 stackSize: 0
616 offsetAdjustment: 0
617 maxAlignment: 0
618 adjustsStack: false
619 hasCalls: false
620 maxCallFrameSize: 0
621 hasOpaqueSPAdjustment: false
622 hasVAStart: false
623 hasMustTailInVarArgFunc: false
624 body: |
625 bb.0 (%ir-block.0):
626 %4 = IMPLICIT_DEF
627 %5 = COPY %4.sub1
628 %6 = IMPLICIT_DEF
629 %7 = COPY %6.sub0
630 %8 = S_MOV_B32 61440
631 %9 = S_MOV_B32 -1
632 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
633 %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
634 %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
635 %13 = V_MOV_B32_e32 80886784, implicit %exec
636 %14 = V_ADD_F16_e64 0, %11, 0, %13, 0, 0, implicit %exec
637 %15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit %exec
638 BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
639 BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
640 S_ENDPGM
641
642 ...
643 ---
644
645 # FIXME: Should fold inline immediate into f16 and literal use into
646 # f32 instruction.
647
648 # CHECK-LABEL: name: add_f16_1.0_other_high_bits_use_f16_f32
649 # CHECK: %13 = V_MOV_B32_e32 305413120, implicit %exec
650 # CHECK: %14 = V_ADD_F32_e32 %13, %11, implicit %exec
651 # CHECK: %15 = V_ADD_F16_e32 %13, %12, implicit %exec
652 name: add_f16_1.0_other_high_bits_use_f16_f32
653 alignment: 0
654 exposesReturnsTwice: false
655 legalized: false
656 regBankSelected: false
657 selected: false
658 tracksRegLiveness: true
659 registers:
660 - { id: 0, class: sreg_64 }
661 - { id: 1, class: sreg_32 }
662 - { id: 2, class: sgpr_32 }
663 - { id: 3, class: vgpr_32 }
664 - { id: 4, class: sreg_64 }
665 - { id: 5, class: sreg_32 }
666 - { id: 6, class: sreg_64 }
667 - { id: 7, class: sreg_32 }
668 - { id: 8, class: sreg_32 }
669 - { id: 9, class: sreg_32 }
670 - { id: 10, class: sreg_128 }
671 - { id: 11, class: vgpr_32 }
672 - { id: 12, class: vgpr_32 }
673 - { id: 13, class: vgpr_32 }
674 - { id: 14, class: vgpr_32 }
675 - { id: 15, class: vgpr_32 }
676 frameInfo:
677 isFrameAddressTaken: false
678 isReturnAddressTaken: false
679 hasStackMap: false
680 hasPatchPoint: false
681 stackSize: 0
682 offsetAdjustment: 0
683 maxAlignment: 0
684 adjustsStack: false
685 hasCalls: false
686 maxCallFrameSize: 0
687 hasOpaqueSPAdjustment: false
688 hasVAStart: false
689 hasMustTailInVarArgFunc: false
690 body: |
691 bb.0 (%ir-block.0):
692 %4 = IMPLICIT_DEF
693 %5 = COPY %4.sub1
694 %6 = IMPLICIT_DEF
695 %7 = COPY %6.sub0
696 %8 = S_MOV_B32 61440
697 %9 = S_MOV_B32 -1
698 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
699 %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
700 %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
701 %13 = V_MOV_B32_e32 305413120, implicit %exec
702 %14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit %exec
703 %15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit %exec
704 BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
705 BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
706 S_ENDPGM
707
708 ...
0 // XFAIL: *
1 // RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck -check-prefix=NOVI %s
2
3 v_add_f16 v1, 0xfffff, v2
4 // NOVI: 19: error: invalid operand for instruction
5
6 v_add_f16 v1, 0x10000, v2
7 // NOVI: 19: error: invalid operand for instruction
8
9 v_add_f16 v1, v2, -0.0
10 v_add_f16 v1, v2, 1
11
12
13
14 // FIXME: Should give truncate error
15 v_add_f16 v1, -32769, v2
16 v_add_f16 v1, 65536, v2
17
18 v_add_f32 v1, 4294967296, v2
19 v_add_f32 v1, 0x0000000100000000, v2
20 v_and_b32 v1, 0x0000000100000000, v2
0 // RUN: llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI %s
1
2 v_add_f16 v1, 0, v2
3 // VI: v_add_f16_e32 v1, 0, v2 ; encoding: [0x80,0x04,0x02,0x3e]
4
5 v_add_f16 v1, 0.0, v2
6 // VI: v_add_f16_e32 v1, 0, v2 ; encoding: [0x80,0x04,0x02,0x3e]
7
8 v_add_f16 v1, v2, 0
9 // VI: v_add_f16_e64 v1, v2, 0 ; encoding: [0x01,0x00,0x1f,0xd1,0x02,0x01,0x01,0x00]
10
11 v_add_f16 v1, v2, 0.0
12 // VI: v_add_f16_e64 v1, v2, 0 ; encoding: [0x01,0x00,0x1f,0xd1,0x02,0x01,0x01,0x00]
13
14 v_add_f16 v1, -0.0, v2
15 // VI: v_add_f16_e32 v1, 0x8000, v2 ; encoding: [0xff,0x04,0x02,0x3e,0x00,0x80,0x00,0x00]
16
17 v_add_f16 v1, 1.0, v2
18 // VI: v_add_f16_e32 v1, 1.0, v2 ; encoding: [0xf2,0x04,0x02,0x3e]
19
20 v_add_f16 v1, -1.0, v2
21 // VI: v_add_f16_e32 v1, -1.0, v2 ; encoding: [0xf3,0x04,0x02,0x3e]
22
23 v_add_f16 v1, -0.5, v2
24 // VI: v_add_f16_e32 v1, -0.5, v2 ; encoding: [0xf1,0x04,0x02,0x3e]
25
26 v_add_f16 v1, 0.5, v2
27 // VI: v_add_f16_e32 v1, 0.5, v2 ; encoding: [0xf0,0x04,0x02,0x3e]
28
29 v_add_f16 v1, 2.0, v2
30 // VI: v_add_f16_e32 v1, 2.0, v2 ; encoding: [0xf4,0x04,0x02,0x3e]
31
32 v_add_f16 v1, -2.0, v2
33 // VI: v_add_f16_e32 v1, -2.0, v2 ; encoding: [0xf5,0x04,0x02,0x3e]
34
35 v_add_f16 v1, 4.0, v2
36 // VI: v_add_f16_e32 v1, 4.0, v2 ; encoding: [0xf6,0x04,0x02,0x3e]
37
38 v_add_f16 v1, -4.0, v2
39 // VI: v_add_f16_e32 v1, -4.0, v2 ; encoding: [0xf7,0x04,0x02,0x3e]
40
41 v_add_f16 v1, 0.15915494, v2
42 // VI: v_add_f16_e32 v1, 0.15915494, v2 ; encoding: [0xf8,0x04,0x02,0x3e]
43
44 v_add_f16 v1, -0.15915494, v2
45 // VI: v_add_f16_e32 v1, 0xb118, v2 ; encoding: [0xff,0x04,0x02,0x3e,0x18,0xb1,0x00,0x00]
46
47 v_add_f16 v1, -1, v2
48 // VI: v_add_f16_e32 v1, -1, v2 ; encoding: [0xc1,0x04,0x02,0x3e]
49
50
51 v_add_f16 v1, -2, v2
52 // VI: v_add_f16_e32 v1, -2, v2 ; encoding: [0xc2,0x04,0x02,0x3e]
53
54 v_add_f16 v1, -3, v2
55 // VI: v_add_f16_e32 v1, -3, v2 ; encoding: [0xc3,0x04,0x02,0x3e]
56
57 v_add_f16 v1, -16, v2
58 // VI: v_add_f16_e32 v1, -16, v2 ; encoding: [0xd0,0x04,0x02,0x3e]
59
60 v_add_f16 v1, 1, v2
61 // VI: v_add_f16_e32 v1, 1, v2 ; encoding: [0x81,0x04,0x02,0x3e]
62
63 v_add_f16 v1, 2, v2
64 // VI: v_add_f16_e32 v1, 2, v2 ; encoding: [0x82,0x04,0x02,0x3e]
65
66 v_add_f16 v1, 3, v2
67 // VI: v_add_f16_e32 v1, 3, v2 ; encoding: [0x83,0x04,0x02,0x3e]
68
69 v_add_f16 v1, 4, v2
70 // VI: v_add_f16_e32 v1, 4, v2 ; encoding: [0x84,0x04,0x02,0x3e]
71
72 v_add_f16 v1, 15, v2
73 // VI: v_add_f16_e32 v1, 15, v2 ; encoding: [0x8f,0x04,0x02,0x3e]
74
75 v_add_f16 v1, 16, v2
76 // VI: v_add_f16_e32 v1, 16, v2 ; encoding: [0x90,0x04,0x02,0x3e]
77
78 v_add_f16 v1, 63, v2
79 // VI: v_add_f16_e32 v1, 63, v2 ; encoding: [0xbf,0x04,0x02,0x3e]
80
81 v_add_f16 v1, 64, v2
82 // VI: v_add_f16_e32 v1, 64, v2 ; encoding: [0xc0,0x04,0x02,0x3e]
83
84 v_add_f16 v1, 0x0001, v2
85 // VI: v_add_f16_e32 v1, 1, v2 ; encoding: [0x81,0x04,0x02,0x3e]
86
87 v_add_f16 v1, 0xffff, v2
88 // VI: v_add_f16_e32 v1, -1, v2 ; encoding: [0xc1,0x04,0x02,0x3e]
89
90 v_add_f16 v1, -17, v2
91 // VI: v_add_f16_e32 v1, 0xffef, v2 ; encoding: [0xff,0x04,0x02,0x3e,0xef,0xff,0x00,0x00]
92
93 v_add_f16 v1, 65, v2
94 // VI: v_add_f16_e32 v1, 0x41, v2 ; encoding: [0xff,0x04,0x02,0x3e,0x41,0x00,0x00,0x00]
95
96 v_add_f16 v1, 0x3c00, v2
97 // VI: v_add_f16_e32 v1, 1.0, v2 ; encoding: [0xf2,0x04,0x02,0x3e]
98
99 v_add_f16 v1, 0xbc00, v2
100 // VI: v_add_f16_e32 v1, -1.0, v2 ; encoding: [0xf3,0x04,0x02,0x3e]
101
102 v_add_f16 v1, 0x3800, v2
103 // VI: v_add_f16_e32 v1, 0.5, v2 ; encoding: [0xf0,0x04,0x02,0x3e]
104
105 v_add_f16 v1, 0xb800, v2
106 // VI: v_add_f16_e32 v1, -0.5, v2 ; encoding: [0xf1,0x04,0x02,0x3e]
107
108 v_add_f16 v1, 0x4000, v2
109 // VI: v_add_f16_e32 v1, 2.0, v2 ; encoding: [0xf4,0x04,0x02,0x3e]
110
111 v_add_f16 v1, 0xc000, v2
112 // VI: v_add_f16_e32 v1, -2.0, v2 ; encoding: [0xf5,0x04,0x02,0x3e]
113
114 v_add_f16 v1, 0x4400, v2
115 // VI: v_add_f16_e32 v1, 4.0, v2 ; encoding: [0xf6,0x04,0x02,0x3e]
116
117 v_add_f16 v1, 0xc400, v2
118 // VI: v_add_f16_e32 v1, -4.0, v2 ; encoding: [0xf7,0x04,0x02,0x3e]
119
120 v_add_f16 v1, 0x3118, v2
121 // VI: v_add_f16_e32 v1, 0.15915494, v2 ; encoding: [0xf8,0x04,0x02,0x3e]
122
123 v_add_f16 v1, -32768, v2
124 // VI: v_add_f16_e32 v1, 0x8000, v2 ; encoding: [0xff,0x04,0x02,0x3e,0x00,0x80,0x00,0x00]
125
126 v_add_f16 v1, 32767, v2
127 // VI: v_add_f16_e32 v1, 0x7fff, v2 ; encoding: [0xff,0x04,0x02,0x3e,0xff,0x7f,0x00,0x00]
128
129 v_add_f16 v1, 65535, v2
130 // VI: v_add_f16_e32 v1, -1, v2 ; encoding: [0xc1,0x04,0x02,0x3e]
131
132
133 // K-constant
134 v_madmk_f16 v1, v2, 0x4280, v3
135 // VI: v_madmk_f16_e32 v1, v2, 0x4280, v3 ; encoding: [0x02,0x07,0x02,0x48,0x80,0x42,0x00,0x00]
136
137 v_madmk_f16 v1, v2, 1.0, v3
138 // VI: v_madmk_f16_e32 v1, v2, 0x3c00, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x3c,0x00,0x00]
139
140 v_madmk_f16 v1, v2, 1, v3
141 // VI: v_madmk_f16_e32 v1, v2, 0x1, v3 ; encoding: [0x02,0x07,0x02,0x48,0x01,0x00,0x00,0x00]
142
143 v_madmk_f16 v1, v2, 64.0, v3
144 // VI: v_madmk_f16_e32 v1, v2, 0x5400, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x54,0x00,0x00]
145
146
147 v_add_f16_e32 v1, 64.0, v2
421421
422422 // NOSICI: error: instruction not supported on this GPU
423423 // NOSICI: v_madmk_f16 v1, v2, 64.0, v3
424 // VI: v_madmk_f16_e32 v1, v2, 0x42800000, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x00,0x80,0x42]
424 // VI: v_madmk_f16_e32 v1, v2, 0x5400, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x54,0x00,0x00]
425425 v_madmk_f16 v1, v2, 64.0, v3
426426
427427 // NOSICI: error: instruction not supported on this GPU
428428 // NOSICI: v_madak_f16 v1, v2, v3, 64.0
429 // VI: v_madak_f16_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x4a,0x00,0x00,0x80,0x42]
429 // VI: v_madak_f16_e32 v1, v2, v3, 0x5400 ; encoding: [0x02,0x07,0x02,0x4a,0x00,0x54,0x00,0x00]
430430 v_madak_f16 v1, v2, v3, 64.0
431431
432432 // NOSICI: error: instruction not supported on this GPU
0 # RUN: llvm-mc -arch=amdgcn -mcpu=tonga -disassemble -show-encoding %s | FileCheck -check-prefix=VI %s
1
2 # VI: v_add_f16_e32 v1, 0.5, v3 ; encoding: [0xf0,0x06,0x02,0x3e]
3 0xf0 0x06 0x02 0x3e
4
5 # VI: v_add_f16_e32 v1, -0.5, v3 ; encoding: [0xf1,0x06,0x02,0x3e]
6 0xf1 0x06 0x02 0x3e
7
8 # VI: v_add_f16_e32 v1, 1.0, v3 ; encoding: [0xf2,0x06,0x02,0x3e]
9 0xf2 0x06 0x02 0x3e
10
11 # VI: v_add_f16_e32 v1, -1.0, v3 ; encoding: [0xf3,0x06,0x02,0x3e]
12 0xf3 0x06 0x02 0x3e
13
14 # VI: v_add_f16_e32 v1, 2.0, v3 ; encoding: [0xf4,0x06,0x02,0x3e]
15 0xf4 0x06 0x02 0x3e
16
17 # VI: v_add_f16_e32 v1, -2.0, v3 ; encoding: [0xf5,0x06,0x02,0x3e]
18 0xf5 0x06 0x02 0x3e
19
20 # VI: v_add_f16_e32 v1, 4.0, v3 ; encoding: [0xf6,0x06,0x02,0x3e]
21 0xf6 0x06 0x02 0x3e
22
23 # VI: v_add_f16_e32 v1, -4.0, v3 ; encoding: [0xf7,0x06,0x02,0x3e]
24 0xf7 0x06 0x02 0x3e
25
26 # VI: v_add_f16_e32 v1, 0.15915494, v3 ; encoding: [0xf8,0x06,0x02,0x3e]
27 0xf8 0x06 0x02 0x3e
28
29 # VI: v_add_f16_e32 v1, 0x41, v3 ; encoding: [0xff,0x06,0x02,0x3e,0x41,0x00,0x00,0x00]
30 0xff 0x06 0x02 0x3e 0x41 0x00 0x00 0x00
31
32 # VI: v_add_f16_e32 v1, 0x100, v3 ; encoding: [0xff,0x06,0x02,0x3e,0x00,0x01,0x00,0x00]
33 0xff 0x06 0x02 0x3e 0x00 0x01 0x00 0x00
34
35 # non-zero unused bits in constant
36 # VI: v_add_f16_e32 v1, 0x10041, v3 ; encoding: [0xff,0x06,0x02,0x3e,0x41,0x00,0x01,0x00]
37 0xff 0x06 0x02 0x3e 0x41 0x00 0x01 0x00
38
39 # VI: v_add_f16_e32 v1, 0x1000041, v3 ; encoding: [0xff,0x06,0x02,0x3e,0x41,0x00,0x00,0x01]
40 0xff 0x06 0x02 0x3e 0x41 0x00 0x00 0x01
41
42 # FIXME: This should be able to round trip with literal after instruction
43 # VI: v_add_f16_e32 v1, 0, v3 ; encoding: [0x80,0x06,0x02,0x3e]
44 0xff 0x06 0x02 0x3e 0x00 0x00 0x00 0x00
45
46 # VI: v_madmk_f16_e32 v1, v2, 0x41, v3 ; encoding: [0x02,0x07,0x02,0x48,0x41,0x00,0x00,0x00]
47 0x02 0x07 0x02 0x48 0x41 0x00 0x00 0x00
48
49 # VI: v_madmk_f16_e32 v1, v2, 0x10041, v3 ; encoding: [0x02,0x07,0x02,0x48,0x41,0x00,0x01,0x00]
50 0x02 0x07 0x02 0x48 0x41 0x00 0x01 0x00
51
52 # VI: v_madmk_f16_e32 v1, v2, 0x1000041, v3 ; encoding: [0x02,0x07,0x02,0x48,0x41,0x00,0x00,0x01]
53 0x02 0x07 0x02 0x48 0x41 0x00 0x00 0x01
245245 # CHECK: v_cvt_f16_i16_e32 v123, 0x21c2 ; encoding: [0xff,0x74,0xf6,0x7e,0xc2,0x21,0x00,0x00]
246246 0xff 0x74 0xf6 0x7e 0xc2 0x21 0x00 0x00
247247
248 # CHECK: v_cvt_u16_f16_e32 v123, 0x3f200000 ; encoding: [0xff,0x76,0xf6,0x7e,0x00,0x00,0x20,0x3f]
249 0xff 0x76 0xf6 0x7e 0x00 0x00 0x20 0x3f
248 # CHECK: v_cvt_u16_f16_e32 v123, 0x3f20 ; encoding: [0xff,0x76,0xf6,0x7e,0x20,0x3f,0x00,0x00]
249 0xff 0x76 0xf6 0x7e 0x20 0x3f 0x00 0x00