llvm.org GIT mirror llvm / aa57603
AMDGPU: Check if users of fneg can fold mods In multi-use cases this can save a few instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@293962 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 3 years ago
6 changed file(s) with 570 addition(s) and 84 deletion(s). Raw diff Collapse all Expand all
485485 // Target Information
486486 //===----------------------------------------------------------------------===//
487487
488 LLVM_READNONE
488489 static bool fnegFoldsIntoOp(unsigned Opc) {
489490 switch (Opc) {
490491 case ISD::FADD:
504505 default:
505506 return false;
506507 }
508 }
509
510 /// \p returns true if the operation will definitely need to use a 64-bit
511 /// encoding, and thus will use a VOP3 encoding regardless of the source
512 /// modifiers.
513 LLVM_READONLY
514 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
515 return N->getNumOperands() > 2 || VT == MVT::f64;
516 }
517
518 // Most FP instructions support source modifiers, but this could be refined
519 // slightly.
520 LLVM_READONLY
521 static bool hasSourceMods(const SDNode *N) {
522 if (isa(N))
523 return false;
524
525 switch (N->getOpcode()) {
526 case ISD::CopyToReg:
527 case ISD::SELECT:
528 case ISD::FDIV:
529 case ISD::FREM:
530 case ISD::INLINEASM:
531 case AMDGPUISD::INTERP_P1:
532 case AMDGPUISD::INTERP_P2:
533 case AMDGPUISD::DIV_SCALE:
534 return false;
535 default:
536 return true;
537 }
538 }
539
540 static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold = 4) {
541 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
542 // it is truly free to use a source modifier in all cases. If there are
543 // multiple users but for each one will necessitate using VOP3, there will be
544 // a code size increase. Try to avoid increasing code size unless we know it
545 // will save on the instruction count.
546 unsigned NumMayIncreaseSize = 0;
547 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
548
549 // XXX - Should this limit number of uses to check?
550 for (const SDNode *U : N->uses()) {
551 if (!hasSourceMods(U))
552 return false;
553
554 if (!opMustUseVOP3Encoding(U, VT)) {
555 if (++NumMayIncreaseSize > CostThreshold)
556 return false;
557 }
558 }
559
560 return true;
507561 }
508562
509563 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
28532907 // the other uses cannot, give up. This both prevents unprofitable
28542908 // transformations and infinite loops: we won't repeatedly try to fold around
28552909 // a negate that has no 'good' form.
2856 //
2857 // TODO: Check users can fold
2858 if (fnegFoldsIntoOp(Opc) && !N0.hasOneUse())
2859 return SDValue();
2910 if (N0.hasOneUse()) {
2911 // This may be able to fold into the source, but at a code size cost. Don't
2912 // fold if the fold into the user is free.
2913 if (allUsesHaveSourceMods(N, 0))
2914 return SDValue();
2915 } else {
2916 if (fnegFoldsIntoOp(Opc) &&
2917 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
2918 return SDValue();
2919 }
28602920
28612921 SDLoc SL(N);
28622922 switch (Opc) {
2020 ; VI: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, |v{{[0-9]+}}|
2121 ; VI: v_cndmask_b32_e32
2222 ; VI: v_add_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
23 ; VI: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
24 ; VI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0
23 ; VI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
24 ; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
2525 define void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
2626 %a11 = fadd fast float %y, -1.0
2727 %a12 = call float @llvm.fabs.f32(float %a11)
115115 ; VI: v_cmp_gt_f16_e64 vcc, |v{{[0-9]+}}|, |v{{[0-9]+}}|
116116 ; VI: v_cndmask_b32_e32
117117 ; VI: v_add_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
118 ; VI: v_mul_f16_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
119 ; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0
120 ; VI-DENORM: v_fma_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0
118 ; VI: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
119 ; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
120 ; VI-DENORM: v_fma_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
121121 define void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
122122 %x = bitcast i16 %x.arg to half
123123 %y = bitcast i16 %y.arg to half
None ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NSZ -check-prefix=SI -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -mcpu=tahiti -start-after=sink -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=tahiti -start-after=sink -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NSZ -check-prefix=SI -check-prefix=FUNC %s
22
33 ; --------------------------------------------------------------------------------
44 ; fadd tests
5252 ; GCN-LABEL: {{^}}v_fneg_add_multi_use_add_f32:
5353 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
5454 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
55 ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
56 ; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
57 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
58 ; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
55
56 ; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
57 ; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
58 ; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
59
60 ; GCN-NSZ: v_sub_f32_e64 [[NEG_ADD:v[0-9]+]], -[[A]], [[B]]
61 ; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[ADD]]
62 ; GCN: buffer_store_dword [[NEG_ADD]]
5963 ; GCN-NEXT: buffer_store_dword [[MUL]]
6064 define void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
6165 %tid = call i32 @llvm.amdgcn.workitem.id.x()
252256 ; GCN-LABEL: {{^}}v_fneg_mul_multi_use_mul_f32:
253257 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
254258 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
255 ; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
256 ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
257 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
258 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
259 ; GCN: buffer_store_dword [[MUL]]
259 ; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], [[A]], -[[B]]
260 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
261 ; GCN-NEXT: buffer_store_dword [[MUL0]]
262 ; GCN-NEXT: buffer_store_dword [[MUL1]]
260263 define void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
261264 %tid = call i32 @llvm.amdgcn.workitem.id.x()
262265 %tid.ext = sext i32 %tid to i64
440443 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
441444 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
442445 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
443 ; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
444 ; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
445 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]]
446
447 ; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
448 ; GCN-SAFE: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
449 ; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]]
450
451 ; GCN-NSZ: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
452 ; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_FMA]]
453
446454 ; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
447455 ; GCN-NEXT: buffer_store_dword [[MUL]]
448456 define void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
696704 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
697705 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
698706 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
699 ; GCN-DAG: v_mac_f32_e32 [[C]], [[B]], [[A]]
700 ; GCN-DAG: v_xor_b32_e32 [[NEG_C:v[0-9]+]], 0x80000000, [[C]]
701 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]]
702 ; GCN-NEXT: buffer_store_dword [[NEG_C]]
707
708 ; GCN-SAFE: v_mac_f32_e32 [[C]], [[B]], [[A]]
709 ; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]]
710 ; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]]
711
712 ; GCN-NSZ: v_mad_f32 [[NEG_MAD:v[0-9]+]], -[[A]], [[B]], -[[C]]
713 ; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_MAD]]
714
715 ; GCN: buffer_store_dword [[NEG_MAD]]
703716 ; GCN-NEXT: buffer_store_dword [[MUL]]
704717 define void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
705718 %tid = call i32 @llvm.amdgcn.workitem.id.x()
13601373 ; GCN: v_trunc_f32_e32
13611374 ; GCN: v_subrev_f32_e32
13621375 ; GCN: v_cndmask_b32
1376
1377 ; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
1378 ; GCN-SAFE: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[ADD]]
1379
13631380 ; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -v{{[0-9]+}}, v{{[0-9]+}}
1364 ; GCN-SAFE: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, v{{[0-9]+}}
13651381 ; GCN: buffer_store_dword [[RESULT]]
13661382 define void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
13671383 %tid = call i32 @llvm.amdgcn.workitem.id.x()
14121428 %nearbyint = call float @llvm.nearbyint.f32(float %a)
14131429 %fneg = fsub float -0.0, %nearbyint
14141430 store float %fneg, float addrspace(1)* %out.gep
1431 ret void
1432 }
1433
1434 ; --------------------------------------------------------------------------------
1435 ; vintrp tests
1436 ; --------------------------------------------------------------------------------
1437
1438 ; GCN-LABEL: {{^}}v_fneg_interp_p1_f32:
1439 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1440 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1441 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
1442 ; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]]
1443 ; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]]
1444 define void @v_fneg_interp_p1_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1445 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1446 %tid.ext = sext i32 %tid to i64
1447 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1448 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1449 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1450 %a = load volatile float, float addrspace(1)* %a.gep
1451 %b = load volatile float, float addrspace(1)* %b.gep
1452 %mul = fmul float %a, %b
1453 %fneg = fsub float -0.0, %mul
1454 %intrp0 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 0, i32 0, i32 0)
1455 %intrp1 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 1, i32 0, i32 0)
1456 store volatile float %intrp0, float addrspace(1)* %out.gep
1457 store volatile float %intrp1, float addrspace(1)* %out.gep
1458 ret void
1459 }
1460
1461 ; GCN-LABEL: {{^}}v_fneg_interp_p2_f32:
1462 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1463 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1464 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
1465 ; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]]
1466 ; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]]
1467 define void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1468 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1469 %tid.ext = sext i32 %tid to i64
1470 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1471 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1472 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1473 %a = load volatile float, float addrspace(1)* %a.gep
1474 %b = load volatile float, float addrspace(1)* %b.gep
1475 %mul = fmul float %a, %b
1476 %fneg = fsub float -0.0, %mul
1477 %intrp0 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 0, i32 0, i32 0)
1478 %intrp1 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 1, i32 0, i32 0)
1479 store volatile float %intrp0, float addrspace(1)* %out.gep
1480 store volatile float %intrp1, float addrspace(1)* %out.gep
1481 ret void
1482 }
1483
1484 ; --------------------------------------------------------------------------------
1485 ; CopyToReg tests
1486 ; --------------------------------------------------------------------------------
1487
1488 ; GCN-LABEL: {{^}}v_fneg_copytoreg_f32:
1489 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1490 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1491 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1492 ; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[B]], [[A]]
1493 ; GCN: s_cbranch_scc1
1494
1495 ; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x80000000, [[MUL0]]
1496 ; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[C]], [[XOR]]
1497 ; GCN: buffer_store_dword [[MUL1]]
1498
1499 ; GCN: buffer_store_dword [[MUL0]]
1500 define void @v_fneg_copytoreg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
1501 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1502 %tid.ext = sext i32 %tid to i64
1503 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1504 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1505 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1506 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1507 %a = load volatile float, float addrspace(1)* %a.gep
1508 %b = load volatile float, float addrspace(1)* %b.gep
1509 %c = load volatile float, float addrspace(1)* %c.gep
1510 %mul = fmul float %a, %b
1511 %fneg = fsub float -0.0, %mul
1512 %cmp0 = icmp eq i32 %d, 0
1513 br i1 %cmp0, label %if, label %endif
1514
1515 if:
1516 %mul1 = fmul float %fneg, %c
1517 store volatile float %mul1, float addrspace(1)* %out.gep
1518 br label %endif
1519
1520 endif:
1521 store volatile float %mul, float addrspace(1)* %out.gep
1522 ret void
1523 }
1524
1525 ; --------------------------------------------------------------------------------
1526 ; inlineasm tests
1527 ; --------------------------------------------------------------------------------
1528
1529 ; Can't fold into use, so should fold into source
1530 ; GCN-LABEL: {{^}}v_fneg_inlineasm_f32:
1531 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1532 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1533 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
1534 ; GCN: ; use [[MUL]]
1535 ; GCN: buffer_store_dword [[MUL]]
1536 define void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
1537 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1538 %tid.ext = sext i32 %tid to i64
1539 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1540 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1541 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1542 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1543 %a = load volatile float, float addrspace(1)* %a.gep
1544 %b = load volatile float, float addrspace(1)* %b.gep
1545 %c = load volatile float, float addrspace(1)* %c.gep
1546 %mul = fmul float %a, %b
1547 %fneg = fsub float -0.0, %mul
1548 call void asm sideeffect "; use $0", "v"(float %fneg) #0
1549 store volatile float %fneg, float addrspace(1)* %out.gep
1550 ret void
1551 }
1552
1553 ; --------------------------------------------------------------------------------
1554 ; inlineasm tests
1555 ; --------------------------------------------------------------------------------
1556
1557 ; Can't fold into use, so should fold into source
1558 ; GCN-LABEL: {{^}}v_fneg_inlineasm_multi_use_src_f32:
1559 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1560 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1561 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]]
1562 ; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]]
1563 ; GCN: ; use [[NEG]]
1564 ; GCN: buffer_store_dword [[MUL]]
1565 define void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
1566 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1567 %tid.ext = sext i32 %tid to i64
1568 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1569 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1570 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1571 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1572 %a = load volatile float, float addrspace(1)* %a.gep
1573 %b = load volatile float, float addrspace(1)* %b.gep
1574 %c = load volatile float, float addrspace(1)* %c.gep
1575 %mul = fmul float %a, %b
1576 %fneg = fsub float -0.0, %mul
1577 call void asm sideeffect "; use $0", "v"(float %fneg) #0
1578 store volatile float %mul, float addrspace(1)* %out.gep
1579 ret void
1580 }
1581
1582 ; --------------------------------------------------------------------------------
1583 ; code size regression tests
1584 ; --------------------------------------------------------------------------------
1585
1586 ; There are multiple users of the fneg that must use a VOP3
1587 ; instruction, so there is no penalty
1588 ; GCN-LABEL: {{^}}multiuse_fneg_2_vop3_users_f32:
1589 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1590 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1591 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1592
1593 ; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], [[C]]
1594 ; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0
1595 ; GCN-NEXT: buffer_store_dword [[FMA0]]
1596 ; GCN-NEXT: buffer_store_dword [[FMA1]]
1597 define void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1598 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1599 %tid.ext = sext i32 %tid to i64
1600 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1601 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1602 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1603 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1604 %a = load volatile float, float addrspace(1)* %a.gep
1605 %b = load volatile float, float addrspace(1)* %b.gep
1606 %c = load volatile float, float addrspace(1)* %c.gep
1607
1608 %fneg.a = fsub float -0.0, %a
1609 %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
1610 %fma1 = call float @llvm.fma.f32(float %fneg.a, float %c, float 2.0)
1611
1612 store volatile float %fma0, float addrspace(1)* %out
1613 store volatile float %fma1, float addrspace(1)* %out
1614 ret void
1615 }
1616
1617 ; There are multiple users, but both require using a larger encoding
1618 ; for the modifier.
1619
1620 ; GCN-LABEL: {{^}}multiuse_fneg_2_vop2_users_f32:
1621 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1622 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1623 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1624
1625 ; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], -[[A]], [[B]]
1626 ; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
1627 ; GCN-NEXT: buffer_store_dword [[MUL0]]
1628 ; GCN-NEXT: buffer_store_dword [[MUL1]]
1629 define void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1630 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1631 %tid.ext = sext i32 %tid to i64
1632 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1633 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1634 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1635 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1636 %a = load volatile float, float addrspace(1)* %a.gep
1637 %b = load volatile float, float addrspace(1)* %b.gep
1638 %c = load volatile float, float addrspace(1)* %c.gep
1639
1640 %fneg.a = fsub float -0.0, %a
1641 %mul0 = fmul float %fneg.a, %b
1642 %mul1 = fmul float %fneg.a, %c
1643
1644 store volatile float %mul0, float addrspace(1)* %out
1645 store volatile float %mul1, float addrspace(1)* %out
1646 ret void
1647 }
1648
1649 ; One user is VOP3 so has no cost to folding the modifier, the other does.
1650 ; GCN-LABEL: {{^}}multiuse_fneg_vop2_vop3_users_f32:
1651 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1652 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1653 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1654
1655 ; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], 2.0
1656 ; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
1657
1658 ; GCN: buffer_store_dword [[FMA0]]
1659 ; GCN-NEXT: buffer_store_dword [[MUL1]]
1660 define void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1661 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1662 %tid.ext = sext i32 %tid to i64
1663 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1664 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1665 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1666 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1667 %a = load volatile float, float addrspace(1)* %a.gep
1668 %b = load volatile float, float addrspace(1)* %b.gep
1669 %c = load volatile float, float addrspace(1)* %c.gep
1670
1671 %fneg.a = fsub float -0.0, %a
1672 %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float 2.0)
1673 %mul1 = fmul float %fneg.a, %c
1674
1675 store volatile float %fma0, float addrspace(1)* %out
1676 store volatile float %mul1, float addrspace(1)* %out
1677 ret void
1678 }
1679
1680 ; The use of the fneg requires a code size increase, but folding into
1681 ; the source does not
1682
1683 ; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f32:
1684 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1685 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1686 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1687 ; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
1688
1689 ; GCN-SAFE: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], [[B]], 2.0
1690 ; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[FMA0]], [[C]]
1691 ; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL2:v[0-9]+]], -[[FMA0]], [[D]]
1692
1693 ; GCN-NSZ: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], -[[B]], -2.0
1694 ; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[C]], [[FMA0]]
1695 ; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[D]], [[FMA0]]
1696
1697 ; GCN: buffer_store_dword [[MUL1]]
1698 ; GCN-NEXT: buffer_store_dword [[MUL2]]
1699 define void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
1700 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1701 %tid.ext = sext i32 %tid to i64
1702 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1703 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1704 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1705 %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
1706 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1707 %a = load volatile float, float addrspace(1)* %a.gep
1708 %b = load volatile float, float addrspace(1)* %b.gep
1709 %c = load volatile float, float addrspace(1)* %c.gep
1710 %d = load volatile float, float addrspace(1)* %d.gep
1711
1712 %fma0 = call float @llvm.fma.f32(float %a, float %b, float 2.0)
1713 %fneg.fma0 = fsub float -0.0, %fma0
1714 %mul1 = fmul float %fneg.fma0, %c
1715 %mul2 = fmul float %fneg.fma0, %d
1716
1717 store volatile float %mul1, float addrspace(1)* %out
1718 store volatile float %mul2, float addrspace(1)* %out
1719 ret void
1720 }
1721
1722 ; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f64:
1723 ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1724 ; GCN: {{buffer|flat}}_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
1725 ; GCN: {{buffer|flat}}_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]]
1726 ; GCN: {{buffer|flat}}_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]]
1727
1728 ; GCN: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], 2.0
1729 ; GCN-DAG: v_mul_f64 [[MUL0:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[C]]
1730 ; GCN-DAG: v_mul_f64 [[MUL1:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[D]]
1731
1732 ; GCN: buffer_store_dwordx2 [[MUL0]]
1733 ; GCN: buffer_store_dwordx2 [[MUL1]]
1734 define void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 {
1735 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1736 %tid.ext = sext i32 %tid to i64
1737 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1738 %b.gep = getelementptr inbounds double, double addrspace(1)* %b.ptr, i64 %tid.ext
1739 %c.gep = getelementptr inbounds double, double addrspace(1)* %c.ptr, i64 %tid.ext
1740 %d.gep = getelementptr inbounds double, double addrspace(1)* %d.ptr, i64 %tid.ext
1741 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1742 %a = load volatile double, double addrspace(1)* %a.gep
1743 %b = load volatile double, double addrspace(1)* %b.gep
1744 %c = load volatile double, double addrspace(1)* %c.gep
1745 %d = load volatile double, double addrspace(1)* %d.gep
1746
1747 %fma0 = call double @llvm.fma.f64(double %a, double %b, double 2.0)
1748 %fneg.fma0 = fsub double -0.0, %fma0
1749 %mul1 = fmul double %fneg.fma0, %c
1750 %mul2 = fmul double %fneg.fma0, %d
1751
1752 store volatile double %mul1, double addrspace(1)* %out
1753 store volatile double %mul2, double addrspace(1)* %out
1754 ret void
1755 }
1756
1757 ; %trunc.a has one fneg use, but it requires a code size increase and
1758 ; %the fneg can instead be folded for free into the fma.
1759
1760 ; GCN-LABEL: {{^}}one_use_cost_to_fold_into_src_f32:
1761 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1762 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1763 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1764 ; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
1765 ; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
1766 ; GCN: buffer_store_dword [[FMA0]]
1767 define void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
1768 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1769 %tid.ext = sext i32 %tid to i64
1770 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1771 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1772 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1773 %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
1774 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1775 %a = load volatile float, float addrspace(1)* %a.gep
1776 %b = load volatile float, float addrspace(1)* %b.gep
1777 %c = load volatile float, float addrspace(1)* %c.gep
1778 %d = load volatile float, float addrspace(1)* %d.gep
1779
1780 %trunc.a = call float @llvm.trunc.f32(float %a)
1781 %trunc.fneg.a = fsub float -0.0, %trunc.a
1782 %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c)
1783 store volatile float %fma0, float addrspace(1)* %out
1784 ret void
1785 }
1786
1787 ; GCN-LABEL: {{^}}multi_use_cost_to_fold_into_src:
1788 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1789 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1790 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1791 ; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
1792 ; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
1793 ; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
1794 ; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[D]], [[TRUNC_A]]
1795 ; GCN: buffer_store_dword [[FMA0]]
1796 ; GCN: buffer_store_dword [[MUL1]]
1797 define void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
1798 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1799 %tid.ext = sext i32 %tid to i64
1800 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1801 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1802 %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1803 %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
1804 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1805 %a = load volatile float, float addrspace(1)* %a.gep
1806 %b = load volatile float, float addrspace(1)* %b.gep
1807 %c = load volatile float, float addrspace(1)* %c.gep
1808 %d = load volatile float, float addrspace(1)* %d.gep
1809
1810 %trunc.a = call float @llvm.trunc.f32(float %a)
1811 %trunc.fneg.a = fsub float -0.0, %trunc.a
1812 %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c)
1813 %mul1 = fmul float %trunc.a, %d
1814 store volatile float %fma0, float addrspace(1)* %out
1815 store volatile float %mul1, float addrspace(1)* %out
14151816 ret void
14161817 }
14171818
14241825 declare float @llvm.rint.f32(float) #1
14251826 declare float @llvm.nearbyint.f32(float) #1
14261827
1828 declare double @llvm.fma.f64(double, double, double) #1
1829
14271830 declare float @llvm.amdgcn.sin.f32(float) #1
14281831 declare float @llvm.amdgcn.rcp.f32(float) #1
14291832 declare float @llvm.amdgcn.rcp.legacy(float) #1
14301833 declare float @llvm.amdgcn.fmul.legacy(float, float) #1
1834 declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0
1835 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
14311836
14321837 attributes #0 = { nounwind }
14331838 attributes #1 = { nounwind readnone }
1111 ; GCN: v_mul_f32_e32
1212 ; GCN: v_div_fmas_f32
1313 ; GCN: v_div_fixup_f32
14 ; GCN: v_trunc_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}
15 ; GCN: v_mac_f32_e32
14 ; GCN: v_trunc_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
15 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
1616 ; GCN: s_endpgm
1717 define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
1818 float addrspace(1)* %in2) #0 {
2727 ; FUNC-LABEL: {{^}}unsafe_frem_f32:
2828 ; GCN: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16
2929 ; GCN: buffer_load_dword [[X:v[0-9]+]], {{.*}}
30 ; GCN: v_rcp_f32_e64 [[INVY:v[0-9]+]], -[[Y]]
30 ; GCN: v_rcp_f32_e32 [[INVY:v[0-9]+]], [[Y]]
3131 ; GCN: v_mul_f32_e32 [[DIV:v[0-9]+]], [[INVY]], [[X]]
3232 ; GCN: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]]
33 ; GCN: v_mac_f32_e32 [[X]], [[Y]], [[TRUNC]]
34 ; GCN: buffer_store_dword [[X]]
35 ; GCN: s_endpgm
33 ; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]]
34 ; GCN: buffer_store_dword [[RESULT]]
3635 define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
3736 float addrspace(1)* %in2) #1 {
3837 %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
5454 ; GCN: buffer_load_dword [[B:v[0-9]+]]
5555 ; GCN: buffer_load_dword [[C:v[0-9]+]]
5656
57 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]]
58 ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL]]
57 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
58 ; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 4.0, [[MUL]]
59 ; GCN-NOT: xor
5960 ; GCN: buffer_store_dword [[MUL]]
6061 define void @multi_use_fneg() #0 {
6162 %a = load volatile float, float addrspace(1)* undef
6464 }
6565
6666 ; GCN-LABEL: {{^}}mac_f16_neg_a:
67 ; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}}
68 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_OTHER:v[0-9]+]], v{{[0-9]+}}
69 ; SI: v_mac_f32_e32 v{{[0-9]+}}, [[CVT_OTHER]], [[CVT_NEG]]
67 ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
68 ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
69 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
70 ; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]]
7071
7172 ; VI-NOT: v_mac_f16
7273 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
9091 }
9192
9293 ; GCN-LABEL: {{^}}mac_f16_neg_b:
93 ; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}}
94 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_OTHER:v[0-9]+]], v{{[0-9]+}}
95 ; SI: v_mac_f32_e32 v{{[0-9]+}}, [[CVT_OTHER]], [[CVT_NEG]]
94 ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
95 ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
96 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
97 ; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]]
98
9699 ; VI-NOT: v_mac_f16
97100 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
98101 ; GCN: s_endpgm
116119
117120 ; GCN-LABEL: {{^}}mac_f16_neg_c:
118121 ; SI: v_cvt_f32_f16_e32
119 ; SI-DAG: v_cvt_f32_f16_e32
120 ; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}}
121 ; SI: v_mac_f32_e32 [[CVT_NEG]], v{{[0-9]+}}, v{{[0-9]+}}
122 ; SI: v_cvt_f32_f16_e32
123 ; SI: v_cvt_f32_f16_e32
124 ; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
122125
123126 ; VI-NOT: v_mac_f16
124127 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
214217 }
215218
216219 ; GCN-LABEL: {{^}}mac_f16_neg_a_unsafe_fp_math:
217 ; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}}
218 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_OTHER:v[0-9]+]], v{{[0-9]+}}
219 ; SI: v_mac_f32_e32 v{{[0-9]+}}, [[CVT_OTHER]], [[CVT_NEG]]
220 ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
221 ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
222 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
223 ; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]]
220224
221225 ; VI-NOT: v_mac_f16
222226 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
240244 }
241245
242246 ; GCN-LABEL: {{^}}mac_f16_neg_b_unsafe_fp_math:
243 ; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}}
244 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_OTHER:v[0-9]+]], v{{[0-9]+}}
245 ; SI: v_mac_f32_e32 v{{[0-9]+}}, [[CVT_OTHER]], [[CVT_NEG]]
247 ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
248 ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
249 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
250 ; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]]
246251
247252 ; VI-NOT: v_mac_f16
248253 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
266271 }
267272
268273 ; GCN-LABEL: {{^}}mac_f16_neg_c_unsafe_fp_math:
269 ; SI: v_cvt_f32_f16_e32
270 ; SI: v_cvt_f32_f16_e32
271 ; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}}
272 ; SI: v_mac_f32_e32 [[CVT_NEG]], v{{[0-9]+}}, v{{[0-9]+}}
274 ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
275 ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
276 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
277 ; SI: v_mad_f32 v{{[0-9]+}}, [[CVT_A]], [[CVT_B]], -[[CVT_C]]
273278
274279 ; VI-NOT: v_mac_f16
275280 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}}
372377 }
373378
374379 ; GCN-LABEL: {{^}}mac_v2f16_neg_a:
375 ; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}}
376 ; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}}
377
378 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG0]]
379 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG1]]
380 ; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
381 ; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
382
383 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, -[[CVT0]], v{{[0-9]+}}, v{{[0-9]+}}
384 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, [[CVT1]], v{{[0-9]+}}
380385
381386 ; VI-NOT: v_mac_f16
382387 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
401406 }
402407
403408 ; GCN-LABEL: {{^}}mac_v2f16_neg_b
404 ; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}}
405 ; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}}
406 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG0]]
407 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG1]]
409 ; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
410 ; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
411 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, -[[CVT0]], v{{[0-9]+}}, v{{[0-9]+}}
412 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, [[CVT1]], v{{[0-9]+}}
408413
409414
410415 ; VI-NOT: v_mac_f16
430435 }
431436
432437 ; GCN-LABEL: {{^}}mac_v2f16_neg_c:
433 ; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}}
434 ; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}}
435
436 ; SI-DAG: v_mac_f32_e32 [[CVT_NEG0]], v{{[0-9]+}}, v{{[0-9]+}}
437 ; SI-DAG: v_mac_f32_e32 [[CVT_NEG1]], v{{[0-9]+}}, v{{[0-9]+}}
438 ; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
439 ; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
440 ; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}}
441 ; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}}
442 ; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
443 ; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
444
445 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -[[CVT2]]
446 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -[[CVT5]]
438447
439448 ; VI-NOT: v_mac_f16
440449 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
543552 }
544553
545554 ; GCN-LABEL: {{^}}mac_v2f16_neg_a_unsafe_fp_math:
546 ; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}}
547 ; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}}
548
549 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG0]]
550 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG1]]
555 ; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
556 ; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
557 ; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}}
558 ; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}}
559 ; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
560 ; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
561
562 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
563 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
551564
552565 ; VI-NOT: v_mac_f16
553566 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
572585 }
573586
574587 ; GCN-LABEL: {{^}}mac_v2f16_neg_b_unsafe_fp_math:
575 ; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}}
576 ; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}}
577
578 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG0]]
579 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG1]]
588 ; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
589 ; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
590 ; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}}
591 ; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}}
592 ; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
593 ; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
594
595 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
596 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
580597
581598 ; VI-NOT: v_mac_f16
582599 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
601618 }
602619
603620 ; GCN-LABEL: {{^}}mac_v2f16_neg_c_unsafe_fp_math:
604 ; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}}
605 ; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}}
606
607 ; SI-DAG: v_mac_f32_e32 [[CVT_NEG0]], v{{[0-9]+}}, v{{[0-9]+}}
608 ; SI-DAG: v_mac_f32_e32 [[CVT_NEG1]], v{{[0-9]+}}, v{{[0-9]+}}
621 ; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
622 ; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
623 ; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}}
624 ; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}}
625 ; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
626 ; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
627
628 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
629 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
609630
610631 ; VI-NOT: v_mac_f16
611632 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}