llvm.org GIT mirror llvm / 234b3a1
AMDGPU: Remove dx10-clamp from subtarget features Since this can be set with s_setreg*, it should not be a subtarget property. Set a default based on the calling convention, and Introduce a new amdgpu-dx10-clamp attribute to override this if desired. Also introduce a new amdgpu-ieee attribute to match. The values need to match to allow inlining. I think it is OK for the caller's dx10-clamp attribute to override the callee, but there doesn't appear to be the infrastructure to do this currently without definining the attribute in the generic Attributes.td. Eventually the calling convention lowering will need to insert a mode switch somewhere for these. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@357302 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 1 year, 5 months ago
17 changed file(s) with 513 addition(s) and 34 deletion(s). Raw diff Collapse all Expand all
432432 "amdgpu-waves-per-eu"="m,n" Specify the minimum and maximum number of waves per
433433 execution unit. Generated by the ``amdgpu_waves_per_eu``
434434 CLANG attribute [CLANG-ATTR]_.
435
436 "amdgpu-ieee" true/false. Specify whether the function expects
437 the IEEE field of the mode register to be set on entry. Overrides
438 the default for the calling convention.
439 "amdgpu-dx10-clamp" true/false. Specify whether the function expects
440 the DX10_CLAMP field of the mode register to be set on entry. Overrides
441 the default for the calling convention.
442
435443 ======================================= ==========================================================
436444
437445 Code Object
891891 // register.
892892 ProgInfo.FloatMode = getFPMode(MF);
893893
894 ProgInfo.IEEEMode = STM.enableIEEEBit(MF);
894 const SIModeRegisterDefaults Mode = MFI->getMode();
895 ProgInfo.IEEEMode = Mode.IEEE;
895896
896897 // Make clamp modifier on NaN input returns 0.
897 ProgInfo.DX10Clamp = STM.enableDX10Clamp();
898 ProgInfo.DX10Clamp = Mode.DX10Clamp;
898899
899900 unsigned LDSAlignShift;
900901 if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
5353 SubtargetFeature
5454 Value#" GPU generation", Implies>;
5555
56 def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp",
57 "DX10Clamp",
58 "true",
59 "clamp modifier clamps NaNs to 0.0"
60 >;
61
6256 def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
6357 "EnablePromoteAlloca",
6458 "true",
4444 R600Subtarget &
4545 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
4646 StringRef GPU, StringRef FS) {
47 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
47 SmallString<256> FullFS("+promote-alloca,");
4848 FullFS += FS;
4949 ParseSubtargetFeatures(GPU, FullFS);
5050
7676 // Similarly we want enable-prt-strict-null to be on by default and not to
7777 // unset everything else if it is disabled
7878
79 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
79 SmallString<256> FullFS("+promote-alloca,+load-store-opt,");
8080
8181 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
8282 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
163163 HalfRate64Ops(false),
164164
165165 FP64FP16Denormals(false),
166 DX10Clamp(false),
167166 FlatForGlobal(false),
168167 AutoWaitcntBeforeBarrier(false),
169168 CodeObjectV3(false),
460459 FMA(false),
461460 CaymanISA(false),
462461 CFALUBug(false),
463 DX10Clamp(false),
464462 HasVertexCache(false),
465463 R600ALUInst(false),
466464 FP64(false),
285285
286286 // Dynamially set bits that enable features.
287287 bool FP64FP16Denormals;
288 bool DX10Clamp;
289288 bool FlatForGlobal;
290289 bool AutoWaitcntBeforeBarrier;
291290 bool CodeObjectV3;
530529 return getGeneration() >= AMDGPUSubtarget::GFX9;
531530 }
532531
533 bool enableDX10Clamp() const {
534 return DX10Clamp;
535 }
536
537 bool enableIEEEBit(const MachineFunction &MF) const {
538 return AMDGPU::isCompute(MF.getFunction().getCallingConv());
539 }
540
541532 bool useFlatForGlobal() const {
542533 return FlatForGlobal;
543534 }
969960 bool FMA;
970961 bool CaymanISA;
971962 bool CFALUBug;
972 bool DX10Clamp;
973963 bool HasVertexCache;
974964 bool R600ALUInst;
975965 bool FP64;
610610 }
611611
612612 bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
613 const Function *Callee) const {
613 const Function *Callee) const {
614614 const TargetMachine &TM = getTLI()->getTargetMachine();
615615 const FeatureBitset &CallerBits =
616616 TM.getSubtargetImpl(*Caller)->getFeatureBits();
619619
620620 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
621621 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
622 return ((RealCallerBits & RealCalleeBits) == RealCalleeBits);
622 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
623 return false;
624
625 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
626 // no way to support merge for backend defined attributes.
627 AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
628 AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
629 return CallerMode.isInlineCompatible(CalleeMode);
623630 }
624631
625632 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
11191119 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
11201120 // correctly handle signed zeros.
11211121 //
1122 bool IsIEEEMode = ST->enableIEEEBit(MF);
1122 // FIXME: Also need to check strictfp
1123 bool IsIEEEMode = MFI->getMode().IEEE;
11231124 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
11241125
11251126 for (MachineBasicBlock *MBB : depth_first(&MF)) {
41444144 SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
41454145 SelectionDAG &DAG) const {
41464146 EVT VT = Op.getValueType();
4147 bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
4147 const MachineFunction &MF = DAG.getMachineFunction();
4148 const SIMachineFunctionInfo *Info = MF.getInfo();
4149 bool IsIEEEMode = Info->getMode().IEEE;
41484150
41494151 // FIXME: Assert during eslection that this is only selected for
41504152 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
82998301 if (Cmp == APFloat::cmpGreaterThan)
83008302 return SDValue();
83018303
8304 const MachineFunction &MF = DAG.getMachineFunction();
8305 const SIMachineFunctionInfo *Info = MF.getInfo();
8306
83028307 // TODO: Check IEEE bit enabled?
83038308 EVT VT = Op0.getValueType();
8304 if (Subtarget->enableDX10Clamp()) {
8309 if (Info->getMode().DX10Clamp) {
83058310 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
83068311 // hardware fmed3 behavior converting to a min.
83078312 // FIXME: Should this be allowing -0.0?
84358440 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
84368441 }
84378442
8443 const MachineFunction &MF = DAG.getMachineFunction();
8444 const SIMachineFunctionInfo *Info = MF.getInfo();
8445
84388446 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
84398447 // handling no dx10-clamp?
8440 if (Subtarget->enableDX10Clamp()) {
8448 if (Info->getMode().DX10Clamp) {
84418449 // If NaNs is clamped to 0, we are free to reorder the inputs.
84428450
84438451 if (isa(Src0) && !isa(Src1))
91279135 if (!CSrc)
91289136 return SDValue();
91299137
9138 const MachineFunction &MF = DCI.DAG.getMachineFunction();
91309139 const APFloat &F = CSrc->getValueAPF();
91319140 APFloat Zero = APFloat::getZero(F.getSemantics());
91329141 APFloat::cmpResult Cmp0 = F.compare(Zero);
91339142 if (Cmp0 == APFloat::cmpLessThan ||
9134 (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
9143 (Cmp0 == APFloat::cmpUnordered &&
9144 MF.getInfo()->getMode().DX10Clamp)) {
91359145 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
91369146 }
91379147
99669976 bool SNaN,
99679977 unsigned Depth) const {
99689978 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
9969 if (Subtarget->enableDX10Clamp())
9979 const MachineFunction &MF = DAG.getMachineFunction();
9980 const SIMachineFunctionInfo *Info = MF.getInfo();
9981
9982 if (Info->getMode().DX10Clamp)
99709983 return true; // Clamped to 0.
99719984 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
99729985 }
2727
2828 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
2929 : AMDGPUMachineFunction(MF),
30 Mode(MF.getFunction()),
3031 PrivateSegmentBuffer(false),
3132 DispatchPtr(false),
3233 QueuePtr(false),
147147
148148 AMDGPUFunctionArgInfo ArgInfo;
149149
150 // State of MODE register, assumed FP mode.
151 AMDGPU::SIModeRegisterDefaults Mode;
152
150153 // Graphics info.
151154 unsigned PSInputAddr = 0;
152155 unsigned PSInputEnable = 0;
278281
279282 ArrayRef getSGPRSpillVGPRs() const {
280283 return SpillVGPRs;
284 }
285
286 AMDGPU::SIModeRegisterDefaults getMode() const {
287 return Mode;
281288 }
282289
283290 bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
10011001 return true;
10021002 }
10031003
1004 SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) {
1005 *this = getDefaultForCallingConv(F.getCallingConv());
1006
1007 StringRef IEEEAttr = F.getFnAttribute("amdgpu-ieee").getValueAsString();
1008 if (!IEEEAttr.empty())
1009 IEEE = IEEEAttr == "true";
1010
1011 StringRef DX10ClampAttr
1012 = F.getFnAttribute("amdgpu-dx10-clamp").getValueAsString();
1013 if (!DX10ClampAttr.empty())
1014 DX10Clamp = DX10ClampAttr == "true";
1015 }
1016
10041017 namespace {
10051018
10061019 struct SourceOfDivergence {
494494 /// \returns true if the intrinsic is divergent
495495 bool isIntrinsicSourceOfDivergence(unsigned IntrID);
496496
497
498 // Track defaults for fields in the MODE registser.
499 struct SIModeRegisterDefaults {
500 /// Floating point opcodes that support exception flag gathering quiet and
501 /// propagate signaling NaN inputs per IEEE 754-2008. Min_dx10 and max_dx10
502 /// become IEEE 754- 2008 compliant due to signaling NaN propagation and
503 /// quieting.
504 bool IEEE : 1;
505
506 /// Used by the vector ALU to force DX10-style treatment of NaNs: when set,
507 /// clamp NaN to zero; otherwise, pass NaN through.
508 bool DX10Clamp : 1;
509
510 // TODO: FP mode fields
511
512 SIModeRegisterDefaults() :
513 IEEE(true),
514 DX10Clamp(true) {}
515
516 SIModeRegisterDefaults(const Function &F);
517
518 static SIModeRegisterDefaults getDefaultForCallingConv(CallingConv::ID CC) {
519 SIModeRegisterDefaults Mode;
520 Mode.DX10Clamp = true;
521 Mode.IEEE = AMDGPU::isCompute(CC);
522 return Mode;
523 }
524
525 bool operator ==(const SIModeRegisterDefaults Other) const {
526 return IEEE == Other.IEEE && DX10Clamp == Other.DX10Clamp;
527 }
528
529 // FIXME: Inlining should be OK for dx10-clamp, since the caller's mode should
530 // be able to override.
531 bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const {
532 return *this == CalleeMode;
533 }
534 };
535
497536 } // end namespace AMDGPU
498537 } // end namespace llvm
499538
0 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
1
2 ; GCN-LABEL: {{^}}kernel_ieee_mode_default:
3 ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
4 ; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
5 ; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
6 ; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
7 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
8 ; GCN-NOT: v_mul_f32
9 define amdgpu_kernel void @kernel_ieee_mode_default() #0 {
10 %val0 = load volatile float, float addrspace(1)* undef
11 %val1 = load volatile float, float addrspace(1)* undef
12 %min = call float @llvm.minnum.f32(float %val0, float %val1)
13 store volatile float %min, float addrspace(1)* undef
14 ret void
15 }
16
17 ; GCN-LABEL: {{^}}kernel_ieee_mode_on:
18 ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
19 ; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
20 ; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
21 ; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
22 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
23 ; GCN-NOT: v_mul_f32
24 define amdgpu_kernel void @kernel_ieee_mode_on() #1 {
25 %val0 = load volatile float, float addrspace(1)* undef
26 %val1 = load volatile float, float addrspace(1)* undef
27 %min = call float @llvm.minnum.f32(float %val0, float %val1)
28 store volatile float %min, float addrspace(1)* undef
29 ret void
30 }
31
32 ; GCN-LABEL: {{^}}kernel_ieee_mode_off:
33 ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
34 ; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
35 ; GCN-NOT: [[VAL0]]
36 ; GCN-NOT: [[VAL1]]
37 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
38 ; GCN-NOT: v_mul_f32
39 define amdgpu_kernel void @kernel_ieee_mode_off() #2 {
40 %val0 = load volatile float, float addrspace(1)* undef
41 %val1 = load volatile float, float addrspace(1)* undef
42 %min = call float @llvm.minnum.f32(float %val0, float %val1)
43 store volatile float %min, float addrspace(1)* undef
44 ret void
45 }
46
47 ; GCN-LABEL: {{^}}func_ieee_mode_default:
48 ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
49 ; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
50 ; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
51 ; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
52 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
53 ; GCN-NOT: v_mul_f32
54 define void @func_ieee_mode_default() #0 {
55 %val0 = load volatile float, float addrspace(1)* undef
56 %val1 = load volatile float, float addrspace(1)* undef
57 %min = call float @llvm.minnum.f32(float %val0, float %val1)
58 store volatile float %min, float addrspace(1)* undef
59 ret void
60 }
61
62 ; GCN-LABEL: {{^}}func_ieee_mode_on:
63 ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
64 ; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
65 ; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
66 ; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
67 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
68 ; GCN-NOT: v_mul_f32
69 define void @func_ieee_mode_on() #1 {
70 %val0 = load volatile float, float addrspace(1)* undef
71 %val1 = load volatile float, float addrspace(1)* undef
72 %min = call float @llvm.minnum.f32(float %val0, float %val1)
73 store volatile float %min, float addrspace(1)* undef
74 ret void
75 }
76
77 ; GCN-LABEL: {{^}}func_ieee_mode_off:
78 ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
79 ; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
80 ; GCN-NOT: [[VAL0]]
81 ; GCN-NOT: [[VAL1]]
82 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
83 ; GCN-NOT: v_mul_f32
84 define void @func_ieee_mode_off() #2 {
85 %val0 = load volatile float, float addrspace(1)* undef
86 %val1 = load volatile float, float addrspace(1)* undef
87 %min = call float @llvm.minnum.f32(float %val0, float %val1)
88 store volatile float %min, float addrspace(1)* undef
89 ret void
90 }
91
92 ; GCN-LABEL: {{^}}cs_ieee_mode_default:
93 ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
94 ; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
95 ; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
96 ; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
97 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
98 ; GCN-NOT: v_mul_f32
99 define amdgpu_cs void @cs_ieee_mode_default() #0 {
100 %val0 = load volatile float, float addrspace(1)* undef
101 %val1 = load volatile float, float addrspace(1)* undef
102 %min = call float @llvm.minnum.f32(float %val0, float %val1)
103 store volatile float %min, float addrspace(1)* undef
104 ret void
105 }
106
107 ; GCN-LABEL: {{^}}cs_ieee_mode_on:
108 ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
109 ; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
110 ; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
111 ; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
112 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
113 ; GCN-NOT: v_mul_f32
114 define amdgpu_cs void @cs_ieee_mode_on() #1 {
115 %val0 = load volatile float, float addrspace(1)* undef
116 %val1 = load volatile float, float addrspace(1)* undef
117 %min = call float @llvm.minnum.f32(float %val0, float %val1)
118 store volatile float %min, float addrspace(1)* undef
119 ret void
120 }
121
122 ; GCN-LABEL: {{^}}cs_ieee_mode_off:
123 ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
124 ; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
125 ; GCN-NOT: [[VAL0]]
126 ; GCN-NOT: [[VAL1]]
127 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
128 ; GCN-NOT: v_mul_f32
129 define amdgpu_cs void @cs_ieee_mode_off() #2 {
130 %val0 = load volatile float, float addrspace(1)* undef
131 %val1 = load volatile float, float addrspace(1)* undef
132 %min = call float @llvm.minnum.f32(float %val0, float %val1)
133 store volatile float %min, float addrspace(1)* undef
134 ret void
135 }
136
137 ; GCN-LABEL: {{^}}ps_ieee_mode_default:
138 ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
139 ; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
140 ; GCN-NOT: [[VAL0]]
141 ; GCN-NOT: [[VAL1]]
142 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
143 ; GCN-NOT: v_mul_f32
144 define amdgpu_ps void @ps_ieee_mode_default() #0 {
145 %val0 = load volatile float, float addrspace(1)* undef
146 %val1 = load volatile float, float addrspace(1)* undef
147 %min = call float @llvm.minnum.f32(float %val0, float %val1)
148 store volatile float %min, float addrspace(1)* undef
149 ret void
150 }
151
152 ; GCN-LABEL: {{^}}ps_ieee_mode_on:
153 ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
154 ; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
155 ; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
156 ; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
157 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
158 ; GCN-NOT: v_mul_f32
159 define amdgpu_ps void @ps_ieee_mode_on() #1 {
160 %val0 = load volatile float, float addrspace(1)* undef
161 %val1 = load volatile float, float addrspace(1)* undef
162 %min = call float @llvm.minnum.f32(float %val0, float %val1)
163 store volatile float %min, float addrspace(1)* undef
164 ret void
165 }
166
167 ; GCN-LABEL: {{^}}ps_ieee_mode_off:
168 ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
169 ; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
170 ; GCN-NOT: [[VAL0]]
171 ; GCN-NOT: [[VAL1]]
172 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
173 ; GCN-NOT: v_mul_f32
174 define amdgpu_ps void @ps_ieee_mode_off() #2 {
175 %val0 = load volatile float, float addrspace(1)* undef
176 %val1 = load volatile float, float addrspace(1)* undef
177 %min = call float @llvm.minnum.f32(float %val0, float %val1)
178 store volatile float %min, float addrspace(1)* undef
179 ret void
180 }
181
182 declare float @llvm.minnum.f32(float, float) #3
183
184 attributes #0 = { nounwind }
185 attributes #1 = { nounwind "amdgpu-ieee"="true" }
186 attributes #2 = { nounwind "amdgpu-ieee"="false" }
187 attributes #3 = { nounwind readnone speculatable }
768768
769769 attributes #0 = { nounwind }
770770 attributes #1 = { nounwind readnone }
771 attributes #2 = { nounwind "target-features"="-dx10-clamp,-fp-exceptions" "no-nans-fp-math"="false" }
772 attributes #3 = { nounwind "target-features"="+dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" }
773 attributes #4 = { nounwind "target-features"="-dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" }
771 attributes #2 = { nounwind "amdgpu-dx10-clamp"="false" "target-features"="-fp-exceptions" "no-nans-fp-math"="false" }
772 attributes #3 = { nounwind "amdgpu-dx10-clamp"="true" "target-features"="+fp-exceptions" "no-nans-fp-math"="false" }
773 attributes #4 = { nounwind "amdgpu-dx10-clamp"="false" "target-features"="+fp-exceptions" "no-nans-fp-math"="false" }
6969 ret void
7070 }
7171
72 ; GCN-LABEL: {{^}}test_no_ieee_mode_vi:
73 ; GCN: float_mode = 192
74 ; GCN: enable_dx10_clamp = 1
75 ; GCN: enable_ieee_mode = 0
76 define amdgpu_kernel void @test_no_ieee_mode_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #7 {
77 store float 0.0, float addrspace(1)* %out0
78 store double 0.0, double addrspace(1)* %out1
79 ret void
80 }
81
82 ; GCN-LABEL: {{^}}test_no_ieee_mode_no_dx10_clamp_vi:
83 ; GCN: float_mode = 192
84 ; GCN: enable_dx10_clamp = 0
85 ; GCN: enable_ieee_mode = 0
86 define amdgpu_kernel void @test_no_ieee_mode_no_dx10_clamp_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #8 {
87 store float 0.0, float addrspace(1)* %out0
88 store double 0.0, double addrspace(1)* %out1
89 ret void
90 }
91
7292 attributes #0 = { nounwind "target-cpu"="kaveri" "target-features"="-code-object-v3" }
7393 attributes #1 = { nounwind "target-cpu"="fiji" "target-features"="-code-object-v3" }
7494 attributes #2 = { nounwind "target-features"="-code-object-v3,-fp32-denormals,+fp64-fp16-denormals" }
7595 attributes #3 = { nounwind "target-features"="-code-object-v3,+fp32-denormals,-fp64-fp16-denormals" }
7696 attributes #4 = { nounwind "target-features"="-code-object-v3,+fp32-denormals,+fp64-fp16-denormals" }
7797 attributes #5 = { nounwind "target-features"="-code-object-v3,-fp32-denormals,-fp64-fp16-denormals" }
78 attributes #6 = { nounwind "target-cpu"="fiji" "target-features"="-code-object-v3,-dx10-clamp" }
98 attributes #6 = { nounwind "amdgpu-dx10-clamp"="false" "target-cpu"="fiji" "target-features"="-code-object-v3" }
99 attributes #7 = { nounwind "amdgpu-ieee"="false" "target-cpu"="fiji" "target-features"="-code-object-v3" }
100 attributes #8 = { nounwind "amdgpu-dx10-clamp"="false" "amdgpu-ieee"="false" "target-cpu"="fiji" "target-features"="-code-object-v3" }
0 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -inline < %s | FileCheck %s
1 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes='cgscc(inline)' < %s | FileCheck %s
2
3 define i32 @func_default() #0 {
4 ret i32 0
5 }
6
7 define i32 @func_dx10_clamp_enabled() #1 {
8 ret i32 0
9 }
10
11 define i32 @func_dx10_clamp_disabled() #2 {
12 ret i32 0
13 }
14
15 ; CHECK-LABEL: @default_call_default(
16 ; CHECK-NEXT: ret i32 0
17 define i32 @default_call_default() #0 {
18 %call = call i32 @func_default()
19 ret i32 %call
20 }
21
22 ; CHECK-LABEL: @dx10_clamp_enabled_call_default(
23 ; CHECK-NEXT: ret i32 0
24 define i32 @dx10_clamp_enabled_call_default() #1 {
25 %call = call i32 @func_default()
26 ret i32 %call
27 }
28
29 ; CHECK-LABEL: @dx10_clamp_enabled_call_dx10_clamp_enabled(
30 ; CHECK-NEXT: ret i32 0
31 define i32 @dx10_clamp_enabled_call_dx10_clamp_enabled() #1 {
32 %call = call i32 @func_dx10_clamp_enabled()
33 ret i32 %call
34 }
35
36 ; CHECK-LABEL: @dx10_clamp_enabled_call_dx10_clamp_disabled(
37 ; CHECK-NEXT: call i32 @func_dx10_clamp_disabled()
38 define i32 @dx10_clamp_enabled_call_dx10_clamp_disabled() #1 {
39 %call = call i32 @func_dx10_clamp_disabled()
40 ret i32 %call
41 }
42
43 ; CHECK-LABEL: @dx10_clamp_disabled_call_default(
44 ; CHECK-NEXT: call i32 @func_default()
45 define i32 @dx10_clamp_disabled_call_default() #2 {
46 %call = call i32 @func_default()
47 ret i32 %call
48 }
49
50 ; CHECK-LABEL: @dx10_clamp_disabled_call_dx10_clamp_enabled(
51 ; CHECK-NEXT: call i32 @func_dx10_clamp_enabled()
52 define i32 @dx10_clamp_disabled_call_dx10_clamp_enabled() #2 {
53 %call = call i32 @func_dx10_clamp_enabled()
54 ret i32 %call
55 }
56
57 ; CHECK-LABEL: @dx10_clamp_disabled_call_dx10_clamp_disabled(
58 ; CHECK-NEXT: ret i32 0
59 define i32 @dx10_clamp_disabled_call_dx10_clamp_disabled() #2 {
60 %call = call i32 @func_dx10_clamp_disabled()
61 ret i32 %call
62 }
63
64 ; Shader calling a compute function
65 ; CHECK-LABEL: @amdgpu_ps_default_call_default(
66 ; CHECK-NEXT: call i32 @func_default()
67 define amdgpu_ps i32 @amdgpu_ps_default_call_default() #0 {
68 %call = call i32 @func_default()
69 ret i32 %call
70 }
71
72 ; Shader with dx10_clamp enabled calling a compute function. Default
73 ; also implies ieee_mode, so this isn't inlinable.
74 ; CHECK-LABEL: @amdgpu_ps_dx10_clamp_enabled_call_default(
75 ; CHECK-NEXT: call i32 @func_default()
76 define amdgpu_ps i32 @amdgpu_ps_dx10_clamp_enabled_call_default() #1 {
77 %call = call i32 @func_default()
78 ret i32 %call
79 }
80
81 ; CHECK-LABEL: @amdgpu_ps_dx10_clamp_disabled_call_default(
82 ; CHECK-NEXT: call i32 @func_default()
83 define amdgpu_ps i32 @amdgpu_ps_dx10_clamp_disabled_call_default() #2 {
84 %call = call i32 @func_default()
85 ret i32 %call
86 }
87
88 ; CHECK-LABEL: @amdgpu_ps_dx10_clamp_enabled_ieee_call_default(
89 ; CHECK-NEXT: ret i32 0
90 define amdgpu_ps i32 @amdgpu_ps_dx10_clamp_enabled_ieee_call_default() #3 {
91 %call = call i32 @func_default()
92 ret i32 %call
93 }
94
95 ; CHECK-LABEL: @amdgpu_ps_dx10_clamp_disabled_ieee_call_default(
96 ; CHECK-NEXT: call i32 @func_default()
97 define amdgpu_ps i32 @amdgpu_ps_dx10_clamp_disabled_ieee_call_default() #4 {
98 %call = call i32 @func_default()
99 ret i32 %call
100 }
101
102 attributes #0 = { nounwind }
103 attributes #1 = { nounwind "amdgpu-dx10-clamp"="true" }
104 attributes #2 = { nounwind "amdgpu-dx10-clamp"="false" }
105 attributes #3 = { nounwind "amdgpu-dx10-clamp"="true" "amdgpu-ieee"="true" }
106 attributes #4 = { nounwind "amdgpu-dx10-clamp"="false" "amdgpu-ieee"="true" }
0 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -inline < %s | FileCheck %s
1 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes='cgscc(inline)' < %s | FileCheck %s
2
3 define i32 @func_default() #0 {
4 ret i32 0
5 }
6
7 define i32 @func_ieee_enabled() #1 {
8 ret i32 0
9 }
10
11 define i32 @func_ieee_disabled() #2 {
12 ret i32 0
13 }
14
15 ; CHECK-LABEL: @default_call_default(
16 ; CHECK-NEXT: ret i32 0
17 define i32 @default_call_default() #0 {
18 %call = call i32 @func_default()
19 ret i32 %call
20 }
21
22 ; CHECK-LABEL: @ieee_enabled_call_default(
23 ; CHECK-NEXT: ret i32 0
24 define i32 @ieee_enabled_call_default() #1 {
25 %call = call i32 @func_default()
26 ret i32 %call
27 }
28
29 ; CHECK-LABEL: @ieee_enabled_call_ieee_enabled(
30 ; CHECK-NEXT: ret i32 0
31 define i32 @ieee_enabled_call_ieee_enabled() #1 {
32 %call = call i32 @func_ieee_enabled()
33 ret i32 %call
34 }
35
36 ; CHECK-LABEL: @ieee_enabled_call_ieee_disabled(
37 ; CHECK-NEXT: call i32 @func_ieee_disabled()
38 define i32 @ieee_enabled_call_ieee_disabled() #1 {
39 %call = call i32 @func_ieee_disabled()
40 ret i32 %call
41 }
42
43 ; CHECK-LABEL: @ieee_disabled_call_default(
44 ; CHECK-NEXT: call i32 @func_default()
45 define i32 @ieee_disabled_call_default() #2 {
46 %call = call i32 @func_default()
47 ret i32 %call
48 }
49
50 ; CHECK-LABEL: @ieee_disabled_call_ieee_enabled(
51 ; CHECK-NEXT: call i32 @func_ieee_enabled()
52 define i32 @ieee_disabled_call_ieee_enabled() #2 {
53 %call = call i32 @func_ieee_enabled()
54 ret i32 %call
55 }
56
57 ; CHECK-LABEL: @ieee_disabled_call_ieee_disabled(
58 ; CHECK-NEXT: ret i32 0
59 define i32 @ieee_disabled_call_ieee_disabled() #2 {
60 %call = call i32 @func_ieee_disabled()
61 ret i32 %call
62 }
63
64 ; Shader calling a compute function
65 ; CHECK-LABEL: @amdgpu_ps_default_call_default(
66 ; CHECK-NEXT: call i32 @func_default()
67 define amdgpu_ps i32 @amdgpu_ps_default_call_default() #0 {
68 %call = call i32 @func_default()
69 ret i32 %call
70 }
71
72 ; Shader with ieee enabled calling a compute function
73 ; CHECK-LABEL: @amdgpu_ps_ieee_enabled_call_default(
74 ; CHECK-NEXT: ret i32 0
75 define amdgpu_ps i32 @amdgpu_ps_ieee_enabled_call_default() #1 {
76 %call = call i32 @func_default()
77 ret i32 %call
78 }
79
80 ; CHECK-LABEL: @amdgpu_ps_ieee_disabled_call_default(
81 ; CHECK-NEXT: call i32 @func_default()
82 define amdgpu_ps i32 @amdgpu_ps_ieee_disabled_call_default() #2 {
83 %call = call i32 @func_default()
84 ret i32 %call
85 }
86
87 attributes #0 = { nounwind }
88 attributes #1 = { nounwind "amdgpu-ieee"="true" }
89 attributes #2 = { nounwind "amdgpu-ieee"="false" }