llvm.org GIT mirror llvm / 953c681
R600 -> AMDGPU rename git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@239657 91177308-0d34-0410-b5e6-96231b3b80d8 Tom Stellard 4 years ago
1087 changed file(s) with 92654 addition(s) and 92652 deletion(s). Raw diff Collapse all Expand all
175175
176176 set(LLVM_ALL_TARGETS
177177 AArch64
178 AMDGPU
178179 ARM
179180 BPF
180181 CppBackend
183184 MSP430
184185 NVPTX
185186 PowerPC
186 R600
187187 Sparc
188188 SystemZ
189189 X86
10961096 fi
10971097
10981098 dnl List all possible targets
1099 ALL_TARGETS="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600 BPF"
1099 ALL_TARGETS="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ AMDGPU BPF"
11001100 AC_SUBST(ALL_TARGETS,$ALL_TARGETS)
11011101
11021102 dnl Allow specific targets to be specified for building (or not)
11311131 hexagon) TARGETS_TO_BUILD="Hexagon $TARGETS_TO_BUILD" ;;
11321132 nvptx) TARGETS_TO_BUILD="NVPTX $TARGETS_TO_BUILD" ;;
11331133 systemz) TARGETS_TO_BUILD="SystemZ $TARGETS_TO_BUILD" ;;
1134 r600) TARGETS_TO_BUILD="R600 $TARGETS_TO_BUILD" ;;
1134 amdgpu) ;&
1135 r600) TARGETS_TO_BUILD="AMDGPU $TARGETS_TO_BUILD" ;;
11351136 host) case "$llvm_cv_target_arch" in
11361137 x86) TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
11371138 x86_64) TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
56275627
56285628 fi
56295629
5630 ALL_TARGETS="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600 BPF"
5630 ALL_TARGETS="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ AMDGPU BPF"
56315631 ALL_TARGETS=$ALL_TARGETS
56325632
56335633
56645664 hexagon) TARGETS_TO_BUILD="Hexagon $TARGETS_TO_BUILD" ;;
56655665 nvptx) TARGETS_TO_BUILD="NVPTX $TARGETS_TO_BUILD" ;;
56665666 systemz) TARGETS_TO_BUILD="SystemZ $TARGETS_TO_BUILD" ;;
5667 r600) TARGETS_TO_BUILD="R600 $TARGETS_TO_BUILD" ;;
5667 amdgpu) ;&
5668 r600) TARGETS_TO_BUILD="AMDGPU $TARGETS_TO_BUILD" ;;
56685669 host) case "$llvm_cv_target_arch" in
56695670 x86) TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
56705671 x86_64) TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
0 ==============================
1 User Guide for AMDGPU Back-end
2 ==============================
3
4 Introduction
5 ============
6
7 The AMDGPU back-end provides ISA code generation for AMD GPUs, starting with
8 the R600 family up until the current Volcanic Islands (GCN Gen 3).
9
10
11 Assembler
12 =========
13
14 The assembler is currently considered experimental.
15
16 For syntax examples look in test/MC/AMDGPU.
17
18 Below some of the currently supported features (modulo bugs). These
19 all apply to the Southern Islands ISA, Sea Islands and Volcanic Islands
20 are also supported but may be missing some instructions and have more bugs:
21
22 DS Instructions
23 ---------------
24 All DS instructions are supported.
25
26 FLAT Instructions
27 ------------------
28 These instructions are only present in the Sea Islands and Volcanic Islands
29 instruction set. All FLAT instructions are supported for these architectures
30
31 MUBUF Instructions
32 ------------------
33 All non-atomic MUBUF instructions are supported.
34
35 SMRD Instructions
36 -----------------
37 Only the s_load_dword* SMRD instructions are supported.
38
39 SOP1 Instructions
40 -----------------
41 All SOP1 instructions are supported.
42
43 SOP2 Instructions
44 -----------------
45 All SOP2 instructions are supported.
46
47 SOPC Instructions
48 -----------------
49 All SOPC instructions are supported.
50
51 SOPP Instructions
52 -----------------
53
54 Unless otherwise mentioned, all SOPP instructions that have one or more
55 operands accept integer operands only. No verification is performed
56 on the operands, so it is up to the programmer to be familiar with the
57 range or acceptable values.
58
59 s_waitcnt
60 ^^^^^^^^^
61
62 s_waitcnt accepts named arguments to specify which memory counter(s) to
63 wait for.
64
65 .. code-block:: nasm
66
67 // Wait for all counters to be 0
68 s_waitcnt 0
69
70 // Equivalent to s_waitcnt 0. Counter names can also be delimited by
71 // '&' or ','.
72 s_waitcnt vmcnt(0) expcnt(0) lgkcmt(0)
73
74 // Wait for vmcnt counter to be 1.
75 s_waitcnt vmcnt(1)
76
77 VOP1, VOP2, VOP3, VOPC Instructions
78 -----------------------------------
79
80 All 32-bit and 64-bit encodings should work.
81
82 The assembler will automatically detect which encoding size to use for
83 VOP1, VOP2, and VOPC instructions based on the operands. If you want to force
84 a specific encoding size, you can add an _e32 (for 32-bit encoding) or
85 _e64 (for 64-bit encoding) suffix to the instruction. Most, but not all
86 instructions support an explicit suffix. These are all valid assembly
87 strings:
88
89 .. code-block:: nasm
90
91 v_mul_i32_i24 v1, v2, v3
92 v_mul_i32_i24_e32 v1, v2, v3
93 v_mul_i32_i24_e64 v1, v2, v3
6767 * `PowerPC64 alignment of long doubles (from GCC) `_
6868 * `Long branch stubs for powerpc64-linux (from binutils) `_
6969
70 R600
71 ----
70 AMDGPU
71 ------
7272
7373 * `AMD R6xx shader ISA `_
7474 * `AMD R7xx shader ISA `_
710710 | | as ``LLVM_ALL_TARGETS``, and can be set to include |
711711 | | out-of-tree targets. The default value includes: |
712712 | | ``AArch64, ARM, CppBackend, Hexagon, |
713 | | Mips, MSP430, NVPTX, PowerPC, R600, Sparc, |
713 | | Mips, MSP430, NVPTX, PowerPC, AMDGPU, Sparc, |
714714 | | SystemZ, X86, XCore``. |
715715 +-------------------------+----------------------------------------------------+
716716 | LLVM_ENABLE_DOXYGEN | Build doxygen-based documentation from the source |
+0
-94
docs/R600Usage.rst less more
None ============================
1 User Guide for R600 Back-end
2 ============================
3
4 Introduction
5 ============
6
7 The R600 back-end provides ISA code generation for AMD GPUs, starting with
8 the R600 family up until the current Volcanic Islands (GCN Gen 3).
9
10
11 Assembler
12 =========
13
14 The assembler is currently considered experimental.
15
16 For syntax examples look in test/MC/R600.
17
18 Below some of the currently supported features (modulo bugs). These
19 all apply to the Southern Islands ISA, Sea Islands and Volcanic Islands
20 are also supported but may be missing some instructions and have more bugs:
21
22 DS Instructions
23 ---------------
24 All DS instructions are supported.
25
26 FLAT Instructions
27 ------------------
28 These instructions are only present in the Sea Islands and Volcanic Islands
29 instruction set. All FLAT instructions are supported for these architectures
30
31 MUBUF Instructions
32 ------------------
33 All non-atomic MUBUF instructions are supported.
34
35 SMRD Instructions
36 -----------------
37 Only the s_load_dword* SMRD instructions are supported.
38
39 SOP1 Instructions
40 -----------------
41 All SOP1 instructions are supported.
42
43 SOP2 Instructions
44 -----------------
45 All SOP2 instructions are supported.
46
47 SOPC Instructions
48 -----------------
49 All SOPC instructions are supported.
50
51 SOPP Instructions
52 -----------------
53
54 Unless otherwise mentioned, all SOPP instructions that have one or more
55 operands accept integer operands only. No verification is performed
56 on the operands, so it is up to the programmer to be familiar with the
57 range or acceptable values.
58
59 s_waitcnt
60 ^^^^^^^^^
61
62 s_waitcnt accepts named arguments to specify which memory counter(s) to
63 wait for.
64
65 .. code-block:: nasm
66
67 // Wait for all counters to be 0
68 s_waitcnt 0
69
70 // Equivalent to s_waitcnt 0. Counter names can also be delimited by
71 // '&' or ','.
72 s_waitcnt vmcnt(0) expcnt(0) lgkcmt(0)
73
74 // Wait for vmcnt counter to be 1.
75 s_waitcnt vmcnt(1)
76
77 VOP1, VOP2, VOP3, VOPC Instructions
78 -----------------------------------
79
80 All 32-bit and 64-bit encodings should work.
81
82 The assembler will automatically detect which encoding size to use for
83 VOP1, VOP2, and VOPC instructions based on the operands. If you want to force
84 a specific encoding size, you can add an _e32 (for 32-bit encoding) or
85 _e64 (for 64-bit encoding) suffix to the instruction. Most, but not all
86 instructions support an explicit suffix. These are all valid assembly
87 strings:
88
89 .. code-block:: nasm
90
91 v_mul_i32_i24 v1, v2, v3
92 v_mul_i32_i24_e32 v1, v2, v3
93 v_mul_i32_i24_e64 v1, v2, v3
251251 WritingAnLLVMPass
252252 HowToUseAttributes
253253 NVPTXUsage
254 R600Usage
254 AMDGPUUsage
255255 StackMaps
256256 InAlloca
257257 BigEndianNEON
337337 :doc:`NVPTXUsage`
338338 This document describes using the NVPTX back-end to compile GPU kernels.
339339
340 :doc:`R600Usage`
341 This document describes how to use the R600 back-end.
340 :doc:`AMDGPUUsage`
341 This document describes how to use the AMDGPU back-end.
342342
343343 :doc:`StackMaps`
344344 LLVM support for mapping instruction addresses to the location of
0 //===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 /// \file
8 //===----------------------------------------------------------------------===//
9
10 #ifndef LLVM_LIB_TARGET_R600_AMDGPU_H
11 #define LLVM_LIB_TARGET_R600_AMDGPU_H
12
13 #include "llvm/Support/TargetRegistry.h"
14 #include "llvm/Target/TargetMachine.h"
15
16 namespace llvm {
17
18 class AMDGPUInstrPrinter;
19 class AMDGPUSubtarget;
20 class AMDGPUTargetMachine;
21 class FunctionPass;
22 class MCAsmInfo;
23 class raw_ostream;
24 class Target;
25 class TargetMachine;
26
27 // R600 Passes
28 FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
29 FunctionPass *createR600TextureIntrinsicsReplacer();
30 FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
31 FunctionPass *createR600EmitClauseMarkers();
32 FunctionPass *createR600ClauseMergePass(TargetMachine &tm);
33 FunctionPass *createR600Packetizer(TargetMachine &tm);
34 FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm);
35 FunctionPass *createAMDGPUCFGStructurizerPass();
36
37 // SI Passes
38 FunctionPass *createSITypeRewriter();
39 FunctionPass *createSIAnnotateControlFlowPass();
40 FunctionPass *createSIFoldOperandsPass();
41 FunctionPass *createSILowerI1CopiesPass();
42 FunctionPass *createSIShrinkInstructionsPass();
43 FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
44 FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
45 FunctionPass *createSIFixControlFlowLiveIntervalsPass();
46 FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm);
47 FunctionPass *createSIFixSGPRLiveRangesPass();
48 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
49 FunctionPass *createSIInsertWaits(TargetMachine &tm);
50 FunctionPass *createSIPrepareScratchRegs();
51
52 void initializeSIFoldOperandsPass(PassRegistry &);
53 extern char &SIFoldOperandsID;
54
55 void initializeSILowerI1CopiesPass(PassRegistry &);
56 extern char &SILowerI1CopiesID;
57
58 void initializeSILoadStoreOptimizerPass(PassRegistry &);
59 extern char &SILoadStoreOptimizerID;
60
61 // Passes common to R600 and SI
62 FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST);
63 Pass *createAMDGPUStructurizeCFGPass();
64 FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
65 ModulePass *createAMDGPUAlwaysInlinePass();
66
67 void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&);
68 extern char &SIFixControlFlowLiveIntervalsID;
69
70 void initializeSIFixSGPRLiveRangesPass(PassRegistry&);
71 extern char &SIFixSGPRLiveRangesID;
72
73
74 extern Target TheAMDGPUTarget;
75 extern Target TheGCNTarget;
76
77 namespace AMDGPU {
78 enum TargetIndex {
79 TI_CONSTDATA_START,
80 TI_SCRATCH_RSRC_DWORD0,
81 TI_SCRATCH_RSRC_DWORD1,
82 TI_SCRATCH_RSRC_DWORD2,
83 TI_SCRATCH_RSRC_DWORD3
84 };
85 }
86
87 #define END_OF_TEXT_LABEL_NAME "EndOfTextLabel"
88
89 } // End namespace llvm
90
91 namespace ShaderType {
92 enum Type {
93 PIXEL = 0,
94 VERTEX = 1,
95 GEOMETRY = 2,
96 COMPUTE = 3
97 };
98 }
99
100 /// OpenCL uses address spaces to differentiate between
101 /// various memory regions on the hardware. On the CPU
102 /// all of the address spaces point to the same memory,
103 /// however on the GPU, each address space points to
104 /// a separate piece of memory that is unique from other
105 /// memory locations.
106 namespace AMDGPUAS {
107 enum AddressSpaces : unsigned {
108 PRIVATE_ADDRESS = 0, ///< Address space for private memory.
109 GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0).
110 CONSTANT_ADDRESS = 2, ///< Address space for constant memory
111 LOCAL_ADDRESS = 3, ///< Address space for local memory.
112 FLAT_ADDRESS = 4, ///< Address space for flat memory.
113 REGION_ADDRESS = 5, ///< Address space for region memory.
114 PARAM_D_ADDRESS = 6, ///< Address space for direct addressible parameter memory (CONST0)
115 PARAM_I_ADDRESS = 7, ///< Address space for indirect addressible parameter memory (VTX1)
116
117 // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on this
118 // order to be able to dynamically index a constant buffer, for example:
119 //
120 // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx
121
122 CONSTANT_BUFFER_0 = 8,
123 CONSTANT_BUFFER_1 = 9,
124 CONSTANT_BUFFER_2 = 10,
125 CONSTANT_BUFFER_3 = 11,
126 CONSTANT_BUFFER_4 = 12,
127 CONSTANT_BUFFER_5 = 13,
128 CONSTANT_BUFFER_6 = 14,
129 CONSTANT_BUFFER_7 = 15,
130 CONSTANT_BUFFER_8 = 16,
131 CONSTANT_BUFFER_9 = 17,
132 CONSTANT_BUFFER_10 = 18,
133 CONSTANT_BUFFER_11 = 19,
134 CONSTANT_BUFFER_12 = 20,
135 CONSTANT_BUFFER_13 = 21,
136 CONSTANT_BUFFER_14 = 22,
137 CONSTANT_BUFFER_15 = 23,
138 ADDRESS_NONE = 24, ///< Address space for unknown memory.
139 LAST_ADDRESS = ADDRESS_NONE,
140
141 // Some places use this if the address space can't be determined.
142 UNKNOWN_ADDRESS_SPACE = ~0u
143 };
144
145 } // namespace AMDGPUAS
146
147 #endif
0 //===-- AMDGPU.td - AMDGPU Tablegen files ------------------*- tablegen -*-===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8
9 include "llvm/Target/Target.td"
10
11 //===----------------------------------------------------------------------===//
12 // Subtarget Features
13 //===----------------------------------------------------------------------===//
14
15 // Debugging Features
16
17 def FeatureDumpCode : SubtargetFeature <"DumpCode",
18 "DumpCode",
19 "true",
20 "Dump MachineInstrs in the CodeEmitter">;
21
22 def FeatureDumpCodeLower : SubtargetFeature <"dumpcode",
23 "DumpCode",
24 "true",
25 "Dump MachineInstrs in the CodeEmitter">;
26
27 def FeatureIRStructurizer : SubtargetFeature <"disable-irstructurizer",
28 "EnableIRStructurizer",
29 "false",
30 "Disable IR Structurizer">;
31
32 def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
33 "EnablePromoteAlloca",
34 "true",
35 "Enable promote alloca pass">;
36
37 // Target features
38
39 def FeatureIfCvt : SubtargetFeature <"disable-ifcvt",
40 "EnableIfCvt",
41 "false",
42 "Disable the if conversion pass">;
43
44 def FeatureFP64 : SubtargetFeature<"fp64",
45 "FP64",
46 "true",
47 "Enable double precision operations">;
48
49 def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals",
50 "FP64Denormals",
51 "true",
52 "Enable double precision denormal handling",
53 [FeatureFP64]>;
54
55 def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf",
56 "FastFMAF32",
57 "true",
58 "Assuming f32 fma is at least as fast as mul + add",
59 []>;
60
61 // Some instructions do not support denormals despite this flag. Using
62 // fp32 denormals also causes instructions to run at the double
63 // precision rate for the device.
64 def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals",
65 "FP32Denormals",
66 "true",
67 "Enable single precision denormal handling">;
68
69 def Feature64BitPtr : SubtargetFeature<"64BitPtr",
70 "Is64bit",
71 "true",
72 "Specify if 64-bit addressing should be used">;
73
74 def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
75 "R600ALUInst",
76 "false",
77 "Older version of ALU instructions encoding">;
78
79 def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
80 "HasVertexCache",
81 "true",
82 "Specify use of dedicated vertex cache">;
83
84 def FeatureCaymanISA : SubtargetFeature<"caymanISA",
85 "CaymanISA",
86 "true",
87 "Use Cayman ISA">;
88
89 def FeatureCFALUBug : SubtargetFeature<"cfalubug",
90 "CFALUBug",
91 "true",
92 "GPU has CF_ALU bug">;
93
94 // XXX - This should probably be removed once enabled by default
95 def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt",
96 "EnableLoadStoreOpt",
97 "true",
98 "Enable SI load/store optimizer pass">;
99
100 def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
101 "FlatAddressSpace",
102 "true",
103 "Support flat address space">;
104
105 def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
106 "EnableVGPRSpilling",
107 "true",
108 "Enable spilling of VGPRs to scratch memory">;
109
110 def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
111 "SGPRInitBug",
112 "true",
113 "VI SGPR initilization bug requiring a fixed SGPR allocation size">;
114
115 class SubtargetFeatureFetchLimit :
116 SubtargetFeature <"fetch"#Value,
117 "TexVTXClauseSize",
118 Value,
119 "Limit the maximum number of fetches in a clause to "#Value>;
120
121 def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">;
122 def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">;
123
124 class SubtargetFeatureWavefrontSize : SubtargetFeature<
125 "wavefrontsize"#Value,
126 "WavefrontSize",
127 !cast(Value),
128 "The number of threads per wavefront">;
129
130 def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
131 def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
132 def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
133
134 class SubtargetFeatureLDSBankCount : SubtargetFeature <
135 "ldsbankcount"#Value,
136 "LDSBankCount",
137 !cast(Value),
138 "The number of LDS banks per compute unit.">;
139
140 def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>;
141 def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>;
142
143 class SubtargetFeatureLocalMemorySize : SubtargetFeature<
144 "localmemorysize"#Value,
145 "LocalMemorySize",
146 !cast(Value),
147 "The size of local memory in bytes">;
148
149 def FeatureGCN : SubtargetFeature<"gcn",
150 "IsGCN",
151 "true",
152 "GCN or newer GPU">;
153
154 def FeatureGCN1Encoding : SubtargetFeature<"gcn1-encoding",
155 "GCN1Encoding",
156 "true",
157 "Encoding format for SI and CI">;
158
159 def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding",
160 "GCN3Encoding",
161 "true",
162 "Encoding format for VI">;
163
164 def FeatureCIInsts : SubtargetFeature<"ci-insts",
165 "CIInsts",
166 "true",
167 "Additional intstructions for CI+">;
168
169 // Dummy feature used to disable assembler instructions.
170 def FeatureDisable : SubtargetFeature<"",
171 "FeatureDisable","true",
172 "Dummy feature to disable assembler"
173 " instructions">;
174
175 class SubtargetFeatureGeneration
176 list Implies> :
177 SubtargetFeature
178 Value#" GPU generation", Implies>;
179
180 def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
181 def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
182 def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
183
184 def FeatureR600 : SubtargetFeatureGeneration<"R600",
185 [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]>;
186
187 def FeatureR700 : SubtargetFeatureGeneration<"R700",
188 [FeatureFetchLimit16, FeatureLocalMemorySize0]>;
189
190 def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
191 [FeatureFetchLimit16, FeatureLocalMemorySize32768]>;
192
193 def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
194 [FeatureFetchLimit16, FeatureWavefrontSize64,
195 FeatureLocalMemorySize32768]
196 >;
197
198 def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
199 [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768,
200 FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding,
201 FeatureLDSBankCount32]>;
202
203 def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
204 [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
205 FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace,
206 FeatureGCN1Encoding, FeatureCIInsts]>;
207
208 def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
209 [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
210 FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
211 FeatureGCN3Encoding, FeatureCIInsts, FeatureLDSBankCount32]>;
212
213 //===----------------------------------------------------------------------===//
214
215 def AMDGPUInstrInfo : InstrInfo {
216 let guessInstructionProperties = 1;
217 let noNamedPositionallyEncodedOperands = 1;
218 }
219
220 def AMDGPUAsmParser : AsmParser {
221 // Some of the R600 registers have the same name, so this crashes.
222 // For example T0_XYZW and T0_XY both have the asm name T0.
223 let ShouldEmitMatchRegisterName = 0;
224 }
225
226 def AMDGPU : Target {
227 // Pull in Instruction Info:
228 let InstructionSet = AMDGPUInstrInfo;
229 let AssemblyParsers = [AMDGPUAsmParser];
230 }
231
232 // Dummy Instruction itineraries for pseudo instructions
233 def ALU_NULL : FuncUnit;
234 def NullALU : InstrItinClass;
235
236 //===----------------------------------------------------------------------===//
237 // Predicate helper class
238 //===----------------------------------------------------------------------===//
239
240 def TruePredicate : Predicate<"true">;
241 def isSICI : Predicate<
242 "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
243 "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
244 >, AssemblerPredicate<"FeatureGCN1Encoding">;
245
246 class PredicateControl {
247 Predicate SubtargetPredicate;
248 Predicate SIAssemblerPredicate = isSICI;
249 list AssemblerPredicates = [];
250 Predicate AssemblerPredicate = TruePredicate;
251 list OtherPredicates = [];
252 list Predicates = !listconcat([SubtargetPredicate, AssemblerPredicate],
253 AssemblerPredicates,
254 OtherPredicates);
255 }
256
257 // Include AMDGPU TD files
258 include "R600Schedule.td"
259 include "SISchedule.td"
260 include "Processors.td"
261 include "AMDGPUInstrInfo.td"
262 include "AMDGPUIntrinsics.td"
263 include "AMDGPURegisterInfo.td"
264 include "AMDGPUInstructions.td"
265 include "AMDGPUCallingConv.td"
0 //===-- AMDGPUAlwaysInlinePass.cpp - Promote Allocas ----------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass marks all internal functions as always_inline and creates
11 /// duplicates of all other functions a marks the duplicates as always_inline.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "AMDGPU.h"
16 #include "llvm/IR/Module.h"
17 #include "llvm/Transforms/Utils/Cloning.h"
18
19 using namespace llvm;
20
21 namespace {
22
23 class AMDGPUAlwaysInline : public ModulePass {
24
25 static char ID;
26
27 public:
28 AMDGPUAlwaysInline() : ModulePass(ID) { }
29 bool runOnModule(Module &M) override;
30 const char *getPassName() const override { return "AMDGPU Always Inline Pass"; }
31 };
32
33 } // End anonymous namespace
34
35 char AMDGPUAlwaysInline::ID = 0;
36
37 bool AMDGPUAlwaysInline::runOnModule(Module &M) {
38
39 std::vector FuncsToClone;
40 for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
41 Function &F = *I;
42 if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() &&
43 !F.hasFnAttribute(Attribute::NoInline))
44 FuncsToClone.push_back(&F);
45 }
46
47 for (Function *F : FuncsToClone) {
48 ValueToValueMapTy VMap;
49 Function *NewFunc = CloneFunction(F, VMap, false);
50 NewFunc->setLinkage(GlobalValue::InternalLinkage);
51 F->getParent()->getFunctionList().push_back(NewFunc);
52 F->replaceAllUsesWith(NewFunc);
53 }
54
55 for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
56 Function &F = *I;
57 if (F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::NoInline)) {
58 F.addFnAttr(Attribute::AlwaysInline);
59 }
60 }
61 return false;
62 }
63
64 ModulePass *llvm::createAMDGPUAlwaysInlinePass() {
65 return new AMDGPUAlwaysInline();
66 }
0 //===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer --------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 ///
11 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12 /// code. When passed an MCAsmStreamer it prints assembly and when passed
13 /// an MCObjectStreamer it outputs binary code.
14 //
15 //===----------------------------------------------------------------------===//
16 //
17
18 #include "AMDGPUAsmPrinter.h"
19 #include "InstPrinter/AMDGPUInstPrinter.h"
20 #include "AMDGPU.h"
21 #include "AMDKernelCodeT.h"
22 #include "AMDGPUSubtarget.h"
23 #include "R600Defines.h"
24 #include "R600MachineFunctionInfo.h"
25 #include "R600RegisterInfo.h"
26 #include "SIDefines.h"
27 #include "SIMachineFunctionInfo.h"
28 #include "SIRegisterInfo.h"
29 #include "llvm/CodeGen/MachineFrameInfo.h"
30 #include "llvm/MC/MCContext.h"
31 #include "llvm/MC/MCSectionELF.h"
32 #include "llvm/MC/MCStreamer.h"
33 #include "llvm/Support/ELF.h"
34 #include "llvm/Support/MathExtras.h"
35 #include "llvm/Support/TargetRegistry.h"
36 #include "llvm/Target/TargetLoweringObjectFile.h"
37
38 using namespace llvm;
39
40 // TODO: This should get the default rounding mode from the kernel. We just set
41 // the default here, but this could change if the OpenCL rounding mode pragmas
42 // are used.
43 //
44 // The denormal mode here should match what is reported by the OpenCL runtime
45 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
46 // can also be override to flush with the -cl-denorms-are-zero compiler flag.
47 //
48 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
49 // precision, and leaves single precision to flush all and does not report
50 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
51 // CL_FP_DENORM for both.
52 //
53 // FIXME: It seems some instructions do not support single precision denormals
54 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
55 // and sin_f32, cos_f32 on most parts).
56
57 // We want to use these instructions, and using fp32 denormals also causes
58 // instructions to run at the double precision rate for the device so it's
59 // probably best to just report no single precision denormals.
60 static uint32_t getFPMode(const MachineFunction &F) {
61 const AMDGPUSubtarget& ST = F.getSubtarget();
62 // TODO: Is there any real use for the flush in only / flush out only modes?
63
64 uint32_t FP32Denormals =
65 ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
66
67 uint32_t FP64Denormals =
68 ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
69
70 return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
71 FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
72 FP_DENORM_MODE_SP(FP32Denormals) |
73 FP_DENORM_MODE_DP(FP64Denormals);
74 }
75
76 static AsmPrinter *
77 createAMDGPUAsmPrinterPass(TargetMachine &tm,
78 std::unique_ptr &&Streamer) {
79 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
80 }
81
82 extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
83 TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
84 TargetRegistry::RegisterAsmPrinter(TheGCNTarget, createAMDGPUAsmPrinterPass);
85 }
86
87 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
88 std::unique_ptr Streamer)
89 : AsmPrinter(TM, std::move(Streamer)) {}
90
91 void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
92
93 // This label is used to mark the end of the .text section.
94 const TargetLoweringObjectFile &TLOF = getObjFileLowering();
95 OutStreamer->SwitchSection(TLOF.getTextSection());
96 MCSymbol *EndOfTextLabel =
97 OutContext.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
98 OutStreamer->EmitLabel(EndOfTextLabel);
99 }
100
101 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
102
103 // The starting address of all shader programs must be 256 bytes aligned.
104 MF.setAlignment(8);
105
106 SetupMachineFunction(MF);
107
108 MCContext &Context = getObjFileLowering().getContext();
109 MCSectionELF *ConfigSection =
110 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
111 OutStreamer->SwitchSection(ConfigSection);
112
113 const AMDGPUSubtarget &STM = MF.getSubtarget();
114 SIProgramInfo KernelInfo;
115 if (STM.isAmdHsaOS()) {
116 getSIProgramInfo(KernelInfo, MF);
117 EmitAmdKernelCodeT(MF, KernelInfo);
118 OutStreamer->EmitCodeAlignment(2 << (MF.getAlignment() - 1));
119 } else if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
120 getSIProgramInfo(KernelInfo, MF);
121 EmitProgramInfoSI(MF, KernelInfo);
122 } else {
123 EmitProgramInfoR600(MF);
124 }
125
126 DisasmLines.clear();
127 HexLines.clear();
128 DisasmLineMaxLen = 0;
129
130 EmitFunctionBody();
131
132 if (isVerbose()) {
133 MCSectionELF *CommentSection =
134 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
135 OutStreamer->SwitchSection(CommentSection);
136
137 if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
138 OutStreamer->emitRawComment(" Kernel info:", false);
139 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen),
140 false);
141 OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
142 false);
143 OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
144 false);
145 OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
146 false);
147 OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
148 false);
149 OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
150 false);
151 } else {
152 R600MachineFunctionInfo *MFI = MF.getInfo();
153 OutStreamer->emitRawComment(
154 Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize)));
155 }
156 }
157
158 if (STM.dumpCode()) {
159
160 OutStreamer->SwitchSection(
161 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0));
162
163 for (size_t i = 0; i < DisasmLines.size(); ++i) {
164 std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
165 Comment += " ; " + HexLines[i] + "\n";
166
167 OutStreamer->EmitBytes(StringRef(DisasmLines[i]));
168 OutStreamer->EmitBytes(StringRef(Comment));
169 }
170 }
171
172 return false;
173 }
174
175 void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
176 unsigned MaxGPR = 0;
177 bool killPixel = false;
178 const AMDGPUSubtarget &STM = MF.getSubtarget();
179 const R600RegisterInfo *RI =
180 static_cast(STM.getRegisterInfo());
181 const R600MachineFunctionInfo *MFI = MF.getInfo();
182
183 for (const MachineBasicBlock &MBB : MF) {
184 for (const MachineInstr &MI : MBB) {
185 if (MI.getOpcode() == AMDGPU::KILLGT)
186 killPixel = true;
187 unsigned numOperands = MI.getNumOperands();
188 for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
189 const MachineOperand &MO = MI.getOperand(op_idx);
190 if (!MO.isReg())
191 continue;
192 unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff;
193
194 // Register with value > 127 aren't GPR
195 if (HWReg > 127)
196 continue;
197 MaxGPR = std::max(MaxGPR, HWReg);
198 }
199 }
200 }
201
202 unsigned RsrcReg;
203 if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) {
204 // Evergreen / Northern Islands
205 switch (MFI->getShaderType()) {
206 default: // Fall through
207 case ShaderType::COMPUTE: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
208 case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
209 case ShaderType::PIXEL: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
210 case ShaderType::VERTEX: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
211 }
212 } else {
213 // R600 / R700
214 switch (MFI->getShaderType()) {
215 default: // Fall through
216 case ShaderType::GEOMETRY: // Fall through
217 case ShaderType::COMPUTE: // Fall through
218 case ShaderType::VERTEX: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
219 case ShaderType::PIXEL: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
220 }
221 }
222
223 OutStreamer->EmitIntValue(RsrcReg, 4);
224 OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
225 S_STACK_SIZE(MFI->StackSize), 4);
226 OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
227 OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
228
229 if (MFI->getShaderType() == ShaderType::COMPUTE) {
230 OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
231 OutStreamer->EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4);
232 }
233 }
234
235 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
236 const MachineFunction &MF) const {
237 const AMDGPUSubtarget &STM = MF.getSubtarget();
238 const SIMachineFunctionInfo *MFI = MF.getInfo();
239 uint64_t CodeSize = 0;
240 unsigned MaxSGPR = 0;
241 unsigned MaxVGPR = 0;
242 bool VCCUsed = false;
243 bool FlatUsed = false;
244 const SIRegisterInfo *RI =
245 static_cast(STM.getRegisterInfo());
246
247 for (const MachineBasicBlock &MBB : MF) {
248 for (const MachineInstr &MI : MBB) {
249 // TODO: CodeSize should account for multiple functions.
250 CodeSize += MI.getDesc().Size;
251
252 unsigned numOperands = MI.getNumOperands();
253 for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
254 const MachineOperand &MO = MI.getOperand(op_idx);
255 unsigned width = 0;
256 bool isSGPR = false;
257
258 if (!MO.isReg()) {
259 continue;
260 }
261 unsigned reg = MO.getReg();
262 if (reg == AMDGPU::VCC || reg == AMDGPU::VCC_LO ||
263 reg == AMDGPU::VCC_HI) {
264 VCCUsed = true;
265 continue;
266 } else if (reg == AMDGPU::FLAT_SCR ||
267 reg == AMDGPU::FLAT_SCR_LO ||
268 reg == AMDGPU::FLAT_SCR_HI) {
269 FlatUsed = true;
270 continue;
271 }
272
273 switch (reg) {
274 default: break;
275 case AMDGPU::SCC:
276 case AMDGPU::EXEC:
277 case AMDGPU::M0:
278 continue;
279 }
280
281 if (AMDGPU::SReg_32RegClass.contains(reg)) {
282 isSGPR = true;
283 width = 1;
284 } else if (AMDGPU::VGPR_32RegClass.contains(reg)) {
285 isSGPR = false;
286 width = 1;
287 } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
288 isSGPR = true;
289 width = 2;
290 } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
291 isSGPR = false;
292 width = 2;
293 } else if (AMDGPU::VReg_96RegClass.contains(reg)) {
294 isSGPR = false;
295 width = 3;
296 } else if (AMDGPU::SReg_128RegClass.contains(reg)) {
297 isSGPR = true;
298 width = 4;
299 } else if (AMDGPU::VReg_128RegClass.contains(reg)) {
300 isSGPR = false;
301 width = 4;
302 } else if (AMDGPU::SReg_256RegClass.contains(reg)) {
303 isSGPR = true;
304 width = 8;
305 } else if (AMDGPU::VReg_256RegClass.contains(reg)) {
306 isSGPR = false;
307 width = 8;
308 } else if (AMDGPU::SReg_512RegClass.contains(reg)) {
309 isSGPR = true;
310 width = 16;
311 } else if (AMDGPU::VReg_512RegClass.contains(reg)) {
312 isSGPR = false;
313 width = 16;
314 } else {
315 llvm_unreachable("Unknown register class");
316 }
317 unsigned hwReg = RI->getEncodingValue(reg) & 0xff;
318 unsigned maxUsed = hwReg + width - 1;
319 if (isSGPR) {
320 MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
321 } else {
322 MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR;
323 }
324 }
325 }
326 }
327
328 if (VCCUsed)
329 MaxSGPR += 2;
330
331 if (FlatUsed)
332 MaxSGPR += 2;
333
334 // We found the maximum register index. They start at 0, so add one to get the
335 // number of registers.
336 ProgInfo.NumVGPR = MaxVGPR + 1;
337 ProgInfo.NumSGPR = MaxSGPR + 1;
338
339 if (STM.hasSGPRInitBug()) {
340 if (ProgInfo.NumSGPR > AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG)
341 llvm_unreachable("Too many SGPRs used with the SGPR init bug");
342
343 ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
344 }
345
346 ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
347 ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;
348 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
349 // register.
350 ProgInfo.FloatMode = getFPMode(MF);
351
352 // XXX: Not quite sure what this does, but sc seems to unset this.
353 ProgInfo.IEEEMode = 0;
354
355 // Do not clamp NAN to 0.
356 ProgInfo.DX10Clamp = 0;
357
358 const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
359 ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF);
360
361 ProgInfo.FlatUsed = FlatUsed;
362 ProgInfo.VCCUsed = VCCUsed;
363 ProgInfo.CodeLen = CodeSize;
364
365 unsigned LDSAlignShift;
366 if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
367 // LDS is allocated in 64 dword blocks.
368 LDSAlignShift = 8;
369 } else {
370 // LDS is allocated in 128 dword blocks.
371 LDSAlignShift = 9;
372 }
373
374 unsigned LDSSpillSize = MFI->LDSWaveSpillSize *
375 MFI->getMaximumWorkGroupSize(MF);
376
377 ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize;
378 ProgInfo.LDSBlocks =
379 RoundUpToAlignment(ProgInfo.LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
380
381 // Scratch is allocated in 256 dword blocks.
382 unsigned ScratchAlignShift = 10;
383 // We need to program the hardware with the amount of scratch memory that
384 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
385 // scratch memory used per thread.
386 ProgInfo.ScratchBlocks =
387 RoundUpToAlignment(ProgInfo.ScratchSize * STM.getWavefrontSize(),
388 1 << ScratchAlignShift) >> ScratchAlignShift;
389
390 ProgInfo.ComputePGMRSrc1 =
391 S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
392 S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
393 S_00B848_PRIORITY(ProgInfo.Priority) |
394 S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
395 S_00B848_PRIV(ProgInfo.Priv) |
396 S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
397 S_00B848_IEEE_MODE(ProgInfo.DebugMode) |
398 S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
399
400 ProgInfo.ComputePGMRSrc2 =
401 S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
402 S_00B84C_USER_SGPR(MFI->NumUserSGPRs) |
403 S_00B84C_TGID_X_EN(1) |
404 S_00B84C_TGID_Y_EN(1) |
405 S_00B84C_TGID_Z_EN(1) |
406 S_00B84C_TG_SIZE_EN(1) |
407 S_00B84C_TIDIG_COMP_CNT(2) |
408 S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks);
409 }
410
411 static unsigned getRsrcReg(unsigned ShaderType) {
412 switch (ShaderType) {
413 default: // Fall through
414 case ShaderType::COMPUTE: return R_00B848_COMPUTE_PGM_RSRC1;
415 case ShaderType::GEOMETRY: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
416 case ShaderType::PIXEL: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
417 case ShaderType::VERTEX: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
418 }
419 }
420
421 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
422 const SIProgramInfo &KernelInfo) {
423 const AMDGPUSubtarget &STM = MF.getSubtarget();
424 const SIMachineFunctionInfo *MFI = MF.getInfo();
425 unsigned RsrcReg = getRsrcReg(MFI->getShaderType());
426
427 if (MFI->getShaderType() == ShaderType::COMPUTE) {
428 OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
429
430 OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);
431
432 OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
433 OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4);
434
435 OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
436 OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4);
437
438 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
439 // 0" comment but I don't see a corresponding field in the register spec.
440 } else {
441 OutStreamer->EmitIntValue(RsrcReg, 4);
442 OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
443 S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
444 if (STM.isVGPRSpillingEnabled(MFI)) {
445 OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
446 OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4);
447 }
448 }
449
450 if (MFI->getShaderType() == ShaderType::PIXEL) {
451 OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
452 OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
453 OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
454 OutStreamer->EmitIntValue(MFI->PSInputAddr, 4);
455 }
456 }
457
458 void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
459 const SIProgramInfo &KernelInfo) const {
460 const SIMachineFunctionInfo *MFI = MF.getInfo();
461 const AMDGPUSubtarget &STM = MF.getSubtarget();
462 amd_kernel_code_t header;
463
464 memset(&header, 0, sizeof(header));
465
466 header.amd_code_version_major = AMD_CODE_VERSION_MAJOR;
467 header.amd_code_version_minor = AMD_CODE_VERSION_MINOR;
468
469 header.struct_byte_size = sizeof(amd_kernel_code_t);
470
471 header.target_chip = STM.getAmdKernelCodeChipID();
472
473 header.kernel_code_entry_byte_offset = (1ULL << MF.getAlignment());
474
475 header.compute_pgm_resource_registers =
476 KernelInfo.ComputePGMRSrc1 |
477 (KernelInfo.ComputePGMRSrc2 << 32);
478
479 // Code Properties:
480 header.code_properties = AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR |
481 AMD_CODE_PROPERTY_IS_PTR64;
482
483 if (KernelInfo.FlatUsed)
484 header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
485
486 if (KernelInfo.ScratchBlocks)
487 header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
488
489 header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
490 header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
491
492 // MFI->ABIArgOffset is the number of bytes for the kernel arguments
493 // plus 36. 36 is the number of bytes reserved at the begining of the
494 // input buffer to store work-group size information.
495 // FIXME: We should be adding the size of the implicit arguments
496 // to this value.
497 header.kernarg_segment_byte_size = MFI->ABIArgOffset;
498
499 header.wavefront_sgpr_count = KernelInfo.NumSGPR;
500 header.workitem_vgpr_count = KernelInfo.NumVGPR;
501
502 // FIXME: What values do I put for these alignments
503 header.kernarg_segment_alignment = 0;
504 header.group_segment_alignment = 0;
505 header.private_segment_alignment = 0;
506
507 header.code_type = 1; // HSA_EXT_CODE_KERNEL
508
509 header.wavefront_size = STM.getWavefrontSize();
510
511 MCSectionELF *VersionSection =
512 OutContext.getELFSection(".hsa.version", ELF::SHT_PROGBITS, 0);
513 OutStreamer->SwitchSection(VersionSection);
514 OutStreamer->EmitBytes(Twine("HSA Code Unit:" +
515 Twine(header.hsail_version_major) + "." +
516 Twine(header.hsail_version_minor) + ":" +
517 "AMD:" +
518 Twine(header.amd_code_version_major) + "." +
519 Twine(header.amd_code_version_minor) + ":" +
520 "GFX8.1:0").str());
521
522 OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
523
524 if (isVerbose()) {
525 OutStreamer->emitRawComment("amd_code_version_major = " +
526 Twine(header.amd_code_version_major), false);
527 OutStreamer->emitRawComment("amd_code_version_minor = " +
528 Twine(header.amd_code_version_minor), false);
529 OutStreamer->emitRawComment("struct_byte_size = " +
530 Twine(header.struct_byte_size), false);
531 OutStreamer->emitRawComment("target_chip = " +
532 Twine(header.target_chip), false);
533 OutStreamer->emitRawComment(" compute_pgm_rsrc1: " +
534 Twine::utohexstr(KernelInfo.ComputePGMRSrc1),
535 false);
536 OutStreamer->emitRawComment(" compute_pgm_rsrc2: " +
537 Twine::utohexstr(KernelInfo.ComputePGMRSrc2),
538 false);
539 OutStreamer->emitRawComment("enable_sgpr_private_segment_buffer = " +
540 Twine((bool)(header.code_properties &
541 AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE)), false);
542 OutStreamer->emitRawComment("enable_sgpr_kernarg_segment_ptr = " +
543 Twine((bool)(header.code_properties &
544 AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)), false);
545 OutStreamer->emitRawComment("private_element_size = 2 ", false);
546 OutStreamer->emitRawComment("is_ptr64 = " +
547 Twine((bool)(header.code_properties & AMD_CODE_PROPERTY_IS_PTR64)), false);
548 OutStreamer->emitRawComment("workitem_private_segment_byte_size = " +
549 Twine(header.workitem_private_segment_byte_size),
550 false);
551 OutStreamer->emitRawComment("workgroup_group_segment_byte_size = " +
552 Twine(header.workgroup_group_segment_byte_size),
553 false);
554 OutStreamer->emitRawComment("gds_segment_byte_size = " +
555 Twine(header.gds_segment_byte_size), false);
556 OutStreamer->emitRawComment("kernarg_segment_byte_size = " +
557 Twine(header.kernarg_segment_byte_size), false);
558 OutStreamer->emitRawComment("wavefront_sgpr_count = " +
559 Twine(header.wavefront_sgpr_count), false);
560 OutStreamer->emitRawComment("workitem_vgpr_count = " +
561 Twine(header.workitem_vgpr_count), false);
562 OutStreamer->emitRawComment("code_type = " + Twine(header.code_type), false);
563 OutStreamer->emitRawComment("wavefront_size = " +
564 Twine((int)header.wavefront_size), false);
565 OutStreamer->emitRawComment("optimization_level = " +
566 Twine(header.optimization_level), false);
567 OutStreamer->emitRawComment("hsail_profile = " +
568 Twine(header.hsail_profile), false);
569 OutStreamer->emitRawComment("hsail_machine_model = " +
570 Twine(header.hsail_machine_model), false);
571 OutStreamer->emitRawComment("hsail_version_major = " +
572 Twine(header.hsail_version_major), false);
573 OutStreamer->emitRawComment("hsail_version_minor = " +
574 Twine(header.hsail_version_minor), false);
575 }
576
577 OutStreamer->EmitBytes(StringRef((char*)&header, sizeof(header)));
578 }
579
580 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
581 unsigned AsmVariant,
582 const char *ExtraCode, raw_ostream &O) {
583 if (ExtraCode && ExtraCode[0]) {
584 if (ExtraCode[1] != 0)
585 return true; // Unknown modifier.
586
587 switch (ExtraCode[0]) {
588 default:
589 // See if this is a generic print operand
590 return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
591 case 'r':
592 break;
593 }
594 }
595
596 AMDGPUInstPrinter::printRegOperand(MI->getOperand(OpNo).getReg(), O,
597 *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo());
598 return false;
599 }
0 //===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code ---------*- C++ -*-===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief AMDGPU Assembly printer class.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #ifndef LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H
15 #define LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H
16
17 #include "llvm/CodeGen/AsmPrinter.h"
18 #include
19
20 namespace llvm {
21
22 class AMDGPUAsmPrinter : public AsmPrinter {
23 private:
24 struct SIProgramInfo {
25 SIProgramInfo() :
26 VGPRBlocks(0),
27 SGPRBlocks(0),
28 Priority(0),
29 FloatMode(0),
30 Priv(0),
31 DX10Clamp(0),
32 DebugMode(0),
33 IEEEMode(0),
34 ScratchSize(0),
35 ComputePGMRSrc1(0),
36 LDSBlocks(0),
37 ScratchBlocks(0),
38 ComputePGMRSrc2(0),
39 NumVGPR(0),
40 NumSGPR(0),
41 FlatUsed(false),
42 VCCUsed(false),
43 CodeLen(0) {}
44
45 // Fields set in PGM_RSRC1 pm4 packet.
46 uint32_t VGPRBlocks;
47 uint32_t SGPRBlocks;
48 uint32_t Priority;
49 uint32_t FloatMode;
50 uint32_t Priv;
51 uint32_t DX10Clamp;
52 uint32_t DebugMode;
53 uint32_t IEEEMode;
54 uint32_t ScratchSize;
55
56 uint64_t ComputePGMRSrc1;
57
58 // Fields set in PGM_RSRC2 pm4 packet.
59 uint32_t LDSBlocks;
60 uint32_t ScratchBlocks;
61
62 uint64_t ComputePGMRSrc2;
63
64 uint32_t NumVGPR;
65 uint32_t NumSGPR;
66 uint32_t LDSSize;
67 bool FlatUsed;
68
69 // Bonus information for debugging.
70 bool VCCUsed;
71 uint64_t CodeLen;
72 };
73
74 void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const;
75 void findNumUsedRegistersSI(const MachineFunction &MF,
76 unsigned &NumSGPR,
77 unsigned &NumVGPR) const;
78
79 /// \brief Emit register usage information so that the GPU driver
80 /// can correctly setup the GPU state.
81 void EmitProgramInfoR600(const MachineFunction &MF);
82 void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo);
83 void EmitAmdKernelCodeT(const MachineFunction &MF,
84 const SIProgramInfo &KernelInfo) const;
85
86 public:
87 explicit AMDGPUAsmPrinter(TargetMachine &TM,
88 std::unique_ptr Streamer);
89
90 bool runOnMachineFunction(MachineFunction &MF) override;
91
92 const char *getPassName() const override {
93 return "AMDGPU Assembly Printer";
94 }
95
96 /// Implemented in AMDGPUMCInstLower.cpp
97 void EmitInstruction(const MachineInstr *MI) override;
98
99 void EmitEndOfAsmFile(Module &M) override;
100
101 bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
102 unsigned AsmVariant, const char *ExtraCode,
103 raw_ostream &O) override;
104
105 protected:
106 std::vector DisasmLines, HexLines;
107 size_t DisasmLineMaxLen;
108 };
109
110 } // End anonymous llvm
111
112 #endif
0 //===---- AMDCallingConv.td - Calling Conventions for Radeon GPUs ---------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This describes the calling conventions for the AMD Radeon GPUs.
10 //
11 //===----------------------------------------------------------------------===//
12
13 // Inversion of CCIfInReg
14 class CCIfNotInReg : CCIf<"!ArgFlags.isInReg()", A> {}
15
16 // Calling convention for SI
17 def CC_SI : CallingConv<[
18
19 CCIfInReg
20 SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
21 SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
22 SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21
23 ]>>>,
24
25 CCIfInReg
26 [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
27 [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
28 >>>,
29
30 CCIfNotInReg
31 VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
32 VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
33 VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
34 VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
35 ]>>>,
36
37 CCIfByVal
38 [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
39 [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
40 >>>
41
42 ]>;
43
44 // Calling convention for R600
45 def CC_R600 : CallingConv<[
46 CCIfInReg
47 T0_XYZW, T1_XYZW, T2_XYZW, T3_XYZW, T4_XYZW, T5_XYZW, T6_XYZW, T7_XYZW,
48 T8_XYZW, T9_XYZW, T10_XYZW, T11_XYZW, T12_XYZW, T13_XYZW, T14_XYZW, T15_XYZW,
49 T16_XYZW, T17_XYZW, T18_XYZW, T19_XYZW, T20_XYZW, T21_XYZW, T22_XYZW,
50 T23_XYZW, T24_XYZW, T25_XYZW, T26_XYZW, T27_XYZW, T28_XYZW, T29_XYZW,
51 T30_XYZW, T31_XYZW, T32_XYZW
52 ]>>>
53 ]>;
54
55 // Calling convention for compute kernels
56 def CC_AMDGPU_Kernel : CallingConv<[
57 CCCustom<"allocateStack">
58 ]>;
59
60 def CC_AMDGPU : CallingConv<[
61 CCIf<"static_cast"
62 "(State.getMachineFunction().getSubtarget()).getGeneration() >="
63 "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
64 "State.getMachineFunction().getInfo()"
65 "->getShaderType() == ShaderType::COMPUTE",
66 CCDelegateTo>,
67 CCIf<"static_cast"
68 "(State.getMachineFunction().getSubtarget()).getGeneration() < "
69 "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
70 "State.getMachineFunction().getInfo()"
71 "->getShaderType() == ShaderType::COMPUTE",
72 CCDelegateTo>,
73 CCIf<"static_cast"
74 "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
75 "AMDGPUSubtarget::SOUTHERN_ISLANDS",
76 CCDelegateTo>,
77 CCIf<"static_cast"
78 "(State.getMachineFunction().getSubtarget()).getGeneration() < "
79 "AMDGPUSubtarget::SOUTHERN_ISLANDS",
80 CCDelegateTo>
81 ]>;
0 //===----------------------- AMDGPUFrameLowering.cpp ----------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 // Interface to describe a layout of a stack frame on a AMDIL target machine
10 //
11 //===----------------------------------------------------------------------===//
12 #include "AMDGPUFrameLowering.h"
13 #include "AMDGPURegisterInfo.h"
14 #include "R600MachineFunctionInfo.h"
15 #include "llvm/CodeGen/MachineFrameInfo.h"
16 #include "llvm/CodeGen/MachineRegisterInfo.h"
17 #include "llvm/IR/Instructions.h"
18
19 using namespace llvm;
20 AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl,
21 int LAO, unsigned TransAl)
22 : TargetFrameLowering(D, StackAl, LAO, TransAl) { }
23
24 AMDGPUFrameLowering::~AMDGPUFrameLowering() { }
25
26 unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const {
27
28 // XXX: Hardcoding to 1 for now.
29 //
30 // I think the StackWidth should stored as metadata associated with the
31 // MachineFunction. This metadata can either be added by a frontend, or
32 // calculated by a R600 specific LLVM IR pass.
33 //
34 // The StackWidth determines how stack objects are laid out in memory.
35 // For a vector stack variable, like: int4 stack[2], the data will be stored
36 // in the following ways depending on the StackWidth.
37 //
38 // StackWidth = 1:
39 //
40 // T0.X = stack[0].x
41 // T1.X = stack[0].y
42 // T2.X = stack[0].z
43 // T3.X = stack[0].w
44 // T4.X = stack[1].x
45 // T5.X = stack[1].y
46 // T6.X = stack[1].z
47 // T7.X = stack[1].w
48 //
49 // StackWidth = 2:
50 //
51 // T0.X = stack[0].x
52 // T0.Y = stack[0].y
53 // T1.X = stack[0].z
54 // T1.Y = stack[0].w
55 // T2.X = stack[1].x
56 // T2.Y = stack[1].y
57 // T3.X = stack[1].z
58 // T3.Y = stack[1].w
59 //
60 // StackWidth = 4:
61 // T0.X = stack[0].x
62 // T0.Y = stack[0].y
63 // T0.Z = stack[0].z
64 // T0.W = stack[0].w
65 // T1.X = stack[1].x
66 // T1.Y = stack[1].y
67 // T1.Z = stack[1].z
68 // T1.W = stack[1].w
69 return 1;
70 }
71
72 /// \returns The number of registers allocated for \p FI.
73 int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
74 int FI) const {
75 const MachineFrameInfo *MFI = MF.getFrameInfo();
76 // Start the offset at 2 so we don't overwrite work group information.
77 // XXX: We should only do this when the shader actually uses this
78 // information.
79 unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4);
80 int UpperBound = FI == -1 ? MFI->getNumObjects() : FI;
81
82 for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) {
83 OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(i));
84 OffsetBytes += MFI->getObjectSize(i);
85 // Each register holds 4 bytes, so we must always align the offset to at
86 // least 4 bytes, so that 2 frame objects won't share the same register.
87 OffsetBytes = RoundUpToAlignment(OffsetBytes, 4);
88 }
89
90 if (FI != -1)
91 OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(FI));
92
93 return OffsetBytes / (getStackWidth(MF) * 4);
94 }
95
96 const TargetFrameLowering::SpillSlot *
97 AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
98 NumEntries = 0;
99 return nullptr;
100 }
101 void AMDGPUFrameLowering::emitPrologue(MachineFunction &MF,
102 MachineBasicBlock &MBB) const {}
103 void
104 AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF,
105 MachineBasicBlock &MBB) const {
106 }
107
108 bool
109 AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const {
110 return false;
111 }
0 //===--------------------- AMDGPUFrameLowering.h ----------------*- C++ -*-===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Interface to describe a layout of a stack frame on a AMDIL target
11 /// machine.
12 //
13 //===----------------------------------------------------------------------===//
14 #ifndef LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H
15 #define LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H
16
17 #include "llvm/CodeGen/MachineFunction.h"
18 #include "llvm/Target/TargetFrameLowering.h"
19
20 namespace llvm {
21
22 /// \brief Information about the stack frame layout on the AMDGPU targets.
23 ///
24 /// It holds the direction of the stack growth, the known stack alignment on
25 /// entry to each function, and the offset to the locals area.
26 /// See TargetFrameInfo for more comments.
27 class AMDGPUFrameLowering : public TargetFrameLowering {
28 public:
29 AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO,
30 unsigned TransAl = 1);
31 virtual ~AMDGPUFrameLowering();
32
33 /// \returns The number of 32-bit sub-registers that are used when storing
34 /// values to the stack.
35 unsigned getStackWidth(const MachineFunction &MF) const;
36 int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
37 const SpillSlot *
38 getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
39 void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
40 void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
41 bool hasFP(const MachineFunction &MF) const override;
42 };
43 } // namespace llvm
44 #endif
0 //===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Defines an instruction selector for the AMDGPU target.
11 //
12 //===----------------------------------------------------------------------===//
13 #include "AMDGPUInstrInfo.h"
14 #include "AMDGPUISelLowering.h" // For AMDGPUISD
15 #include "AMDGPURegisterInfo.h"
16 #include "AMDGPUSubtarget.h"
17 #include "R600InstrInfo.h"
18 #include "SIDefines.h"
19 #include "SIISelLowering.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/CodeGen/FunctionLoweringInfo.h"
22 #include "llvm/CodeGen/PseudoSourceValue.h"
23 #include "llvm/CodeGen/MachineFrameInfo.h"
24 #include "llvm/CodeGen/MachineRegisterInfo.h"
25 #include "llvm/CodeGen/SelectionDAG.h"
26 #include "llvm/CodeGen/SelectionDAGISel.h"
27 #include "llvm/IR/Function.h"
28
29 using namespace llvm;
30
31 //===----------------------------------------------------------------------===//
32 // Instruction Selector Implementation
33 //===----------------------------------------------------------------------===//
34
35 namespace {
36 /// AMDGPU specific code to select AMDGPU machine instructions for
37 /// SelectionDAG operations.
38 class AMDGPUDAGToDAGISel : public SelectionDAGISel {
39 // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
40 // make the right decision when generating code for different targets.
41 const AMDGPUSubtarget *Subtarget;
42 public:
43 AMDGPUDAGToDAGISel(TargetMachine &TM);
44 virtual ~AMDGPUDAGToDAGISel();
45 bool runOnMachineFunction(MachineFunction &MF) override;
46 SDNode *Select(SDNode *N) override;
47 const char *getPassName() const override;
48 void PostprocessISelDAG() override;
49
50 private:
51 bool isInlineImmediate(SDNode *N) const;
52 bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
53 const R600InstrInfo *TII);
54 bool FoldOperands(unsigned, const R600InstrInfo *, std::vector &);
55 bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector &);
56
57 // Complex pattern selectors
58 bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
59 bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2);
60 bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
61
62 static bool checkType(const Value *ptr, unsigned int addrspace);
63 static bool checkPrivateAddress(const MachineMemOperand *Op);
64
65 static bool isGlobalStore(const StoreSDNode *N);
66 static bool isFlatStore(const StoreSDNode *N);
67 static bool isPrivateStore(const StoreSDNode *N);
68 static bool isLocalStore(const StoreSDNode *N);
69 static bool isRegionStore(const StoreSDNode *N);
70
71 bool isCPLoad(const LoadSDNode *N) const;
72 bool isConstantLoad(const LoadSDNode *N, int cbID) const;
73 bool isGlobalLoad(const LoadSDNode *N) const;
74 bool isFlatLoad(const LoadSDNode *N) const;
75 bool isParamLoad(const LoadSDNode *N) const;
76 bool isPrivateLoad(const LoadSDNode *N) const;
77 bool isLocalLoad(const LoadSDNode *N) const;
78 bool isRegionLoad(const LoadSDNode *N) const;
79
80 SDNode *glueCopyToM0(SDNode *N) const;
81
82 const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
83 bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
84 bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
85 SDValue& Offset);
86 bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
87 bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
88 bool isDSOffsetLegal(const SDValue &Base, unsigned Offset,
89 unsigned OffsetBits) const;
90 bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
91 bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
92 SDValue &Offset1) const;
93 void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
94 SDValue &SOffset, SDValue &Offset, SDValue &Offen,
95 SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
96 SDValue &TFE) const;
97 bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
98 SDValue &SOffset, SDValue &Offset, SDValue &GLC,
99 SDValue &SLC, SDValue &TFE) const;
100 bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
101 SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
102 SDValue &SLC) const;
103 bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr,
104 SDValue &SOffset, SDValue &ImmOffset) const;
105 bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
106 SDValue &Offset, SDValue &GLC, SDValue &SLC,
107 SDValue &TFE) const;
108 bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
109 SDValue &Offset, SDValue &GLC) const;
110 SDNode *SelectAddrSpaceCast(SDNode *N);
111 bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
112 bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
113 SDValue &Clamp, SDValue &Omod) const;
114
115 bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods,
116 SDValue &Omod) const;
117 bool SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods,
118 SDValue &Clamp,
119 SDValue &Omod) const;
120
121 SDNode *SelectADD_SUB_I64(SDNode *N);
122 SDNode *SelectDIV_SCALE(SDNode *N);
123
124 SDNode *getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val,
125 uint32_t Offset, uint32_t Width);
126 SDNode *SelectS_BFEFromShifts(SDNode *N);
127 SDNode *SelectS_BFE(SDNode *N);
128
129 // Include the pieces autogenerated from the target description.
130 #include "AMDGPUGenDAGISel.inc"
131 };
132 } // end anonymous namespace
133
134 /// \brief This pass converts a legalized DAG into a AMDGPU-specific
135 // DAG, ready for instruction scheduling.
136 FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM) {
137 return new AMDGPUDAGToDAGISel(TM);
138 }
139
140 AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM)
141 : SelectionDAGISel(TM) {}
142
143 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
144 Subtarget = &static_cast(MF.getSubtarget());
145 return SelectionDAGISel::runOnMachineFunction(MF);
146 }
147
148 AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() {
149 }
150
151 bool AMDGPUDAGToDAGISel::isInlineImmediate(SDNode *N) const {
152 const SITargetLowering *TL
153 = static_cast(getTargetLowering());
154 return TL->analyzeImmediate(N) == 0;
155 }
156
157 /// \brief Determine the register class for \p OpNo
158 /// \returns The register class of the virtual register that will be used for
159 /// the given operand number \OpNo or NULL if the register class cannot be
160 /// determined.
161 const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
162 unsigned OpNo) const {
163 if (!N->isMachineOpcode())
164 return nullptr;
165
166 switch (N->getMachineOpcode()) {
167 default: {
168 const MCInstrDesc &Desc =
169 Subtarget->getInstrInfo()->get(N->getMachineOpcode());
170 unsigned OpIdx = Desc.getNumDefs() + OpNo;
171 if (OpIdx >= Desc.getNumOperands())
172 return nullptr;
173 int RegClass = Desc.OpInfo[OpIdx].RegClass;
174 if (RegClass == -1)
175 return nullptr;
176
177 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
178 }
179 case AMDGPU::REG_SEQUENCE: {
180 unsigned RCID = cast(N->getOperand(0))->getZExtValue();
181 const TargetRegisterClass *SuperRC =
182 Subtarget->getRegisterInfo()->getRegClass(RCID);
183
184 SDValue SubRegOp = N->getOperand(OpNo + 1);
185 unsigned SubRegIdx = cast(SubRegOp)->getZExtValue();
186 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
187 SubRegIdx);
188 }
189 }
190 }
191
192 bool AMDGPUDAGToDAGISel::SelectADDRParam(
193 SDValue Addr, SDValue& R1, SDValue& R2) {
194
195 if (Addr.getOpcode() == ISD::FrameIndex) {
196 if (FrameIndexSDNode *FIN = dyn_cast(Addr)) {
197 R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
198 R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
199 } else {
200 R1 = Addr;
201 R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
202 }
203 } else if (Addr.getOpcode() == ISD::ADD) {
204 R1 = Addr.getOperand(0);
205 R2 = Addr.getOperand(1);
206 } else {
207 R1 = Addr;
208 R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
209 }
210 return true;
211 }
212
213 bool AMDGPUDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) {
214 if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
215 Addr.getOpcode() == ISD::TargetGlobalAddress) {
216 return false;
217 }
218 return SelectADDRParam(Addr, R1, R2);
219 }
220
221
222 bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) {
223 if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
224 Addr.getOpcode() == ISD::TargetGlobalAddress) {
225 return false;
226 }
227
228 if (Addr.getOpcode() == ISD::FrameIndex) {
229 if (FrameIndexSDNode *FIN = dyn_cast(Addr)) {
230 R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
231 R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64);
232 } else {
233 R1 = Addr;
234 R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64);
235 }
236 } else if (Addr.getOpcode() == ISD::ADD) {
237 R1 = Addr.getOperand(0);
238 R2 = Addr.getOperand(1);
239 } else {
240 R1 = Addr;
241 R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64);
242 }
243 return true;
244 }
245
246 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
247 if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
248 !checkType(cast(N)->getMemOperand()->getValue(),
249 AMDGPUAS::LOCAL_ADDRESS))
250 return N;
251
252 const SITargetLowering& Lowering =
253 *static_cast(getTargetLowering());
254
255 // Write max value to m0 before each load operation
256
257 SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N),
258 CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
259
260 SDValue Glue = M0.getValue(1);
261
262 SmallVector Ops;
263 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
264 Ops.push_back(N->getOperand(i));
265 }
266 Ops.push_back(Glue);
267 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
268
269 return N;
270 }
271
272 SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
273 unsigned int Opc = N->getOpcode();
274 if (N->isMachineOpcode()) {
275 N->setNodeId(-1);
276 return nullptr; // Already selected.
277 }
278
279 if (isa(N))
280 N = glueCopyToM0(N);
281
282 switch (Opc) {
283 default: break;
284 // We are selecting i64 ADD here instead of custom lower it during
285 // DAG legalization, so we can fold some i64 ADDs used for address
286 // calculation into the LOAD and STORE instructions.
287 case ISD::ADD:
288 case ISD::SUB: {
289 if (N->getValueType(0) != MVT::i64 ||
290 Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
291 break;
292
293 return SelectADD_SUB_I64(N);
294 }
295 case ISD::SCALAR_TO_VECTOR:
296 case AMDGPUISD::BUILD_VERTICAL_VECTOR:
297 case ISD::BUILD_VECTOR: {
298 unsigned RegClassID;
299 const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
300 EVT VT = N->getValueType(0);
301 unsigned NumVectorElts = VT.getVectorNumElements();
302 EVT EltVT = VT.getVectorElementType();
303 assert(EltVT.bitsEq(MVT::i32));
304 if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
305 bool UseVReg = true;
306 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
307 U != E; ++U) {
308 if (!U->isMachineOpcode()) {
309 continue;
310 }
311 const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
312 if (!RC) {
313 continue;
314 }
315 if (static_cast(TRI)->isSGPRClass(RC)) {
316 UseVReg = false;
317 }
318 }
319 switch(NumVectorElts) {
320 case 1: RegClassID = UseVReg ? AMDGPU::VGPR_32RegClassID :
321 AMDGPU::SReg_32RegClassID;
322 break;
323 case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID :
324 AMDGPU::SReg_64RegClassID;
325 break;
326 case 4: RegClassID = UseVReg ? AMDGPU::VReg_128RegClassID :
327 AMDGPU::SReg_128RegClassID;
328 break;
329 case 8: RegClassID = UseVReg ? AMDGPU::VReg_256RegClassID :
330 AMDGPU::SReg_256RegClassID;
331 break;
332 case 16: RegClassID = UseVReg ? AMDGPU::VReg_512RegClassID :
333 AMDGPU::SReg_512RegClassID;
334 break;
335 default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
336 }
337 } else {
338 // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
339 // that adds a 128 bits reg copy when going through TwoAddressInstructions
340 // pass. We want to avoid 128 bits copies as much as possible because they
341 // can't be bundled by our scheduler.
342 switch(NumVectorElts) {
343 case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
344 case 4:
345 if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
346 RegClassID = AMDGPU::R600_Reg128VerticalRegClassID;
347 else
348 RegClassID = AMDGPU::R600_Reg128RegClassID;
349 break;
350 default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
351 }
352 }
353
354 SDLoc DL(N);
355 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
356
357 if (NumVectorElts == 1) {
358 return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT,
359 N->getOperand(0), RegClass);
360 }
361
362 assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not "
363 "supported yet");
364 // 16 = Max Num Vector Elements
365 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
366 // 1 = Vector Register Class
367 SmallVector RegSeqArgs(NumVectorElts * 2 + 1);
368
369 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
370 bool IsRegSeq = true;
371 unsigned NOps = N->getNumOperands();
372 for (unsigned i = 0; i < NOps; i++) {
373 // XXX: Why is this here?
374 if (isa(N->getOperand(i))) {
375 IsRegSeq = false;
376 break;
377 }
378 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
379 RegSeqArgs[1 + (2 * i) + 1] =
380 CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL,
381 MVT::i32);
382 }
383
384 if (NOps != NumVectorElts) {
385 // Fill in the missing undef elements if this was a scalar_to_vector.
386 assert(Opc == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
387
388 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
389 DL, EltVT);
390 for (unsigned i = NOps; i < NumVectorElts; ++i) {
391 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
392 RegSeqArgs[1 + (2 * i) + 1] =
393 CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32);
394 }
395 }
396
397 if (!IsRegSeq)
398 break;
399 return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(),
400 RegSeqArgs);
401 }
402 case ISD::BUILD_PAIR: {
403 SDValue RC, SubReg0, SubReg1;
404 if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
405 break;
406 }
407 SDLoc DL(N);
408 if (N->getValueType(0) == MVT::i128) {
409 RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32);
410 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
411 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
412 } else if (N->getValueType(0) == MVT::i64) {
413 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
414 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
415 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
416 } else {
417 llvm_unreachable("Unhandled value type for BUILD_PAIR");
418 }
419 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
420 N->getOperand(1), SubReg1 };
421 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
422 DL, N->getValueType(0), Ops);
423 }
424
425 case ISD::Constant:
426 case ISD::ConstantFP: {
427 if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
428 N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
429 break;
430
431 uint64_t Imm;
432 if (ConstantFPSDNode *FP = dyn_cast(N))
433 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
434 else {
435 ConstantSDNode *C = cast(N);
436 Imm = C->getZExtValue();
437 }
438
439 SDLoc DL(N);
440 SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
441 CurDAG->getConstant(Imm & 0xFFFFFFFF, DL,
442 MVT::i32));
443 SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
444 CurDAG->getConstant(Imm >> 32, DL, MVT::i32));
445 const SDValue Ops[] = {
446 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
447 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
448 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
449 };
450
451 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
452 N->getValueType(0), Ops);
453 }
454
455 case ISD::LOAD: {
456 LoadSDNode *LD = cast(N);
457 SDLoc SL(N);
458 EVT VT = N->getValueType(0);
459
460 if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) {
461 N = glueCopyToM0(N);
462 break;
463 }
464
465 // To simplify the TableGen patters, we replace all i64 loads with
466 // v2i32 loads. Alternatively, we could promote i64 loads to v2i32
467 // during DAG legalization, however, so places (ExpandUnalignedLoad)
468 // in the DAG legalizer assume that if i64 is legal, so doing this
469 // promotion early can cause problems.
470
471 SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(),
472 LD->getBasePtr(), LD->getMemOperand());
473 SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL,
474 MVT::i64, NewLoad);
475 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1));
476 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast);
477 SDNode *Load = glueCopyToM0(NewLoad.getNode());
478 SelectCode(Load);
479 N = BitCast.getNode();
480 break;
481 }
482
483 case ISD::STORE: {
484 // Handle i64 stores here for the same reason mentioned above for loads.
485 StoreSDNode *ST = cast(N);
486 SDValue Value = ST->getValue();
487 if (Value.getValueType() == MVT::i64 && !ST->isTruncatingStore()) {
488
489 SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N),
490 MVT::v2i32, Value);
491 SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue,
492 ST->getBasePtr(), ST->getMemOperand());
493
494 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore);
495
496 if (NewValue.getOpcode() == ISD::BITCAST) {
497 Select(NewStore.getNode());
498 return SelectCode(NewValue.getNode());
499 }
500
501 // getNode() may fold the bitcast if its input was another bitcast. If that
502 // happens we should only select the new store.
503 N = NewStore.getNode();
504 }
505
506 N = glueCopyToM0(N);
507 break;
508 }
509
510 case AMDGPUISD::REGISTER_LOAD: {
511 if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
512 break;
513 SDValue Addr, Offset;
514
515 SDLoc DL(N);
516 SelectADDRIndirect(N->getOperand(1), Addr, Offset);
517 const SDValue Ops[] = {
518 Addr,
519 Offset,
520 CurDAG->getTargetConstant(0, DL, MVT::i32),
521 N->getOperand(0),
522 };
523 return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, DL,
524 CurDAG->getVTList(MVT::i32, MVT::i64,
525 MVT::Other),
526 Ops);
527 }
528 case AMDGPUISD::REGISTER_STORE: {
529 if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
530 break;
531 SDValue Addr, Offset;
532 SelectADDRIndirect(N->getOperand(2), Addr, Offset);
533 SDLoc DL(N);
534 const SDValue Ops[] = {
535 N->getOperand(1),
536 Addr,
537 Offset,
538 CurDAG->getTargetConstant(0, DL, MVT::i32),
539 N->getOperand(0),
540 };
541 return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, DL,
542 CurDAG->getVTList(MVT::Other),
543 Ops);
544 }
545
546 case AMDGPUISD::BFE_I32:
547 case AMDGPUISD::BFE_U32: {
548 if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
549 break;
550
551 // There is a scalar version available, but unlike the vector version which
552 // has a separate operand for the offset and width, the scalar version packs
553 // the width and offset into a single operand. Try to move to the scalar
554 // version if the offsets are constant, so that we can try to keep extended
555 // loads of kernel arguments in SGPRs.
556
557 // TODO: Technically we could try to pattern match scalar bitshifts of
558 // dynamic values, but it's probably not useful.
559 ConstantSDNode *Offset = dyn_cast(N->getOperand(1));
560 if (!Offset)
561 break;
562
563 ConstantSDNode *Width = dyn_cast(N->getOperand(2));
564 if (!Width)
565 break;
566
567 bool Signed = Opc == AMDGPUISD::BFE_I32;
568
569 uint32_t OffsetVal = Offset->getZExtValue();
570 uint32_t WidthVal = Width->getZExtValue();
571
572 return getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, SDLoc(N),
573 N->getOperand(0), OffsetVal, WidthVal);
574
575 }
576 case AMDGPUISD::DIV_SCALE: {
577 return SelectDIV_SCALE(N);
578 }
579 case ISD::CopyToReg: {
580 const SITargetLowering& Lowering =
581 *static_cast(getTargetLowering());
582 Lowering.legalizeTargetIndependentNode(N, *CurDAG);
583 break;
584 }
585 case ISD::ADDRSPACECAST:
586 return SelectAddrSpaceCast(N);
587 case ISD::AND:
588 case ISD::SRL:
589 case ISD::SRA:
590 if (N->getValueType(0) != MVT::i32 ||
591 Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
592 break;
593
594 return SelectS_BFE(N);
595 }
596
597 return SelectCode(N);
598 }
599
600
601 bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) {
602 assert(AS != 0 && "Use checkPrivateAddress instead.");
603 if (!Ptr)
604 return false;
605
606 return Ptr->getType()->getPointerAddressSpace() == AS;
607 }
608
609 bool AMDGPUDAGToDAGISel::checkPrivateAddress(const MachineMemOperand *Op) {
610 if (Op->getPseudoValue())
611 return true;
612
613 if (PointerType *PT = dyn_cast(Op->getValue()->getType()))
614 return PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
615
616 return false;
617 }
618
619 bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) {
620 return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS);
621 }
622
623 bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) {
624 const Value *MemVal = N->getMemOperand()->getValue();
625 return (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
626 !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
627 !checkType(MemVal, AMDGPUAS::REGION_ADDRESS));
628 }
629
630 bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) {
631 return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
632 }
633
634 bool AMDGPUDAGToDAGISel::isFlatStore(const StoreSDNode *N) {
635 return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS);
636 }
637
638 bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
639 return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
640 }
641
642 bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int CbId) const {
643 const Value *MemVal = N->getMemOperand()->getValue();
644 if (CbId == -1)
645 return checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS);
646
647 return checkType(MemVal, AMDGPUAS::CONSTANT_BUFFER_0 + CbId);
648 }
649
650 bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const {
651 if (N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS)
652 if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
653 N->getMemoryVT().bitsLT(MVT::i32))
654 return true;
655
656 return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS);
657 }
658
659 bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) const {
660 return checkType(N->getMemOperand()->getValue(), AMDGPUAS::PARAM_I_ADDRESS);
661 }
662
663 bool AMDGPUDAGToDAGISel::isLocalLoad(const LoadSDNode *N) const {
664 return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
665 }
666
667 bool AMDGPUDAGToDAGISel::isFlatLoad(const LoadSDNode *N) const {
668 return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS);
669 }
670
671 bool AMDGPUDAGToDAGISel::isRegionLoad(const LoadSDNode *N) const {
672 return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
673 }
674
675 bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const {
676 MachineMemOperand *MMO = N->getMemOperand();
677 if (checkPrivateAddress(N->getMemOperand())) {
678 if (MMO) {
679 const PseudoSourceValue *PSV = MMO->getPseudoValue();
680 if (PSV && PSV == PseudoSourceValue::getConstantPool()) {
681 return true;
682 }
683 }
684 }
685 return false;
686 }
687
688 bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const {
689 if (checkPrivateAddress(N->getMemOperand())) {
690 // Check to make sure we are not a constant pool load or a constant load
691 // that is marked as a private load
692 if (isCPLoad(N) || isConstantLoad(N, -1)) {
693 return false;
694 }
695 }
696
697 const Value *MemVal = N->getMemOperand()->getValue();
698 if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
699 !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
700 !checkType(MemVal, AMDGPUAS::FLAT_ADDRESS) &&
701 !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) &&
702 !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) &&
703 !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) &&
704 !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)) {
705 return true;
706 }
707 return false;
708 }
709
710 const char *AMDGPUDAGToDAGISel::getPassName() const {
711 return "AMDGPU DAG->DAG Pattern Instruction Selection";
712 }
713
714 #ifdef DEBUGTMP
715 #undef INT64_C
716 #endif
717 #undef DEBUGTMP
718
719 //===----------------------------------------------------------------------===//
720 // Complex Patterns
721 //===----------------------------------------------------------------------===//
722
723 bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
724 SDValue& IntPtr) {
725 if (ConstantSDNode *Cst = dyn_cast(Addr)) {
726 IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr),
727 true);
728 return true;
729 }
730 return false;
731 }
732
733 bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
734 SDValue& BaseReg, SDValue &Offset) {
735 if (!isa(Addr)) {
736 BaseReg = Addr;
737 Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true);
738 return true;
739 }
740 return false;
741 }
742
743 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
744 SDValue &Offset) {
745 ConstantSDNode *IMMOffset;
746
747 if (Addr.getOpcode() == ISD::ADD
748 && (IMMOffset = dyn_cast(Addr.getOperand(1)))
749 && isInt<16>(IMMOffset->getZExtValue())) {
750
751 Base = Addr.getOperand(0);
752 Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
753 MVT::i32);
754 return true;
755 // If the pointer address is constant, we can move it to the offset field.
756 } else if ((IMMOffset = dyn_cast(Addr))
757 && isInt<16>(IMMOffset->getZExtValue())) {
758 Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
759 SDLoc(CurDAG->getEntryNode()),
760 AMDGPU::ZERO, MVT::i32);
761 Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
762 MVT::i32);
763 return true;
764 }
765
766 // Default case, no offset
767 Base = Addr;
768 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
769 return true;
770 }
771
772 bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
773 SDValue &Offset) {
774 ConstantSDNode *C;
775 SDLoc DL(Addr);
776
777 if ((C = dyn_cast(Addr))) {
778 Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
779 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
780 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
781 (C = dyn_cast(Addr.getOperand(1)))) {
782 Base = Addr.getOperand(0);
783 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
784 } else {
785 Base = Addr;
786 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
787 }
788
789 return true;
790 }
791
792 SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
793 SDLoc DL(N);
794 SDValue LHS = N->getOperand(0);
795 SDValue RHS = N->getOperand(1);
796
797 bool IsAdd = (N->getOpcode() == ISD::ADD);
798
799 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
800 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
801
802 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
803 DL, MVT::i32, LHS, Sub0);
804 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
805 DL, MVT::i32, LHS, Sub1);
806
807 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
808 DL, MVT::i32, RHS, Sub0);
809 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
810 DL, MVT::i32, RHS, Sub1);
811
812 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
813 SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
814
815
816 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
817 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
818
819 SDNode *AddLo = CurDAG->getMachineNode( Opc, DL, VTList, AddLoArgs);
820 SDValue Carry(AddLo, 1);
821 SDNode *AddHi
822 = CurDAG->getMachineNode(CarryOpc, DL, MVT::i32,
823 SDValue(Hi0, 0), SDValue(Hi1, 0), Carry);
824
825 SDValue Args[5] = {
826 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
827 SDValue(AddLo,0),
828 Sub0,
829 SDValue(AddHi,0),
830 Sub1,
831 };
832 return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
833 }
834
835 // We need to handle this here because tablegen doesn't support matching
836 // instructions with multiple outputs.
837 SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
838 SDLoc SL(N);
839 EVT VT = N->getValueType(0);
840
841 assert(VT == MVT::f32 || VT == MVT::f64);
842
843 unsigned Opc
844 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
845
846 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
847 SDValue Ops[8];
848
849 SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
850 SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]);
851 SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]);
852 return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops);
853 }
854
855 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
856 unsigned OffsetBits) const {
857 if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
858 (OffsetBits == 8 && !isUInt<8>(Offset)))
859 return false;
860
861 if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS)
862 return true;
863
864 // On Southern Islands instruction with a negative base value and an offset
865 // don't seem to work.
866 return CurDAG->SignBitIsZero(Base);
867 }
868
869 bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
870 SDValue &Offset) const {
871 if (CurDAG->isBaseWithConstantOffset(Addr)) {
872 SDValue N0 = Addr.getOperand(0);
873 SDValue N1 = Addr.getOperand(1);
874 ConstantSDNode *C1 = cast(N1);
875 if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) {
876 // (add n0, c0)
877 Base = N0;
878 Offset = N1;
879 return true;
880 }
881 }
882
883 SDLoc DL(Addr);
884
885 // If we have a constant address, prefer to put the constant into the
886 // offset. This can save moves to load the constant address since multiple
887 // operations can share the zero base address register, and enables merging
888 // into read2 / write2 instructions.
889 if (const ConstantSDNode *CAddr = dyn_cast(Addr)) {
890 if (isUInt<16>(CAddr->getZExtValue())) {
891 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
892 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
893 DL, MVT::i32, Zero);
894 Base = SDValue(MovZero, 0);
895 Offset = Addr;
896 return true;
897 }
898 }
899
900 // default case
901 Base = Addr;
902 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
903 return true;
904 }
905
906 bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
907 SDValue &Offset0,
908 SDValue &Offset1) const {
909 SDLoc DL(Addr);
910
911 if (CurDAG->isBaseWithConstantOffset(Addr)) {
912 SDValue N0 = Addr.getOperand(0);
913 SDValue N1 = Addr.getOperand(1);
914 ConstantSDNode *C1 = cast(N1);
915 unsigned DWordOffset0 = C1->getZExtValue() / 4;
916 unsigned DWordOffset1 = DWordOffset0 + 1;
917 // (add n0, c0)
918 if (isDSOffsetLegal(N0, DWordOffset1, 8)) {
919 Base = N0;
920 Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
921 Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
922 return true;
923 }
924 }
925
926 if (const ConstantSDNode *CAddr = dyn_cast(Addr)) {
927 unsigned DWordOffset0 = CAddr->getZExtValue() / 4;
928 unsigned DWordOffset1 = DWordOffset0 + 1;
929 assert(4 * DWordOffset0 == CAddr->getZExtValue());
930
931 if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) {
932 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
933 MachineSDNode *MovZero
934 = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
935 DL, MVT::i32, Zero);
936 Base = SDValue(MovZero, 0);
937 Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
938 Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
939 return true;
940 }
941 }
942
943 // default case
944 Base = Addr;
945 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
946 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
947 return true;
948 }
949
950 static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) {
951 return isUInt<12>(Imm->getZExtValue());
952 }
953
954 void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
955 SDValue &VAddr, SDValue &SOffset,
956 SDValue &Offset, SDValue &Offen,
957 SDValue &Idxen, SDValue &Addr64,
958 SDValue &GLC, SDValue &SLC,
959 SDValue &TFE) const {
960 SDLoc DL(Addr);
961
962 GLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
963 SLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
964 TFE = CurDAG->getTargetConstant(0, DL, MVT::i1);
965
966 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
967 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
968 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
969 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
970
971 if (CurDAG->isBaseWithConstantOffset(Addr)) {
972 SDValue N0 = Addr.getOperand(0);
973 SDValue N1 = Addr.getOperand(1);
974 ConstantSDNode *C1 = cast(N1);
975
976 if (N0.getOpcode() == ISD::ADD) {
977 // (add (add N2, N3), C1) -> addr64
978 SDValue N2 = N0.getOperand(0);
979 SDValue N3 = N0.getOperand(1);
980 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
981 Ptr = N2;
982 VAddr = N3;
983 } else {
984
985 // (add N0, C1) -> offset
986 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
987 Ptr = N0;
988 }
989
990 if (isLegalMUBUFImmOffset(C1)) {
991 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
992 return;
993 } else if (isUInt<32>(C1->getZExtValue())) {
994 // Illegal offset, store it in soffset.
995 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
996 SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
997 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
998 0);
999 return;
1000 }
1001 }
1002
1003 if (Addr.getOpcode() == ISD::ADD) {
1004 // (add N0, N1) -> addr64
1005 SDValue N0 = Addr.getOperand(0);
1006 SDValue N1 = Addr.getOperand(1);
1007 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1008 Ptr = N0;
1009 VAddr = N1;
1010 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1011 return;
1012 }
1013
1014 // default case -> offset
1015 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1016 Ptr = Addr;
1017 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1018
1019 }
1020
1021 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1022 SDValue &VAddr, SDValue &SOffset,
1023 SDValue &Offset, SDValue &GLC,
1024 SDValue &SLC, SDValue &TFE) const {
1025 SDValue Ptr, Offen, Idxen, Addr64;
1026
1027 SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
1028 GLC, SLC, TFE);
1029
1030 ConstantSDNode *C = cast(Addr64);
1031 if (C->getSExtValue()) {
1032 SDLoc DL(Addr);
1033
1034 const SITargetLowering& Lowering =
1035 *static_cast(getTargetLowering());
1036
1037 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1038 return true;
1039 }
1040
1041 return false;
1042 }
1043
1044 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1045 SDValue &VAddr, SDValue &SOffset,
1046 SDValue &Offset,
1047 SDValue &SLC) const {
1048 SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1);
1049 SDValue GLC, TFE;
1050
1051 return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE);
1052 }
1053
1054 bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
1055 SDValue &VAddr, SDValue &SOffset,
1056 SDValue &ImmOffset) const {
1057
1058 SDLoc DL(Addr);
1059 MachineFunction &MF = CurDAG->getMachineFunction();
1060 const SIRegisterInfo *TRI =
1061 static_cast(Subtarget->getRegisterInfo());
1062 MachineRegisterInfo &MRI = MF.getRegInfo();
1063 const SITargetLowering& Lowering =
1064 *static_cast(getTargetLowering());
1065
1066 unsigned ScratchOffsetReg =
1067 TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
1068 Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass,
1069 ScratchOffsetReg, MVT::i32);
1070 SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32);
1071 SDValue ScratchRsrcDword0 =
1072 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0);
1073
1074 SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32);
1075 SDValue ScratchRsrcDword1 =
1076 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0);
1077
1078 const SDValue RsrcOps[] = {
1079 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
1080 ScratchRsrcDword0,
1081 CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
1082 ScratchRsrcDword1,
1083 CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
1084 };
1085 SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1086 MVT::v2i32, RsrcOps), 0);
1087 Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0);
1088 SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
1089 MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32);
1090
1091 // (add n0, c1)
1092 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1093 SDValue N1 = Addr.getOperand(1);
1094 ConstantSDNode *C1 = cast(N1);
1095
1096 if (isLegalMUBUFImmOffset(C1)) {
1097 VAddr = Addr.getOperand(0);
1098 ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1099 return true;
1100 }
1101 }
1102
1103 // (node)
1104 VAddr = Addr;
1105 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1106 return true;
1107 }
1108
1109 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1110 SDValue &SOffset, SDValue &Offset,
1111 SDValue &GLC, SDValue &SLC,
1112 SDValue &TFE) const {
1113 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1114 const SIInstrInfo *TII =
1115 static_cast(Subtarget->getInstrInfo());
1116
1117 SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
1118 GLC, SLC, TFE);
1119
1120 if (!cast(Offen)->getSExtValue() &&
1121 !cast(Idxen)->getSExtValue() &&
1122 !cast(Addr64)->getSExtValue()) {
1123 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1124 APInt::getAllOnesValue(32).getZExtValue(); // Size
1125 SDLoc DL(Addr);
1126
1127 const SITargetLowering& Lowering =
1128 *static_cast(getTargetLowering());
1129
1130 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1131 return true;
1132 }
1133 return false;
1134 }
1135
1136 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1137 SDValue &Soffset, SDValue &Offset,
1138 SDValue &GLC) const {
1139 SDValue SLC, TFE;
1140
1141 return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE);
1142 }
1143
1144 // FIXME: This is incorrect and only enough to be able to compile.
1145 SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
1146 AddrSpaceCastSDNode *ASC = cast(N);
1147 SDLoc DL(N);
1148
1149 assert(Subtarget->hasFlatAddressSpace() &&
1150 "addrspacecast only supported with flat address space!");
1151
1152 assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
1153 ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) &&
1154 "Cannot cast address space to / from constant address!");
1155
1156 assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
1157 ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) &&
1158 "Can only cast to / from flat address space!");
1159
1160 // The flat instructions read the address as the index of the VGPR holding the
1161 // address, so casting should just be reinterpreting the base VGPR, so just
1162 // insert trunc / bitcast / zext.
1163
1164 SDValue Src = ASC->getOperand(0);
1165 EVT DestVT = ASC->getValueType(0);
1166 EVT SrcVT = Src.getValueType();
1167
1168 unsigned SrcSize = SrcVT.getSizeInBits();
1169 unsigned DestSize = DestVT.getSizeInBits();
1170
1171 if (SrcSize > DestSize) {
1172 assert(SrcSize == 64 && DestSize == 32);
1173 return CurDAG->getMachineNode(
1174 TargetOpcode::EXTRACT_SUBREG,
1175 DL,
1176 DestVT,
1177 Src,
1178 CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32));
1179 }
1180
1181
1182 if (DestSize > SrcSize) {
1183 assert(SrcSize == 32 && DestSize == 64);
1184
1185 // FIXME: This is probably wrong, we should never be defining
1186 // a register class with both VGPRs and SGPRs
1187 SDValue RC = CurDAG->getTargetConstant(AMDGPU::VS_64RegClassID, DL,
1188 MVT::i32);
1189
1190 const SDValue Ops[] = {
1191 RC,
1192 Src,
1193 CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
1194 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
1195 CurDAG->getConstant(0, DL, MVT::i32)), 0),
1196 CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
1197 };
1198
1199 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
1200 DL, N->getValueType(0), Ops);
1201 }
1202
1203 assert(SrcSize == 64 && DestSize == 64);
1204 return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode();
1205 }
1206
1207 SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val,
1208 uint32_t Offset, uint32_t Width) {
1209 // Transformation function, pack the offset and width of a BFE into
1210 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1211 // source, bits [5:0] contain the offset and bits [22:16] the width.
1212 uint32_t PackedVal = Offset | (Width << 16);
1213 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
1214
1215 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
1216 }
1217
1218 SDNode *AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
1219 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
1220 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
1221 // Predicate: 0 < b <= c < 32
1222
1223 const SDValue &Shl = N->getOperand(0);
1224 ConstantSDNode *B = dyn_cast(Shl->getOperand(1));
1225 ConstantSDNode *C = dyn_cast(N->getOperand(1));
1226
1227 if (B && C) {
1228 uint32_t BVal = B->getZExtValue();
1229 uint32_t CVal = C->getZExtValue();
1230
1231 if (0 < BVal && BVal <= CVal && CVal < 32) {
1232 bool Signed = N->getOpcode() == ISD::SRA;
1233 unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
1234
1235 return getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0),
1236 CVal - BVal, 32 - CVal);
1237 }
1238 }
1239 return SelectCode(N);
1240 }
1241
1242 SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
1243 switch (N->getOpcode()) {
1244 case ISD::AND:
1245 if (N->getOperand(0).getOpcode() == ISD::SRL) {
1246 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
1247 // Predicate: isMask(mask)
1248 const SDValue &Srl = N->getOperand(0);
1249 ConstantSDNode *Shift = dyn_cast(Srl.getOperand(1));
1250 ConstantSDNode *Mask = dyn_cast(N->getOperand(1));
1251
1252 if (Shift && Mask) {
1253 uint32_t ShiftVal = Shift->getZExtValue();
1254 uint32_t MaskVal = Mask->getZExtValue();
1255
1256 if (isMask_32(MaskVal)) {
1257 uint32_t WidthVal = countPopulation(MaskVal);
1258
1259 return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), Srl.getOperand(0),
1260 ShiftVal, WidthVal);
1261 }
1262 }
1263 }
1264 break;
1265 case ISD::SRL:
1266 if (N->getOperand(0).getOpcode() == ISD::AND) {
1267 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
1268 // Predicate: isMask(mask >> b)
1269 const SDValue &And = N->getOperand(0);
1270 ConstantSDNode *Shift = dyn_cast(N->getOperand(1));
1271 ConstantSDNode *Mask = dyn_cast(And->getOperand(1));
1272
1273 if (Shift && Mask) {
1274 uint32_t ShiftVal = Shift->getZExtValue();
1275 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
1276
1277 if (isMask_32(MaskVal)) {
1278 uint32_t WidthVal = countPopulation(MaskVal);
1279
1280 return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), And.getOperand(0),
1281 ShiftVal, WidthVal);
1282 }
1283 }
1284 } else if (N->getOperand(0).getOpcode() == ISD::SHL)
1285 return SelectS_BFEFromShifts(N);
1286 break;
1287 case ISD::SRA:
1288 if (N->getOperand(0).getOpcode() == ISD::SHL)
1289 return SelectS_BFEFromShifts(N);
1290 break;
1291 }
1292
1293 return SelectCode(N);
1294 }
1295
1296 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
1297 SDValue &SrcMods) const {
1298
1299 unsigned Mods = 0;
1300
1301 Src = In;
1302
1303 if (Src.getOpcode() == ISD::FNEG) {
1304 Mods |= SISrcMods::NEG;
1305 Src = Src.getOperand(0);
1306 }
1307
1308 if (Src.getOpcode() == ISD::FABS) {
1309 Mods |= SISrcMods::ABS;
1310 Src = Src.getOperand(0);
1311 }
1312
1313 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
1314
1315 return true;
1316 }
1317
1318 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
1319 SDValue &SrcMods, SDValue &Clamp,
1320 SDValue &Omod) const {
1321 SDLoc DL(In);
1322 // FIXME: Handle Clamp and Omod
1323 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i32);
1324 Omod = CurDAG->getTargetConstant(0, DL, MVT::i32);
1325
1326 return SelectVOP3Mods(In, Src, SrcMods);
1327 }
1328
1329 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src,
1330 SDValue &SrcMods,
1331 SDValue &Omod) const {
1332 // FIXME: Handle Omod
1333 Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
1334
1335 return SelectVOP3Mods(In, Src, SrcMods);
1336 }
1337
1338 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
1339 SDValue &SrcMods,
1340 SDValue &Clamp,
1341 SDValue &Omod) const {
1342 Clamp = Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
1343 return SelectVOP3Mods(In, Src, SrcMods);
1344 }
1345
1346 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
1347 const AMDGPUTargetLowering& Lowering =
1348 *static_cast(getTargetLowering());
1349 bool IsModified = false;
1350 do {
1351 IsModified = false;
1352 // Go over all selected nodes and try to fold them a bit more
1353 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
1354 E = CurDAG->allnodes_end(); I != E; ++I) {
1355
1356 SDNode *Node = I;
1357
1358 MachineSDNode *MachineNode = dyn_cast(I);
1359 if (!MachineNode)
1360 continue;
1361
1362 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
1363 if (ResNode != Node) {
1364 ReplaceUses(Node, ResNode);
1365 IsModified = true;
1366 }
1367 }
1368 CurDAG->RemoveDeadNodes();
1369 } while (IsModified);
1370 }
0 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief This is the parent TargetLowering class for hardware code gen
11 /// targets.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "AMDGPUISelLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUFrameLowering.h"
18 #include "AMDGPUIntrinsicInfo.h"
19 #include "AMDGPURegisterInfo.h"
20 #include "AMDGPUSubtarget.h"
21 #include "R600MachineFunctionInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "llvm/CodeGen/CallingConvLower.h"
24 #include "llvm/CodeGen/MachineFunction.h"
25 #include "llvm/CodeGen/MachineRegisterInfo.h"
26 #include "llvm/CodeGen/SelectionDAG.h"
27 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
28 #include "llvm/IR/DataLayout.h"
29 #include "llvm/IR/DiagnosticInfo.h"
30 #include "llvm/IR/DiagnosticPrinter.h"
31
32 using namespace llvm;
33
34 namespace {
35
36 /// Diagnostic information for unimplemented or unsupported feature reporting.
37 class DiagnosticInfoUnsupported : public DiagnosticInfo {
38 private:
39 const Twine &Description;
40 const Function &Fn;
41
42 static int KindID;
43
44 static int getKindID() {
45 if (KindID == 0)
46 KindID = llvm::getNextAvailablePluginDiagnosticKind();
47 return KindID;
48 }
49
50 public:
51 DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc,
52 DiagnosticSeverity Severity = DS_Error)
53 : DiagnosticInfo(getKindID(), Severity),
54 Description(Desc),
55 Fn(Fn) { }
56
57 const Function &getFunction() const { return Fn; }
58 const Twine &getDescription() const { return Description; }
59
60 void print(DiagnosticPrinter &DP) const override {
61 DP << "unsupported " << getDescription() << " in " << Fn.getName();
62 }
63
64 static bool classof(const DiagnosticInfo *DI) {
65 return DI->getKind() == getKindID();
66 }
67 };
68
69 int DiagnosticInfoUnsupported::KindID = 0;
70 }
71
72
73 static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
74 CCValAssign::LocInfo LocInfo,
75 ISD::ArgFlagsTy ArgFlags, CCState &State) {
76 unsigned Offset = State.AllocateStack(ValVT.getStoreSize(),
77 ArgFlags.getOrigAlign());
78 State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
79
80 return true;
81 }
82
83 #include "AMDGPUGenCallingConv.inc"
84
85 // Find a larger type to do a load / store of a vector with.
86 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
87 unsigned StoreSize = VT.getStoreSizeInBits();
88 if (StoreSize <= 32)
89 return EVT::getIntegerVT(Ctx, StoreSize);
90
91 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
92 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
93 }
94
95 // Type for a vector that will be loaded to.
96 EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) {
97 unsigned StoreSize = VT.getStoreSizeInBits();
98 if (StoreSize <= 32)
99 return EVT::getIntegerVT(Ctx, 32);
100
101 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
102 }
103
104 AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
105 const AMDGPUSubtarget &STI)
106 : TargetLowering(TM), Subtarget(&STI) {
107 setOperationAction(ISD::Constant, MVT::i32, Legal);
108 setOperationAction(ISD::Constant, MVT::i64, Legal);
109 setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
110 setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
111
112 setOperationAction(ISD::BR_JT, MVT::Other, Expand);
113 setOperationAction(ISD::BRIND, MVT::Other, Expand);
114
115 // We need to custom lower some of the intrinsics
116 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
117
118 // Library functions. These default to Expand, but we have instructions
119 // for them.
120 setOperationAction(ISD::FCEIL, MVT::f32, Legal);
121 setOperationAction(ISD::FEXP2, MVT::f32, Legal);
122 setOperationAction(ISD::FPOW, MVT::f32, Legal);
123 setOperationAction(ISD::FLOG2, MVT::f32, Legal);
124 setOperationAction(ISD::FABS, MVT::f32, Legal);
125 setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
126 setOperationAction(ISD::FRINT, MVT::f32, Legal);
127 setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
128 setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
129 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
130
131 setOperationAction(ISD::FROUND, MVT::f32, Custom);
132 setOperationAction(ISD::FROUND, MVT::f64, Custom);
133
134 setOperationAction(ISD::FREM, MVT::f32, Custom);
135 setOperationAction(ISD::FREM, MVT::f64, Custom);
136
137 // v_mad_f32 does not support denormals according to some sources.
138 if (!Subtarget->hasFP32Denormals())
139 setOperationAction(ISD::FMAD, MVT::f32, Legal);
140
141 // Expand to fneg + fadd.
142 setOperationAction(ISD::FSUB, MVT::f64, Expand);
143
144 // Lower floating point store/load to integer store/load to reduce the number
145 // of patterns in tablegen.
146 setOperationAction(ISD::STORE, MVT::f32, Promote);
147 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
148
149 setOperationAction(ISD::STORE, MVT::v2f32, Promote);
150 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
151
152 setOperationAction(ISD::STORE, MVT::v4f32, Promote);
153 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
154
155 setOperationAction(ISD::STORE, MVT::v8f32, Promote);
156 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
157
158 setOperationAction(ISD::STORE, MVT::v16f32, Promote);
159 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
160
161 setOperationAction(ISD::STORE, MVT::f64, Promote);
162 AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
163
164 setOperationAction(ISD::STORE, MVT::v2f64, Promote);
165 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64);
166
167 // Custom lowering of vector stores is required for local address space
168 // stores.
169 setOperationAction(ISD::STORE, MVT::v4i32, Custom);
170
171 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
172 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
173 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
174
175 // XXX: This can be change to Custom, once ExpandVectorStores can
176 // handle 64-bit stores.
177 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
178
179 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
180 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
181 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
182 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
183 setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand);
184
185
186 setOperationAction(ISD::LOAD, MVT::f32, Promote);
187 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
188
189 setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
190 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
191
192 setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
193 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
194
195 setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
196 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
197
198 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
199 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
200
201 setOperationAction(ISD::LOAD, MVT::f64, Promote);
202 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
203
204 setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
205 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64);
206
207 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
208 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
209 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
210 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
211 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
212 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
213 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
214 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
215 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
216 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
217
218 // There are no 64-bit extloads. These should be done as a 32-bit extload and
219 // an extension to 64-bit.
220 for (MVT VT : MVT::integer_valuetypes()) {
221 setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
222 setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
223 setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
224 }
225
226 for (MVT VT : MVT::integer_vector_valuetypes()) {
227 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
228 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
229 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
230 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
231 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
232 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
233 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
234 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
235 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
237 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
238 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
239 }
240
241 setOperationAction(ISD::BR_CC, MVT::i1, Expand);
242
243 if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
244 setOperationAction(ISD::FCEIL, MVT::f64, Custom);
245 setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
246 setOperationAction(ISD::FRINT, MVT::f64, Custom);
247 setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
248 }
249
250 if (!Subtarget->hasBFI()) {
251 // fcopysign can be done in a single instruction with BFI.
252 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
253 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
254 }
255
256 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
257
258 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
259 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
260 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
261 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
262
263 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
264 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
265 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
266 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
267
268 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
269 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
270 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
271 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
272
273 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
274 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
275
276 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
277 for (MVT VT : ScalarIntVTs) {
278 setOperationAction(ISD::SREM, VT, Expand);
279 setOperationAction(ISD::SDIV, VT, Expand);
280
281 // GPU does not have divrem function for signed or unsigned.
282 setOperationAction(ISD::SDIVREM, VT, Custom);
283 setOperationAction(ISD::UDIVREM, VT, Custom);
284
285 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
286 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
287 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
288
289 setOperationAction(ISD::BSWAP, VT, Expand);
290 setOperationAction(ISD::CTTZ, VT, Expand);
291 setOperationAction(ISD::CTLZ, VT, Expand);
292 }
293
294 if (!Subtarget->hasBCNT(32))
295 setOperationAction(ISD::CTPOP, MVT::i32, Expand);
296
297 if (!Subtarget->hasBCNT(64))
298 setOperationAction(ISD::CTPOP, MVT::i64, Expand);
299
300 // The hardware supports 32-bit ROTR, but not ROTL.
301 setOperationAction(ISD::ROTL, MVT::i32, Expand);
302 setOperationAction(ISD::ROTL, MVT::i64, Expand);
303 setOperationAction(ISD::ROTR, MVT::i64, Expand);
304
305 setOperationAction(ISD::MUL, MVT::i64, Expand);
306 setOperationAction(ISD::MULHU, MVT::i64, Expand);
307 setOperationAction(ISD::MULHS, MVT::i64, Expand);
308 setOperationAction(ISD::UDIV, MVT::i32, Expand);
309 setOperationAction(ISD::UREM, MVT::i32, Expand);
310 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
311 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
312 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
313 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
314 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
315
316 setOperationAction(ISD::SMIN, MVT::i32, Legal);
317 setOperationAction(ISD::UMIN, MVT::i32, Legal);
318 setOperationAction(ISD::SMAX, MVT::i32, Legal);
319 setOperationAction(ISD::UMAX, MVT::i32, Legal);
320
321 if (!Subtarget->hasFFBH())
322 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
323
324 if (!Subtarget->hasFFBL())
325 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
326
327 static const MVT::SimpleValueType VectorIntTypes[] = {
328 MVT::v2i32, MVT::v4i32
329 };
330
331 for (MVT VT : VectorIntTypes) {
332 // Expand the following operations for the current type by default.
333 setOperationAction(ISD::ADD, VT, Expand);
334 setOperationAction(ISD::AND, VT, Expand);
335 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
336 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
337 setOperationAction(ISD::MUL, VT, Expand);
338 setOperationAction(ISD::OR, VT, Expand);
339 setOperationAction(ISD::SHL, VT, Expand);
340 setOperationAction(ISD::SRA, VT, Expand);
341 setOperationAction(ISD::SRL, VT, Expand);
342 setOperationAction(ISD::ROTL, VT, Expand);
343 setOperationAction(ISD::ROTR, VT, Expand);
344 setOperationAction(ISD::SUB, VT, Expand);
345 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
346 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
347 setOperationAction(ISD::SDIV, VT, Expand);
348 setOperationAction(ISD::UDIV, VT, Expand);
349 setOperationAction(ISD::SREM, VT, Expand);
350 setOperationAction(ISD::UREM, VT, Expand);
351 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
352 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
353 setOperationAction(ISD::SDIVREM, VT, Custom);
354 setOperationAction(ISD::UDIVREM, VT, Custom);
355 setOperationAction(ISD::ADDC, VT, Expand);
356 setOperationAction(ISD::SUBC, VT, Expand);
357 setOperationAction(ISD::ADDE, VT, Expand);
358 setOperationAction(ISD::SUBE, VT, Expand);
359 setOperationAction(ISD::SELECT, VT, Expand);
360 setOperationAction(ISD::VSELECT, VT, Expand);
361 setOperationAction(ISD::SELECT_CC, VT, Expand);
362 setOperationAction(ISD::XOR, VT, Expand);