llvm.org GIT mirror llvm / 737edaf
Merging r260651: ------------------------------------------------------------------------ r260651 | Matthew.Arsenault | 2016-02-11 18:40:47 -0800 (Thu, 11 Feb 2016) | 7 lines AMDGPU: Set element_size in private resource descriptor Introduce a subtarget feature for this, and leave the default with the current behavior which assumes up to 16-byte loads/stores can be used. The field also seems to have the ability to be set to 2 bytes, but I'm not sure what that would be used for. ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@271679 91177308-0d34-0410-b5e6-96231b3b80d8 Tom Stellard 3 years ago
11 changed file(s) with 65 addition(s) and 11 deletion(s). Raw diff Collapse all Expand all
204204 "CIInsts",
205205 "true",
206206 "Additional intstructions for CI+">;
207
208 class FeatureMaxPrivateElementSize : SubtargetFeature<
209 "max-private-element-size-"#size,
210 "MaxPrivateElementSize",
211 !cast(size),
212 "Maximum private access size may be "#size
213 >;
214
215 def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>;
216 def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>;
217 def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>;
207218
208219 // Dummy feature used to disable assembler instructions.
209220 def FeatureDisable : SubtargetFeature<"",
592592 }
593593 }
594594
595 // This is supposed to be log2(Size)
596 static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
597 switch (Size) {
598 case 4:
599 return AMD_ELEMENT_4_BYTES;
600 case 8:
601 return AMD_ELEMENT_8_BYTES;
602 case 16:
603 return AMD_ELEMENT_16_BYTES;
604 default:
605 llvm_unreachable("invalid private_element_size");
606 }
607 }
608
595609 void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
596610 const SIProgramInfo &KernelInfo) const {
597611 const SIMachineFunctionInfo *MFI = MF.getInfo();
604618 KernelInfo.ComputePGMRSrc1 |
605619 (KernelInfo.ComputePGMRSrc2 << 32);
606620 header.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
621
622
623 AMD_HSA_BITS_SET(header.code_properties,
624 AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
625 getElementByteSizeValue(STM.getMaxPrivateElementSize()));
607626
608627 if (MFI->hasPrivateSegmentBuffer()) {
609628 header.code_properties |=
5757 FP32Denormals = false;
5858 FP64Denormals = false;
5959 }
60
61 // Set defaults if needed.
62 if (MaxPrivateElementSize == 0)
63 MaxPrivateElementSize = 16;
64
6065 return *this;
6166 }
6267
7176 EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false),
7277 EnableXNACK(false),
7378 WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
79 MaxPrivateElementSize(0),
7480 EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),
7581 GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0),
7682 IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false),
7878 unsigned WavefrontSize;
7979 bool CFALUBug;
8080 int LocalMemorySize;
81 unsigned MaxPrivateElementSize;
8182 bool EnableVGPRSpilling;
8283 bool SGPRInitBug;
8384 bool IsGCN;
242243 return LocalMemorySize;
243244 }
244245
246 unsigned getMaxPrivateElementSize() const {
247 return MaxPrivateElementSize;
248 }
249
245250 bool hasSGPRInitBug() const {
246251 return SGPRInitBug;
247252 }
4242 AMD_CODE_VERSION_MAJOR = 0,
4343 AMD_CODE_VERSION_MINOR = 1
4444 };
45
46 // Sets val bits for specified mask in specified dst packed instance.
47 #define AMD_HSA_BITS_SET(dst, mask, val) \
48 dst &= (~(1 << mask ## _SHIFT) & ~mask); \
49 dst |= (((val) << mask ## _SHIFT) & mask)
50
51 // Gets bits for specified mask from specified src packed instance.
52 #define AMD_HSA_BITS_GET(src, mask) \
53 ((src & mask) >> mask ## _SHIFT) \
4554
4655 /// The values used to define the number of bytes to use for the
4756 /// swizzle element size.
31193119 AMDGPU::RSRC_TID_ENABLE |
31203120 0xffffffff; // Size;
31213121
3122 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
3123
3124 Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT);
3125
31223126 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
31233127 // Clear them unless we want a huge stride.
31243128 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
497497
498498 const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
499499 const uint64_t RSRC_TID_ENABLE = 1LL << 55;
500
500 const uint64_t RSRC_ELEMENT_SIZE_SHIFT = 51;
501501 } // End namespace AMDGPU
502502
503503 namespace SI {
99 ; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1010 ; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1111 ; GCN: s_mov_b32 s10, -1
12 ; CI: s_mov_b32 s11, 0x80f000
13 ; VI: s_mov_b32 s11, 0x800000
12 ; CI: s_mov_b32 s11, 0x98f000
13 ; VI: s_mov_b32 s11, 0x980000
1414
1515
1616 ; GCNHSA: .amd_kernel_code_t
44 ; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
55 ; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
66 ; GCN: s_mov_b32 s10, -1
7 ; CI: s_mov_b32 s11, 0x80f000
8 ; VI: s_mov_b32 s11, 0x800000
7 ; CI: s_mov_b32 s11, 0x98f000
8 ; VI: s_mov_b32 s11, 0x980000
99
1010 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
1111 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
2525 ; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2626 ; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2727 ; GCN: s_mov_b32 s10, -1
28 ; CI: s_mov_b32 s11, 0x80f000
29 ; VI: s_mov_b32 s11, 0x800000
28 ; CI: s_mov_b32 s11, 0x98f000
29 ; VI: s_mov_b32 s11, 0x980000
3030
3131 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
3232 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
2020 ; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2121 ; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2222 ; GCN-NEXT: s_mov_b32 s14, -1
23 ; SI-NEXT: s_mov_b32 s15, 0x80f000
24 ; VI-NEXT: s_mov_b32 s15, 0x800000
23 ; SI-NEXT: s_mov_b32 s15, 0x98f000
24 ; VI-NEXT: s_mov_b32 s15, 0x980000
2525
2626
2727 ; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill
1313 ; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1414 ; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1515 ; GCN-NEXT: s_mov_b32 s14, -1
16 ; SI-NEXT: s_mov_b32 s15, 0x80f000
17 ; VI-NEXT: s_mov_b32 s15, 0x800000
16 ; SI-NEXT: s_mov_b32 s15, 0x98f000
17 ; VI-NEXT: s_mov_b32 s15, 0x980000
1818
1919 ; s12 is offset user SGPR
2020 ; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill