llvm.org GIT mirror llvm / e06c315
[AMDGPU] use v32f32 for 3 mfma intrinsics These should really use v32f32, but were defined as v32i32 due to the lack of the v32f32 type. Differential Revision: https://reviews.llvm.org/D64667 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@365972 91177308-0d34-0410-b5e6-96231b3b80d8 Stanislav Mekhanoshin a month ago
11 changed file(s) with 95 addition(s) and 68 deletion(s). Raw diff Collapse all Expand all
260260 def llvm_v4f32_ty : LLVMType; // 4 x float
261261 def llvm_v8f32_ty : LLVMType; // 8 x float
262262 def llvm_v16f32_ty : LLVMType; // 16 x float
263 def llvm_v32f32_ty : LLVMType; // 32 x float
263264 def llvm_v1f64_ty : LLVMType; // 1 x double
264265 def llvm_v2f64_ty : LLVMType; // 2 x double
265266 def llvm_v4f64_ty : LLVMType; // 4 x double
16621662 def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicNoRtn;
16631663
16641664 // llvm.amdgcn.mfma.f32.* vdst, srcA, srcB, srcC, cbsz, abid, blgp
1665 def int_amdgcn_mfma_f32_32x32x1f32 : Intrinsic<[llvm_v32i32_ty],
1666 [llvm_float_ty, llvm_float_ty, llvm_v32i32_ty,
1665 def int_amdgcn_mfma_f32_32x32x1f32 : Intrinsic<[llvm_v32f32_ty],
1666 [llvm_float_ty, llvm_float_ty, llvm_v32f32_ty,
16671667 llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
16681668
16691669 def int_amdgcn_mfma_f32_16x16x1f32 : Intrinsic<[llvm_v16f32_ty],
16821682 [llvm_float_ty, llvm_float_ty, llvm_v4f32_ty,
16831683 llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
16841684
1685 def int_amdgcn_mfma_f32_32x32x4f16 : Intrinsic<[llvm_v32i32_ty],
1686 [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v32i32_ty,
1685 def int_amdgcn_mfma_f32_32x32x4f16 : Intrinsic<[llvm_v32f32_ty],
1686 [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v32f32_ty,
16871687 llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
16881688
16891689 def int_amdgcn_mfma_f32_16x16x4f16 : Intrinsic<[llvm_v16f32_ty],
17221722 [llvm_i32_ty, llvm_i32_ty, llvm_v4i32_ty,
17231723 llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
17241724
1725 def int_amdgcn_mfma_f32_32x32x2bf16 : Intrinsic<[llvm_v32i32_ty],
1726 [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v32i32_ty,
1725 def int_amdgcn_mfma_f32_32x32x2bf16 : Intrinsic<[llvm_v32f32_ty],
1726 [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v32f32_ty,
17271727 llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
17281728
17291729 def int_amdgcn_mfma_f32_16x16x2bf16 : Intrinsic<[llvm_v16f32_ty],
164164 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
165165 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
166166
167 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
168 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
169
167170 setOperationAction(ISD::LOAD, MVT::i64, Promote);
168171 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
169172
254257
255258 setOperationAction(ISD::STORE, MVT::v16f32, Promote);
256259 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
260
261 setOperationAction(ISD::STORE, MVT::v32f32, Promote);
262 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
257263
258264 setOperationAction(ISD::STORE, MVT::i64, Promote);
259265 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
354360 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
355361 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
356362 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
363 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
357364 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
365 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
366 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
358367
359368 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
360369 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
152152
153153 if (Subtarget->hasMAIInsts()) {
154154 addRegisterClass(MVT::v32i32, &AMDGPU::AReg_1024RegClass);
155 addRegisterClass(MVT::v32f32, &AMDGPU::AReg_1024RegClass);
155156 }
156157
157158 computeRegisterProperties(Subtarget->getRegisterInfo());
262263
263264 // We only support LOAD/STORE and vector manipulation ops for vectors
264265 // with > 4 elements.
265 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
266 MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v32i32 }) {
266 for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
267 MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16,
268 MVT::v32i32, MVT::v32f32 }) {
267269 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
268270 switch (Op) {
269271 case ISD::LOAD:
21772177
21782178 def VOP_V4F32_F32_F32_V4F32 : VOPProfile <[v4f32, f32, f32, v4f32]>;
21792179 def VOP_V16F32_F32_F32_V16F32 : VOPProfile <[v16f32, f32, f32, v16f32]>;
2180 // TODO: define v32f32
2181 def VOP_V32F32_F32_F32_V32F32 : VOPProfile <[v32i32, f32, f32, v32i32]>;
2180 def VOP_V32F32_F32_F32_V32F32 : VOPProfile <[v32f32, f32, f32, v32f32]>;
21822181 def VOP_V4F32_V4F16_V4F16_V4F32 : VOPProfile <[v4f32, v4f16, v4f16, v4f32]>;
21832182 def VOP_V16F32_V4F16_V4F16_V16F32 : VOPProfile <[v16f32, v4f16, v4f16, v16f32]>;
2184 def VOP_V32F32_V4F16_V4F16_V32F32 : VOPProfile <[v32i32, v4f16, v4f16, v32i32]>;
2183 def VOP_V32F32_V4F16_V4F16_V32F32 : VOPProfile <[v32f32, v4f16, v4f16, v32f32]>;
21852184 def VOP_V4F32_V2I16_V2I16_V4F32 : VOPProfile <[v4f32, v2i16, v2i16, v4f32]>;
21862185 def VOP_V16F32_V2I16_V2I16_V16F32 : VOPProfile <[v16f32, v2i16, v2i16, v16f32]>;
2187 def VOP_V32F32_V2I16_V2I16_V32F32 : VOPProfile <[v32i32, v2i16, v2i16, v32i32]>;
2186 def VOP_V32F32_V2I16_V2I16_V32F32 : VOPProfile <[v32f32, v2i16, v2i16, v32f32]>;
21882187 def VOP_V4I32_I32_I32_V4I32 : VOPProfile <[v4i32, i32, i32, v4i32]>;
21892188 def VOP_V16I32_I32_I32_V16I32 : VOPProfile <[v16i32, i32, i32, v16i32]>;
21902189 def VOP_V32I32_I32_I32_V32I32 : VOPProfile <[v32i32, i32, i32, v32i32]>;
940940
941941 def Insert_Element_v32i32_#Index : Insert_Element <
942942 i32, v32i32, Index, !cast(sub#Index)
943 >;
944
945 def Extract_Element_v32f32_#Index : Extract_Element <
946 f32, v32f32, Index, !cast(sub#Index)
947 >;
948
949 def Insert_Element_v32f32_#Index : Insert_Element <
950 f32, v32f32, Index, !cast(sub#Index)
943951 >;
944952 }
945953
10331041 def : BitConvert ;
10341042 def : BitConvert ;
10351043
1044 // 1024-bit bitcast
1045 def : BitConvert ;
1046 def : BitConvert ;
1047
10361048 /********** =================== **********/
10371049 /********** Src & Dst modifiers **********/
10381050 /********** =================== **********/
756756 let isAllocatable = 0;
757757 }
758758
759 def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32], 32, (add SGPR_1024Regs)> {
759 def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, (add SGPR_1024Regs)> {
760760 let AllocationPriority = 19;
761761 }
762762
763 def SReg_1024 : RegisterClass<"AMDGPU", [v32i32], 32,
763 def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
764764 (add SGPR_1024)> {
765765 let CopyCost = 16;
766766 let AllocationPriority = 19;
811811 let AllocationPriority = 7;
812812 }
813813
814 def VReg_1024 : RegisterClass<"AMDGPU", [v32i32], 32, (add VGPR_1024)> {
814 def VReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, (add VGPR_1024)> {
815815 let Size = 1024;
816816 let CopyCost = 32;
817817 let AllocationPriority = 8;
839839 }
840840
841841 // TODO: add v32f32 value type
842 def AReg_1024 : RegisterClass<"AMDGPU", [v32i32], 32, (add AGPR_1024)> {
842 def AReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, (add AGPR_1024)> {
843843 let Size = 1024;
844844 let CopyCost = 65;
845845 let AllocationPriority = 8;
0 ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
11
2 declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x i32>, i32, i32, i32)
2 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
33
44 ; GCN-LABEL: {{^}}test_32_agprs:
55 ; GCN: v_mfma_f32_32x32x1f32 a[0:31], {{v[0-9]+}}, {{v[0-9]+}}, 0
66 ; GCN-NOT: v28
77 ; GCN: NumVgprs: 32
88 ; GCN: VGPRBlocks: 7
9 define amdgpu_kernel void @test_32_agprs(<32 x i32> addrspace(1)* %arg) {
9 define amdgpu_kernel void @test_32_agprs(<32 x float> addrspace(1)* %arg) {
1010 bb:
11 %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> , i32 0, i32 0, i32 0)
12 store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
11 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> , i32 0, i32 0, i32 0)
12 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
1313 ret void
1414 }
0 ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
11
2 declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x i32>, i32, i32, i32)
2 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
33 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
44 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32)
55 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float, float, <16 x float>, i32, i32, i32)
66 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float, float, <4 x float>, i32, i32, i32)
7 declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <32 x i32>, i32, i32, i32)
7 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <32 x float>, i32, i32, i32)
88 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32)
99 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half>, <4 x half>, <4 x float>, i32, i32, i32)
1010 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32)
1414 declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
1515 declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32, i32, <16 x i32>, i32, i32, i32)
1616 declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32, i32)
17 declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x i32>, i32, i32, i32)
17 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x float>, i32, i32, i32)
1818 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
1919 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32)
2020 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
9999 ; GCN-DAG: global_store_dwordx4
100100 ; GCN-DAG: global_store_dwordx4
101101 ; GCN-DAG: global_store_dwordx4
102 define amdgpu_kernel void @test_mfma_f32_32x32x1f32(<32 x i32> addrspace(1)* %arg) {
103 bb:
104 %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg
105 %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %in.1, i32 1, i32 2, i32 3)
106 store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
102 define amdgpu_kernel void @test_mfma_f32_32x32x1f32(<32 x float> addrspace(1)* %arg) {
103 bb:
104 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
105 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
106 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
107107 ret void
108108 }
109109
325325 ; GCN-DAG: global_store_dwordx4
326326 ; GCN-DAG: global_store_dwordx4
327327 ; GCN-DAG: global_store_dwordx4
328 define amdgpu_kernel void @test_mfma_f32_32x32x4f16(<32 x i32> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) {
329 bb:
330 %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg
328 define amdgpu_kernel void @test_mfma_f32_32x32x4f16(<32 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) {
329 bb:
330 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
331331 %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c
332332 %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1
333333 %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p
334 %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %c.1, <4 x half> %c.2, <32 x i32> %in.1, i32 1, i32 2, i32 3)
335 store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
334 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %c.1, <4 x half> %c.2, <32 x float> %in.1, i32 1, i32 2, i32 3)
335 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
336336 ret void
337337 }
338338
793793 ; GCN-DAG: global_store_dwordx4
794794 ; GCN-DAG: global_store_dwordx4
795795 ; GCN-DAG: global_store_dwordx4
796 define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(<32 x i32> addrspace(1)* %arg) {
797 bb:
798 %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg
796 define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(<32 x float> addrspace(1)* %arg) {
797 bb:
798 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
799799 %a = bitcast i32 1 to <2 x i16>
800800 %b = bitcast i32 2 to <2 x i16>
801 %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x i32> %in.1, i32 1, i32 2, i32 3)
802 store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
801 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
802 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
803803 ret void
804804 }
805805
956956 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_forward_acc:
957957 ; GCN: v_mfma_f32_32x32x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
958958 ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
959 define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(<32 x i32> addrspace(1)* %arg) {
960 bb:
961 %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg
962 %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %in.1, i32 0, i32 0, i32 0)
963 %mai.2 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %mai.1, i32 0, i32 0, i32 0)
964 store <32 x i32> %mai.2, <32 x i32> addrspace(1)* %arg
959 define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(<32 x float> addrspace(1)* %arg) {
960 bb:
961 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
962 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
963 %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0)
964 store <32 x float> %mai.2, <32 x float> addrspace(1)* %arg
965965 ret void
966966 }
967967
11111111 ; GCN-DAG: global_store_dwordx4
11121112 ; GCN-DAG: global_store_dwordx4
11131113 ; GCN-DAG: global_store_dwordx4
1114 define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(<32 x i32> addrspace(1)* %arg) {
1115 bb:
1116 %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> , i32 0, i32 0, i32 0)
1117 store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
1114 define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(<32 x float> addrspace(1)* %arg) {
1115 bb:
1116 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> , i32 0, i32 0, i32 0)
1117 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
11181118 ret void
11191119 }
11201120
11831183
11841184 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm:
11851185 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1186 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1
1186 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
11871187 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
11881188 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
11891189 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
12551255 ; GCN-DAG: global_store_dwordx4
12561256 ; GCN-DAG: global_store_dwordx4
12571257 ; GCN-DAG: global_store_dwordx4
1258 define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(<32 x i32> addrspace(1)* %arg) {
1259 bb:
1260 %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> , i32 0, i32 0, i32 0)
1261 store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
1258 define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(<32 x float> addrspace(1)* %arg) {
1259 bb:
1260 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> , i32 0, i32 0, i32 0)
1261 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
12621262 ret void
12631263 }
12641264
13491349 ; GCN-DAG: global_store_dwordx4
13501350 ; GCN-DAG: global_store_dwordx4
13511351 ; GCN-DAG: global_store_dwordx4
1352 define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(<32 x i32> addrspace(1)* %arg) {
1352 define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(<32 x float> addrspace(1)* %arg) {
13531353 bb:
13541354 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1355 %gep = getelementptr inbounds <32 x i32>, <32 x i32> addrspace(1)* %arg, i32 %tid
1356 %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %gep
1357 %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %in.1, i32 1, i32 2, i32 3)
1358 store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %gep
1359 ret void
1360 }
1355 %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
1356 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
1357 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
1358 store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep
1359 ret void
1360 }
8383 ; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload
8484 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
8585 ; A2V: ScratchSize: 0
86 define amdgpu_kernel void @max_32regs_mfma32(i32 addrspace(1)* %arg) #3 {
86 define amdgpu_kernel void @max_32regs_mfma32(float addrspace(1)* %arg) #3 {
8787 bb:
8888 %v = call i32 asm sideeffect "", "=a"()
8989 br label %use
9090
9191 use:
92 %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x i32> >, i32 0, i32 0, i32 0)
92 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x float> >, i32 0, i32 0, i32 0)
9393 call void asm sideeffect "", "a"(i32 %v)
94 %elt1 = extractelement <32 x i32> %mai.1, i32 0
95 store i32 %elt1, i32 addrspace(1)* %arg
94 %elt1 = extractelement <32 x float> %mai.1, i32 0
95 store float %elt1, float addrspace(1)* %arg
9696 ret void
9797 }
9898
9999 declare i32 @llvm.amdgcn.workitem.id.x()
100100 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
101101 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32)
102 declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x i32>, i32, i32, i32)
102 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
103103
104104 attributes #0 = { nounwind "amdgpu-num-vgpr"="24" }
105105 attributes #1 = { nounwind "amdgpu-num-vgpr"="8" }
232232 ret void
233233 }
234234
235 ; FIXME: adding an AReg_1024 register class for v32f32 and v32i32
236 ; produces unnecessary copies and we still have some amount
237 ; of conventional spilling.
238
235239 ; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32_2bb:
236240 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
237241 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
238 ; GFX908-NOT: SCRATCH_RSRC
242 ; GFX908-FIXME-NOT: SCRATCH_RSRC
239243 ; GFX908-DAG: v_accvgpr_write_b32 a0, v
240244 ; GFX900: buffer_store_dword v
241245 ; GFX900: buffer_load_dword v
242 ; GFX908-NOT: buffer_
246 ; GFX908-FIXME-NOT: buffer_
243247 ; GFX908-DAG v_accvgpr_read_b32
244248
245249 ; GCN: NumVgprs: 256
246250 ; GFX900: ScratchSize: 580
247 ; GFX908: ScratchSize: 0
251 ; GFX908-FIXME: ScratchSize: 0
248252 ; GCN: VGPRBlocks: 63
249253 ; GCN: NumVGPRsForWavesPerEU: 256
250254 define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) {