llvm.org GIT mirror llvm / 692ee10
R600: Add 64-bit float load/store support * Added R600_Reg64 class * Added T#Index#.XY registers definition * Added v2i32 register reads from parameter and global space * Added f32 and i32 elements extraction from v2f32 and v2i32 * Added v2i32 -> v2f32 conversions Tom Stellard: - Mark vec2 operations as expand. The addition of a vec2 register class made them all legal. Patch by: Dmitry Cherkassov Signed-off-by: Dmitry Cherkassov <dcherkassov@gmail.com> git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@187582 91177308-0d34-0410-b5e6-96231b3b80d8 Tom Stellard 6 years ago
23 changed file(s) with 300 addition(s) and 66 deletion(s). Raw diff Collapse all Expand all
3737
3838 // Calling convention for compute kernels
3939 def CC_AMDGPU_Kernel : CallingConv<[
40 CCIfType<[v4i32, v4f32], CCAssignToStack <16, 16>>,
41 CCIfType<[i64, f64], CCAssignToStack < 8, 8>>,
42 CCIfType<[i32, f32], CCAssignToStack < 4, 4>>,
43 CCIfType<[i16], CCAssignToStack < 2, 4>>,
44 CCIfType<[i8], CCAssignToStack < 1, 4>>
40 CCIfType<[v4i32, v4f32], CCAssignToStack <16, 16>>,
41 CCIfType<[i64, f64, v2f32, v2i32], CCAssignToStack < 8, 8>>,
42 CCIfType<[i32, f32], CCAssignToStack < 4, 4>>,
43 CCIfType<[i16], CCAssignToStack < 2, 4>>,
44 CCIfType<[i8], CCAssignToStack < 1, 4>>
4545 ]>;
4646
4747 def CC_AMDGPU : CallingConv<[
259259 if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
260260 break;
261261 }
262
263 unsigned RegClassID;
264 switch(N->getValueType(0).getVectorNumElements()) {
265 case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
266 case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break;
267 default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
268 }
262269 // BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
263270 // that adds a 128 bits reg copy when going through TwoAddressInstructions
264271 // pass. We want to avoid 128 bits copies as much as possible because they
265272 // can't be bundled by our scheduler.
266273 SDValue RegSeqArgs[9] = {
267 CurDAG->getTargetConstant(AMDGPU::R600_Reg128RegClassID, MVT::i32),
274 CurDAG->getTargetConstant(RegClassID, MVT::i32),
268275 SDValue(), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
269276 SDValue(), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32),
270277 SDValue(), CurDAG->getTargetConstant(AMDGPU::sub2, MVT::i32),
7777
7878 setOperationAction(ISD::LOAD, MVT::f64, Promote);
7979 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
80
81 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Expand);
82 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Expand);
8083
8184 setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
8285 setOperationAction(ISD::FNEG, MVT::v4f32, Expand);
377377 case AMDGPU::R600_ExportBuf:
378378 case AMDGPU::R600_ExportSwz:
379379 case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
380 case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
380381 case AMDGPU::RAT_WRITE_CACHELESS_128_eg:
381 case AMDGPU::RAT_STORE_DWORD_cm:
382 case AMDGPU::RAT_STORE_DWORD32_cm:
383 case AMDGPU::RAT_STORE_DWORD64_cm:
382384 DEBUG(dbgs() << CfCount << ":"; MI->dump(););
383385 CfCount++;
384386 break;
3232 addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
3333 addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
3434 addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
35 addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
36 addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
37
3538 computeRegisterProperties();
3639
3740 setOperationAction(ISD::FADD, MVT::v4f32, Expand);
41 setOperationAction(ISD::FADD, MVT::v2f32, Expand);
3842 setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
43 setOperationAction(ISD::FMUL, MVT::v2f32, Expand);
3944 setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
45 setOperationAction(ISD::FDIV, MVT::v2f32, Expand);
4046 setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
47 setOperationAction(ISD::FSUB, MVT::v2f32, Expand);
4148
4249 setOperationAction(ISD::FCOS, MVT::f32, Custom);
4350 setOperationAction(ISD::FSIN, MVT::f32, Custom);
4451
4552 setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
53 setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
4654
4755 setOperationAction(ISD::BR_CC, MVT::i32, Expand);
4856 setOperationAction(ISD::BR_CC, MVT::f32, Expand);
6573
6674 // Legalize loads and stores to the private address space.
6775 setOperationAction(ISD::LOAD, MVT::i32, Custom);
68 setOperationAction(ISD::LOAD, MVT::v2i32, Expand);
76 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
6977 setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
7078 setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
7179 setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
7381 setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
7482 setOperationAction(ISD::STORE, MVT::i8, Custom);
7583 setOperationAction(ISD::STORE, MVT::i32, Custom);
76 setOperationAction(ISD::STORE, MVT::v2i32, Expand);
84 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
7785 setOperationAction(ISD::STORE, MVT::v4i32, Custom);
7886
7987 setOperationAction(ISD::LOAD, MVT::i32, Custom);
169177 }
170178
171179 case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
180 case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
172181 case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
173182 unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
174183
11281137 DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
11291138 Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
11301139 }
1131 Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
1140 EVT NewVT = MVT::v4i32;
1141 unsigned NumElements = 4;
1142 if (VT.isVector()) {
1143 NewVT = VT;
1144 NumElements = VT.getVectorNumElements();
1145 }
1146 Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements);
11321147 } else {
11331148 // non constant ptr cant be folded, keeps it as a v4f32 load
11341149 Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
5050 MachineBasicBlock::iterator MI, DebugLoc DL,
5151 unsigned DestReg, unsigned SrcReg,
5252 bool KillSrc) const {
53 if (AMDGPU::R600_Reg128RegClass.contains(DestReg)
54 && AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
55 for (unsigned I = 0; I < 4; I++) {
53 unsigned VectorComponents = 0;
54 if (AMDGPU::R600_Reg128RegClass.contains(DestReg) &&
55 AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
56 VectorComponents = 4;
57 } else if(AMDGPU::R600_Reg64RegClass.contains(DestReg) &&
58 AMDGPU::R600_Reg64RegClass.contains(SrcReg)) {
59 VectorComponents = 2;
60 }
61
62 if (VectorComponents > 0) {
63 for (unsigned I = 0; I < VectorComponents; I++) {
5664 unsigned SubRegIndex = RI.getSubRegFromChannel(I);
5765 buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
5866 RI.getSubReg(DestReg, SubRegIndex),
6169 RegState::Define | RegState::Implicit);
6270 }
6371 } else {
64
65 // We can't copy vec4 registers
66 assert(!AMDGPU::R600_Reg128RegClass.contains(DestReg)
67 && !AMDGPU::R600_Reg128RegClass.contains(SrcReg));
68
6972 MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
7073 DestReg, SrcReg);
7174 NewMI->getOperand(getOperandIdx(*NewMI, AMDGPU::OpName::src0))
12891289 [(global_store i32:$rw_gpr, i32:$index_gpr)]
12901290 >;
12911291
1292 // 64-bit store
1293 def RAT_WRITE_CACHELESS_64_eg : RAT_WRITE_CACHELESS_eg <
1294 (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
1295 0x3, "RAT_WRITE_CACHELESS_64_eg $rw_gpr.XY, $index_gpr, $eop",
1296 [(global_store v2i32:$rw_gpr, i32:$index_gpr)]
1297 >;
1298
12921299 //128-bit store
12931300 def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg <
12941301 (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
13571364 let Constraints = "$src_gpr.ptr = $dst_gpr";
13581365 }
13591366
1367 class VTX_READ_64_eg buffer_id, list pattern>
1368 : VTX_READ_eg <"VTX_READ_64 $dst_gpr.XY, $src_gpr", buffer_id,
1369 (outs R600_Reg64:$dst_gpr), pattern> {
1370
1371 let MEGA_FETCH_COUNT = 8;
1372 let DST_SEL_X = 0;
1373 let DST_SEL_Y = 1;
1374 let DST_SEL_Z = 7;
1375 let DST_SEL_W = 7;
1376 let DATA_FORMAT = 0x1D; // COLOR_32_32
1377 }
1378
13601379 class VTX_READ_128_eg buffer_id, list pattern>
13611380 : VTX_READ_eg <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id,
13621381 (outs R600_Reg128:$dst_gpr), pattern> {
13901409 [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
13911410 >;
13921411
1412 def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <0,
1413 [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
1414 >;
1415
13931416 def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
13941417 [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
13951418 >;
14101433 // 32-bit reads
14111434 def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1,
14121435 [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
1436 >;
1437
1438 // 64-bit reads
1439 def VTX_READ_GLOBAL_64_eg : VTX_READ_64_eg <1,
1440 [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
14131441 >;
14141442
14151443 // 128-bit reads
17431771 def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;
17441772
17451773
1746 def RAT_STORE_DWORD_cm : EG_CF_RAT <
1747 0x57, 0x14, 0x1, (outs),
1748 (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr),
1749 "EXPORT_RAT_INST_STORE_DWORD $rw_gpr, $index_gpr",
1750 [(global_store i32:$rw_gpr, i32:$index_gpr)]
1774 class RAT_STORE_DWORD_cm mask, dag ins, list pat> : EG_CF_RAT <
1775 0x57, 0x14, mask, (outs), ins,
1776 "EXPORT_RAT_INST_STORE_DWORD $rw_gpr, $index_gpr", pat
17511777 > {
17521778 let eop = 0; // This bit is not used on Cayman.
17531779 }
1780
1781 def RAT_STORE_DWORD32_cm : RAT_STORE_DWORD_cm <0x1,
1782 (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr),
1783 [(global_store i32:$rw_gpr, i32:$index_gpr)]
1784 >;
1785
1786 def RAT_STORE_DWORD64_cm : RAT_STORE_DWORD_cm <0x3,
1787 (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr),
1788 [(global_store v2i32:$rw_gpr, i32:$index_gpr)]
1789 >;
17541790
17551791 class VTX_READ_cm buffer_id, dag outs, list pattern>
17561792 : VTX_WORD0_cm, VTX_READ {
18141850 let Constraints = "$src_gpr.ptr = $dst_gpr";
18151851 }
18161852
1853 class VTX_READ_64_cm buffer_id, list pattern>
1854 : VTX_READ_cm <"VTX_READ_64 $dst_gpr, $src_gpr", buffer_id,
1855 (outs R600_Reg64:$dst_gpr), pattern> {
1856
1857 let DST_SEL_X = 0;
1858 let DST_SEL_Y = 1;
1859 let DST_SEL_Z = 7;
1860 let DST_SEL_W = 7;
1861 let DATA_FORMAT = 0x1D; // COLOR_32_32
1862 }
1863
18171864 class VTX_READ_128_cm buffer_id, list pattern>
18181865 : VTX_READ_cm <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id,
18191866 (outs R600_Reg128:$dst_gpr), pattern> {
18451892 [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
18461893 >;
18471894
1895 def VTX_READ_PARAM_64_cm : VTX_READ_64_cm <0,
1896 [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
1897 >;
1898
18481899 def VTX_READ_PARAM_128_cm : VTX_READ_128_cm <0,
18491900 [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
18501901 >;
18651916 // 32-bit reads
18661917 def VTX_READ_GLOBAL_32_cm : VTX_READ_32_cm <1,
18671918 [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
1919 >;
1920
1921 // 64-bit reads
1922 def VTX_READ_GLOBAL_64_cm : VTX_READ_64_cm <1,
1923 [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
18681924 >;
18691925
18701926 // 128-bit reads
22962352 def : Vector4_Build ;
22972353 def : Vector4_Build ;
22982354
2355 def : Extract_Element ;
2356 def : Extract_Element ;
2357
2358 def : Insert_Element ;
2359 def : Insert_Element ;
2360
2361 def : Extract_Element ;
2362 def : Extract_Element ;
2363
2364 def : Insert_Element ;
2365 def : Insert_Element ;
2366
22992367 // bitconvert patterns
23002368
23012369 def : BitConvert ;
23022370 def : BitConvert ;
2371 def : BitConvert ;
2372 def : BitConvert ;
23032373 def : BitConvert ;
23042374 def : BitConvert ;
23052375
2121 let SubRegIndices = [sub0, sub1, sub2, sub3];
2222 let HWEncoding = encoding;
2323 }
24
25 class R600Reg_64 subregs, bits<16> encoding> :
26 RegisterWithSubRegs {
27 let Namespace = "AMDGPU";
28 let SubRegIndices = [sub0, sub1];
29 let HWEncoding = encoding;
30 }
31
2432
2533 foreach Index = 0-127 in {
2634 foreach Chan = [ "X", "Y", "Z", "W" ] in {
3947 !cast("T"#Index#"_Y"),
4048 !cast("T"#Index#"_Z"),
4149 !cast("T"#Index#"_W")],
50 Index>;
51
52 def T#Index#_XY : R600Reg_64 <"T"#Index#"",
53 [!cast("T"#Index#"_X"),
54 !cast("T"#Index#"_Y")],
4255 Index>;
4356 }
4457
185198 let CopyCost = -1;
186199 }
187200
201 def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
202 (add (sequence "T%u_XY", 0, 63))>;
203
188204 //===----------------------------------------------------------------------===//
189205 // Register classes for indirect addressing
190206 //===----------------------------------------------------------------------===//
None ; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
0 ; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s --check-prefix=SI-CHECK
11
22 ; SI-CHECK: @f64_kernel_arg
33 ; SI-CHECK-DAG: S_LOAD_DWORDX2 SGPR{{[0-9]}}_SGPR{{[0-9]}}, SGPR0_SGPR1, 9
0 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
1 ; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
2
3 ; R600-CHECK: @build_vector2
4 ; R600-CHECK: MOV
5 ; R600-CHECK: MOV
6 ; R600-CHECK-NOT: MOV
7 ; SI-CHECK: @build_vector2
8 ; SI-CHECK-DAG: V_MOV_B32_e32 [[X:VGPR[0-9]]], 5
9 ; SI-CHECK-DAG: V_MOV_B32_e32 [[Y:VGPR[0-9]]], 6
10 ; SI-CHECK: BUFFER_STORE_DWORDX2 [[X]]_[[Y]]
11 define void @build_vector2 (<2 x i32> addrspace(1)* %out) {
12 entry:
13 store <2 x i32> , <2 x i32> addrspace(1)* %out
14 ret void
15 }
16
17 ; R600-CHECK: @build_vector4
18 ; R600-CHECK: MOV
19 ; R600-CHECK: MOV
20 ; R600-CHECK: MOV
21 ; R600-CHECK: MOV
22 ; R600-CHECK-NOT: MOV
23 ; SI-CHECK: @build_vector4
24 ; SI-CHECK-DAG: V_MOV_B32_e32 [[X:VGPR[0-9]]], 5
25 ; SI-CHECK-DAG: V_MOV_B32_e32 [[Y:VGPR[0-9]]], 6
26 ; SI-CHECK-DAG: V_MOV_B32_e32 [[Z:VGPR[0-9]]], 7
27 ; SI-CHECK-DAG: V_MOV_B32_e32 [[W:VGPR[0-9]]], 8
28 ; SI-CHECK: BUFFER_STORE_DWORDX4 [[X]]_[[Y]]_[[Z]]_[[W]]
29 define void @build_vector4 (<4 x i32> addrspace(1)* %out) {
30 entry:
31 store <4 x i32> , <4 x i32> addrspace(1)* %out
32 ret void
33 }
1414
1515 declare void @llvm.AMDGPU.store.output(float, i32)
1616
17 ; CHECK: @fadd_v2f32
18 ; CHECK-DAG: ADD * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
19 ; CHECK-DAG: ADD * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
20 define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
21 entry:
22 %0 = fadd <2 x float> %a, %b
23 store <2 x float> %0, <2 x float> addrspace(1)* %out
24 ret void
25 }
26
1727 ; CHECK: @fadd_v4f32
1828 ; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
1929 ; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
None ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
0 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
11
2 ;CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}
3 ;CHECK-DAG: MUL_IEEE * T{{[0-9]+\.[XYZW]}}
4 ;CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}
5 ;CHECK-DAG: MUL_IEEE * T{{[0-9]+\.[XYZW]}}
6 ;CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}
7 ;CHECK-DAG: MUL_IEEE * T{{[0-9]+\.[XYZW]}}
8 ;CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}
9 ;CHECK-DAG: MUL_IEEE * T{{[0-9]+\.[XYZW]}}
2 ; These tests check that fdiv is expanded correctly and also test that the
3 ; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate
4 ; instruction groups.
105
11 define void @test(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) {
6 ; CHECK: @fdiv_v2f32
7 ; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
8 ; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
9 ; CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
10 ; CHECK-DAG: MUL_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
11 define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
1212 entry:
13 %0 = fdiv <4 x float> %a, %b
14 store <4 x float> %0, <4 x float> addrspace(1)* %out
13 %0 = fdiv <2 x float> %a, %b
14 store <2 x float> %0, <2 x float> addrspace(1)* %out
1515 ret void
1616 }
17
18 ; CHECK: @fdiv_v4f32
19 ; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
20 ; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
21 ; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22 ; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
23 ; CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
24 ; CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
25 ; CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
26 ; CHECK-DAG: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
27
28 define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
29 %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
30 %a = load <4 x float> addrspace(1) * %in
31 %b = load <4 x float> addrspace(1) * %b_ptr
32 %result = fdiv <4 x float> %a, %b
33 store <4 x float> %result, <4 x float> addrspace(1)* %out
34 ret void
35 }
1414
1515 declare void @llvm.AMDGPU.store.output(float, i32)
1616
17 ; CHECK: @fmul_v2f32
18 ; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW]}}
19 ; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW]}}
20 define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
21 entry:
22 %0 = fmul <2 x float> %a, %b
23 store <2 x float> %0, <2 x float> addrspace(1)* %out
24 ret void
25 }
26
1727 ; CHECK: @fmul_v4f32
1828 ; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
1929 ; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
0 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
11 ; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
2
3 ; R600-CHECK: @fp_to_sint_v2i32
4 ; R600-CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
5 ; R600-CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
6 ; SI-CHECK: @fp_to_sint_v2i32
7 ; SI-CHECK: V_CVT_I32_F32_e32
8 ; SI-CHECK: V_CVT_I32_F32_e32
9 define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
10 %result = fptosi <2 x float> %in to <2 x i32>
11 store <2 x i32> %result, <2 x i32> addrspace(1)* %out
12 ret void
13 }
214
315 ; R600-CHECK: @fp_to_sint_v4i32
416 ; R600-CHECK: FLT_TO_INT {{[* ]*}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
1022 ; SI-CHECK: V_CVT_I32_F32_e32
1123 ; SI-CHECK: V_CVT_I32_F32_e32
1224 ; SI-CHECK: V_CVT_I32_F32_e32
13
1425 define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
1526 %value = load <4 x float> addrspace(1) * %in
1627 %result = fptosi <4 x float> %value to <4 x i32>
0 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
1
2 ; CHECK: @fp_to_uint_v2i32
3 ; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
4 ; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
5
6 define void @fp_to_uint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
7 %result = fptoui <2 x float> %in to <2 x i32>
8 store <2 x i32> %result, <2 x i32> addrspace(1)* %out
9 ret void
10 }
111
212 ; CHECK: @fp_to_uint_v4i32
313 ; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
1414
1515 declare void @llvm.AMDGPU.store.output(float, i32)
1616
17 ; CHECK: @fsub_v2f32
18 ; CHECK-DAG: ADD * T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z
19 ; CHECK-DAG: ADD * T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y
20 define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
21 entry:
22 %0 = fsub <2 x float> %a, %b
23 store <2 x float> %0, <2 x float> addrspace(1)* %out
24 ret void
25 }
26
1727 ; CHECK: @fsub_v4f32
18 ; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
19 ; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
20 ; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
21 ; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22
28 ; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
29 ; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
30 ; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
31 ; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
2332 define void @fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
2433 %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
2534 %a = load <4 x float> addrspace(1) * %in
9191
9292 ; load a v2f32 value from the global address space
9393 ; R600-CHECK: @load_v2f32
94 ; R600-CHECK: VTX_READ_32
95 ; R600-CHECK: VTX_READ_32
94 ; R600-CHECK: VTX_READ_64
9695
9796 ; SI-CHECK: @load_v2f32
9897 ; SI-CHECK: BUFFER_LOAD_DWORDX2
22
33 ; load a v2i32 value from the global address space.
44 ; EG-CHECK: @load_v2i32
5 ; EG-CHECK-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4
6 ; EG-CHECK-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
5 ; EG-CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0
76 ; SI-CHECK: @load_v2i32
87 ; SI-CHECK: BUFFER_LOAD_DWORDX2 VGPR{{[0-9]+}}
98 define void @load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
0 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
11
2 ;EG-CHECK: @test2
3 ;EG-CHECK: SETE_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
4 ;EG-CHECK: SETE_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
2 ; CHECK: @setcc_v2i32
3 ; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z
4 ; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y
55
6 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
7 %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
8 %a = load <2 x i32> addrspace(1) * %in
9 %b = load <2 x i32> addrspace(1) * %b_ptr
6 define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
107 %result = icmp eq <2 x i32> %a, %b
118 %sext = sext <2 x i1> %result to <2 x i32>
129 store <2 x i32> %sext, <2 x i32> addrspace(1)* %out
1310 ret void
1411 }
1512
16 ;EG-CHECK: @test4
17 ;EG-CHECK: SETE_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
18 ;EG-CHECK: SETE_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
19 ;EG-CHECK: SETE_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
20 ;EG-CHECK: SETE_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
13 ; CHECK: @setcc_v4i32
14 ; EG-CHECK-DAG: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
15 ; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
16 ; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
17 ; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
2118
22 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
19 define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
2320 %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
2421 %a = load <4 x i32> addrspace(1) * %in
2522 %b = load <4 x i32> addrspace(1) * %b_ptr
0 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
11 ; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
2
3 ; R600-CHECK: @sint_to_fp_v2i32
4 ; R600-CHECK-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
5 ; R600-CHECK-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
6 ; SI-CHECK: @sint_to_fp_v2i32
7 ; SI-CHECK: V_CVT_F32_I32_e32
8 ; SI-CHECK: V_CVT_F32_I32_e32
9 define void @sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
10 %result = sitofp <2 x i32> %in to <2 x float>
11 store <2 x float> %result, <2 x float> addrspace(1)* %out
12 ret void
13 }
214
315 ; R600-CHECK: @sint_to_fp_v4i32
416 ; R600-CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
1616
1717 ; vec2 floating-point stores
1818 ; EG-CHECK: @store_v2f32
19 ; EG-CHECK: RAT_WRITE_CACHELESS_32_eg
20 ; EG-CHECK-NEXT: RAT_WRITE_CACHELESS_32_eg
19 ; EG-CHECK: RAT_WRITE_CACHELESS_64_eg
2120 ; CM-CHECK: @store_v2f32
2221 ; CM-CHECK: EXPORT_RAT_INST_STORE_DWORD
23 ; CM-CHECK-NEXT: EXPORT_RAT_INST_STORE_DWORD
2422 ; SI-CHECK: @store_v2f32
2523 ; SI-CHECK: BUFFER_STORE_DWORDX2
2624
4038 ; be two 32-bit stores.
4139
4240 ; EG-CHECK: @vecload2
43 ; EG-CHECK: RAT_WRITE_CACHELESS_32_eg
44 ; EG-CHECK: RAT_WRITE_CACHELESS_32_eg
41 ; EG-CHECK: RAT_WRITE_CACHELESS_64_eg
4542 ; CM-CHECK: @vecload2
46 ; CM-CHECK: EXPORT_RAT_INST_STORE_DWORD
4743 ; CM-CHECK: EXPORT_RAT_INST_STORE_DWORD
4844 ; SI-CHECK: @vecload2
4945 ; SI-CHECK: BUFFER_STORE_DWORDX2
22
33 ;EG-CHECK: @test2
44 ;EG-CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
5 ;EG-CHECK: SUB_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
5 ;EG-CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
66
77 ;SI-CHECK: @test2
88 ;SI-CHECK: V_SUB_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
0 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
11 ; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
2
3 ; R600-CHECK: @uint_to_fp_v2i32
4 ; R600-CHECK-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
5 ; R600-CHECK-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
6 ; SI-CHECK: @uint_to_fp_v2i32
7 ; SI-CHECK: V_CVT_F32_U32_e32
8 ; SI-CHECK: V_CVT_F32_U32_e32
9 define void @uint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
10 %result = uitofp <2 x i32> %in to <2 x float>
11 store <2 x float> %result, <2 x float> addrspace(1)* %out
12 ret void
13 }
214
315 ; R600-CHECK: @uint_to_fp_v4i32
416 ; R600-CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}