llvm.org GIT mirror llvm / bd04b64
AMDGPU: Select v_mad_u64_u32 and v_mad_i64_i32 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317492 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 2 years ago
7 changed file(s) with 345 addition(s) and 83 deletion(s). Raw diff Collapse all Expand all
203203 void SelectADD_SUB_I64(SDNode *N);
204204 void SelectUADDO_USUBO(SDNode *N);
205205 void SelectDIV_SCALE(SDNode *N);
206 void SelectMAD_64_32(SDNode *N);
206207 void SelectFMA_W_CHAIN(SDNode *N);
207208 void SelectFMUL_W_CHAIN(SDNode *N);
208209
591592 }
592593 case AMDGPUISD::DIV_SCALE: {
593594 SelectDIV_SCALE(N);
595 return;
596 }
597 case AMDGPUISD::MAD_I64_I32:
598 case AMDGPUISD::MAD_U64_U32: {
599 SelectMAD_64_32(N);
594600 return;
595601 }
596602 case ISD::CopyToReg: {
810816 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
811817
812818 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) };
819 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
820 }
821
822 // We need to handle this here because tablegen doesn't support matching
823 // instructions with multiple outputs.
824 void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
825 SDLoc SL(N);
826 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
827 unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32 : AMDGPU::V_MAD_U64_U32;
828
829 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
830 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
831 Clamp };
813832 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
814833 }
815834
148148 }
149149
150150 return false;
151 }
152
153 unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
154 KnownBits Known;
155 EVT VT = Op.getValueType();
156 DAG.computeKnownBits(Op, Known);
157
158 return VT.getSizeInBits() - Known.countMinLeadingZeros();
159 }
160
161 unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
162 EVT VT = Op.getValueType();
163
164 // In order for this to be a signed 24-bit value, bit 23, must
165 // be a sign bit.
166 return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
151167 }
152168
153169 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
26142630 //===----------------------------------------------------------------------===//
26152631
26162632 static bool isU24(SDValue Op, SelectionDAG &DAG) {
2617 KnownBits Known;
2618 EVT VT = Op.getValueType();
2619 DAG.computeKnownBits(Op, Known);
2620
2621 return (VT.getSizeInBits() - Known.countMinLeadingZeros()) <= 24;
2633 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
26222634 }
26232635
26242636 static bool isI24(SDValue Op, SelectionDAG &DAG) {
26252637 EVT VT = Op.getValueType();
2626
2627 // In order for this to be a signed 24-bit value, bit 23, must
2628 // be a sign bit.
26292638 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
26302639 // as unsigned 24-bit values.
2631 (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
2640 AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;
26322641 }
26332642
26342643 static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
39453954 NODE_NAME_CASE(MUL_LOHI_I24)
39463955 NODE_NAME_CASE(MAD_U24)
39473956 NODE_NAME_CASE(MAD_I24)
3957 NODE_NAME_CASE(MAD_I64_I32)
3958 NODE_NAME_CASE(MAD_U64_U32)
39483959 NODE_NAME_CASE(TEXTURE_FETCH)
39493960 NODE_NAME_CASE(EXPORT)
39503961 NODE_NAME_CASE(EXPORT_DONE)
3535
3636 public:
3737 static bool isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op);
38 static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG);
39 static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG);
3840
3941 protected:
4042 const AMDGPUSubtarget *Subtarget;
378380 MULHI_I24,
379381 MAD_U24,
380382 MAD_I24,
383 MAD_U64_U32,
384 MAD_I64_I32,
381385 MUL_LOHI_I24,
382386 MUL_LOHI_U24,
383387 TEXTURE_FETCH,
461461 return isAmdHsaOS() || isMesaKernel(MF);
462462 }
463463
464 bool hasMad64_32() const {
465 return getGeneration() >= SEA_ISLANDS;
466 }
467
464468 bool hasFminFmaxLegacy() const {
465469 return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
466470 }
59615961 return 0;
59625962 }
59635963
5964 static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
5965 EVT VT,
5966 SDValue N0, SDValue N1, SDValue N2,
5967 bool Signed) {
5968 unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
5969 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
5970 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
5971 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
5972 }
5973
59645974 SDValue SITargetLowering::performAddCombine(SDNode *N,
59655975 DAGCombinerInfo &DCI) const {
59665976 SelectionDAG &DAG = DCI.DAG;
59675977 EVT VT = N->getValueType(0);
5968
5969 if (VT != MVT::i32)
5970 return SDValue();
5971
59725978 SDLoc SL(N);
59735979 SDValue LHS = N->getOperand(0);
59745980 SDValue RHS = N->getOperand(1);
5981
5982 if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL)
5983 && Subtarget->hasMad64_32() &&
5984 !VT.isVector() && VT.getScalarSizeInBits() > 32 &&
5985 VT.getScalarSizeInBits() <= 64) {
5986 if (LHS.getOpcode() != ISD::MUL)
5987 std::swap(LHS, RHS);
5988
5989 SDValue MulLHS = LHS.getOperand(0);
5990 SDValue MulRHS = LHS.getOperand(1);
5991 SDValue AddRHS = RHS;
5992
5993 // TODO: Maybe restrict if SGPR inputs.
5994 if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
5995 numBitsUnsigned(MulRHS, DAG) <= 32) {
5996 MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
5997 MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
5998 AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
5999 return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
6000 }
6001
6002 if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) {
6003 MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
6004 MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
6005 AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
6006 return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
6007 }
6008
6009 return SDValue();
6010 }
6011
6012 if (VT != MVT::i32)
6013 return SDValue();
59756014
59766015 // add x, zext (setcc) => addcarry x, 0, setcc
59776016 // add x, sext (setcc) => subcarry x, 0, setcc
0 ; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s
1 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
2
3 ; GCN-LABEL: {{^}}mad_i64_i32_sextops:
4 ; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3]
5
6 ; SI: v_mul_lo_i32
7 ; SI: v_mul_hi_i32
8 ; SI: v_add_i32
9 ; SI: v_addc_u32
10 define i64 @mad_i64_i32_sextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
11 %sext0 = sext i32 %arg0 to i64
12 %sext1 = sext i32 %arg1 to i64
13 %mul = mul i64 %sext0, %sext1
14 %mad = add i64 %mul, %arg2
15 ret i64 %mad
16 }
17
18 ; GCN-LABEL: {{^}}mad_i64_i32_sextops_commute:
19 ; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3]
20
21 ; SI-DAG: v_mul_lo_i32
22 ; SI-DAG: v_mul_hi_i32
23 ; SI: v_add_i32
24 ; SI: v_addc_u32
25 define i64 @mad_i64_i32_sextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
26 %sext0 = sext i32 %arg0 to i64
27 %sext1 = sext i32 %arg1 to i64
28 %mul = mul i64 %sext0, %sext1
29 %mad = add i64 %arg2, %mul
30 ret i64 %mad
31 }
32
33 ; GCN-LABEL: {{^}}mad_u64_u32_zextops:
34 ; CI: v_mad_u64_u32 v[0:1], s[6:7], v0, v1, v[2:3]
35
36 ; SI-DAG: v_mul_lo_i32
37 ; SI-DAG: v_mul_hi_u32
38 ; SI: v_add_i32
39 ; SI: v_addc_u32
40 define i64 @mad_u64_u32_zextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
41 %sext0 = zext i32 %arg0 to i64
42 %sext1 = zext i32 %arg1 to i64
43 %mul = mul i64 %sext0, %sext1
44 %mad = add i64 %mul, %arg2
45 ret i64 %mad
46 }
47
48 ; GCN-LABEL: {{^}}mad_u64_u32_zextops_commute:
49 ; CI: v_mad_u64_u32 v[0:1], s[6:7], v0, v1, v[2:3]
50
51 ; SI-DAG: v_mul_lo_i32
52 ; SI-DAG: v_mul_hi_u32
53 ; SI: v_add_i32
54 ; SI: v_addc_u32
55 define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
56 %sext0 = zext i32 %arg0 to i64
57 %sext1 = zext i32 %arg1 to i64
58 %mul = mul i64 %sext0, %sext1
59 %mad = add i64 %arg2, %mul
60 ret i64 %mad
61 }
62
63
64
65
66
67
68 ; GCN-LABEL: {{^}}mad_i64_i32_sextops_i32_i128:
69 ; CI: v_mad_u64_u32
70 ; CI: v_mad_u64_u32
71 ; CI: v_mad_u64_u32
72 ; CI: v_mad_i64_i32
73
74 ; SI-NOT: v_mad_
75 define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
76 %sext0 = sext i32 %arg0 to i128
77 %sext1 = sext i32 %arg1 to i128
78 %mul = mul i128 %sext0, %sext1
79 %mad = add i128 %mul, %arg2
80 ret i128 %mad
81 }
82
83 ; GCN-LABEL: {{^}}mad_i64_i32_sextops_i32_i63:
84 ; CI: v_lshl_b64
85 ; CI: v_ashr
86 ; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3]
87
88 ; SI-NOT: v_mad_u64_u32
89 define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 {
90 %sext0 = sext i32 %arg0 to i63
91 %sext1 = sext i32 %arg1 to i63
92 %mul = mul i63 %sext0, %sext1
93 %mad = add i63 %mul, %arg2
94 ret i63 %mad
95 }
96
97 ; GCN-LABEL: {{^}}mad_i64_i32_sextops_i31_i63:
98 ; CI: v_lshl_b64
99 ; CI: v_ashr_i64
100 ; CI: v_bfe_i32 v1, v1, 0, 31
101 ; CI: v_bfe_i32 v0, v0, 0, 31
102 ; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3]
103 define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 {
104 %sext0 = sext i31 %arg0 to i63
105 %sext1 = sext i31 %arg1 to i63
106 %mul = mul i63 %sext0, %sext1
107 %mad = add i63 %mul, %arg2
108 ret i63 %mad
109 }
110
111 ; GCN-LABEL: {{^}}mad_u64_u32_bitops:
112 ; CI: v_mad_u64_u32 v[0:1], s[6:7], v0, v2, v[4:5]
113 define i64 @mad_u64_u32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
114 %trunc.lhs = and i64 %arg0, 4294967295
115 %trunc.rhs = and i64 %arg1, 4294967295
116 %mul = mul i64 %trunc.lhs, %trunc.rhs
117 %add = add i64 %mul, %arg2
118 ret i64 %add
119 }
120
121 ; GCN-LABEL: {{^}}mad_u64_u32_bitops_lhs_mask_small:
122 ; GCN-NOT: v_mad_
123 define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
124 %trunc.lhs = and i64 %arg0, 8589934591
125 %trunc.rhs = and i64 %arg1, 4294967295
126 %mul = mul i64 %trunc.lhs, %trunc.rhs
127 %add = add i64 %mul, %arg2
128 ret i64 %add
129 }
130
131 ; GCN-LABEL: {{^}}mad_u64_u32_bitops_rhs_mask_small:
132 ; GCN-NOT: v_mad_
133 define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
134 %trunc.lhs = and i64 %arg0, 4294967295
135 %trunc.rhs = and i64 %arg1, 8589934591
136 %mul = mul i64 %trunc.lhs, %trunc.rhs
137 %add = add i64 %mul, %arg2
138 ret i64 %add
139 }
140
141 ; GCN-LABEL: {{^}}mad_i64_i32_bitops:
142 ; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v2, v[4:5]
143 ; SI-NOT: v_mad_
144 define i64 @mad_i64_i32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
145 %shl.lhs = shl i64 %arg0, 32
146 %trunc.lhs = ashr i64 %shl.lhs, 32
147 %shl.rhs = shl i64 %arg1, 32
148 %trunc.rhs = ashr i64 %shl.rhs, 32
149 %mul = mul i64 %trunc.lhs, %trunc.rhs
150 %add = add i64 %mul, %arg2
151 ret i64 %add
152 }
153
154 ; Example from bug report
155 ; GCN-LABEL: {{^}}mad_i64_i32_unpack_i64ops:
156 ; CI: v_mad_u64_u32 v[0:1], s[6:7], v1, v0, v[0:1]
157 ; SI-NOT: v_mad_u64_u32
158 define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 {
159 %tmp4 = lshr i64 %arg0, 32
160 %tmp5 = and i64 %arg0, 4294967295
161 %mul = mul nuw i64 %tmp4, %tmp5
162 %mad = add i64 %mul, %arg0
163 ret i64 %mad
164 }
165
166 attributes #0 = { nounwind }
167 attributes #1 = { nounwind readnone speculatable }
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
33
44 ; mul24 and mad24 are affected
55
77 ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
88 ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
99
10 ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
11 ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
10 ; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
11 ; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
1212
1313 define amdgpu_kernel void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
1414 %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
2525 ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
2626 ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
2727
28 ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
29 ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
30 ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
31 ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
28 ; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
29 ; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
30 ; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
31 ; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
3232
3333 define amdgpu_kernel void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
3434 %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
4040 }
4141
4242 ; FUNC-LABEL: {{^}}s_trunc_i64_mul_to_i32:
43 ; SI: s_load_dword
44 ; SI: s_load_dword
45 ; SI: s_mul_i32
46 ; SI: buffer_store_dword
43 ; GCN: s_load_dword
44 ; GCN: s_load_dword
45 ; GCN: s_mul_i32
46 ; GCN: buffer_store_dword
4747 define amdgpu_kernel void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
4848 %mul = mul i64 %b, %a
4949 %trunc = trunc i64 %mul to i32
5252 }
5353
5454 ; FUNC-LABEL: {{^}}v_trunc_i64_mul_to_i32:
55 ; SI: s_load_dword
56 ; SI: s_load_dword
57 ; SI: v_mul_lo_i32
58 ; SI: buffer_store_dword
55 ; GCN: s_load_dword
56 ; GCN: s_load_dword
57 ; GCN: v_mul_lo_i32
58 ; GCN: buffer_store_dword
5959 define amdgpu_kernel void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
6060 %a = load i64, i64 addrspace(1)* %aptr, align 8
6161 %b = load i64, i64 addrspace(1)* %bptr, align 8
7070 ; FUNC-LABEL: {{^}}mul64_sext_c:
7171 ; EG-DAG: MULLO_INT
7272 ; EG-DAG: MULHI_INT
73 ; SI-DAG: s_mul_i32
74 ; SI-DAG: v_mul_hi_i32
73 ; GCN-DAG: s_mul_i32
74 ; GCN-DAG: v_mul_hi_i32
7575 define amdgpu_kernel void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) {
7676 entry:
7777 %0 = sext i32 %in to i64
8383 ; FUNC-LABEL: {{^}}v_mul64_sext_c:
8484 ; EG-DAG: MULLO_INT
8585 ; EG-DAG: MULHI_INT
86 ; SI-DAG: v_mul_lo_i32
87 ; SI-DAG: v_mul_hi_i32
88 ; SI: s_endpgm
86 ; GCN-DAG: v_mul_lo_i32
87 ; GCN-DAG: v_mul_hi_i32
88 ; GCN: s_endpgm
8989 define amdgpu_kernel void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
9090 %val = load i32, i32 addrspace(1)* %in, align 4
9191 %ext = sext i32 %val to i64
9595 }
9696
9797 ; FUNC-LABEL: {{^}}v_mul64_sext_inline_imm:
98 ; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
99 ; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
100 ; SI: s_endpgm
98 ; GCN-DAG: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
99 ; GCN-DAG: v_mul_hi_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
100 ; GCN: s_endpgm
101101 define amdgpu_kernel void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
102102 %val = load i32, i32 addrspace(1)* %in, align 4
103103 %ext = sext i32 %val to i64
107107 }
108108
109109 ; FUNC-LABEL: {{^}}s_mul_i32:
110 ; SI: s_load_dword [[SRC0:s[0-9]+]],
111 ; SI: s_load_dword [[SRC1:s[0-9]+]],
112 ; SI: s_mul_i32 [[SRESULT:s[0-9]+]], [[SRC0]], [[SRC1]]
113 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
114 ; SI: buffer_store_dword [[VRESULT]],
115 ; SI: s_endpgm
110 ; GCN: s_load_dword [[SRC0:s[0-9]+]],
111 ; GCN: s_load_dword [[SRC1:s[0-9]+]],
112 ; GCN: s_mul_i32 [[SRESULT:s[0-9]+]], [[SRC0]], [[SRC1]]
113 ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
114 ; GCN: buffer_store_dword [[VRESULT]],
115 ; GCN: s_endpgm
116116 define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
117117 %mul = mul i32 %a, %b
118118 store i32 %mul, i32 addrspace(1)* %out, align 4
120120 }
121121
122122 ; FUNC-LABEL: {{^}}v_mul_i32:
123 ; SI: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
123 ; GCN: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
124124 define amdgpu_kernel void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
125125 %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
126126 %a = load i32, i32 addrspace(1)* %in
145145 }
146146
147147 ; FUNC-LABEL: {{^}}v_mul_i64:
148 ; SI: v_mul_lo_i32
148 ; GCN: v_mul_lo_i32
149149 define amdgpu_kernel void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
150150 %a = load i64, i64 addrspace(1)* %aptr, align 8
151151 %b = load i64, i64 addrspace(1)* %bptr, align 8
155155 }
156156
157157 ; FUNC-LABEL: {{^}}mul32_in_branch:
158 ; SI: s_mul_i32
158 ; GCN: s_mul_i32
159159 define amdgpu_kernel void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) {
160160 entry:
161161 %0 = icmp eq i32 %a, 0
176176 }
177177
178178 ; FUNC-LABEL: {{^}}mul64_in_branch:
179 ; SI-DAG: s_mul_i32
180 ; SI-DAG: v_mul_hi_u32
181 ; SI: s_endpgm
179 ; GCN-DAG: s_mul_i32
180 ; GCN-DAG: v_mul_hi_u32
181 ; GCN: s_endpgm
182182 define amdgpu_kernel void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
183183 entry:
184184 %0 = icmp eq i64 %a, 0
200200
201201 ; FIXME: Load dwordx4
202202 ; FUNC-LABEL: {{^}}s_mul_i128:
203 ; SI: s_load_dwordx2
204 ; SI: s_load_dwordx2
205 ; SI: s_load_dwordx2
206 ; SI: s_load_dwordx2
203 ; GCN: s_load_dwordx2
204 ; GCN: s_load_dwordx2
205 ; GCN: s_load_dwordx2
206 ; GCN: s_load_dwordx2
207207
208208 ; SI: v_mul_hi_u32
209209 ; SI: v_mul_hi_u32
210210 ; SI: s_mul_i32
211211 ; SI: v_mul_hi_u32
212212 ; SI: s_mul_i32
213
213214 ; SI-DAG: s_mul_i32
214215 ; SI-DAG: v_mul_hi_u32
215216 ; SI-DAG: v_mul_hi_u32
216217 ; SI-DAG: s_mul_i32
217218 ; SI-DAG: s_mul_i32
218219 ; SI-DAG: v_mul_hi_u32
219 ; SI: s_mul_i32
220 ; SI: s_mul_i32
221 ; SI: s_mul_i32
222 ; SI: s_mul_i32
223 ; SI: s_mul_i32
224
225 ; SI: buffer_store_dwordx4
220
221 ; SI: s_mul_i32
222 ; SI: s_mul_i32
223 ; SI: s_mul_i32
224 ; SI: s_mul_i32
225 ; SI: s_mul_i32
226
227
228 ; VI: s_mul_i32
229 ; VI: s_mul_i32
230 ; VI: v_mul_hi_u32
231 ; VI: v_mul_hi_u32
232 ; VI: v_mad_u64_u32
233 ; VI: v_mad_u64_u32
234 ; VI: v_mad_u64_u32
235
236
237 ; GCN: buffer_store_dwordx4
226238 define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 {
227239 %mul = mul i128 %a, %b
228240 store i128 %mul, i128 addrspace(1)* %out
230242 }
231243
232244 ; FUNC-LABEL: {{^}}v_mul_i128:
233 ; SI: {{buffer|flat}}_load_dwordx4
234 ; SI: {{buffer|flat}}_load_dwordx4
235
236 ; SI-DAG: v_mul_lo_i32
237 ; SI-DAG: v_mul_hi_u32
238 ; SI-DAG: v_mul_hi_u32
239 ; SI-DAG: v_mul_lo_i32
240 ; SI-DAG: v_mul_hi_u32
241 ; SI-DAG: v_mul_hi_u32
242 ; SI-DAG: v_mul_lo_i32
243 ; SI-DAG: v_mul_lo_i32
244 ; SI: v_add_i32_e32
245 ; SI-DAG: v_mul_hi_u32
246 ; SI-DAG: v_mul_lo_i32
247 ; SI-DAG: v_mul_hi_u32
248 ; SI-DAG: v_mul_lo_i32
249 ; SI-DAG: v_mul_lo_i32
250 ; SI-DAG: v_mul_lo_i32
251 ; SI-DAG: v_mul_lo_i32
252 ; SI-DAG: v_mul_lo_i32
253
254 ; SI: {{buffer|flat}}_store_dwordx4
245 ; GCN: {{buffer|flat}}_load_dwordx4
246 ; GCN: {{buffer|flat}}_load_dwordx4
247
248 ; GCN-DAG: v_mul_lo_i32
249 ; GCN-DAG: v_mul_hi_u32
250 ; GCN-DAG: v_mul_hi_u32
251 ; GCN-DAG: v_mul_lo_i32
252 ; GCN-DAG: v_mul_hi_u32
253 ; GCN-DAG: v_mul_hi_u32
254 ; GCN-DAG: v_mul_lo_i32
255 ; GCN-DAG: v_mul_lo_i32
256 ; GCN: v_add_i32_e32
257
258 ; SI-DAG: v_mul_hi_u32
259 ; SI-DAG: v_mul_lo_i32
260 ; SI-DAG: v_mul_hi_u32
261 ; SI-DAG: v_mul_lo_i32
262 ; SI-DAG: v_mul_lo_i32
263 ; SI-DAG: v_mul_lo_i32
264 ; SI-DAG: v_mul_lo_i32
265 ; SI-DAG: v_mul_lo_i32
266
267 ; VI: v_mad_u64_u32
268 ; VI: v_mad_u64_u32
269 ; VI: v_mad_u64_u32
270
271 ; GCN: {{buffer|flat}}_store_dwordx4
255272 define amdgpu_kernel void @v_mul_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %aptr, i128 addrspace(1)* %bptr) #0 {
256273 %tid = call i32 @llvm.r600.read.tidig.x()
257274 %gep.a = getelementptr inbounds i128, i128 addrspace(1)* %aptr, i32 %tid