llvm.org GIT mirror llvm / b508009
AMDGPU: Add 24-bit mul intrinsics Insert these during codegenprepare. This works around a DAG issue where generic combines eliminate the and asserting the high bits are zero, which then exposes an unknown read source to the mul combine. It doesn't worth the hassle of trying to insert an AssertZext or something to try to deal with it. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@366094 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault a month ago
9 changed file(s) with 751 addition(s) and 11 deletion(s). Raw diff Collapse all Expand all
13491349 [IntrNoMem, IntrSpeculatable]
13501350 >;
13511351
1352 def int_amdgcn_mul_i24 : Intrinsic<[llvm_i32_ty],
1353 [llvm_i32_ty, llvm_i32_ty],
1354 [IntrNoMem, IntrSpeculatable]
1355 >;
1356
1357 def int_amdgcn_mul_u24 : Intrinsic<[llvm_i32_ty],
1358 [llvm_i32_ty, llvm_i32_ty],
1359 [IntrNoMem, IntrSpeculatable]
1360 >;
1361
13521362 // llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id)
13531363 //
13541364 // bar_val is the total number of waves that will wait on this
6060 AssumptionCache *AC = nullptr;
6161 LegacyDivergenceAnalysis *DA = nullptr;
6262 Module *Mod = nullptr;
63 const DataLayout *DL = nullptr;
6364 bool HasUnsafeFPMath = false;
6465
6566 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
132133 /// \returns True.
133134 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
134135
136
137 unsigned numBitsUnsigned(Value *Op, unsigned ScalarSize) const;
138 unsigned numBitsSigned(Value *Op, unsigned ScalarSize) const;
139 bool isI24(Value *V, unsigned ScalarSize) const;
140 bool isU24(Value *V, unsigned ScalarSize) const;
141
142 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
143 /// SelectionDAG has an issue where an and asserting the bits are known
144 bool replaceMulWithMul24(BinaryOperator &I) const;
145
135146 /// Expands 24 bit div or rem.
136147 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
137148 Value *Num, Value *Den,
391402 return true;
392403 }
393404
405 unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op,
406 unsigned ScalarSize) const {
407 KnownBits Known = computeKnownBits(Op, *DL, 0, AC);
408 return ScalarSize - Known.countMinLeadingZeros();
409 }
410
411 unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op,
412 unsigned ScalarSize) const {
413 // In order for this to be a signed 24-bit value, bit 23, must
414 // be a sign bit.
415 return ScalarSize - ComputeNumSignBits(Op, *DL, 0, AC);
416 }
417
418 bool AMDGPUCodeGenPrepare::isI24(Value *V, unsigned ScalarSize) const {
419 return ScalarSize >= 24 && // Types less than 24-bit should be treated
420 // as unsigned 24-bit values.
421 numBitsSigned(V, ScalarSize) < 24;
422 }
423
424 bool AMDGPUCodeGenPrepare::isU24(Value *V, unsigned ScalarSize) const {
425 return numBitsUnsigned(V, ScalarSize) <= 24;
426 }
427
428 static void extractValues(IRBuilder<> &Builder,
429 SmallVectorImpl &Values, Value *V) {
430 VectorType *VT = dyn_cast(V->getType());
431 if (!VT) {
432 Values.push_back(V);
433 return;
434 }
435
436 for (int I = 0, E = VT->getNumElements(); I != E; ++I)
437 Values.push_back(Builder.CreateExtractElement(V, I));
438 }
439
440 static Value *insertValues(IRBuilder<> &Builder,
441 Type *Ty,
442 SmallVectorImpl &Values) {
443 if (Values.size() == 1)
444 return Values[0];
445
446 Value *NewVal = UndefValue::get(Ty);
447 for (int I = 0, E = Values.size(); I != E; ++I)
448 NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
449
450 return NewVal;
451 }
452
453 bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
454 if (I.getOpcode() != Instruction::Mul)
455 return false;
456
457 Type *Ty = I.getType();
458 unsigned Size = Ty->getScalarSizeInBits();
459 if (Size <= 16 && ST->has16BitInsts())
460 return false;
461
462 // Prefer scalar if this could be s_mul_i32
463 if (DA->isUniform(&I))
464 return false;
465
466 Value *LHS = I.getOperand(0);
467 Value *RHS = I.getOperand(1);
468 IRBuilder<> Builder(&I);
469 Builder.SetCurrentDebugLocation(I.getDebugLoc());
470
471 Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
472
473 // TODO: Should this try to match mulhi24?
474 if (ST->hasMulU24() && isU24(LHS, Size) && isU24(RHS, Size)) {
475 IntrID = Intrinsic::amdgcn_mul_u24;
476 } else if (ST->hasMulI24() && isI24(LHS, Size) && isI24(RHS, Size)) {
477 IntrID = Intrinsic::amdgcn_mul_i24;
478 } else
479 return false;
480
481 SmallVector LHSVals;
482 SmallVector RHSVals;
483 SmallVector ResultVals;
484 extractValues(Builder, LHSVals, LHS);
485 extractValues(Builder, RHSVals, RHS);
486
487
488 IntegerType *I32Ty = Builder.getInt32Ty();
489 FunctionCallee Intrin = Intrinsic::getDeclaration(Mod, IntrID);
490 for (int I = 0, E = LHSVals.size(); I != E; ++I) {
491 Value *LHS, *RHS;
492 if (IntrID == Intrinsic::amdgcn_mul_u24) {
493 LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
494 RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
495 } else {
496 LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty);
497 RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty);
498 }
499
500 Value *Result = Builder.CreateCall(Intrin, {LHS, RHS});
501
502 if (IntrID == Intrinsic::amdgcn_mul_u24) {
503 ResultVals.push_back(Builder.CreateZExtOrTrunc(Result,
504 LHSVals[I]->getType()));
505 } else {
506 ResultVals.push_back(Builder.CreateSExtOrTrunc(Result,
507 LHSVals[I]->getType()));
508 }
509 }
510
511 I.replaceAllUsesWith(insertValues(Builder, Ty, ResultVals));
512 I.eraseFromParent();
513
514 return true;
515 }
516
394517 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
395518 const ConstantFP *CNum = dyn_cast(Num);
396519 if (!CNum)
753876 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
754877 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
755878 DA->isUniform(&I) && promoteUniformOpToI32(I))
879 return true;
880
881 if (replaceMulWithMul24(I))
756882 return true;
757883
758884 bool Changed = false;
8811007
8821008 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
8831009 Mod = &M;
1010 DL = &Mod->getDataLayout();
8841011 return false;
8851012 }
8861013
58345834
58355835 case Intrinsic::amdgcn_cos:
58365836 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
5837
5838 case Intrinsic::amdgcn_mul_u24:
5839 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2));
5840 case Intrinsic::amdgcn_mul_i24:
5841 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2));
58375842
58385843 case Intrinsic::amdgcn_log_clamp: {
58395844 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
0 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
1 ; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare %s | FileCheck -check-prefix=SI %s
2 ; RUN: opt -S -mtriple=amdgcn-- -mcpu=fiji -amdgpu-codegenprepare %s | FileCheck -check-prefix=VI %s
3
4 define i16 @mul_i16(i16 %lhs, i16 %rhs) {
5 ; SI-LABEL: @mul_i16(
6 ; SI-NEXT: [[TMP1:%.*]] = zext i16 [[LHS:%.*]] to i32
7 ; SI-NEXT: [[TMP2:%.*]] = zext i16 [[RHS:%.*]] to i32
8 ; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
9 ; SI-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
10 ; SI-NEXT: ret i16 [[TMP4]]
11 ;
12 ; VI-LABEL: @mul_i16(
13 ; VI-NEXT: [[MUL:%.*]] = mul i16 [[LHS:%.*]], [[RHS:%.*]]
14 ; VI-NEXT: ret i16 [[MUL]]
15 ;
16 %mul = mul i16 %lhs, %rhs
17 ret i16 %mul
18 }
19
20 define i32 @smul24_i32(i32 %lhs, i32 %rhs) {
21 ; SI-LABEL: @smul24_i32(
22 ; SI-NEXT: [[SHL_LHS:%.*]] = shl i32 [[LHS:%.*]], 8
23 ; SI-NEXT: [[LHS24:%.*]] = ashr i32 [[SHL_LHS]], 8
24 ; SI-NEXT: [[LSHR_RHS:%.*]] = shl i32 [[RHS:%.*]], 8
25 ; SI-NEXT: [[RHS24:%.*]] = ashr i32 [[LHS]], 8
26 ; SI-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[LHS24]], i32 [[RHS24]])
27 ; SI-NEXT: ret i32 [[TMP1]]
28 ;
29 ; VI-LABEL: @smul24_i32(
30 ; VI-NEXT: [[SHL_LHS:%.*]] = shl i32 [[LHS:%.*]], 8
31 ; VI-NEXT: [[LHS24:%.*]] = ashr i32 [[SHL_LHS]], 8
32 ; VI-NEXT: [[LSHR_RHS:%.*]] = shl i32 [[RHS:%.*]], 8
33 ; VI-NEXT: [[RHS24:%.*]] = ashr i32 [[LHS]], 8
34 ; VI-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[LHS24]], i32 [[RHS24]])
35 ; VI-NEXT: ret i32 [[TMP1]]
36 ;
37 %shl.lhs = shl i32 %lhs, 8
38 %lhs24 = ashr i32 %shl.lhs, 8
39 %lshr.rhs = shl i32 %rhs, 8
40 %rhs24 = ashr i32 %lhs, 8
41 %mul = mul i32 %lhs24, %rhs24
42 ret i32 %mul
43 }
44
45 define <2 x i32> @smul24_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
46 ; SI-LABEL: @smul24_v2i32(
47 ; SI-NEXT: [[SHL_LHS:%.*]] = shl <2 x i32> [[LHS:%.*]],
48 ; SI-NEXT: [[LHS24:%.*]] = ashr <2 x i32> [[SHL_LHS]],
49 ; SI-NEXT: [[LSHR_RHS:%.*]] = shl <2 x i32> [[RHS:%.*]],
50 ; SI-NEXT: [[RHS24:%.*]] = ashr <2 x i32> [[LHS]],
51 ; SI-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[LHS24]], i64 0
52 ; SI-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[LHS24]], i64 1
53 ; SI-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[RHS24]], i64 0
54 ; SI-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[RHS24]], i64 1
55 ; SI-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP3]])
56 ; SI-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP2]], i32 [[TMP4]])
57 ; SI-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i64 0
58 ; SI-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP6]], i64 1
59 ; SI-NEXT: ret <2 x i32> [[TMP8]]
60 ;
61 ; VI-LABEL: @smul24_v2i32(
62 ; VI-NEXT: [[SHL_LHS:%.*]] = shl <2 x i32> [[LHS:%.*]],
63 ; VI-NEXT: [[LHS24:%.*]] = ashr <2 x i32> [[SHL_LHS]],
64 ; VI-NEXT: [[LSHR_RHS:%.*]] = shl <2 x i32> [[RHS:%.*]],
65 ; VI-NEXT: [[RHS24:%.*]] = ashr <2 x i32> [[LHS]],
66 ; VI-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[LHS24]], i64 0
67 ; VI-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[LHS24]], i64 1
68 ; VI-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[RHS24]], i64 0
69 ; VI-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[RHS24]], i64 1
70 ; VI-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP3]])
71 ; VI-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP2]], i32 [[TMP4]])
72 ; VI-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i64 0
73 ; VI-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP6]], i64 1
74 ; VI-NEXT: ret <2 x i32> [[TMP8]]
75 ;
76 %shl.lhs = shl <2 x i32> %lhs,
77 %lhs24 = ashr <2 x i32> %shl.lhs,
78 %lshr.rhs = shl <2 x i32> %rhs,
79 %rhs24 = ashr <2 x i32> %lhs,
80 %mul = mul <2 x i32> %lhs24, %rhs24
81 ret <2 x i32> %mul
82 }
83
84 define i32 @umul24_i32(i32 %lhs, i32 %rhs) {
85 ; SI-LABEL: @umul24_i32(
86 ; SI-NEXT: [[LHS24:%.*]] = and i32 [[LHS:%.*]], 16777215
87 ; SI-NEXT: [[RHS24:%.*]] = and i32 [[RHS:%.*]], 16777215
88 ; SI-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[LHS24]], i32 [[RHS24]])
89 ; SI-NEXT: ret i32 [[TMP1]]
90 ;
91 ; VI-LABEL: @umul24_i32(
92 ; VI-NEXT: [[LHS24:%.*]] = and i32 [[LHS:%.*]], 16777215
93 ; VI-NEXT: [[RHS24:%.*]] = and i32 [[RHS:%.*]], 16777215
94 ; VI-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[LHS24]], i32 [[RHS24]])
95 ; VI-NEXT: ret i32 [[TMP1]]
96 ;
97 %lhs24 = and i32 %lhs, 16777215
98 %rhs24 = and i32 %rhs, 16777215
99 %mul = mul i32 %lhs24, %rhs24
100 ret i32 %mul
101 }
102
103 define <2 x i32> @umul24_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
104 ; SI-LABEL: @umul24_v2i32(
105 ; SI-NEXT: [[LHS24:%.*]] = and <2 x i32> [[LHS:%.*]],
106 ; SI-NEXT: [[RHS24:%.*]] = and <2 x i32> [[RHS:%.*]],
107 ; SI-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[LHS24]], i64 0
108 ; SI-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[LHS24]], i64 1
109 ; SI-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[RHS24]], i64 0
110 ; SI-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[RHS24]], i64 1
111 ; SI-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP3]])
112 ; SI-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP2]], i32 [[TMP4]])
113 ; SI-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i64 0
114 ; SI-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP6]], i64 1
115 ; SI-NEXT: ret <2 x i32> [[TMP8]]
116 ;
117 ; VI-LABEL: @umul24_v2i32(
118 ; VI-NEXT: [[LHS24:%.*]] = and <2 x i32> [[LHS:%.*]],
119 ; VI-NEXT: [[RHS24:%.*]] = and <2 x i32> [[RHS:%.*]],
120 ; VI-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[LHS24]], i64 0
121 ; VI-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[LHS24]], i64 1
122 ; VI-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[RHS24]], i64 0
123 ; VI-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[RHS24]], i64 1
124 ; VI-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP3]])
125 ; VI-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP2]], i32 [[TMP4]])
126 ; VI-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i64 0
127 ; VI-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP6]], i64 1
128 ; VI-NEXT: ret <2 x i32> [[TMP8]]
129 ;
130 %lhs24 = and <2 x i32> %lhs,
131 %rhs24 = and <2 x i32> %rhs,
132 %mul = mul <2 x i32> %lhs24, %rhs24
133 ret <2 x i32> %mul
134 }
135
136 define i64 @smul24_i64(i64 %lhs, i64 %rhs) {
137 ; SI-LABEL: @smul24_i64(
138 ; SI-NEXT: [[SHL_LHS:%.*]] = shl i64 [[LHS:%.*]], 40
139 ; SI-NEXT: [[LHS24:%.*]] = ashr i64 [[SHL_LHS]], 40
140 ; SI-NEXT: [[LSHR_RHS:%.*]] = shl i64 [[RHS:%.*]], 40
141 ; SI-NEXT: [[RHS24:%.*]] = ashr i64 [[LHS]], 40
142 ; SI-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32
143 ; SI-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32
144 ; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]])
145 ; SI-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
146 ; SI-NEXT: ret i64 [[TMP4]]
147 ;
148 ; VI-LABEL: @smul24_i64(
149 ; VI-NEXT: [[SHL_LHS:%.*]] = shl i64 [[LHS:%.*]], 40
150 ; VI-NEXT: [[LHS24:%.*]] = ashr i64 [[SHL_LHS]], 40
151 ; VI-NEXT: [[LSHR_RHS:%.*]] = shl i64 [[RHS:%.*]], 40
152 ; VI-NEXT: [[RHS24:%.*]] = ashr i64 [[LHS]], 40
153 ; VI-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32
154 ; VI-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32
155 ; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]])
156 ; VI-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
157 ; VI-NEXT: ret i64 [[TMP4]]
158 ;
159 %shl.lhs = shl i64 %lhs, 40
160 %lhs24 = ashr i64 %shl.lhs, 40
161 %lshr.rhs = shl i64 %rhs, 40
162 %rhs24 = ashr i64 %lhs, 40
163 %mul = mul i64 %lhs24, %rhs24
164 ret i64 %mul
165 }
166
167 define i64 @umul24_i64(i64 %lhs, i64 %rhs) {
168 ; SI-LABEL: @umul24_i64(
169 ; SI-NEXT: [[LHS24:%.*]] = and i64 [[LHS:%.*]], 16777215
170 ; SI-NEXT: [[RHS24:%.*]] = and i64 [[RHS:%.*]], 16777215
171 ; SI-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32
172 ; SI-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32
173 ; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
174 ; SI-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
175 ; SI-NEXT: ret i64 [[TMP4]]
176 ;
177 ; VI-LABEL: @umul24_i64(
178 ; VI-NEXT: [[LHS24:%.*]] = and i64 [[LHS:%.*]], 16777215
179 ; VI-NEXT: [[RHS24:%.*]] = and i64 [[RHS:%.*]], 16777215
180 ; VI-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32
181 ; VI-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32
182 ; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
183 ; VI-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
184 ; VI-NEXT: ret i64 [[TMP4]]
185 ;
186 %lhs24 = and i64 %lhs, 16777215
187 %rhs24 = and i64 %rhs, 16777215
188 %mul = mul i64 %lhs24, %rhs24
189 ret i64 %mul
190 }
191
192 define i31 @smul24_i31(i31 %lhs, i31 %rhs) {
193 ; SI-LABEL: @smul24_i31(
194 ; SI-NEXT: [[SHL_LHS:%.*]] = shl i31 [[LHS:%.*]], 7
195 ; SI-NEXT: [[LHS24:%.*]] = ashr i31 [[SHL_LHS]], 7
196 ; SI-NEXT: [[LSHR_RHS:%.*]] = shl i31 [[RHS:%.*]], 7
197 ; SI-NEXT: [[RHS24:%.*]] = ashr i31 [[LHS]], 7
198 ; SI-NEXT: [[TMP1:%.*]] = sext i31 [[LHS24]] to i32
199 ; SI-NEXT: [[TMP2:%.*]] = sext i31 [[RHS24]] to i32
200 ; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]])
201 ; SI-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i31
202 ; SI-NEXT: ret i31 [[TMP4]]
203 ;
204 ; VI-LABEL: @smul24_i31(
205 ; VI-NEXT: [[SHL_LHS:%.*]] = shl i31 [[LHS:%.*]], 7
206 ; VI-NEXT: [[LHS24:%.*]] = ashr i31 [[SHL_LHS]], 7
207 ; VI-NEXT: [[LSHR_RHS:%.*]] = shl i31 [[RHS:%.*]], 7
208 ; VI-NEXT: [[RHS24:%.*]] = ashr i31 [[LHS]], 7
209 ; VI-NEXT: [[TMP1:%.*]] = sext i31 [[LHS24]] to i32
210 ; VI-NEXT: [[TMP2:%.*]] = sext i31 [[RHS24]] to i32
211 ; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]])
212 ; VI-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i31
213 ; VI-NEXT: ret i31 [[TMP4]]
214 ;
215 %shl.lhs = shl i31 %lhs, 7
216 %lhs24 = ashr i31 %shl.lhs, 7
217 %lshr.rhs = shl i31 %rhs, 7
218 %rhs24 = ashr i31 %lhs, 7
219 %mul = mul i31 %lhs24, %rhs24
220 ret i31 %mul
221 }
222
223 define i31 @umul24_i31(i31 %lhs, i31 %rhs) {
224 ; SI-LABEL: @umul24_i31(
225 ; SI-NEXT: [[LHS24:%.*]] = and i31 [[LHS:%.*]], 16777215
226 ; SI-NEXT: [[RHS24:%.*]] = and i31 [[RHS:%.*]], 16777215
227 ; SI-NEXT: [[TMP1:%.*]] = zext i31 [[LHS24]] to i32
228 ; SI-NEXT: [[TMP2:%.*]] = zext i31 [[RHS24]] to i32
229 ; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
230 ; SI-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i31
231 ; SI-NEXT: ret i31 [[TMP4]]
232 ;
233 ; VI-LABEL: @umul24_i31(
234 ; VI-NEXT: [[LHS24:%.*]] = and i31 [[LHS:%.*]], 16777215
235 ; VI-NEXT: [[RHS24:%.*]] = and i31 [[RHS:%.*]], 16777215
236 ; VI-NEXT: [[TMP1:%.*]] = zext i31 [[LHS24]] to i32
237 ; VI-NEXT: [[TMP2:%.*]] = zext i31 [[RHS24]] to i32
238 ; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
239 ; VI-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i31
240 ; VI-NEXT: ret i31 [[TMP4]]
241 ;
242 %lhs24 = and i31 %lhs, 16777215
243 %rhs24 = and i31 %rhs, 16777215
244 %mul = mul i31 %lhs24, %rhs24
245 ret i31 %mul
246 }
247
248 define <2 x i31> @umul24_v2i31(<2 x i31> %lhs, <2 x i31> %rhs) {
249 ; SI-LABEL: @umul24_v2i31(
250 ; SI-NEXT: [[LHS24:%.*]] = and <2 x i31> [[LHS:%.*]],
251 ; SI-NEXT: [[RHS24:%.*]] = and <2 x i31> [[RHS:%.*]],
252 ; SI-NEXT: [[TMP1:%.*]] = extractelement <2 x i31> [[LHS24]], i64 0
253 ; SI-NEXT: [[TMP2:%.*]] = extractelement <2 x i31> [[LHS24]], i64 1
254 ; SI-NEXT: [[TMP3:%.*]] = extractelement <2 x i31> [[RHS24]], i64 0
255 ; SI-NEXT: [[TMP4:%.*]] = extractelement <2 x i31> [[RHS24]], i64 1
256 ; SI-NEXT: [[TMP5:%.*]] = zext i31 [[TMP1]] to i32
257 ; SI-NEXT: [[TMP6:%.*]] = zext i31 [[TMP3]] to i32
258 ; SI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP5]], i32 [[TMP6]])
259 ; SI-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i31
260 ; SI-NEXT: [[TMP9:%.*]] = zext i31 [[TMP2]] to i32
261 ; SI-NEXT: [[TMP10:%.*]] = zext i31 [[TMP4]] to i32
262 ; SI-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP9]], i32 [[TMP10]])
263 ; SI-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i31
264 ; SI-NEXT: [[TMP13:%.*]] = insertelement <2 x i31> undef, i31 [[TMP8]], i64 0
265 ; SI-NEXT: [[TMP14:%.*]] = insertelement <2 x i31> [[TMP13]], i31 [[TMP12]], i64 1
266 ; SI-NEXT: ret <2 x i31> [[TMP14]]
267 ;
268 ; VI-LABEL: @umul24_v2i31(
269 ; VI-NEXT: [[LHS24:%.*]] = and <2 x i31> [[LHS:%.*]],
270 ; VI-NEXT: [[RHS24:%.*]] = and <2 x i31> [[RHS:%.*]],
271 ; VI-NEXT: [[TMP1:%.*]] = extractelement <2 x i31> [[LHS24]], i64 0
272 ; VI-NEXT: [[TMP2:%.*]] = extractelement <2 x i31> [[LHS24]], i64 1
273 ; VI-NEXT: [[TMP3:%.*]] = extractelement <2 x i31> [[RHS24]], i64 0
274 ; VI-NEXT: [[TMP4:%.*]] = extractelement <2 x i31> [[RHS24]], i64 1
275 ; VI-NEXT: [[TMP5:%.*]] = zext i31 [[TMP1]] to i32
276 ; VI-NEXT: [[TMP6:%.*]] = zext i31 [[TMP3]] to i32
277 ; VI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP5]], i32 [[TMP6]])
278 ; VI-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i31
279 ; VI-NEXT: [[TMP9:%.*]] = zext i31 [[TMP2]] to i32
280 ; VI-NEXT: [[TMP10:%.*]] = zext i31 [[TMP4]] to i32
281 ; VI-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP9]], i32 [[TMP10]])
282 ; VI-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i31
283 ; VI-NEXT: [[TMP13:%.*]] = insertelement <2 x i31> undef, i31 [[TMP8]], i64 0
284 ; VI-NEXT: [[TMP14:%.*]] = insertelement <2 x i31> [[TMP13]], i31 [[TMP12]], i64 1
285 ; VI-NEXT: ret <2 x i31> [[TMP14]]
286 ;
287 %lhs24 = and <2 x i31> %lhs,
288 %rhs24 = and <2 x i31> %rhs,
289 %mul = mul <2 x i31> %lhs24, %rhs24
290 ret <2 x i31> %mul
291 }
292
293 define <2 x i31> @smul24_v2i31(<2 x i31> %lhs, <2 x i31> %rhs) {
294 ; SI-LABEL: @smul24_v2i31(
295 ; SI-NEXT: [[SHL_LHS:%.*]] = shl <2 x i31> [[LHS:%.*]],
296 ; SI-NEXT: [[LHS24:%.*]] = ashr <2 x i31> [[SHL_LHS]],
297 ; SI-NEXT: [[LSHR_RHS:%.*]] = shl <2 x i31> [[RHS:%.*]],
298 ; SI-NEXT: [[RHS24:%.*]] = ashr <2 x i31> [[LHS]],
299 ; SI-NEXT: [[TMP1:%.*]] = extractelement <2 x i31> [[LHS24]], i64 0
300 ; SI-NEXT: [[TMP2:%.*]] = extractelement <2 x i31> [[LHS24]], i64 1
301 ; SI-NEXT: [[TMP3:%.*]] = extractelement <2 x i31> [[RHS24]], i64 0
302 ; SI-NEXT: [[TMP4:%.*]] = extractelement <2 x i31> [[RHS24]], i64 1
303 ; SI-NEXT: [[TMP5:%.*]] = sext i31 [[TMP1]] to i32
304 ; SI-NEXT: [[TMP6:%.*]] = sext i31 [[TMP3]] to i32
305 ; SI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP5]], i32 [[TMP6]])
306 ; SI-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i31
307 ; SI-NEXT: [[TMP9:%.*]] = sext i31 [[TMP2]] to i32
308 ; SI-NEXT: [[TMP10:%.*]] = sext i31 [[TMP4]] to i32
309 ; SI-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP9]], i32 [[TMP10]])
310 ; SI-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i31
311 ; SI-NEXT: [[TMP13:%.*]] = insertelement <2 x i31> undef, i31 [[TMP8]], i64 0
312 ; SI-NEXT: [[TMP14:%.*]] = insertelement <2 x i31> [[TMP13]], i31 [[TMP12]], i64 1
313 ; SI-NEXT: ret <2 x i31> [[TMP14]]
314 ;
315 ; VI-LABEL: @smul24_v2i31(
316 ; VI-NEXT: [[SHL_LHS:%.*]] = shl <2 x i31> [[LHS:%.*]],
317 ; VI-NEXT: [[LHS24:%.*]] = ashr <2 x i31> [[SHL_LHS]],
318 ; VI-NEXT: [[LSHR_RHS:%.*]] = shl <2 x i31> [[RHS:%.*]],
319 ; VI-NEXT: [[RHS24:%.*]] = ashr <2 x i31> [[LHS]],
320 ; VI-NEXT: [[TMP1:%.*]] = extractelement <2 x i31> [[LHS24]], i64 0
321 ; VI-NEXT: [[TMP2:%.*]] = extractelement <2 x i31> [[LHS24]], i64 1
322 ; VI-NEXT: [[TMP3:%.*]] = extractelement <2 x i31> [[RHS24]], i64 0
323 ; VI-NEXT: [[TMP4:%.*]] = extractelement <2 x i31> [[RHS24]], i64 1
324 ; VI-NEXT: [[TMP5:%.*]] = sext i31 [[TMP1]] to i32
325 ; VI-NEXT: [[TMP6:%.*]] = sext i31 [[TMP3]] to i32
326 ; VI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP5]], i32 [[TMP6]])
327 ; VI-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i31
328 ; VI-NEXT: [[TMP9:%.*]] = sext i31 [[TMP2]] to i32
329 ; VI-NEXT: [[TMP10:%.*]] = sext i31 [[TMP4]] to i32
330 ; VI-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP9]], i32 [[TMP10]])
331 ; VI-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i31
332 ; VI-NEXT: [[TMP13:%.*]] = insertelement <2 x i31> undef, i31 [[TMP8]], i64 0
333 ; VI-NEXT: [[TMP14:%.*]] = insertelement <2 x i31> [[TMP13]], i31 [[TMP12]], i64 1
334 ; VI-NEXT: ret <2 x i31> [[TMP14]]
335 ;
336 %shl.lhs = shl <2 x i31> %lhs,
337 %lhs24 = ashr <2 x i31> %shl.lhs,
338 %lshr.rhs = shl <2 x i31> %rhs,
339 %rhs24 = ashr <2 x i31> %lhs,
340 %mul = mul <2 x i31> %lhs24, %rhs24
341 ret <2 x i31> %mul
342 }
343
344 define i33 @smul24_i33(i33 %lhs, i33 %rhs) {
345 ; SI-LABEL: @smul24_i33(
346 ; SI-NEXT: [[SHL_LHS:%.*]] = shl i33 [[LHS:%.*]], 9
347 ; SI-NEXT: [[LHS24:%.*]] = ashr i33 [[SHL_LHS]], 9
348 ; SI-NEXT: [[LSHR_RHS:%.*]] = shl i33 [[RHS:%.*]], 9
349 ; SI-NEXT: [[RHS24:%.*]] = ashr i33 [[LHS]], 9
350 ; SI-NEXT: [[TMP1:%.*]] = trunc i33 [[LHS24]] to i32
351 ; SI-NEXT: [[TMP2:%.*]] = trunc i33 [[RHS24]] to i32
352 ; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]])
353 ; SI-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i33
354 ; SI-NEXT: ret i33 [[TMP4]]
355 ;
356 ; VI-LABEL: @smul24_i33(
357 ; VI-NEXT: [[SHL_LHS:%.*]] = shl i33 [[LHS:%.*]], 9
358 ; VI-NEXT: [[LHS24:%.*]] = ashr i33 [[SHL_LHS]], 9
359 ; VI-NEXT: [[LSHR_RHS:%.*]] = shl i33 [[RHS:%.*]], 9
360 ; VI-NEXT: [[RHS24:%.*]] = ashr i33 [[LHS]], 9
361 ; VI-NEXT: [[TMP1:%.*]] = trunc i33 [[LHS24]] to i32
362 ; VI-NEXT: [[TMP2:%.*]] = trunc i33 [[RHS24]] to i32
363 ; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]])
364 ; VI-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i33
365 ; VI-NEXT: ret i33 [[TMP4]]
366 ;
367 %shl.lhs = shl i33 %lhs, 9
368 %lhs24 = ashr i33 %shl.lhs, 9
369 %lshr.rhs = shl i33 %rhs, 9
370 %rhs24 = ashr i33 %lhs, 9
371 %mul = mul i33 %lhs24, %rhs24
372 ret i33 %mul
373 }
374
375 define i33 @umul24_i33(i33 %lhs, i33 %rhs) {
376 ; SI-LABEL: @umul24_i33(
377 ; SI-NEXT: [[LHS24:%.*]] = and i33 [[LHS:%.*]], 16777215
378 ; SI-NEXT: [[RHS24:%.*]] = and i33 [[RHS:%.*]], 16777215
379 ; SI-NEXT: [[TMP1:%.*]] = trunc i33 [[LHS24]] to i32
380 ; SI-NEXT: [[TMP2:%.*]] = trunc i33 [[RHS24]] to i32
381 ; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
382 ; SI-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i33
383 ; SI-NEXT: ret i33 [[TMP4]]
384 ;
385 ; VI-LABEL: @umul24_i33(
386 ; VI-NEXT: [[LHS24:%.*]] = and i33 [[LHS:%.*]], 16777215
387 ; VI-NEXT: [[RHS24:%.*]] = and i33 [[RHS:%.*]], 16777215
388 ; VI-NEXT: [[TMP1:%.*]] = trunc i33 [[LHS24]] to i32
389 ; VI-NEXT: [[TMP2:%.*]] = trunc i33 [[RHS24]] to i32
390 ; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
391 ; VI-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i33
392 ; VI-NEXT: ret i33 [[TMP4]]
393 ;
394 %lhs24 = and i33 %lhs, 16777215
395 %rhs24 = and i33 %rhs, 16777215
396 %mul = mul i33 %lhs24, %rhs24
397 ret i33 %mul
398 }
399
400 define i32 @smul25_i32(i32 %lhs, i32 %rhs) {
401 ; SI-LABEL: @smul25_i32(
402 ; SI-NEXT: [[SHL_LHS:%.*]] = shl i32 [[LHS:%.*]], 7
403 ; SI-NEXT: [[LHS24:%.*]] = ashr i32 [[SHL_LHS]], 7
404 ; SI-NEXT: [[LSHR_RHS:%.*]] = shl i32 [[RHS:%.*]], 7
405 ; SI-NEXT: [[RHS24:%.*]] = ashr i32 [[LHS]], 7
406 ; SI-NEXT: [[MUL:%.*]] = mul i32 [[LHS24]], [[RHS24]]
407 ; SI-NEXT: ret i32 [[MUL]]
408 ;
409 ; VI-LABEL: @smul25_i32(
410 ; VI-NEXT: [[SHL_LHS:%.*]] = shl i32 [[LHS:%.*]], 7
411 ; VI-NEXT: [[LHS24:%.*]] = ashr i32 [[SHL_LHS]], 7
412 ; VI-NEXT: [[LSHR_RHS:%.*]] = shl i32 [[RHS:%.*]], 7
413 ; VI-NEXT: [[RHS24:%.*]] = ashr i32 [[LHS]], 7
414 ; VI-NEXT: [[MUL:%.*]] = mul i32 [[LHS24]], [[RHS24]]
415 ; VI-NEXT: ret i32 [[MUL]]
416 ;
417 %shl.lhs = shl i32 %lhs, 7
418 %lhs24 = ashr i32 %shl.lhs, 7
419 %lshr.rhs = shl i32 %rhs, 7
420 %rhs24 = ashr i32 %lhs, 7
421 %mul = mul i32 %lhs24, %rhs24
422 ret i32 %mul
423 }
424
425 define i32 @umul25_i32(i32 %lhs, i32 %rhs) {
426 ; SI-LABEL: @umul25_i32(
427 ; SI-NEXT: [[LHS24:%.*]] = and i32 [[LHS:%.*]], 33554431
428 ; SI-NEXT: [[RHS24:%.*]] = and i32 [[RHS:%.*]], 33554431
429 ; SI-NEXT: [[MUL:%.*]] = mul i32 [[LHS24]], [[RHS24]]
430 ; SI-NEXT: ret i32 [[MUL]]
431 ;
432 ; VI-LABEL: @umul25_i32(
433 ; VI-NEXT: [[LHS24:%.*]] = and i32 [[LHS:%.*]], 33554431
434 ; VI-NEXT: [[RHS24:%.*]] = and i32 [[RHS:%.*]], 33554431
435 ; VI-NEXT: [[MUL:%.*]] = mul i32 [[LHS24]], [[RHS24]]
436 ; VI-NEXT: ret i32 [[MUL]]
437 ;
438 %lhs24 = and i32 %lhs, 33554431
439 %rhs24 = and i32 %rhs, 33554431
440 %mul = mul i32 %lhs24, %rhs24
441 ret i32 %mul
442 }
443
444 define <2 x i33> @smul24_v2i33(<2 x i33> %lhs, <2 x i33> %rhs) {
445 ; SI-LABEL: @smul24_v2i33(
446 ; SI-NEXT: [[SHL_LHS:%.*]] = shl <2 x i33> [[LHS:%.*]],
447 ; SI-NEXT: [[LHS24:%.*]] = ashr <2 x i33> [[SHL_LHS]],
448 ; SI-NEXT: [[LSHR_RHS:%.*]] = shl <2 x i33> [[RHS:%.*]],
449 ; SI-NEXT: [[RHS24:%.*]] = ashr <2 x i33> [[LHS]],
450 ; SI-NEXT: [[TMP1:%.*]] = extractelement <2 x i33> [[LHS24]], i64 0
451 ; SI-NEXT: [[TMP2:%.*]] = extractelement <2 x i33> [[LHS24]], i64 1
452 ; SI-NEXT: [[TMP3:%.*]] = extractelement <2 x i33> [[RHS24]], i64 0
453 ; SI-NEXT: [[TMP4:%.*]] = extractelement <2 x i33> [[RHS24]], i64 1
454 ; SI-NEXT: [[TMP5:%.*]] = trunc i33 [[TMP1]] to i32
455 ; SI-NEXT: [[TMP6:%.*]] = trunc i33 [[TMP3]] to i32
456 ; SI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP5]], i32 [[TMP6]])
457 ; SI-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i33
458 ; SI-NEXT: [[TMP9:%.*]] = trunc i33 [[TMP2]] to i32
459 ; SI-NEXT: [[TMP10:%.*]] = trunc i33 [[TMP4]] to i32
460 ; SI-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP9]], i32 [[TMP10]])
461 ; SI-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i33
462 ; SI-NEXT: [[TMP13:%.*]] = insertelement <2 x i33> undef, i33 [[TMP8]], i64 0
463 ; SI-NEXT: [[TMP14:%.*]] = insertelement <2 x i33> [[TMP13]], i33 [[TMP12]], i64 1
464 ; SI-NEXT: ret <2 x i33> [[TMP14]]
465 ;
466 ; VI-LABEL: @smul24_v2i33(
467 ; VI-NEXT: [[SHL_LHS:%.*]] = shl <2 x i33> [[LHS:%.*]],
468 ; VI-NEXT: [[LHS24:%.*]] = ashr <2 x i33> [[SHL_LHS]],
469 ; VI-NEXT: [[LSHR_RHS:%.*]] = shl <2 x i33> [[RHS:%.*]],
470 ; VI-NEXT: [[RHS24:%.*]] = ashr <2 x i33> [[LHS]],
471 ; VI-NEXT: [[TMP1:%.*]] = extractelement <2 x i33> [[LHS24]], i64 0
472 ; VI-NEXT: [[TMP2:%.*]] = extractelement <2 x i33> [[LHS24]], i64 1
473 ; VI-NEXT: [[TMP3:%.*]] = extractelement <2 x i33> [[RHS24]], i64 0
474 ; VI-NEXT: [[TMP4:%.*]] = extractelement <2 x i33> [[RHS24]], i64 1
475 ; VI-NEXT: [[TMP5:%.*]] = trunc i33 [[TMP1]] to i32
476 ; VI-NEXT: [[TMP6:%.*]] = trunc i33 [[TMP3]] to i32
477 ; VI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP5]], i32 [[TMP6]])
478 ; VI-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i33
479 ; VI-NEXT: [[TMP9:%.*]] = trunc i33 [[TMP2]] to i32
480 ; VI-NEXT: [[TMP10:%.*]] = trunc i33 [[TMP4]] to i32
481 ; VI-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP9]], i32 [[TMP10]])
482 ; VI-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i33
483 ; VI-NEXT: [[TMP13:%.*]] = insertelement <2 x i33> undef, i33 [[TMP8]], i64 0
484 ; VI-NEXT: [[TMP14:%.*]] = insertelement <2 x i33> [[TMP13]], i33 [[TMP12]], i64 1
485 ; VI-NEXT: ret <2 x i33> [[TMP14]]
486 ;
487 %shl.lhs = shl <2 x i33> %lhs,
488 %lhs24 = ashr <2 x i33> %shl.lhs,
489 %lshr.rhs = shl <2 x i33> %rhs,
490 %rhs24 = ashr <2 x i33> %lhs,
491 %mul = mul <2 x i33> %lhs24, %rhs24
492 ret <2 x i33> %mul
493 }
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
1
2 ; GCN-LABEL: {{^}}test_mul_i24:
3 ; GCN: v_mul_i32_i24
4 define amdgpu_kernel void @test_mul_i24(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #1 {
5 %val = call i32 @llvm.amdgcn.mul.i24(i32 %src1, i32 %src2) #0
6 store i32 %val, i32 addrspace(1)* %out
7 ret void
8 }
9
10 declare i32 @llvm.amdgcn.mul.i24(i32, i32) #0
11
12 attributes #0 = { nounwind readnone speculatable }
13 attributes #1 = { nounwind }
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
1
2 ; GCN-LABEL: {{^}}test_mul_u24:
3 ; GCN: v_mul_u32_u24
4 define amdgpu_kernel void @test_mul_u24(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #1 {
5 %val = call i32 @llvm.amdgcn.mul.u24(i32 %src1, i32 %src2) #0
6 store i32 %val, i32 addrspace(1)* %out
7 ret void
8 }
9
10 declare i32 @llvm.amdgcn.mul.u24(i32, i32) #0
11
12 attributes #0 = { nounwind readnone speculatable }
13 attributes #1 = { nounwind }
232232 store i64 %mad_ext, i64 addrspace(1)* %out
233233 ret void
234234 }
235
236 ; The ands are asserting the high bits are 0. SimplifyDemandedBits on
237 ; the adds would remove the ands before the target combine on the mul
238 ; had a chance to form mul24. The mul combine would then see
239 ; extractelement with no known bits and fail. All of the mul/add
240 ; combos in this loop should form v_mad_u32_u24.
241
242 ; FUNC-LABEL: {{^}}mad24_known_bits_destroyed:
243 ; GCN: v_mad_u32_u24
244 ; GCN: v_mad_u32_u24
245 ; GCN: v_mad_u32_u24
246 ; GCN: v_mad_u32_u24
247 ; GCN: v_mad_u32_u24
248 ; GCN: v_mad_u32_u24
249 ; GCN: v_mad_u32_u24
250 ; GCN: v_mad_u32_u24
251 define void @mad24_known_bits_destroyed(i32 %arg, <4 x i32> %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 addrspace(1)* %arg7, <4 x i32> addrspace(1)* %arg8) #0 {
252 bb:
253 %tmp = and i32 %arg4, 16777215
254 %tmp9 = extractelement <4 x i32> %arg1, i64 1
255 %tmp10 = extractelement <4 x i32> %arg3, i64 1
256 %tmp11 = and i32 %tmp9, 16777215
257 %tmp12 = extractelement <4 x i32> %arg1, i64 2
258 %tmp13 = extractelement <4 x i32> %arg3, i64 2
259 %tmp14 = and i32 %tmp12, 16777215
260 %tmp15 = extractelement <4 x i32> %arg1, i64 3
261 %tmp16 = extractelement <4 x i32> %arg3, i64 3
262 %tmp17 = and i32 %tmp15, 16777215
263 br label %bb19
264
265 bb18: ; preds = %bb19
266 ret void
267
268 bb19: ; preds = %bb19, %bb
269 %tmp20 = phi i32 [ %arg, %bb ], [ %tmp40, %bb19 ]
270 %tmp21 = phi i32 [ 0, %bb ], [ %tmp54, %bb19 ]
271 %tmp22 = phi <4 x i32> [ %arg2, %bb ], [ %tmp53, %bb19 ]
272 %tmp23 = and i32 %tmp20, 16777215
273 %tmp24 = mul i32 %tmp23, %tmp
274 %tmp25 = add i32 %tmp24, %arg5
275 %tmp26 = extractelement <4 x i32> %tmp22, i64 1
276 %tmp27 = and i32 %tmp26, 16777215
277 %tmp28 = mul i32 %tmp27, %tmp11
278 %tmp29 = add i32 %tmp28, %tmp10
279 %tmp30 = extractelement <4 x i32> %tmp22, i64 2
280 %tmp31 = and i32 %tmp30, 16777215
281 %tmp32 = mul i32 %tmp31, %tmp14
282 %tmp33 = add i32 %tmp32, %tmp13
283 %tmp34 = extractelement <4 x i32> %tmp22, i64 3
284 %tmp35 = and i32 %tmp34, 16777215
285 %tmp36 = mul i32 %tmp35, %tmp17
286 %tmp37 = add i32 %tmp36, %tmp16
287 %tmp38 = and i32 %tmp25, 16777215
288 %tmp39 = mul i32 %tmp38, %tmp
289 %tmp40 = add i32 %tmp39, %arg5
290 store i32 %tmp40, i32 addrspace(1)* %arg7
291 %tmp41 = insertelement <4 x i32> undef, i32 %tmp40, i32 0
292 %tmp42 = and i32 %tmp29, 16777215
293 %tmp43 = mul i32 %tmp42, %tmp11
294 %tmp44 = add i32 %tmp43, %tmp10
295 %tmp45 = insertelement <4 x i32> %tmp41, i32 %tmp44, i32 1
296 %tmp46 = and i32 %tmp33, 16777215
297 %tmp47 = mul i32 %tmp46, %tmp14
298 %tmp48 = add i32 %tmp47, %tmp13
299 %tmp49 = insertelement <4 x i32> %tmp45, i32 %tmp48, i32 2
300 %tmp50 = and i32 %tmp37, 16777215
301 %tmp51 = mul i32 %tmp50, %tmp17
302 %tmp52 = add i32 %tmp51, %tmp16
303 %tmp53 = insertelement <4 x i32> %tmp49, i32 %tmp52, i32 3
304 store <4 x i32> %tmp53, <4 x i32> addrspace(1)* %arg8
305 %tmp54 = add nuw nsw i32 %tmp21, 1
306 %tmp55 = icmp eq i32 %tmp54, %arg6
307 br i1 %tmp55, label %bb18, label %bb19
308 }
309
310 attributes #0 = { norecurse nounwind }
4040 }
4141
4242 ; GCN-LABEL: {{^}}v_mul_v2i16:
43 ; SI: v_mul_lo_u32
44 ; SI: v_mul_lo_u32
43 ; SI: v_mul_u32_u24
44 ; SI: v_mul_u32_u24
4545
4646 ; VI: v_mul_lo_u16_sdwa
4747 ; VI: v_mul_lo_u16_e32
5858
5959 ; FIXME: Unpack garbage on gfx9
6060 ; GCN-LABEL: {{^}}v_mul_v3i16:
61 ; SI: v_mul_lo_u32
62 ; SI: v_mul_lo_u32
63 ; SI: v_mul_lo_u32
61 ; SI: v_mul_u32_u24
62 ; SI: v_mul_u32_u24
63 ; SI: v_mul_u32_u24
6464
6565 ; VI: v_mul_lo_u16
6666 ; VI: v_mul_lo_u16
7676 }
7777
7878 ; GCN-LABEL: {{^}}v_mul_v4i16:
79 ; SI: v_mul_lo_u32
80 ; SI: v_mul_lo_u32
81 ; SI: v_mul_lo_u32
82 ; SI: v_mul_lo_u32
79 ; SI: v_mul_u32_u24
80 ; SI: v_mul_u32_u24
81 ; SI: v_mul_u32_u24
82 ; SI: v_mul_u32_u24
8383
8484 ; VI: v_mul_lo_u16_sdwa
8585 ; VI: v_mul_lo_u16_e32
248248 ; GCN-DAG: v_and_b32_e32 v1, [[U23_MASK]], v1
249249 ; GCN-DAG: v_mul_u32_u24_e32 v0, 0xea, v0
250250 ; GCN-DAG: v_mul_u32_u24_e32 v1, 0x39b, v1
251 ; GCN: v_and_b32_e32 v1, s4, v1
252 ; GCN: v_and_b32_e32 v0, 0x7ffffe, v0
251 ; GCN-DAG: v_and_b32_e32 v1, s4, v1
252 ; GCN-DAG: v_and_b32_e32 v0, 0x7ffffe, v0
253253 ; GCN: v_mul_u32_u24_e32 v0, v0, v1
254254 ; GCN: v_and_b32_e32 v0, 0x1fffe, v0
255255 ; GCN: v_mul_u32_u24_e32 v0, 0x63, v0