llvm.org GIT mirror llvm / 4dd96ff
[ARM][MVE] Remove old tail predicates Remove any predicate that we replace with a vctp intrinsic, and try to remove their operands too. Also look into the exit block to see if there's any duplicates of the predicates that we've replaced and clone the vctp to be used there instead. Differential Revision: https://reviews.llvm.org/D67709 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@372567 91177308-0d34-0410-b5e6-96231b3b80d8 Sam Parker 1 year, 1 day ago
6 changed file(s) with 672 addition(s) and 12 deletion(s). Raw diff Collapse all Expand all
39853985 def MVE_VDWDUPu16 : MVE_VxWDUP<"vdwdup", "u16", 0b01, 0b1>;
39863986 def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>;
39873987
3988 let hasSideEffects = 1 in
39883989 class MVE_VCTP size, list pattern=[]>
39893990 : MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix,
39903991 "$Rn", vpred_n, "", pattern> {
8383
8484 /// Is the icmp that generates an i1 vector, based upon a loop counter
8585 /// and a limit that is defined outside the loop.
86 bool isTailPredicate(Value *Predicate, Value *NumElements);
86 bool isTailPredicate(Instruction *Predicate, Value *NumElements);
8787 };
8888
8989 } // end namespace
177177 return Changed;
178178 }
179179
180 bool MVETailPredication::isTailPredicate(Value *V, Value *NumElements) {
180 bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) {
181181 // Look for the following:
182182
183183 // %trip.count.minus.1 = add i32 %N, -1
205205 Instruction *Induction = nullptr;
206206
207207 // The vector icmp
208 if (!match(V, m_ICmp(Pred, m_Instruction(Induction),
208 if (!match(I, m_ICmp(Pred, m_Instruction(Induction),
209209 m_Instruction(Shuffle))) ||
210210 Pred != ICmpInst::ICMP_ULE || !L->isLoopInvariant(Shuffle))
211211 return false;
389389 return Expander.expandCodeFor(Elems, Elems->getType(), InsertPt);
390390 }
391391
392 // Look through the exit block to see whether there's a duplicate predicate
393 // instruction. This can happen when we need to perform a select on values
394 // from the last and previous iteration. Instead of doing a straight
395 // replacement of that predicate with the vctp, clone the vctp and place it
396 // in the block. This means that the VPR doesn't have to be live into the
397 // exit block which should make it easier to convert this loop into a proper
398 // tail predicated loop.
399 static void Cleanup(DenseMap &NewPredicates,
400 SetVector &MaybeDead, Loop *L) {
401 if (BasicBlock *Exit = L->getUniqueExitBlock()) {
402 for (auto &Pair : NewPredicates) {
403 Instruction *OldPred = Pair.first;
404 Instruction *NewPred = Pair.second;
405
406 for (auto &I : *Exit) {
407 if (I.isSameOperationAs(OldPred)) {
408 Instruction *PredClone = NewPred->clone();
409 PredClone->insertBefore(&I);
410 I.replaceAllUsesWith(PredClone);
411 MaybeDead.insert(&I);
412 break;
413 }
414 }
415 }
416 }
417
418 // Drop references and add operands to check for dead.
419 SmallPtrSet Dead;
420 while (!MaybeDead.empty()) {
421 auto *I = MaybeDead.front();
422 MaybeDead.remove(I);
423 if (I->hasNUsesOrMore(1))
424 continue;
425
426 for (auto &U : I->operands()) {
427 if (auto *OpI = dyn_cast(U))
428 MaybeDead.insert(OpI);
429 }
430 I->dropAllReferences();
431 Dead.insert(I);
432 }
433
434 for (auto *I : Dead)
435 I->eraseFromParent();
436
437 for (auto I : L->blocks())
438 DeleteDeadPHIs(I);
439 }
440
392441 bool MVETailPredication::TryConvert(Value *TripCount) {
393442 if (!IsPredicatedVectorLoop())
394443 return false;
399448 // operand is generated from an induction variable.
400449 Module *M = L->getHeader()->getModule();
401450 Type *Ty = IntegerType::get(M->getContext(), 32);
402 SmallPtrSet> Predicates;
451 SetVector> Predicates;
452 DenseMap NewPredicates;
403453
404454 for (auto *I : MaskedInsts) {
405455 Intrinsic::ID ID = I->getIntrinsicID();
406456 unsigned PredOp = ID == Intrinsic::masked_load ? 2 : 3;
407 Value *Predicate = I->getArgOperand(PredOp);
408 if (Predicates.count(Predicate))
457 auto *Predicate = dyn_cast(I->getArgOperand(PredOp));
458 if (!Predicate || Predicates.count(Predicate))
409459 continue;
410460
411461 VectorType *VecTy = getVectorType(I);
444494 Value *Remaining = Builder.CreateSub(Processed, Factor);
445495 Value *TailPredicate = Builder.CreateCall(VCTP, Remaining);
446496 Predicate->replaceAllUsesWith(TailPredicate);
497 NewPredicates[Predicate] = cast(TailPredicate);
447498
448499 // Add the incoming value to the new phi.
449500 Processed->addIncoming(Remaining, L->getLoopLatch());
452503 << "TP: Inserted VCTP: " << *TailPredicate << "\n");
453504 }
454505
455 for (auto I : L->blocks())
456 DeleteDeadPHIs(I);
457
506 // Now clean up.
507 Cleanup(NewPredicates, Predicates, L);
458508 return true;
459509 }
460510
0 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -enable-arm-maskedldst=true -disable-mve-tail-predication=false --verify-machineinstrs %s -o - | FileCheck %s
1
2 ; CHECK-LABEL: vpsel_mul_reduce_add
3 ; CHECK: dls lr, lr
4 ; CHECK: [[LOOP:.LBB[0-9_]+]]:
5 ; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]], #4
6 ; CHECK: vctp.32 [[ELEMS]]
7 ; CHECK: vstr p0, [sp
8 ; CHECK: vpstt
9 ; CHECK-NEXT: vldrwt.u32
10 ; CHECK-NEXT: vldrwt.u32
11 ; CHECK: vcmp.i32
12 ; CHECK: vpsel
13 ; CHECK: vldr p0, [sp
14 ; CHECK: vpst
15 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
16 ; CHECK: le lr, [[LOOP]]
17 ; CHECK: vctp.32 [[ELEMS]]
18 ; CHECK-NEXT: vpsel
19 ; CHECK-NEXT: vaddv.u32
20 define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32 %N) {
21 entry:
22 %cmp8 = icmp eq i32 %N, 0
23 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
24
25 vector.ph: ; preds = %entry
26 %n.rnd.up = add i32 %N, 3
27 %n.vec = and i32 %n.rnd.up, -4
28 %trip.count.minus.1 = add i32 %N, -1
29 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
30 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
31 br label %vector.body
32
33 vector.body: ; preds = %vector.body, %vector.ph
34 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
35 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
36 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
37 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
38 %induction = add <4 x i32> %broadcast.splat,
39 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
40 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
41 %tmp2 = bitcast i32* %tmp to <4 x i32>*
42 %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
43 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
44 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
45 %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
46 %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
47 %tmp6 = bitcast i32* %tmp5 to <4 x i32>*
48 %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
49 %rem = urem i32 %index, 16
50 %rem.broadcast.splatinsert = insertelement <4 x i32> undef, i32 %rem, i32 0
51 %rem.broadcast.splat = shufflevector <4 x i32> %rem.broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
52 %cmp = icmp eq <4 x i32> %rem.broadcast.splat,
53 %wide.masked.load = select <4 x i1> %cmp, <4 x i32> %wide.masked.load.b, <4 x i32> %wide.masked.load.c
54 %mul = mul nsw <4 x i32> %wide.masked.load, %wide.masked.load.a
55 %add = add nsw <4 x i32> %mul, %vec.phi
56 %index.next = add i32 %index, 4
57 %tmp7 = icmp eq i32 %index.next, %n.vec
58 br i1 %tmp7, label %middle.block, label %vector.body
59
60 middle.block: ; preds = %vector.body
61 %tmp8 = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
62 %tmp9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp8)
63 br label %for.cond.cleanup
64
65 for.cond.cleanup: ; preds = %middle.block, %entry
66 %res.0.lcssa = phi i32 [ 0, %entry ], [ %tmp9, %middle.block ]
67 ret i32 %res.0.lcssa
68 }
69
70 ; CHECK-LABEL: vpsel_mul_reduce_add_2
71 ; CHECK: dls lr, lr
72 ; CHECK: [[LOOP:.LBB[0-9_]+]]:
73 ; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]], #4
74 ; CHECK: vctp.32 [[ELEMS]]
75 ; CHECK: vstr p0, [sp
76 ; CHECK: vpstt
77 ; CHECK-NEXT: vldrwt.u32
78 ; CHECK-NEXT: vldrwt.u32
79 ; CHECK; vsub
80 ; CHECK: vpst
81 ; CHECK-NEXT: vldrwt.u32
82 ; CHECK: vcmp.i32
83 ; CHECK: vpsel
84 ; CHECK: vldr p0, [sp
85 ; CHECK: vpst
86 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
87 ; CHECK: le lr, [[LOOP]]
88 ; CHECK: vctp.32 [[ELEMS]]
89 ; CHECK-NEXT: vpsel
90 ; CHECK-NEXT: vaddv.u32
91 define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
92 i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
93 entry:
94 %cmp8 = icmp eq i32 %N, 0
95 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
96
97 vector.ph: ; preds = %entry
98 %n.rnd.up = add i32 %N, 3
99 %n.vec = and i32 %n.rnd.up, -4
100 %trip.count.minus.1 = add i32 %N, -1
101 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
102 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
103 br label %vector.body
104
105 vector.body: ; preds = %vector.body, %vector.ph
106 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
107 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
108 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
109 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
110 %induction = add <4 x i32> %broadcast.splat,
111 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
112 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
113 %tmp2 = bitcast i32* %tmp to <4 x i32>*
114 %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
115 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
116 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
117 %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
118 %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
119 %tmp6 = bitcast i32* %tmp5 to <4 x i32>*
120 %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
121 %tmp7 = getelementptr inbounds i32, i32* %d, i32 %index
122 %tmp8 = bitcast i32* %tmp7 to <4 x i32>*
123 %wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
124 %sub = sub <4 x i32> %wide.masked.load.c, %wide.masked.load.d
125 %rem = urem i32 %index, 16
126 %rem.broadcast.splatinsert = insertelement <4 x i32> undef, i32 %rem, i32 0
127 %rem.broadcast.splat = shufflevector <4 x i32> %rem.broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
128 %cmp = icmp eq <4 x i32> %rem.broadcast.splat,
129 %sel = select <4 x i1> %cmp, <4 x i32> %sub, <4 x i32> %wide.masked.load.b
130 %mul = mul <4 x i32> %sel, %wide.masked.load.a
131 %add = add <4 x i32> %mul, %vec.phi
132 %index.next = add i32 %index, 4
133 %cmp.exit = icmp eq i32 %index.next, %n.vec
134 br i1 %cmp.exit, label %middle.block, label %vector.body
135
136 middle.block: ; preds = %vector.body
137 %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
138 %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc)
139 br label %for.cond.cleanup
140
141 for.cond.cleanup: ; preds = %middle.block, %entry
142 %res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ]
143 ret i32 %res.0.lcssa
144 }
145
146 ; CHECK-LABEL: and_mul_reduce_add
147 ; CHECK: dls lr, lr
148 ; CHECK: [[LOOP:.LBB[0-9_]+]]:
149 ; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]],{{.*}}#4
150 ; CHECK: vctp.32 [[ELEMS]]
151 ; CHECK: vpstt
152 ; CHECK-NEXT: vldrwt.u32
153 ; CHECK-NEXT: vldrwt.u32
154 ; CHECK: vpsttt
155 ; CHECK-NEXT: vcmpt.i32 eq, {{.*}}, zr
156 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3]
157 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2]
158 ; CHECK: le lr, [[LOOP]]
159 ; CHECK: vctp.32 [[ELEMS]]
160 ; CHECK: vpsel
161 define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
162 i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
163 entry:
164 %cmp8 = icmp eq i32 %N, 0
165 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
166
167 vector.ph: ; preds = %entry
168 %n.rnd.up = add i32 %N, 3
169 %n.vec = and i32 %n.rnd.up, -4
170 %trip.count.minus.1 = add i32 %N, -1
171 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
172 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
173 br label %vector.body
174
175 vector.body: ; preds = %vector.body, %vector.ph
176 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
177 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
178 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
179 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
180 %induction = add <4 x i32> %broadcast.splat,
181 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
182 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
183 %tmp2 = bitcast i32* %tmp to <4 x i32>*
184 %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
185 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
186 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
187 %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
188 %sub = sub <4 x i32> %wide.masked.load.a, %wide.masked.load.b
189 %cmp = icmp eq <4 x i32> %sub,
190 %mask = and <4 x i1> %cmp, %tmp1
191 %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
192 %tmp6 = bitcast i32* %tmp5 to <4 x i32>*
193 %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %mask, <4 x i32> undef)
194 %tmp7 = getelementptr inbounds i32, i32* %d, i32 %index
195 %tmp8 = bitcast i32* %tmp7 to <4 x i32>*
196 %wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %mask, <4 x i32> undef)
197 %mul = mul <4 x i32> %wide.masked.load.c, %wide.masked.load.d
198 %add = add <4 x i32> %mul, %vec.phi
199 %index.next = add i32 %index, 4
200 %cmp.exit = icmp eq i32 %index.next, %n.vec
201 br i1 %cmp.exit, label %middle.block, label %vector.body
202
203 middle.block: ; preds = %vector.body
204 %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
205 %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc)
206 br label %for.cond.cleanup
207
208 for.cond.cleanup: ; preds = %middle.block, %entry
209 %res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ]
210 ret i32 %res.0.lcssa
211 }
212
213 ; TODO: Why does p0 get reloaded from the stack into p0, just to be vmrs'd?
214 ; CHECK-LABEL: or_mul_reduce_add
215 ; CHECK: dls lr, lr
216 ; CHECK: [[LOOP:.LBB[0-9_]+]]:
217 ; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]],{{.*}}#4
218 ; CHECK: vctp.32 [[ELEMS]]
219 ; CHECK: vstr p0, [sp
220 ; CHECK: vpstt
221 ; CHECK-NEXT: vldrwt.u32
222 ; CHECK-NEXT: vldrwt.u32
223 ; CHECK: vcmp.i32 eq, {{.*}}, zr
224 ; CHECK: vmrs [[VCMP:r[0-9]+]], p0
225 ; CHECK: vldr p0, [sp
226 ; CHECK: vmrs [[VCTP:r[0-9]+]], p0
227 ; CHECK: orr{{.*}} [[VCMP]], [[VCTP]]
228 ; CHECK-NEXT: vmsr p0
229 ; CHECK-NEXT: vpstt
230 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3]
231 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2]
232 ; CHECK: le lr, [[LOOP]]
233 ; CHECK: vctp.32 [[ELEMS]]
234 ; CHECK: vpsel
235 define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
236 i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
237 entry:
238 %cmp8 = icmp eq i32 %N, 0
239 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
240
241 vector.ph: ; preds = %entry
242 %n.rnd.up = add i32 %N, 3
243 %n.vec = and i32 %n.rnd.up, -4
244 %trip.count.minus.1 = add i32 %N, -1
245 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
246 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
247 br label %vector.body
248
249 vector.body: ; preds = %vector.body, %vector.ph
250 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
251 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
252 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
253 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
254 %induction = add <4 x i32> %broadcast.splat,
255 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
256 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
257 %tmp2 = bitcast i32* %tmp to <4 x i32>*
258 %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
259 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
260 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
261 %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
262 %sub = sub <4 x i32> %wide.masked.load.a, %wide.masked.load.b
263 %cmp = icmp eq <4 x i32> %sub,
264 %mask = or <4 x i1> %cmp, %tmp1
265 %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
266 %tmp6 = bitcast i32* %tmp5 to <4 x i32>*
267 %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %mask, <4 x i32> undef)
268 %tmp7 = getelementptr inbounds i32, i32* %d, i32 %index
269 %tmp8 = bitcast i32* %tmp7 to <4 x i32>*
270 %wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %mask, <4 x i32> undef)
271 %mul = mul <4 x i32> %wide.masked.load.c, %wide.masked.load.d
272 %add = add <4 x i32> %mul, %vec.phi
273 %index.next = add i32 %index, 4
274 %cmp.exit = icmp eq i32 %index.next, %n.vec
275 br i1 %cmp.exit, label %middle.block, label %vector.body
276
277 middle.block: ; preds = %vector.body
278 %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
279 %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc)
280 br label %for.cond.cleanup
281
282 for.cond.cleanup: ; preds = %middle.block, %entry
283 %res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ]
284 ret i32 %res.0.lcssa
285 }
286
287 ; Function Attrs: argmemonly nounwind readonly willreturn
288 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
289
290 ; Function Attrs: nounwind readnone willreturn
291 declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
0 ; RUN: llc -mtriple=armv8.1m.main -mattr=+mve -enable-arm-maskedldst=true -disable-mve-tail-predication=false --verify-machineinstrs %s -o - | FileCheck %s
1
2 ; CHECK-LABEL: mul_reduce_add
3 ; CHECK: dls lr,
4 ; CHECK: [[LOOP:.LBB[0-9_]+]]:
5 ; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]], #4
6 ; CHECK: vctp.32 [[ELEMS]]
7 ; CHECK: vpstt
8 ; CHECK-NEXT: vldrwt.u32
9 ; CHECK-NEXT: vldrwt.u32
10 ; CHECK: le lr, [[LOOP]]
11 ; CHECK: vctp.32 [[ELEMS]]
12 ; CHECK: vpsel
13 ; CHECK: vaddv.u32 r0
14 define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
15 entry:
16 %cmp8 = icmp eq i32 %N, 0
17 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
18
19 vector.ph: ; preds = %entry
20 %n.rnd.up = add i32 %N, 3
21 %n.vec = and i32 %n.rnd.up, -4
22 %trip.count.minus.1 = add i32 %N, -1
23 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
24 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
25 br label %vector.body
26
27 vector.body: ; preds = %vector.body, %vector.ph
28 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
29 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ]
30 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
31 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
32 %induction = add <4 x i32> %broadcast.splat,
33 %0 = getelementptr inbounds i32, i32* %a, i32 %index
34 %1 = icmp ule <4 x i32> %induction, %broadcast.splat12
35 %2 = bitcast i32* %0 to <4 x i32>*
36 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
37 %3 = getelementptr inbounds i32, i32* %b, i32 %index
38 %4 = bitcast i32* %3 to <4 x i32>*
39 %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %1, <4 x i32> undef)
40 %5 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
41 %6 = add nsw <4 x i32> %5, %vec.phi
42 %index.next = add i32 %index, 4
43 %7 = icmp eq i32 %index.next, %n.vec
44 br i1 %7, label %middle.block, label %vector.body
45
46 middle.block: ; preds = %vector.body
47 %8 = select <4 x i1> %1, <4 x i32> %6, <4 x i32> %vec.phi
48 %9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %8)
49 br label %for.cond.cleanup
50
51 for.cond.cleanup: ; preds = %middle.block, %entry
52 %res.0.lcssa = phi i32 [ 0, %entry ], [ %9, %middle.block ]
53 ret i32 %res.0.lcssa
54 }
55
56 ; Function Attrs: norecurse nounwind readonly
57 define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) {
58 entry:
59 %cmp6 = icmp eq i32 %N, 0
60 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
61
62 vector.ph: ; preds = %entry
63 %n.rnd.up = add i32 %N, 3
64 %n.vec = and i32 %n.rnd.up, -4
65 %trip.count.minus.1 = add i32 %N, -1
66 %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
67 %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
68 br label %vector.body
69
70 vector.body: ; preds = %vector.body, %vector.ph
71 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
72 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
73 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
74 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
75 %induction = add <4 x i32> %broadcast.splat,
76 %0 = getelementptr inbounds i32, i32* %a, i32 %index
77 %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
78 %2 = bitcast i32* %0 to <4 x i32>*
79 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
80 %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
81 %index.next = add i32 %index, 4
82 %4 = icmp eq i32 %index.next, %n.vec
83 br i1 %4, label %middle.block, label %vector.body
84
85 middle.block: ; preds = %vector.body
86 %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi
87 %6 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %5)
88 br label %for.cond.cleanup
89
90 for.cond.cleanup: ; preds = %middle.block, %entry
91 %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ]
92 ret i32 %res.0.lcssa
93 }
94
95 ; CHECK-LABEL: add_reduce_add_const
96 ; CHECK: dls lr, lr
97 ; CHECK: [[LOOP:.LBB[0-9_]+]]:
98 ; CHECK: subs [[ELEMS:r[0-9]+]], #4
99 ; CHECK: vctp.32 [[ELEMS]]
100 ; CHECK: vpst
101 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
102 ; CHECK: vadd.i32
103 ; CHECK: le lr, [[LOOP]]
104 ; CHECK: vctp.32 [[ELEMS]]
105 ; CHECK: vpsel
106 define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) {
107 entry:
108 %cmp6 = icmp eq i32 %N, 0
109 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
110
111 vector.ph: ; preds = %entry
112 %n.rnd.up = add i32 %N, 3
113 %n.vec = and i32 %n.rnd.up, -4
114 %trip.count.minus.1 = add i32 %N, -1
115 %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
116 %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
117 br label %vector.body
118
119 vector.body: ; preds = %vector.body, %vector.ph
120 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
121 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
122 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
123 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
124 %induction = add <4 x i32> %broadcast.splat,
125 %0 = getelementptr inbounds i32, i32* %a, i32 %index
126 %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
127 %2 = bitcast i32* %0 to <4 x i32>*
128 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
129 %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
130 %index.next = add i32 %index, 4
131 %4 = icmp eq i32 %index.next, %n.vec
132 br i1 %4, label %middle.block, label %vector.body
133
134 middle.block: ; preds = %vector.body
135 %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi
136 %6 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %5)
137 br label %for.cond.cleanup
138
139 for.cond.cleanup: ; preds = %middle.block, %entry
140 %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ]
141 ret i32 %res.0.lcssa
142 }
143
144 ; CHECK-LABEL: vector_mul_const
145 ; CHECK: dls lr, lr
146 ; CHECK: [[LOOP:.LBB[0-9_]+]]:
147 ; CHECK: subs [[ELEMS:r[0-9]+]], #4
148 ; CHECK: vctp.32 [[ELEMS]]
149 ; CHECK: vpst
150 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r1]
151 ; CHECK: vmul.i32
152 ; CHECK: vpst
153 ; CHECK-NEXT: vstrwt.32 q{{.*}}, [r0]
154 ; CHECK: le lr, [[LOOP]]
155 define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c, i32 %N) {
156 entry:
157 %cmp6 = icmp eq i32 %N, 0
158 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
159
160 vector.ph: ; preds = %entry
161 %n.rnd.up = add i32 %N, 3
162 %n.vec = and i32 %n.rnd.up, -4
163 %trip.count.minus.1 = add i32 %N, -1
164 %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
165 %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
166 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
167 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
168 br label %vector.body
169
170 vector.body: ; preds = %vector.body, %vector.ph
171 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
172 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
173 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
174 %induction = add <4 x i32> %broadcast.splat,
175 %0 = getelementptr inbounds i32, i32* %b, i32 %index
176 %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
177 %2 = bitcast i32* %0 to <4 x i32>*
178 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
179 %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat11
180 %4 = getelementptr inbounds i32, i32* %a, i32 %index
181 %5 = bitcast i32* %4 to <4 x i32>*
182 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %5, i32 4, <4 x i1> %1)
183 %index.next = add i32 %index, 4
184 %6 = icmp eq i32 %index.next, %n.vec
185 br i1 %6, label %for.cond.cleanup, label %vector.body
186
187 for.cond.cleanup: ; preds = %vector.body, %entry
188 ret void
189 }
190
191 ; CHECK-LABEL: vector_add_const
192 ; CHECK: dls lr, lr
193 ; CHECK: [[LOOP:.LBB[0-9_]+]]:
194 ; CHECK: subs [[ELEMS:r[0-9]+]], #4
195 ; CHECK: vctp.32 [[ELEMS]]
196 ; CHECK: vpst
197 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r1]
198 ; CHECK: vadd.i32
199 ; CHECK: vpst
200 ; CHECK-NEXT: vstrwt.32 q{{.*}}, [r0]
201 ; CHECK: le lr, [[LOOP]]
202 define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c, i32 %N) {
203 entry:
204 %cmp6 = icmp eq i32 %N, 0
205 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
206
207 vector.ph: ; preds = %entry
208 %n.rnd.up = add i32 %N, 3
209 %n.vec = and i32 %n.rnd.up, -4
210 %trip.count.minus.1 = add i32 %N, -1
211 %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
212 %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
213 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
214 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
215 br label %vector.body
216
217 vector.body: ; preds = %vector.body, %vector.ph
218 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
219 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
220 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
221 %induction = add <4 x i32> %broadcast.splat,
222 %0 = getelementptr inbounds i32, i32* %b, i32 %index
223 %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
224 %2 = bitcast i32* %0 to <4 x i32>*
225 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
226 %3 = add nsw <4 x i32> %wide.masked.load, %broadcast.splat11
227 %4 = getelementptr inbounds i32, i32* %a, i32 %index
228 %5 = bitcast i32* %4 to <4 x i32>*
229 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %5, i32 4, <4 x i1> %1)
230 %index.next = add i32 %index, 4
231 %6 = icmp eq i32 %index.next, %n.vec
232 br i1 %6, label %for.cond.cleanup, label %vector.body
233
234 for.cond.cleanup: ; preds = %vector.body, %entry
235 ret void
236 }
237
238 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
239 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #4
240 declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
241
0
1 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve %s -S -o - | FileCheck %s
2
3 ; CHECK-LABEL: vec_mul_reduce_add
4
5 ; CHECK: vector.body:
6 ; CHECK-NOT: phi i32 [ 0, %vector.ph ]
7 ; CHECK: [[ELTS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[SUB:%[^ ]+]], %vector.body ]
8 ; CHECK: [[SUB]] = sub i32 [[ELTS]], 4
9 ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[SUB]])
10 ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]]
11 ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]],
12
13 ; CHECK: middle.block:
14 ; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[SUB]])
15 ; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP_CLONE]],
16 ; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]])
17
18 define i32 @vec_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
19 entry:
20 %cmp8 = icmp eq i32 %N, 0
21 %0 = add i32 %N, 3
22 %1 = lshr i32 %0, 2
23 %2 = shl nuw i32 %1, 2
24 %3 = add i32 %2, -4
25 %4 = lshr i32 %3, 2
26 %5 = add nuw nsw i32 %4, 1
27 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
28
29 vector.ph: ; preds = %entry
30 %trip.count.minus.1 = add i32 %N, -1
31 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
32 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
33 call void @llvm.set.loop.iterations.i32(i32 %5)
34 br label %vector.body
35
36 vector.body: ; preds = %vector.body, %vector.ph
37 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
38 %lsr.iv2 = phi i32* [ %scevgep3, %vector.body ], [ %a, %vector.ph ]
39 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %b, %vector.ph ]
40 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %9, %vector.body ]
41 %6 = phi i32 [ %5, %vector.ph ], [ %10, %vector.body ]
42 %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>*
43 %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
44 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
45 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
46 %induction = add <4 x i32> %broadcast.splat,
47 %7 = icmp ule <4 x i32> %induction, %broadcast.splat12
48 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %7, <4 x i32> undef)
49 %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %7, <4 x i32> undef)
50 %8 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
51 %9 = add nsw <4 x i32> %8, %vec.phi
52 %index.next = add i32 %index, 4
53 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
54 %scevgep3 = getelementptr i32, i32* %lsr.iv2, i32 4
55 %10 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
56 %11 = icmp ne i32 %10, 0
57 br i1 %11, label %vector.body, label %middle.block
58
59 middle.block: ; preds = %vector.body
60 %12 = icmp ule <4 x i32> %induction, %broadcast.splat12
61 %13 = select <4 x i1> %12, <4 x i32> %9, <4 x i32> %vec.phi
62 %14 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %13)
63 br label %for.cond.cleanup
64
65 for.cond.cleanup: ; preds = %middle.block, %entry
66 %res.0.lcssa = phi i32 [ 0, %entry ], [ %14, %middle.block ]
67 ret i32 %res.0.lcssa
68 }
69
70 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
71 declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
72 declare void @llvm.set.loop.iterations.i32(i32)
73 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
74
33 define void @vctp8(i32 %arg, <16 x i8> *%in, <16 x i8>* %out) {
44 ; CHECK-LABEL: vctp8:
55 ; CHECK: @ %bb.0:
6 ; CHECK-NEXT: vctp.8 r0
67 ; CHECK-NEXT: vldrw.u32 q1, [r1]
7 ; CHECK-NEXT: vctp.8 r0
88 ; CHECK-NEXT: vmov.i32 q0, #0x0
99 ; CHECK-NEXT: vpsel q0, q1, q0
1010 ; CHECK-NEXT: vstrw.32 q0, [r2]
1919 define void @vctp16(i32 %arg, <8 x i16> *%in, <8 x i16>* %out) {
2020 ; CHECK-LABEL: vctp16:
2121 ; CHECK: @ %bb.0:
22 ; CHECK-NEXT: vctp.16 r0
2223 ; CHECK-NEXT: vldrw.u32 q1, [r1]
23 ; CHECK-NEXT: vctp.16 r0
2424 ; CHECK-NEXT: vmov.i32 q0, #0x0
2525 ; CHECK-NEXT: vpsel q0, q1, q0
2626 ; CHECK-NEXT: vstrw.32 q0, [r2]
3535 define void @vctp32(i32 %arg, <4 x i32> *%in, <4 x i32>* %out) {
3636 ; CHECK-LABEL: vctp32:
3737 ; CHECK: @ %bb.0:
38 ; CHECK-NEXT: vctp.32 r0
3839 ; CHECK-NEXT: vldrw.u32 q1, [r1]
39 ; CHECK-NEXT: vctp.32 r0
4040 ; CHECK-NEXT: vmov.i32 q0, #0x0
4141 ; CHECK-NEXT: vpsel q0, q1, q0
4242 ; CHECK-NEXT: vstrw.32 q0, [r2]