llvm.org GIT mirror llvm / 1e77c70
[ARM] Implement isExtractSubvectorCheap. See https://reviews.llvm.org/D6678 for the history of isExtractSubvectorCheap. Essentially the same considerations apply to ARM. This temporarily breaks the formation of vpadd/vpaddl in certain cases; AddCombineToVPADDL essentially assumes that we won't form VUZP shuffles. See https://reviews.llvm.org/D27779 for followup fix. Differential Revision: https://reviews.llvm.org/D27774 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@290198 91177308-0d34-0410-b5e6-96231b3b80d8 Eli Friedman 3 years ago
6 changed file(s) with 86 addition(s) and 53 deletion(s). Raw diff Collapse all Expand all
1292012920 return true;
1292112921 }
1292212922
12923 bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT,
12924 unsigned Index) const {
12925 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
12926 return false;
12927
12928 return (Index == 0 || Index == ResVT.getVectorNumElements());
12929 }
12930
1292312931 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
1292412932 ARM_MB::MemBOpt Domain) const {
1292512933 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
429429 /// to just the constant itself.
430430 bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
431431 Type *Ty) const override;
432
433 /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
434 /// with this index.
435 bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override;
432436
433437 /// \brief Returns true if an argument of type Ty needs to be passed in a
434438 /// contiguous block of registers in calling convention CallConv.
216216 ; CHECK-LABEL: test_multisource:
217217 ; CHECK: @ BB#0:
218218 ; CHECK-NEXT: mov r1, r0
219 ; CHECK-NEXT: add r2, r0, #48
220 ; CHECK-NEXT: add r0, r0, #32
219 ; CHECK-NEXT: add r2, r0, #32
220 ; CHECK-NEXT: add r0, r0, #48
221221 ; CHECK-NEXT: vld1.16 {d16, d17}, [r1:128]!
222222 ; CHECK-NEXT: vld1.64 {d20, d21}, [r2:128]
223 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1:128]
224 ; CHECK-NEXT: vmov.u16 r1, d16[0]
225 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0:128]
226 ; CHECK-NEXT: vmov.16 d22[0], r1
227 ; CHECK-NEXT: vmov.u16 r0, d18[0]
228 ; CHECK-NEXT: vmov.u16 r1, d16[0]
229 ; CHECK-NEXT: vmov.16 d22[1], r0
230 ; CHECK-NEXT: vmov.u16 r0, d20[0]
231 ; CHECK-NEXT: vmov.16 d22[2], r1
232 ; CHECK-NEXT: vmov.16 d22[3], r0
233 ; CHECK-NEXT: vmov r0, r1, d22
223 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0:128]
224 ; CHECK-NEXT: vld1.64 {d22, d23}, [r1:128]
225 ; CHECK-NEXT: vorr d24, d20, d20
226 ; CHECK-NEXT: vzip.16 d24, d18
227 ; CHECK-NEXT: vext.16 d18, d20, d24, #2
228 ; CHECK-NEXT: vtrn.16 q8, q11
229 ; CHECK-NEXT: vext.16 d16, d18, d16, #2
230 ; CHECK-NEXT: vext.16 d16, d16, d16, #2
231 ; CHECK-NEXT: vmov r0, r1, d16
234232 ; CHECK-NEXT: mov pc, lr
235233 %tmp1 = load <32 x i16>, <32 x i16>* %B
236234 %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <4 x i32>
243241 ; CHECK-LABEL: test_largespan:
244242 ; CHECK: @ BB#0:
245243 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
246 ; CHECK-NEXT: vmov.u16 r1, d16[0]
247 ; CHECK-NEXT: vmov.u16 r0, d16[2]
248 ; CHECK-NEXT: vmov.16 d18[0], r1
249 ; CHECK-NEXT: vmov.u16 r1, d17[0]
250 ; CHECK-NEXT: vmov.16 d18[1], r0
251 ; CHECK-NEXT: vmov.u16 r0, d17[2]
252 ; CHECK-NEXT: vmov.16 d18[2], r1
253 ; CHECK-NEXT: vmov.16 d18[3], r0
244 ; CHECK-NEXT: vorr d18, d16, d16
245 ; CHECK-NEXT: vuzp.16 d18, d17
254246 ; CHECK-NEXT: vmov r0, r1, d18
255247 ; CHECK-NEXT: mov pc, lr
256248 %tmp1 = load <8 x i16>, <8 x i16>* %B
212212 ret <2 x i64> %tmp2
213213 }
214214
215 ; Test AddCombine optimization that generates a vpaddl.s
216 define void @addCombineToVPADDL() nounwind ssp {
217 ; CHECK-LABEL: addCombineToVPADDL:
218 ; CHECK: @ BB#0:
219 ; CHECK-NEXT: .save {r11}
220 ; CHECK-NEXT: push {r11}
221 ; CHECK-NEXT: .setfp r11, sp
222 ; CHECK-NEXT: mov r11, sp
223 ; CHECK-NEXT: .pad #44
224 ; CHECK-NEXT: sub sp, sp, #44
225 ; CHECK-NEXT: bic sp, sp, #15
226 ; CHECK-NEXT: add r0, sp, #16
227 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0:128]
228 ; CHECK-NEXT: vpaddl.s8 q8, q8
229 ; CHECK-NEXT: vmovn.i16 d16, q8
230 ; CHECK-NEXT: vstr d16, [sp, #8]
231 ; CHECK-NEXT: mov sp, r11
232 ; CHECK-NEXT: pop {r11}
233 ; CHECK-NEXT: mov pc, lr
234 %cbcr = alloca <16 x i8>, align 16
235 %X = alloca <8 x i8>, align 8
215 ; Combine vuzp+vadd->vpadd.
216 ; FIXME: Implement this optimization
217 define void @addCombineToVPADD(<16 x i8> *%cbcr, <8 x i8> *%X) nounwind ssp {
218 ; CHECK-LABEL: addCombineToVPADD:
219 ; CHECK: @ BB#0:
220 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
221 ; CHECK-NEXT: vorr d18, d17, d17
222 ; CHECK-NEXT: vuzp.8 d16, d18
223 ; CHECK-NEXT: vadd.i8 d16, d18, d16
224 ; CHECK-NEXT: vstr d16, [r1]
225 ; CHECK-NEXT: mov pc, lr
236226 %tmp = load <16 x i8>, <16 x i8>* %cbcr
237227 %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32>
238 %tmp2 = load <16 x i8>, <16 x i8>* %cbcr
239 %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32>
228 %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32>
229
240230 %add = add <8 x i8> %tmp3, %tmp1
241231 store <8 x i8> %add, <8 x i8>* %X, align 8
232 ret void
233 }
234
235 ; Combine vuzp+vaddl->vpaddl
236 ; FIXME: Implement this optimization.
237 define void @addCombineToVPADDL_sext(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
238 ; CHECK-LABEL: addCombineToVPADDL_sext:
239 ; CHECK: @ BB#0:
240 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
241 ; CHECK-NEXT: vorr d18, d17, d17
242 ; CHECK-NEXT: vuzp.8 d16, d18
243 ; CHECK-NEXT: vaddl.s8 q8, d18, d16
244 ; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
245 ; CHECK-NEXT: mov pc, lr
246 %tmp = load <16 x i8>, <16 x i8>* %cbcr
247 %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32>
248 %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32>
249 %tmp4 = sext <8 x i8> %tmp3 to <8 x i16>
250 %tmp5 = sext <8 x i8> %tmp1 to <8 x i16>
251 %add = add <8 x i16> %tmp4, %tmp5
252 store <8 x i16> %add, <8 x i16>* %X, align 8
242253 ret void
243254 }
244255
66 ; CHECK-NEXT: vldr d16, [r1]
77 ; CHECK-NEXT: vldr d17, [r0]
88 ; CHECK-NEXT: vuzp.8 d17, d16
9 ; CHECK-NEXT: vadd.i8 d16, d17, d16
9 ; CHECK-NEXT: vmul.i8 d16, d17, d16
1010 ; CHECK-NEXT: vmov r0, r1, d16
1111 ; CHECK-NEXT: mov pc, lr
1212 %tmp1 = load <8 x i8>, <8 x i8>* %A
1313 %tmp2 = load <8 x i8>, <8 x i8>* %B
1414 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32>
1515 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32>
16 %tmp5 = add <8 x i8> %tmp3, %tmp4
16 %tmp5 = mul <8 x i8> %tmp3, %tmp4
1717 ret <8 x i8> %tmp5
1818 }
1919
3838 ; CHECK-NEXT: vldr d16, [r1]
3939 ; CHECK-NEXT: vldr d17, [r0]
4040 ; CHECK-NEXT: vuzp.16 d17, d16
41 ; CHECK-NEXT: vadd.i16 d16, d17, d16
41 ; CHECK-NEXT: vmul.i16 d16, d17, d16
4242 ; CHECK-NEXT: vmov r0, r1, d16
4343 ; CHECK-NEXT: mov pc, lr
4444 %tmp1 = load <4 x i16>, <4 x i16>* %A
4545 %tmp2 = load <4 x i16>, <4 x i16>* %B
4646 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32>
4747 %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32>
48 %tmp5 = add <4 x i16> %tmp3, %tmp4
48 %tmp5 = mul <4 x i16> %tmp3, %tmp4
4949 ret <4 x i16> %tmp5
5050 }
5151
206206 ; CHECK-NEXT: vldr d16, [r1]
207207 ; CHECK-NEXT: vldr d17, [r0]
208208 ; CHECK-NEXT: vuzp.8 d17, d16
209 ; CHECK-NEXT: vadd.i8 d16, d17, d16
209 ; CHECK-NEXT: vmul.i8 d16, d17, d16
210210 ; CHECK-NEXT: vmov r0, r1, d16
211211 ; CHECK-NEXT: mov pc, lr
212212 %tmp1 = load <8 x i8>, <8 x i8>* %A
213213 %tmp2 = load <8 x i8>, <8 x i8>* %B
214214 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32>
215215 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32>
216 %tmp5 = add <8 x i8> %tmp3, %tmp4
216 %tmp5 = mul <8 x i8> %tmp3, %tmp4
217217 ret <8 x i8> %tmp5
218218 }
219219
549549 %rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1
550550 ret <10 x i8> %rv
551551 }
552
553 %struct.uint8x8x2_t = type { [2 x <8 x i8>] }
554 define %struct.uint8x8x2_t @vuzp_extract_subvector(<16 x i8> %t) #0 {
555 ; CHECK-LABEL: vuzp_extract_subvector:
556 ; CHECK: @ BB#0:
557 ; CHECK-NEXT: vmov d17, r2, r3
558 ; CHECK-NEXT: vmov d16, r0, r1
559 ; CHECK-NEXT: vorr d18, d17, d17
560 ; CHECK-NEXT: vuzp.8 d16, d18
561 ; CHECK-NEXT: vmov r0, r1, d16
562 ; CHECK-NEXT: vmov r2, r3, d18
563 ; CHECK-NEXT: mov pc, lr
564
565 %vuzp.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32>
566 %vuzp1.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32>
567 %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0
568 %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
569 ret %struct.uint8x8x2_t %.fca.0.1.insert
570 }
331331 ; CHECK-LABEL: vzip_vext_factor:
332332 ; CHECK: @ BB#0: @ %entry
333333 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
334 ; CHECK-NEXT: vext.16 d16, d16, d17, #3
335 ; CHECK-NEXT: vext.16 d17, d16, d16, #1
336 ; CHECK-NEXT: vzip.16 d16, d17
334 ; CHECK-NEXT: vext.16 d18, d16, d17, #1
335 ; CHECK-NEXT: vext.16 d16, d18, d17, #2
337336 ; CHECK-NEXT: vext.16 d16, d16, d16, #1
338337 ; CHECK-NEXT: vstr d16, [r1]
339338 ; CHECK-NEXT: mov pc, lr