llvm.org GIT mirror llvm / 29ab6ce
[AArch64] Improve single vector lane stores When storing the 0th lane of a vector, use a simpler and usually more efficient scalar store instead. Differential revision: https://reviews.llvm.org/D46655 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@332251 91177308-0d34-0410-b5e6-96231b3b80d8 Evandro Menezes 1 year, 4 months ago
5 changed file(s) with 414 addition(s) and 22 deletion(s). Raw diff Collapse all Expand all
22482248
22492249 let AddedComplexity = 19 in {
22502250 defm : VecROStoreLane0Pat;
2251 defm : VecROStoreLane0Pat;
2252 defm : VecROStoreLane0Pat;
2253 defm : VecROStoreLane0Pat;
2254 defm : VecROStoreLane0Pat;
2255 defm : VecROStoreLane0Pat;
2256 defm : VecROStoreLane0Pat64, store , v2f64, f64, dsub, STRDroW, STRDroX>;
2251 defm : VecROStoreLane0Pat16, store, v8f16, f16, hsub, STRHroW, STRHroX>;
2252 defm : VecROStoreLane0Pat;
2253 defm : VecROStoreLane0Pat;
2254 defm : VecROStoreLane0Pat;
2255 defm : VecROStoreLane0Pat;
22572256 }
22582257
22592258 //---
22872286 (am_indexed8 GPR64sp:$Rn,
22882287 uimm12s1:$offset))]>;
22892288
2289 let AddedComplexity = 10 in {
2290
22902291 // Match all store 64 bits width whose type is compatible with FPR64
2291 let AddedComplexity = 10 in {
2292 def : Pat<(store (v1i64 FPR64:$Rt),
2293 (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
2294 (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
2295 def : Pat<(store (v1f64 FPR64:$Rt),
2296 (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
2297 (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
2298
22922299 let Predicates = [IsLE] in {
22932300 // We must use ST1 to store vectors in big-endian.
22942301 def : Pat<(store (v2f32 FPR64:$Rt),
23072314 (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
23082315 (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
23092316 }
2310 def : Pat<(store (v1f64 FPR64:$Rt),
2311 (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
2312 (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
2313 def : Pat<(store (v1i64 FPR64:$Rt),
2314 (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
2315 (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
23162317
23172318 // Match all store 128 bits width whose type is compatible with FPR128
2319 def : Pat<(store (f128 FPR128:$Rt),
2320 (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
2321 (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
2322
23182323 let Predicates = [IsLE] in {
23192324 // We must use ST1 to store vectors in big-endian.
23202325 def : Pat<(store (v4f32 FPR128:$Rt),
23392344 (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
23402345 (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
23412346 }
2342 def : Pat<(store (f128 FPR128:$Rt),
2343 (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
2344 (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
23452347
23462348 // truncstore i64
23472349 def : Pat<(truncstorei32 GPR64:$Rt,
23542356 (STRBBui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s1:$offset)>;
23552357
23562358 } // AddedComplexity = 10
2359
2360 // Match stores from lane 0 to the appropriate subreg's store.
2361 multiclass VecStoreLane0Pat
2362 ValueType VTy, ValueType STy,
2363 SubRegIndex SubRegIdx, Operand IndexType,
2364 Instruction STR> {
2365 def : Pat<(storeop (STy (vector_extract (VTy VecListOne128:$Vt), 0)),
2366 (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
2367 (STR (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
2368 GPR64sp:$Rn, IndexType:$offset)>;
2369 }
2370
2371 let AddedComplexity = 19 in {
2372 defm : VecStoreLane0Pat;
2373 defm : VecStoreLane0Pat;
2374 defm : VecStoreLane0Pat;
2375 defm : VecStoreLane0Pat;
2376 defm : VecStoreLane0Pat;
2377 defm : VecStoreLane0Pat;
2378 }
23572379
23582380 //---
23592381 // (unscaled immediate)
23862408 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
23872409
23882410 // Match all store 64 bits width whose type is compatible with FPR64
2411 def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
2412 (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
2413 def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
2414 (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
2415
2416 let AddedComplexity = 10 in {
2417
23892418 let Predicates = [IsLE] in {
23902419 // We must use ST1 to store vectors in big-endian.
23912420 def : Pat<(store (v2f32 FPR64:$Rt),
24042433 (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
24052434 (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
24062435 }
2407 def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
2408 (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
2409 def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
2410 (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
24112436
24122437 // Match all store 128 bits width whose type is compatible with FPR128
2438 def : Pat<(store (f128 FPR128:$Rt), (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
2439 (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
2440
24132441 let Predicates = [IsLE] in {
24142442 // We must use ST1 to store vectors in big-endian.
24152443 def : Pat<(store (v4f32 FPR128:$Rt),
24372465 (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
24382466 (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
24392467 }
2468
2469 } // AddedComplexity = 10
24402470
24412471 // unscaled i64 truncating stores
24422472 def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
41504180 (SUBREG_TO_REG (i32 0),
41514181 (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
41524182
4183 def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
4184 (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
4185 def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
4186 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
4187
41534188 def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
41544189 (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
41554190 (i32 FPR32:$Rn), ssub))>;
41564191 def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
41574192 (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
41584193 (i32 FPR32:$Rn), ssub))>;
4194
41594195 def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
41604196 (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
41614197 (i64 FPR64:$Rn), dsub))>;
41694205 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
41704206 def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
41714207 (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
4208
41724209 def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
41734210 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
41744211
924924 ; CHECK-LABEL: test_extracts_inserts_varidx_insert:
925925 ; CHECK: and [[MASKED_IDX:x[0-9]+]], x0, #0x3
926926 ; CHECK: bfi x9, [[MASKED_IDX]], #1, #2
927 ; CHECK: st1 { v0.h }[0], [x9]
927 ; CHECK: str h0, [x9]
928928 ; CHECK-DAG: ldr d[[R:[0-9]+]]
929929 ; CHECK-DAG: mov v[[R]].h[1], v0.h[1]
930930 ; CHECK-DAG: mov v[[R]].h[2], v0.h[2]
390390 ret void
391391 }
392392
393 define void @test_vst1q_lane0_s16(i16* %a, <8 x i16> %b) {
394 ; CHECK-LABEL: test_vst1q_lane0_s16:
395 ; CHECK: str {{h[0-9]+}}, [x0]
396 entry:
397 %0 = extractelement <8 x i16> %b, i32 0
398 store i16 %0, i16* %a, align 2
399 ret void
400 }
401
393402 define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) {
394403 ; CHECK-LABEL: test_vst1q_lane_s32:
395404 ; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
399408 ret void
400409 }
401410
411 define void @test_vst1q_lane0_s32(i32* %a, <4 x i32> %b) {
412 ; CHECK-LABEL: test_vst1q_lane0_s32:
413 ; CHECK: str {{s[0-9]+}}, [x0]
414 entry:
415 %0 = extractelement <4 x i32> %b, i32 0
416 store i32 %0, i32* %a, align 4
417 ret void
418 }
419
402420 define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) {
403421 ; CHECK-LABEL: test_vst1q_lane_s64:
404422 ; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
408426 ret void
409427 }
410428
429 define void @test_vst1q_lane0_s64(i64* %a, <2 x i64> %b) {
430 ; CHECK-LABEL: test_vst1q_lane0_s64:
431 ; CHECK: str {{d[0-9]+}}, [x0]
432 entry:
433 %0 = extractelement <2 x i64> %b, i32 0
434 store i64 %0, i64* %a, align 8
435 ret void
436 }
437
411438 define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) {
412439 ; CHECK-LABEL: test_vst1q_lane_f32:
413440 ; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
417444 ret void
418445 }
419446
447 define void @test_vst1q_lane0_f32(float* %a, <4 x float> %b) {
448 ; CHECK-LABEL: test_vst1q_lane0_f32:
449 ; CHECK: str {{s[0-9]+}}, [x0]
450 entry:
451 %0 = extractelement <4 x float> %b, i32 0
452 store float %0, float* %a, align 4
453 ret void
454 }
455
420456 define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) {
421457 ; CHECK-LABEL: test_vst1q_lane_f64:
422458 ; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
426462 ret void
427463 }
428464
465 define void @test_vst1q_lane0_f64(double* %a, <2 x double> %b) {
466 ; CHECK-LABEL: test_vst1q_lane0_f64:
467 ; CHECK: str {{d[0-9]+}}, [x0]
468 entry:
469 %0 = extractelement <2 x double> %b, i32 0
470 store double %0, double* %a, align 8
471 ret void
472 }
473
429474 define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) {
430475 ; CHECK-LABEL: test_vst1_lane_s8:
431476 ; CHECK: st1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
444489 ret void
445490 }
446491
492 define void @test_vst1_lane0_s16(i16* %a, <4 x i16> %b) {
493 ; CHECK-LABEL: test_vst1_lane0_s16:
494 ; CHECK: str {{h[0-9]+}}, [x0]
495 entry:
496 %0 = extractelement <4 x i16> %b, i32 0
497 store i16 %0, i16* %a, align 2
498 ret void
499 }
500
447501 define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) {
448502 ; CHECK-LABEL: test_vst1_lane_s32:
449503 ; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
453507 ret void
454508 }
455509
510 define void @test_vst1_lane0_s32(i32* %a, <2 x i32> %b) {
511 ; CHECK-LABEL: test_vst1_lane0_s32:
512 ; CHECK: str {{s[0-9]+}}, [x0]
513 entry:
514 %0 = extractelement <2 x i32> %b, i32 0
515 store i32 %0, i32* %a, align 4
516 ret void
517 }
518
456519 define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) {
457520 ; CHECK-LABEL: test_vst1_lane_s64:
458 ; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
521 ; CHECK: str {{d[0-9]+}}, [x0]
459522 entry:
460523 %0 = extractelement <1 x i64> %b, i32 0
461524 store i64 %0, i64* %a, align 8
471534 ret void
472535 }
473536
537 define void @test_vst1_lane0_f32(float* %a, <2 x float> %b) {
538 ; CHECK-LABEL: test_vst1_lane0_f32:
539 ; CHECK: str {{s[0-9]+}}, [x0]
540 entry:
541 %0 = extractelement <2 x float> %b, i32 0
542 store float %0, float* %a, align 4
543 ret void
544 }
545
474546 define void @test_vst1_lane_f64(double* %a, <1 x double> %b) {
475547 ; CHECK-LABEL: test_vst1_lane_f64:
476548 ; CHECK: str {{d[0-9]+}}, [x0]
66 ; CHECK: st1.b
77 %tmp = extractelement <16 x i8> %A, i32 1
88 store i8 %tmp, i8* %D
9 ret void
10 }
11
12 define void @st1lane0_16b(<16 x i8> %A, i8* %D) {
13 ; CHECK-LABEL: st1lane0_16b
14 ; CHECK: st1.b
15 %tmp = extractelement <16 x i8> %A, i32 0
16 store i8 %tmp, i8* %D
17 ret void
18 }
19
20 define void @st1lane0u_16b(<16 x i8> %A, i8* %D) {
21 ; CHECK-LABEL: st1lane0u_16b
22 ; CHECK: st1.b
23 %ptr = getelementptr i8, i8* %D, i64 -1
24 %tmp = extractelement <16 x i8> %A, i32 0
25 store i8 %tmp, i8* %ptr
926 ret void
1027 }
1128
3754 ret void
3855 }
3956
57 define void @st1lane0_8h(<8 x i16> %A, i16* %D) {
58 ; CHECK-LABEL: st1lane0_8h
59 ; CHECK: str
60 %tmp = extractelement <8 x i16> %A, i32 0
61 store i16 %tmp, i16* %D
62 ret void
63 }
64
65 define void @st1lane0u_8h(<8 x i16> %A, i16* %D) {
66 ; CHECK-LABEL: st1lane0u_8h
67 ; CHECK: st1.h
68 %ptr = getelementptr i16, i16* %D, i64 -1
69 %tmp = extractelement <8 x i16> %A, i32 0
70 store i16 %tmp, i16* %ptr
71 ret void
72 }
73
4074 define void @st1lane_ro_8h(<8 x i16> %A, i16* %D, i64 %offset) {
4175 ; CHECK-LABEL: st1lane_ro_8h
4276 ; CHECK: add x[[XREG:[0-9]+]], x0, x1
6498 ret void
6599 }
66100
101 define void @st1lane0_4s(<4 x i32> %A, i32* %D) {
102 ; CHECK-LABEL: st1lane0_4s
103 ; CHECK: str
104 %tmp = extractelement <4 x i32> %A, i32 0
105 store i32 %tmp, i32* %D
106 ret void
107 }
108
109 define void @st1lane0u_4s(<4 x i32> %A, i32* %D) {
110 ; CHECK-LABEL: st1lane0u_4s
111 ; CHECK: st1.s
112 %ptr = getelementptr i32, i32* %D, i64 -1
113 %tmp = extractelement <4 x i32> %A, i32 0
114 store i32 %tmp, i32* %ptr
115 ret void
116 }
117
67118 define void @st1lane_ro_4s(<4 x i32> %A, i32* %D, i64 %offset) {
68119 ; CHECK-LABEL: st1lane_ro_4s
69120 ; CHECK: add x[[XREG:[0-9]+]], x0, x1
91142 ret void
92143 }
93144
145 define void @st1lane0_4s_float(<4 x float> %A, float* %D) {
146 ; CHECK-LABEL: st1lane0_4s_float
147 ; CHECK: str
148 %tmp = extractelement <4 x float> %A, i32 0
149 store float %tmp, float* %D
150 ret void
151 }
152
153 define void @st1lane0u_4s_float(<4 x float> %A, float* %D) {
154 ; CHECK-LABEL: st1lane0u_4s_float
155 ; CHECK: st1.s
156 %ptr = getelementptr float, float* %D, i64 -1
157 %tmp = extractelement <4 x float> %A, i32 0
158 store float %tmp, float* %ptr
159 ret void
160 }
161
94162 define void @st1lane_ro_4s_float(<4 x float> %A, float* %D, i64 %offset) {
95163 ; CHECK-LABEL: st1lane_ro_4s_float
96164 ; CHECK: add x[[XREG:[0-9]+]], x0, x1
118186 ret void
119187 }
120188
189 define void @st1lane0_2d(<2 x i64> %A, i64* %D) {
190 ; CHECK-LABEL: st1lane0_2d
191 ; CHECK: str
192 %tmp = extractelement <2 x i64> %A, i32 0
193 store i64 %tmp, i64* %D
194 ret void
195 }
196
197 define void @st1lane0u_2d(<2 x i64> %A, i64* %D) {
198 ; CHECK-LABEL: st1lane0u_2d
199 ; CHECK: st1.d
200 %ptr = getelementptr i64, i64* %D, i64 -1
201 %tmp = extractelement <2 x i64> %A, i32 0
202 store i64 %tmp, i64* %ptr
203 ret void
204 }
205
121206 define void @st1lane_ro_2d(<2 x i64> %A, i64* %D, i64 %offset) {
122207 ; CHECK-LABEL: st1lane_ro_2d
123208 ; CHECK: add x[[XREG:[0-9]+]], x0, x1
145230 ret void
146231 }
147232
233 define void @st1lane0_2d_double(<2 x double> %A, double* %D) {
234 ; CHECK-LABEL: st1lane0_2d_double
235 ; CHECK: str
236 %tmp = extractelement <2 x double> %A, i32 0
237 store double %tmp, double* %D
238 ret void
239 }
240
241 define void @st1lane0u_2d_double(<2 x double> %A, double* %D) {
242 ; CHECK-LABEL: st1lane0u_2d_double
243 ; CHECK: st1.d
244 %ptr = getelementptr double, double* %D, i64 -1
245 %tmp = extractelement <2 x double> %A, i32 0
246 store double %tmp, double* %ptr
247 ret void
248 }
249
148250 define void @st1lane_ro_2d_double(<2 x double> %A, double* %D, i64 %offset) {
149251 ; CHECK-LABEL: st1lane_ro_2d_double
150252 ; CHECK: add x[[XREG:[0-9]+]], x0, x1
200302 ret void
201303 }
202304
305 define void @st1lane0_4h(<4 x i16> %A, i16* %D) {
306 ; CHECK-LABEL: st1lane0_4h
307 ; CHECK: str
308 %tmp = extractelement <4 x i16> %A, i32 0
309 store i16 %tmp, i16* %D
310 ret void
311 }
312
313 define void @st1lane0u_4h(<4 x i16> %A, i16* %D) {
314 ; CHECK-LABEL: st1lane0u_4h
315 ; CHECK: st1.h
316 %ptr = getelementptr i16, i16* %D, i64 -1
317 %tmp = extractelement <4 x i16> %A, i32 0
318 store i16 %tmp, i16* %ptr
319 ret void
320 }
321
203322 define void @st1lane_ro_4h(<4 x i16> %A, i16* %D, i64 %offset) {
204323 ; CHECK-LABEL: st1lane_ro_4h
205324 ; CHECK: add x[[XREG:[0-9]+]], x0, x1
227346 ret void
228347 }
229348
349 define void @st1lane0_2s(<2 x i32> %A, i32* %D) {
350 ; CHECK-LABEL: st1lane0_2s
351 ; CHECK: str
352 %tmp = extractelement <2 x i32> %A, i32 0
353 store i32 %tmp, i32* %D
354 ret void
355 }
356
357 define void @st1lane0u_2s(<2 x i32> %A, i32* %D) {
358 ; CHECK-LABEL: st1lane0u_2s
359 ; CHECK: st1.s
360 %ptr = getelementptr i32, i32* %D, i64 -1
361 %tmp = extractelement <2 x i32> %A, i32 0
362 store i32 %tmp, i32* %ptr
363 ret void
364 }
365
230366 define void @st1lane_ro_2s(<2 x i32> %A, i32* %D, i64 %offset) {
231367 ; CHECK-LABEL: st1lane_ro_2s
232368 ; CHECK: add x[[XREG:[0-9]+]], x0, x1
254390 ret void
255391 }
256392
393 define void @st1lane0_2s_float(<2 x float> %A, float* %D) {
394 ; CHECK-LABEL: st1lane0_2s_float
395 ; CHECK: str
396 %tmp = extractelement <2 x float> %A, i32 0
397 store float %tmp, float* %D
398 ret void
399 }
400
401 define void @st1lane0u_2s_float(<2 x float> %A, float* %D) {
402 ; CHECK-LABEL: st1lane0u_2s_float
403 ; CHECK: st1.s
404 %ptr = getelementptr float, float* %D, i64 -1
405 %tmp = extractelement <2 x float> %A, i32 0
406 store float %tmp, float* %ptr
407 ret void
408 }
409
257410 define void @st1lane_ro_2s_float(<2 x float> %A, float* %D, i64 %offset) {
258411 ; CHECK-LABEL: st1lane_ro_2s_float
259412 ; CHECK: add x[[XREG:[0-9]+]], x0, x1
270423 %ptr = getelementptr float, float* %D, i64 %offset
271424 %tmp = extractelement <2 x float> %A, i32 0
272425 store float %tmp, float* %ptr
426 ret void
427 }
428
429 define void @st1lane0_1d(<1 x i64> %A, i64* %D) {
430 ; CHECK-LABEL: st1lane0_1d
431 ; CHECK: str
432 %tmp = extractelement <1 x i64> %A, i32 0
433 store i64 %tmp, i64* %D
434 ret void
435 }
436
437 define void @st1lane0u_1d(<1 x i64> %A, i64* %D) {
438 ; CHECK-LABEL: st1lane0u_1d
439 ; CHECK: st1.d
440 %ptr = getelementptr i64, i64* %D, i64 -1
441 %tmp = extractelement <1 x i64> %A, i32 0
442 store i64 %tmp, i64* %ptr
443 ret void
444 }
445
446 define void @st1lane0_ro_1d(<1 x i64> %A, i64* %D, i64 %offset) {
447 ; CHECK-LABEL: st1lane0_ro_1d
448 ; CHECK: str d0, [x0, x1, lsl #3]
449 %ptr = getelementptr i64, i64* %D, i64 %offset
450 %tmp = extractelement <1 x i64> %A, i32 0
451 store i64 %tmp, i64* %ptr
452 ret void
453 }
454
455 define void @st1lane0_1d_double(<1 x double> %A, double* %D) {
456 ; CHECK-LABEL: st1lane0_1d_double
457 ; CHECK: str
458 %tmp = extractelement <1 x double> %A, i32 0
459 store double %tmp, double* %D
460 ret void
461 }
462
463 define void @st1lane0u_1d_double(<1 x double> %A, double* %D) {
464 ; CHECK-LABEL: st1lane0u_1d_double
465 ; CHECK: stur
466 %ptr = getelementptr double, double* %D, i64 -1
467 %tmp = extractelement <1 x double> %A, i32 0
468 store double %tmp, double* %ptr
469 ret void
470 }
471
472 define void @st1lane0_ro_1d_double(<1 x double> %A, double* %D, i64 %offset) {
473 ; CHECK-LABEL: st1lane0_ro_1d_double
474 ; CHECK: str d0, [x0, x1, lsl #3]
475 %ptr = getelementptr double, double* %D, i64 %offset
476 %tmp = extractelement <1 x double> %A, i32 0
477 store double %tmp, double* %ptr
273478 ret void
274479 }
275480
8787 ret void
8888 }
8989
90 define void @store_lane0_64(half* nocapture %a, <4 x half> %b) #1 {
91 ; CHECK-LABEL: store_lane0_64:
92 ; CHECK: str h0, [x0]
93 entry:
94 %0 = extractelement <4 x half> %b, i32 0
95 store half %0, half* %a, align 2
96 ret void
97 }
98
99 define void @storeu_lane0_64(half* nocapture %a, <4 x half> %b) #1 {
100 ; CHECK-LABEL: storeu_lane0_64:
101 ; CHECK: st1 { v0.h }[0], [x{{[0-9]+}}]
102 entry:
103 %0 = getelementptr half, half* %a, i64 -1
104 %1 = extractelement <4 x half> %b, i32 0
105 store half %1, half* %0, align 2
106 ret void
107 }
108
109 define void @storero_lane_64(half* nocapture %a, <4 x half> %b, i64 %c) #1 {
110 ; CHECK-LABEL: storero_lane_64:
111 ; CHECK: st1 { v0.h }[2], [x{{[0-9]+}}]
112 entry:
113 %0 = getelementptr half, half* %a, i64 %c
114 %1 = extractelement <4 x half> %b, i32 2
115 store half %1, half* %0, align 2
116 ret void
117 }
118
119 define void @storero_lane0_64(half* nocapture %a, <4 x half> %b, i64 %c) #1 {
120 ; CHECK-LABEL: storero_lane0_64:
121 ; CHECK: str h0, [x0, x1, lsl #1]
122 entry:
123 %0 = getelementptr half, half* %a, i64 %c
124 %1 = extractelement <4 x half> %b, i32 0
125 store half %1, half* %0, align 2
126 ret void
127 }
128
90129 ; Store from one lane of v8f16
91130 define void @store_lane_128(half* nocapture %a, <8 x half> %b) #1 {
92131 ; CHECK-LABEL: store_lane_128:
94133 entry:
95134 %0 = extractelement <8 x half> %b, i32 5
96135 store half %0, half* %a, align 2
136 ret void
137 }
138
139 define void @store_lane0_128(half* nocapture %a, <8 x half> %b) #1 {
140 ; CHECK-LABEL: store_lane0_128:
141 ; CHECK: str h0, [x0]
142 entry:
143 %0 = extractelement <8 x half> %b, i32 0
144 store half %0, half* %a, align 2
145 ret void
146 }
147
148 define void @storeu_lane0_128(half* nocapture %a, <8 x half> %b) #1 {
149 ; CHECK-LABEL: storeu_lane0_128:
150 ; CHECK: st1 { v0.h }[0], [x{{[0-9]+}}]
151 entry:
152 %0 = getelementptr half, half* %a, i64 -1
153 %1 = extractelement <8 x half> %b, i32 0
154 store half %1, half* %0, align 2
155 ret void
156 }
157
158 define void @storero_lane_128(half* nocapture %a, <8 x half> %b, i64 %c) #1 {
159 ; CHECK-LABEL: storero_lane_128:
160 ; CHECK: st1 { v0.h }[4], [x{{[0-9]+}}]
161 entry:
162 %0 = getelementptr half, half* %a, i64 %c
163 %1 = extractelement <8 x half> %b, i32 4
164 store half %1, half* %0, align 2
165 ret void
166 }
167
168 define void @storero_lane0_128(half* nocapture %a, <8 x half> %b, i64 %c) #1 {
169 ; CHECK-LABEL: storero_lane0_128:
170 ; CHECK: str h0, [x0, x1, lsl #1]
171 entry:
172 %0 = getelementptr half, half* %a, i64 %c
173 %1 = extractelement <8 x half> %b, i32 0
174 store half %1, half* %0, align 2
97175 ret void
98176 }
99177