llvm.org GIT mirror llvm / 6030329
AMDGPU: Improve extract_vector_elt reduction combine Handle fmul, fsub and preserve flags. Also really test minnum/maxnum reductions. The existing tests were only checking from minnum/maxnum matched from a fast math compare and select which is not the same. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@339820 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 1 year, 6 months ago
3 changed file(s) with 151 addition(s) and 48 deletion(s). Raw diff Collapse all Expand all
73487348 return SDValue();
73497349 // TODO: Support other binary operations.
73507350 case ISD::FADD:
7351 case ISD::FSUB:
7352 case ISD::FMUL:
73517353 case ISD::ADD:
73527354 case ISD::UMIN:
73537355 case ISD::UMAX:
73547356 case ISD::SMIN:
73557357 case ISD::SMAX:
73567358 case ISD::FMAXNUM:
7357 case ISD::FMINNUM:
7358 return DAG.getNode(Opc, SL, EltVT,
7359 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7360 Vec.getOperand(0), Idx),
7361 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7362 Vec.getOperand(1), Idx));
7359 case ISD::FMINNUM: {
7360 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7361 Vec.getOperand(0), Idx);
7362 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7363 Vec.getOperand(1), Idx);
7364
7365 DCI.AddToWorklist(Elt0.getNode());
7366 DCI.AddToWorklist(Elt1.getNode());
7367 return DAG.getNode(Opc, SL, EltVT, Elt0, Elt1, Vec->getFlags());
7368 }
73637369 }
73647370 }
73657371
782782
783783 ; GCN-LABEL: {{^}}v_test_canonicalize_extract_element_v2f16:
784784 ; GFX9: s_waitcnt
785 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, 4.0 op_sel_hi:[1,0]
785 ; GFX9-NEXT: v_mul_f16_e32 v0, 4.0, v0
786786 ; GFX9-NEXT: s_setpc_b64
787787 define half @v_test_canonicalize_extract_element_v2f16(<2 x half> %vec) {
788788 %vec.op = fmul <2 x half> %vec,
None ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
2
3 ; GCN-LABEL: {{^}}reduction_half4:
0 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
2
3 ; GCN-LABEL: {{^}}reduction_fadd_v4f16:
44 ; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
55 ; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
66
77 ; VI: v_add_f16_sdwa
88 ; VI-NEXT: v_add_f16_e32
99 ; VI-NEXT: v_add_f16_e32
10 define half @reduction_half4(<4 x half> %vec4) {
11 entry:
12 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32>
13 %bin.rdx = fadd fast <4 x half> %vec4, %rdx.shuf
10 define half @reduction_fadd_v4f16(<4 x half> %vec4) {
11 entry:
12 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32>
13 %bin.rdx = fadd <4 x half> %vec4, %rdx.shuf
1414 %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32>
15 %bin.rdx2 = fadd fast <4 x half> %bin.rdx, %rdx.shuf1
15 %bin.rdx2 = fadd <4 x half> %bin.rdx, %rdx.shuf1
16 %res = extractelement <4 x half> %bin.rdx2, i32 0
17 ret half %res
18 }
19
20 ; GCN-LABEL: {{^}}reduction_fsub_v4f16:
21 ; GFX9: s_waitcnt
22 ; GFX9-NEXT: v_pk_add_f16 [[ADD:v[0-9]+]], v0, v1 neg_lo:[0,1] neg_hi:[0,1]{{$}}
23 ; GFX9-NEXT: v_sub_f16_sdwa v0, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
24 ; GFX9-NEXT: s_setpc_b64
25
26 ; VI: v_sub_f16_sdwa
27 ; VI-NEXT: v_sub_f16_e32
28 ; VI-NEXT: v_sub_f16_e32
29 ; VI-NEXT: s_setpc_b64
30 define half @reduction_fsub_v4f16(<4 x half> %vec4) {
31 entry:
32 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32>
33 %bin.rdx = fsub <4 x half> %vec4, %rdx.shuf
34 %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32>
35 %bin.rdx2 = fsub <4 x half> %bin.rdx, %rdx.shuf1
36 %res = extractelement <4 x half> %bin.rdx2, i32 0
37 ret half %res
38 }
39
40 ; Make sure nsz is preserved when the operations are split.
41 ; GCN-LABEL: {{^}}reduction_fsub_v4f16_preserve_fmf:
42 ; GFX9: s_waitcnt
43 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]{{$}}
44 ; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
45 ; GFX9-NEXT: s_setpc_b64
46
47 ; VI: s_waitcnt
48 ; VI-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
49 ; VI-NEXT: v_sub_f16_e32 v0, v0, v1
50 ; VI-NEXT: v_sub_f16_e32 v0, v2, v0
51 ; VI-NEXT: s_setpc_b64
52 define half @reduction_fsub_v4f16_preserve_fmf(<4 x half> %vec4) {
53 entry:
54 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32>
55 %bin.rdx = fsub nsz <4 x half> %vec4, %rdx.shuf
56 %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32>
57 %bin.rdx2 = fsub nsz <4 x half> %bin.rdx, %rdx.shuf1
58 %res = extractelement <4 x half> %bin.rdx2, i32 0
59 %neg.res = fsub half -0.0, %res
60 ret half %neg.res
61 }
62
63 ; GCN-LABEL: {{^}}reduction_fmul_half4:
64 ; GFX9: v_pk_mul_f16 [[MUL:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
65 ; GFX9-NEXT: v_mul_f16_sdwa v{{[0-9]+}}, [[MUL]], [[MUL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
66
67 ; VI: v_mul_f16_sdwa
68 ; VI-NEXT: v_mul_f16_e32
69 ; VI-NEXT: v_mul_f16_e32
70 define half @reduction_fmul_half4(<4 x half> %vec4) {
71 entry:
72 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32>
73 %bin.rdx = fmul <4 x half> %vec4, %rdx.shuf
74 %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32>
75 %bin.rdx2 = fmul <4 x half> %bin.rdx, %rdx.shuf1
1676 %res = extractelement <4 x half> %bin.rdx2, i32 0
1777 ret half %res
1878 }
51111 define half @reduction_half8(<8 x half> %vec8) {
52112 entry:
53113 %rdx.shuf = shufflevector <8 x half> %vec8, <8 x half> undef, <8 x i32>
54 %bin.rdx = fadd fast <8 x half> %vec8, %rdx.shuf
114 %bin.rdx = fadd <8 x half> %vec8, %rdx.shuf
55115 %rdx.shuf1 = shufflevector <8 x half> %bin.rdx, <8 x half> undef, <8 x i32>
56 %bin.rdx2 = fadd fast <8 x half> %bin.rdx, %rdx.shuf1
116 %bin.rdx2 = fadd <8 x half> %bin.rdx, %rdx.shuf1
57117 %rdx.shuf3 = shufflevector <8 x half> %bin.rdx2, <8 x half> undef, <8 x i32>
58 %bin.rdx4 = fadd fast <8 x half> %bin.rdx2, %rdx.shuf3
118 %bin.rdx4 = fadd <8 x half> %bin.rdx2, %rdx.shuf3
59119 %res = extractelement <8 x half> %bin.rdx4, i32 0
60120 ret half %res
61121 }
62122
63123 ; GCN-LABEL: {{^}}reduction_v8i16:
64 ; GFX9: v_pk_add_u16 [[ADD1]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
65 ; GFX9-NEXT: v_pk_add_u16 [[ADD2]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
66 ; GFX9-NEXT: v_pk_add_u16 [[ADD3]], [[ADD2]], [[ADD1]]{{$}}
124 ; GFX9: v_pk_add_u16 [[ADD1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
125 ; GFX9-NEXT: v_pk_add_u16 [[ADD2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
126 ; GFX9-NEXT: v_pk_add_u16 [[ADD3:v[0-9]+]], [[ADD2]], [[ADD1]]{{$}}
67127 ; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
68128
69129 ; VI: v_add_u16_sdwa
91151 ; GFX9-NEXT: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
92152 ; GFX9-NEXT: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
93153 ; GFX9: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
94 ; GFX9-NEXT: v_pk_add_f16 [[ADD1]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
95 ; GFX9-NEXT: v_pk_add_f16 [[ADD2]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
96 ; GFX9-NEXT: v_pk_add_f16 [[ADD3]], [[ADD2]], [[ADD1]]{{$}}
154 ; GFX9-NEXT: v_pk_add_f16 [[ADD1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
155 ; GFX9-NEXT: v_pk_add_f16 [[ADD2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
156 ; GFX9-NEXT: v_pk_add_f16 [[ADD3:v[0-9]+]], [[ADD2]], [[ADD1]]{{$}}
97157 ; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
98158
99159 ; VI: v_add_f16_sdwa
115175 define half @reduction_half16(<16 x half> %vec16) {
116176 entry:
117177 %rdx.shuf = shufflevector <16 x half> %vec16, <16 x half> undef, <16 x i32>
118 %bin.rdx = fadd fast <16 x half> %vec16, %rdx.shuf
178 %bin.rdx = fadd <16 x half> %vec16, %rdx.shuf
119179 %rdx.shuf1 = shufflevector <16 x half> %bin.rdx, <16 x half> undef, <16 x i32>
120 %bin.rdx2 = fadd fast <16 x half> %bin.rdx, %rdx.shuf1
180 %bin.rdx2 = fadd <16 x half> %bin.rdx, %rdx.shuf1
121181 %rdx.shuf3 = shufflevector <16 x half> %bin.rdx2, <16 x half> undef, <16 x i32>
122 %bin.rdx4 = fadd fast <16 x half> %bin.rdx2, %rdx.shuf3
182 %bin.rdx4 = fadd <16 x half> %bin.rdx2, %rdx.shuf3
123183 %rdx.shuf5 = shufflevector <16 x half> %bin.rdx4, <16 x half> undef, <16 x i32>
124 %bin.rdx6 = fadd fast <16 x half> %bin.rdx4, %rdx.shuf5
184 %bin.rdx6 = fadd <16 x half> %bin.rdx4, %rdx.shuf5
125185 %res = extractelement <16 x half> %bin.rdx6, i32 0
126186 ret half %res
127187 }
372432 ret i16 %res
373433 }
374434
375 ; GCN-LABEL: {{^}}reduction_fmax_v4half:
435 ; GCN-LABEL: {{^}}reduction_maxnum_v4f16:
376436 ; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
377437 ; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
378438
379439 ; VI: v_max_f16_sdwa
380440 ; VI-NEXT: v_max_f16_e32
381441 ; VI-NEXT: v_max_f16_e32
382 define half @reduction_fmax_v4half(<4 x half> %vec4) {
383 entry:
384 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32>
385 %rdx.minmax.cmp = fcmp fast ogt <4 x half> %vec4, %rdx.shuf
386 %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x half> %vec4, <4 x half> %rdx.shuf
387 %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax.select, <4 x half> undef, <4 x i32>
388 %rdx.minmax.cmp2 = fcmp fast ogt <4 x half> %rdx.minmax.select, %rdx.shuf1
389 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1
390 %res = extractelement <4 x half> %rdx.minmax.select3, i32 0
391 ret half %res
392 }
393
394 ; GCN-LABEL: {{^}}reduction_fmin_v4half:
442 define half @reduction_maxnum_v4f16(<4 x half> %vec4) {
443 entry:
444 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32>
445 %rdx.minmax = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %vec4, <4 x half> %rdx.shuf)
446 %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax, <4 x half> undef, <4 x i32>
447 %rdx.minmax3 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %rdx.minmax, <4 x half> %rdx.shuf1)
448 %res = extractelement <4 x half> %rdx.minmax3, i32 0
449 ret half %res
450 }
451
452 ; GCN-LABEL: {{^}}reduction_minnum_v4f16:
395453 ; GFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
396454 ; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
397455
398456 ; VI: v_min_f16_sdwa
399457 ; VI-NEXT: v_min_f16_e32
400458 ; VI-NEXT: v_min_f16_e32
401 define half @reduction_fmin_v4half(<4 x half> %vec4) {
402 entry:
403 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32>
404 %rdx.minmax.cmp = fcmp fast olt <4 x half> %vec4, %rdx.shuf
459 define half @reduction_minnum_v4f16(<4 x half> %vec4) {
460 entry:
461 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32>
462 %rdx.minmax = call <4 x half> @llvm.minnum.v4f16(<4 x half> %vec4, <4 x half> %rdx.shuf)
463 %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax, <4 x half> undef, <4 x i32>
464 %rdx.minmax3 = call <4 x half> @llvm.minnum.v4f16(<4 x half> %rdx.minmax, <4 x half> %rdx.shuf1)
465 %res = extractelement <4 x half> %rdx.minmax3, i32 0
466 ret half %res
467 }
468
469 ; GCN-LABEL: {{^}}reduction_fast_max_pattern_v4f16:
470 ; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
471 ; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
472
473 ; VI: v_max_f16_sdwa
474 ; VI-NEXT: v_max_f16_e32
475 ; VI-NEXT: v_max_f16_e32
476 define half @reduction_fast_max_pattern_v4f16(<4 x half> %vec4) {
477 entry:
478 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32>
479 %rdx.minmax.cmp = fcmp nnan nsz ogt <4 x half> %vec4, %rdx.shuf
405480 %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x half> %vec4, <4 x half> %rdx.shuf
406481 %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax.select, <4 x half> undef, <4 x i32>
407 %rdx.minmax.cmp2 = fcmp fast olt <4 x half> %rdx.minmax.select, %rdx.shuf1
482 %rdx.minmax.cmp2 = fcmp nnan nsz ogt <4 x half> %rdx.minmax.select, %rdx.shuf1
408483 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1
409484 %res = extractelement <4 x half> %rdx.minmax.select3, i32 0
410485 ret half %res
411486 }
487
488 ; GCN-LABEL: {{^}}reduction_fast_min_pattern_v4f16:
489 ; GFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
490 ; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
491
492 ; VI: v_min_f16_sdwa
493 ; VI-NEXT: v_min_f16_e32
494 ; VI-NEXT: v_min_f16_e32
495 define half @reduction_fast_min_pattern_v4f16(<4 x half> %vec4) {
496 entry:
497 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32>
498 %rdx.minmax.cmp = fcmp nnan nsz olt <4 x half> %vec4, %rdx.shuf
499 %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x half> %vec4, <4 x half> %rdx.shuf
500 %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax.select, <4 x half> undef, <4 x i32>
501 %rdx.minmax.cmp2 = fcmp nnan nsz olt <4 x half> %rdx.minmax.select, %rdx.shuf1
502 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1
503 %res = extractelement <4 x half> %rdx.minmax.select3, i32 0
504 ret half %res
505 }
506
507 declare <4 x half> @llvm.minnum.v4f16(<4 x half>, <4 x half>)
508 declare <4 x half> @llvm.maxnum.v4f16(<4 x half>, <4 x half>)