llvm.org GIT mirror llvm / 596516b
[X86] Add broadcast instructions to the table used by ExeDepsFix pass. Adds the different broadcast instructions to the ReplaceableInstrsAVX2 table. That way the ExeDepsFix pass can take better decisions when AVX2 broadcasts are across domain (int <-> float). In particular, prior to this patch we were generating: vpbroadcastd LCPI1_0(%rip), %ymm2 vpand %ymm2, %ymm0, %ymm0 vmaxps %ymm1, %ymm0, %ymm0 ## <- domain change penalty Now, we generate the following nice sequence where everything is in the float domain: vbroadcastss LCPI1_0(%rip), %ymm2 vandps %ymm2, %ymm0, %ymm0 vmaxps %ymm1, %ymm0, %ymm0 <rdar://problem/16354675> git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@204770 91177308-0d34-0410-b5e6-96231b3b80d8 Quentin Colombet 5 years ago
4 changed file(s) with 144 addition(s) and 10 deletion(s). Raw diff Collapse all Expand all
51245124 { X86::VINSERTF128rm, X86::VINSERTF128rm, X86::VINSERTI128rm },
51255125 { X86::VINSERTF128rr, X86::VINSERTF128rr, X86::VINSERTI128rr },
51265126 { X86::VPERM2F128rm, X86::VPERM2F128rm, X86::VPERM2I128rm },
5127 { X86::VPERM2F128rr, X86::VPERM2F128rr, X86::VPERM2I128rr }
5127 { X86::VPERM2F128rr, X86::VPERM2F128rr, X86::VPERM2I128rr },
5128 { X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm},
5129 { X86::VBROADCASTSSrr, X86::VBROADCASTSSrr, X86::VPBROADCASTDrr},
5130 { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr},
5131 { X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm},
5132 { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr},
5133 { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm}
51285134 };
51295135
51305136 // FIXME: Some shuffle and unpack instructions have equivalents in different
752752
753753
754754 define <4 x i32> @test_x86_avx2_pbroadcastd_128(<4 x i32> %a0) {
755 ; CHECK: vpbroadcastd
755 ; CHECK: vbroadcastss
756756 %res = call <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32> %a0) ; <<4 x i32>> [#uses=1]
757757 ret <4 x i32> %res
758758 }
760760
761761
762762 define <8 x i32> @test_x86_avx2_pbroadcastd_256(<4 x i32> %a0) {
763 ; CHECK: vpbroadcastd
763 ; CHECK: vbroadcastss {{[^,]+}}, %ymm{{[0-9]+}}
764764 %res = call <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32> %a0) ; <<8 x i32>> [#uses=1]
765765 ret <8 x i32> %res
766766 }
776776
777777
778778 define <4 x i64> @test_x86_avx2_pbroadcastq_256(<2 x i64> %a0) {
779 ; CHECK: vpbroadcastq
779 ; CHECK: vbroadcastsd {{[^,]+}}, %ymm{{[0-9]+}}
780780 %res = call <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64> %a0) ; <<4 x i64>> [#uses=1]
781781 ret <4 x i64> %res
782782 }
9797 %qf = insertelement <16 x i16> %qe, i16 %q, i32 15
9898 ret <16 x i16> %qf
9999 }
100 ; CHECK: vpbroadcastd (%
100 ; CHECK: vbroadcastss (%
101101 define <4 x i32> @D32(i32* %ptr) nounwind uwtable readnone ssp {
102102 entry:
103103 %q = load i32* %ptr, align 4
107107 %q3 = insertelement <4 x i32> %q2, i32 %q, i32 3
108108 ret <4 x i32> %q3
109109 }
110 ; CHECK: vpbroadcastd (%
110 ; CHECK: vbroadcastss (%
111111 define <8 x i32> @DD32(i32* %ptr) nounwind uwtable readnone ssp {
112112 entry:
113113 %q = load i32* %ptr, align 4
129129 %q1 = insertelement <2 x i64> %q0, i64 %q, i32 1
130130 ret <2 x i64> %q1
131131 }
132 ; CHECK: vpbroadcastq (%
132 ; CHECK: vbroadcastsd (%
133133 define <4 x i64> @QQ64(i64* %ptr) nounwind uwtable readnone ssp {
134134 entry:
135135 %q = load i64* %ptr, align 4
292292
293293
294294 ;CHECK-LABEL: _inreg4xi64:
295 ;CHECK: vpbroadcastq
295 ;CHECK: vbroadcastsd
296296 ;CHECK: ret
297297 define <4 x i64> @_inreg4xi64(<4 x i64> %a) {
298298 %b = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> zeroinitializer
324324 }
325325
326326 ;CHECK-LABEL: _inreg8xi32:
327 ;CHECK: vpbroadcastd
327 ;CHECK: vbroadcastss
328328 ;CHECK: ret
329329 define <8 x i32> @_inreg8xi32(<8 x i32> %a) {
330330 %b = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> zeroinitializer
332332 }
333333
334334 ;CHECK-LABEL: _inreg4xi32:
335 ;CHECK: vpbroadcastd
335 ;CHECK: vbroadcastss
336336 ;CHECK: ret
337337 define <4 x i32> @_inreg4xi32(<4 x i32> %a) {
338338 %b = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> zeroinitializer
0 ; RUN: llc -O3 -mtriple=x86_64-apple-macosx -o - < %s -mattr=+avx2 -enable-unsafe-fp-math -mcpu=core2 | FileCheck %s
1 ; Check that the ExeDepsFix pass correctly fixes the domain for broadcast instructions.
2 ;
3
4 ; CHECK-LABEL: ExeDepsFix_broadcastss
5 ; CHECK: broadcastss
6 ; CHECK: vandps
7 ; CHECK: vmaxps
8 ; CHECK: ret
9 define <4 x float> @ExeDepsFix_broadcastss(<4 x float> %arg, <4 x float> %arg2) {
10 %bitcast = bitcast <4 x float> %arg to <4 x i32>
11 %and = and <4 x i32> %bitcast,
12 %floatcast = bitcast <4 x i32> %and to <4 x float>
13 %max_is_x = fcmp oge <4 x float> %floatcast, %arg2
14 %max = select <4 x i1> %max_is_x, <4 x float> %floatcast, <4 x float> %arg2
15 ret <4 x float> %max
16 }
17
18 ; CHECK-LABEL: ExeDepsFix_broadcastss256
19 ; CHECK: broadcastss
20 ; CHECK: vandps
21 ; CHECK: vmaxps
22 ; CHECK: ret
23 define <8 x float> @ExeDepsFix_broadcastss256(<8 x float> %arg, <8 x float> %arg2) {
24 %bitcast = bitcast <8 x float> %arg to <8 x i32>
25 %and = and <8 x i32> %bitcast,
26 %floatcast = bitcast <8 x i32> %and to <8 x float>
27 %max_is_x = fcmp oge <8 x float> %floatcast, %arg2
28 %max = select <8 x i1> %max_is_x, <8 x float> %floatcast, <8 x float> %arg2
29 ret <8 x float> %max
30 }
31
32
33 ; CHECK-LABEL: ExeDepsFix_broadcastss_inreg
34 ; CHECK: broadcastss
35 ; CHECK: vandps
36 ; CHECK: vmaxps
37 ; CHECK: ret
38 define <4 x float> @ExeDepsFix_broadcastss_inreg(<4 x float> %arg, <4 x float> %arg2, i32 %broadcastvalue) {
39 %bitcast = bitcast <4 x float> %arg to <4 x i32>
40 %in = insertelement <4 x i32> undef, i32 %broadcastvalue, i32 0
41 %mask = shufflevector <4 x i32> %in, <4 x i32> undef, <4 x i32> zeroinitializer
42 %and = and <4 x i32> %bitcast, %mask
43 %floatcast = bitcast <4 x i32> %and to <4 x float>
44 %max_is_x = fcmp oge <4 x float> %floatcast, %arg2
45 %max = select <4 x i1> %max_is_x, <4 x float> %floatcast, <4 x float> %arg2
46 ret <4 x float> %max
47 }
48
49 ; CHECK-LABEL: ExeDepsFix_broadcastss256_inreg
50 ; CHECK: broadcastss
51 ; CHECK: vandps
52 ; CHECK: vmaxps
53 ; CHECK: ret
54 define <8 x float> @ExeDepsFix_broadcastss256_inreg(<8 x float> %arg, <8 x float> %arg2, i32 %broadcastvalue) {
55 %bitcast = bitcast <8 x float> %arg to <8 x i32>
56 %in = insertelement <8 x i32> undef, i32 %broadcastvalue, i32 0
57 %mask = shufflevector <8 x i32> %in, <8 x i32> undef, <8 x i32> zeroinitializer
58 %and = and <8 x i32> %bitcast, %mask
59 %floatcast = bitcast <8 x i32> %and to <8 x float>
60 %max_is_x = fcmp oge <8 x float> %floatcast, %arg2
61 %max = select <8 x i1> %max_is_x, <8 x float> %floatcast, <8 x float> %arg2
62 ret <8 x float> %max
63 }
64
65 ; CHECK-LABEL: ExeDepsFix_broadcastsd
66 ; In that case the broadcast is directly folded into vandpd.
67 ; CHECK: vandpd
68 ; CHECK: vmaxpd
69 ; CHECK:ret
70 define <2 x double> @ExeDepsFix_broadcastsd(<2 x double> %arg, <2 x double> %arg2) {
71 %bitcast = bitcast <2 x double> %arg to <2 x i64>
72 %and = and <2 x i64> %bitcast,
73 %floatcast = bitcast <2 x i64> %and to <2 x double>
74 %max_is_x = fcmp oge <2 x double> %floatcast, %arg2
75 %max = select <2 x i1> %max_is_x, <2 x double> %floatcast, <2 x double> %arg2
76 ret <2 x double> %max
77 }
78
79 ; CHECK-LABEL: ExeDepsFix_broadcastsd256
80 ; CHECK: broadcastsd
81 ; CHECK: vandpd
82 ; CHECK: vmaxpd
83 ; CHECK: ret
84 define <4 x double> @ExeDepsFix_broadcastsd256(<4 x double> %arg, <4 x double> %arg2) {
85 %bitcast = bitcast <4 x double> %arg to <4 x i64>
86 %and = and <4 x i64> %bitcast,
87 %floatcast = bitcast <4 x i64> %and to <4 x double>
88 %max_is_x = fcmp oge <4 x double> %floatcast, %arg2
89 %max = select <4 x i1> %max_is_x, <4 x double> %floatcast, <4 x double> %arg2
90 ret <4 x double> %max
91 }
92
93
94 ; CHECK-LABEL: ExeDepsFix_broadcastsd_inreg
95 ; ExeDepsFix works top down, thus it coalesces vmovlhps domain with
96 ; vandps and there is nothing more you can do to match vmaxpd.
97 ; CHECK: vmovlhps
98 ; CHECK: vandps
99 ; CHECK: vmaxpd
100 ; CHECK: ret
101 define <2 x double> @ExeDepsFix_broadcastsd_inreg(<2 x double> %arg, <2 x double> %arg2, i64 %broadcastvalue) {
102 %bitcast = bitcast <2 x double> %arg to <2 x i64>
103 %in = insertelement <2 x i64> undef, i64 %broadcastvalue, i32 0
104 %mask = shufflevector <2 x i64> %in, <2 x i64> undef, <2 x i32> zeroinitializer
105 %and = and <2 x i64> %bitcast, %mask
106 %floatcast = bitcast <2 x i64> %and to <2 x double>
107 %max_is_x = fcmp oge <2 x double> %floatcast, %arg2
108 %max = select <2 x i1> %max_is_x, <2 x double> %floatcast, <2 x double> %arg2
109 ret <2 x double> %max
110 }
111
112 ; CHECK-LABEL: ExeDepsFix_broadcastsd256_inreg
113 ; CHECK: broadcastsd
114 ; CHECK: vandpd
115 ; CHECK: vmaxpd
116 ; CHECK: ret
117 define <4 x double> @ExeDepsFix_broadcastsd256_inreg(<4 x double> %arg, <4 x double> %arg2, i64 %broadcastvalue) {
118 %bitcast = bitcast <4 x double> %arg to <4 x i64>
119 %in = insertelement <4 x i64> undef, i64 %broadcastvalue, i32 0
120 %mask = shufflevector <4 x i64> %in, <4 x i64> undef, <4 x i32> zeroinitializer
121 %and = and <4 x i64> %bitcast, %mask
122 %floatcast = bitcast <4 x i64> %and to <4 x double>
123 %max_is_x = fcmp oge <4 x double> %floatcast, %arg2
124 %max = select <4 x i1> %max_is_x, <4 x double> %floatcast, <4 x double> %arg2
125 ret <4 x double> %max
126 }
127