llvm.org GIT mirror llvm / 4ea4968
Recognize unpckh* masks and match 256-bit versions. The new versions are different from the previous 128-bit because they work in lanes. Update a few comments and add testcases git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@136157 91177308-0d34-0410-b5e6-96231b3b80d8 Bruno Cardoso Lopes 8 years ago
11 changed file(s) with 150 addition(s) and 134 deletion(s). Raw diff Collapse all Expand all
166166 SmallVectorImpl &ShuffleMask) {
167167 unsigned NumElts = VT.getVectorNumElements();
168168
169 // Handle vector lengths > 128 bits. Define a "section" as a set of
170 // 128 bits. AVX defines UNPCK* to operate independently on 128-bit
171 // sections.
172 unsigned NumSections = VT.getSizeInBits() / 128;
173 if (NumSections == 0 ) NumSections = 1; // Handle MMX
174 unsigned NumSectionElts = NumElts / NumSections;
169 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
170 // independently on 128-bit lanes.
171 unsigned NumLanes = VT.getSizeInBits() / 128;
172 if (NumLanes == 0 ) NumLanes = 1; // Handle MMX
173 unsigned NumLaneElts = NumElts / NumLanes;
175174
176175 unsigned Start = 0;
177 unsigned End = NumSectionElts / 2;
178 for (unsigned s = 0; s < NumSections; ++s) {
176 unsigned End = NumLaneElts / 2;
177 for (unsigned s = 0; s < NumLanes; ++s) {
179178 for (unsigned i = Start; i != End; ++i) {
180179 ShuffleMask.push_back(i); // Reads from dest/src1
181 ShuffleMask.push_back(i+NumSectionElts); // Reads from src/src2
180 ShuffleMask.push_back(i+NumLaneElts); // Reads from src/src2
182181 }
183182 // Process the next 128 bits.
184 Start += NumSectionElts;
185 End += NumSectionElts;
183 Start += NumLaneElts;
184 End += NumLaneElts;
186185 }
187186 }
188187
27102710 case X86ISD::PUNPCKLQDQ:
27112711 case X86ISD::UNPCKHPS:
27122712 case X86ISD::UNPCKHPD:
2713 case X86ISD::VUNPCKHPSY:
2714 case X86ISD::VUNPCKHPDY:
27132715 case X86ISD::PUNPCKHWD:
27142716 case X86ISD::PUNPCKHBW:
27152717 case X86ISD::PUNPCKHDQ:
27812783 case X86ISD::PUNPCKLQDQ:
27822784 case X86ISD::UNPCKHPS:
27832785 case X86ISD::UNPCKHPD:
2786 case X86ISD::VUNPCKHPSY:
2787 case X86ISD::VUNPCKHPDY:
27842788 case X86ISD::PUNPCKHWD:
27852789 case X86ISD::PUNPCKHBW:
27862790 case X86ISD::PUNPCKHDQ:
32183222 static bool isUNPCKLMask(const SmallVectorImpl &Mask, EVT VT,
32193223 bool V2IsSplat = false) {
32203224 int NumElts = VT.getVectorNumElements();
3221 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
3225
3226 assert((VT.is128BitVector() || VT.is256BitVector()) &&
3227 "Unsupported vector type for unpckh");
3228
3229 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8)
32223230 return false;
32233231
3224 // Handle vector lengths > 128 bits. Define a "section" as a set of
3225 // 128 bits. AVX defines UNPCK* to operate independently on 128-bit
3226 // sections.
3227 unsigned NumSections = VT.getSizeInBits() / 128;
3228 if (NumSections == 0 ) NumSections = 1; // Handle MMX
3229 unsigned NumSectionElts = NumElts / NumSections;
3232 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3233 // independently on 128-bit lanes.
3234 unsigned NumLanes = VT.getSizeInBits()/128;
3235 unsigned NumLaneElts = NumElts/NumLanes;
32303236
32313237 unsigned Start = 0;
3232 unsigned End = NumSectionElts;
3233 for (unsigned s = 0; s < NumSections; ++s) {
3234 for (unsigned i = Start, j = s * NumSectionElts;
3238 unsigned End = NumLaneElts;
3239 for (unsigned s = 0; s < NumLanes; ++s) {
3240 for (unsigned i = Start, j = s * NumLaneElts;
32353241 i != End;
32363242 i += 2, ++j) {
32373243 int BitI = Mask[i];
32473253 }
32483254 }
32493255 // Process the next 128 bits.
3250 Start += NumSectionElts;
3251 End += NumSectionElts;
3256 Start += NumLaneElts;
3257 End += NumLaneElts;
32523258 }
32533259
32543260 return true;
32653271 static bool isUNPCKHMask(const SmallVectorImpl &Mask, EVT VT,
32663272 bool V2IsSplat = false) {
32673273 int NumElts = VT.getVectorNumElements();
3268 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
3274
3275 assert((VT.is128BitVector() || VT.is256BitVector()) &&
3276 "Unsupported vector type for unpckh");
3277
3278 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8)
32693279 return false;
32703280
3271 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
3272 int BitI = Mask[i];
3273 int BitI1 = Mask[i+1];
3274 if (!isUndefOrEqual(BitI, j + NumElts/2))
3275 return false;
3276 if (V2IsSplat) {
3277 if (isUndefOrEqual(BitI1, NumElts))
3281 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3282 // independently on 128-bit lanes.
3283 unsigned NumLanes = VT.getSizeInBits()/128;
3284 unsigned NumLaneElts = NumElts/NumLanes;
3285
3286 unsigned Start = 0;
3287 unsigned End = NumLaneElts;
3288 for (unsigned l = 0; l != NumLanes; ++l) {
3289 for (unsigned i = Start, j = (l*NumLaneElts)+NumLaneElts/2;
3290 i != End; i += 2, ++j) {
3291 int BitI = Mask[i];
3292 int BitI1 = Mask[i+1];
3293 if (!isUndefOrEqual(BitI, j))
32783294 return false;
3279 } else {
3280 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts))
3281 return false;
3282 }
3295 if (V2IsSplat) {
3296 if (isUndefOrEqual(BitI1, NumElts))
3297 return false;
3298 } else {
3299 if (!isUndefOrEqual(BitI1, j+NumElts))
3300 return false;
3301 }
3302 }
3303 // Process the next 128 bits.
3304 Start += NumLaneElts;
3305 End += NumLaneElts;
32833306 }
32843307 return true;
32853308 }
32983321 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
32993322 return false;
33003323
3301 // Handle vector lengths > 128 bits. Define a "section" as a set of
3302 // 128 bits. AVX defines UNPCK* to operate independently on 128-bit
3303 // sections.
3304 unsigned NumSections = VT.getSizeInBits() / 128;
3305 if (NumSections == 0 ) NumSections = 1; // Handle MMX
3306 unsigned NumSectionElts = NumElems / NumSections;
3307
3308 for (unsigned s = 0; s < NumSections; ++s) {
3309 for (unsigned i = s * NumSectionElts, j = s * NumSectionElts;
3310 i != NumSectionElts * (s + 1);
3324 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3325 // independently on 128-bit lanes.
3326 unsigned NumLanes = VT.getSizeInBits() / 128;
3327 unsigned NumLaneElts = NumElems / NumLanes;
3328
3329 for (unsigned s = 0; s < NumLanes; ++s) {
3330 for (unsigned i = s * NumLaneElts, j = s * NumLaneElts;
3331 i != NumLaneElts * (s + 1);
33113332 i += 2, ++j) {
33123333 int BitI = Mask[i];
33133334 int BitI1 = Mask[i+1];
40944115 break;
40954116 case X86ISD::UNPCKHPS:
40964117 case X86ISD::UNPCKHPD:
4118 case X86ISD::VUNPCKHPSY:
4119 case X86ISD::VUNPCKHPDY:
40974120 DecodeUNPCKHPMask(NumElems, ShuffleMask);
40984121 break;
40994122 case X86ISD::PUNPCKLBW:
57505773 case MVT::v2i64: return X86ISD::PUNPCKHQDQ;
57515774 case MVT::v4f32: return X86ISD::UNPCKHPS;
57525775 case MVT::v2f64: return X86ISD::UNPCKHPD;
5776 case MVT::v8f32: return X86ISD::VUNPCKHPSY;
5777 case MVT::v4f64: return X86ISD::VUNPCKHPDY;
57535778 case MVT::v16i8: return X86ISD::PUNPCKHBW;
57545779 case MVT::v8i16: return X86ISD::PUNPCKHWD;
57555780 default:
1259612621 case X86ISD::PUNPCKHQDQ:
1259712622 case X86ISD::UNPCKHPS:
1259812623 case X86ISD::UNPCKHPD:
12624 case X86ISD::VUNPCKHPSY:
12625 case X86ISD::VUNPCKHPDY:
1259912626 case X86ISD::PUNPCKLBW:
1260012627 case X86ISD::PUNPCKLWD:
1260112628 case X86ISD::PUNPCKLDQ:
260260 VUNPCKLPDY,
261261 UNPCKHPS,
262262 UNPCKHPD,
263 VUNPCKHPSY,
264 VUNPCKHPDY,
263265 PUNPCKLBW,
264266 PUNPCKLWD,
265267 PUNPCKLDQ,
132132 def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>;
133133 def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
134134
135 def X86Unpcklps : SDNode<"X86ISD::UNPCKLPS", SDTShuff2Op>;
136 def X86Unpcklpd : SDNode<"X86ISD::UNPCKLPD", SDTShuff2Op>;
135 def X86Unpcklps : SDNode<"X86ISD::UNPCKLPS", SDTShuff2Op>;
136 def X86Unpcklpd : SDNode<"X86ISD::UNPCKLPD", SDTShuff2Op>;
137137 def X86Unpcklpsy : SDNode<"X86ISD::VUNPCKLPSY", SDTShuff2Op>;
138138 def X86Unpcklpdy : SDNode<"X86ISD::VUNPCKLPDY", SDTShuff2Op>;
139 def X86Unpckhps : SDNode<"X86ISD::UNPCKHPS", SDTShuff2Op>;
140 def X86Unpckhpd : SDNode<"X86ISD::UNPCKHPD", SDTShuff2Op>;
139
140 def X86Unpckhps : SDNode<"X86ISD::UNPCKHPS", SDTShuff2Op>;
141 def X86Unpckhpd : SDNode<"X86ISD::UNPCKHPD", SDTShuff2Op>;
142 def X86Unpckhpsy : SDNode<"X86ISD::VUNPCKHPSY", SDTShuff2Op>;
143 def X86Unpckhpdy : SDNode<"X86ISD::VUNPCKHPDY", SDTShuff2Op>;
141144
142145 def X86Punpcklbw : SDNode<"X86ISD::PUNPCKLBW", SDTShuff2Op>;
143146 def X86Punpcklwd : SDNode<"X86ISD::PUNPCKLWD", SDTShuff2Op>;
56765676 def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)),
56775677 (UNPCKHPSrr VR128:$src1, VR128:$src2)>;
56785678
5679 // Shuffle with VUNPCKHPSY
5680 def : Pat<(v8f32 (X86Unpckhpsy VR256:$src1, (memopv8f32 addr:$src2))),
5681 (VUNPCKHPSYrm VR256:$src1, addr:$src2)>, Requires<[HasAVX]>;
5682 def : Pat<(v8f32 (X86Unpckhpsy VR256:$src1, VR256:$src2)),
5683 (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>, Requires<[HasAVX]>;
5684
56795685 // Shuffle with UNPCKLPD
56805686 def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))),
56815687 (VUNPCKLPDrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>;
57015707 (VUNPCKHPDrr VR128:$src1, VR128:$src2)>, Requires<[HasAVX]>;
57025708 def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, VR128:$src2)),
57035709 (UNPCKHPDrr VR128:$src1, VR128:$src2)>;
5710
5711 // Shuffle with VUNPCKHPDY
5712 def : Pat<(v4f64 (X86Unpckhpdy VR256:$src1, (memopv4f64 addr:$src2))),
5713 (VUNPCKHPDYrm VR256:$src1, addr:$src2)>, Requires<[HasAVX]>;
5714 def : Pat<(v4f64 (X86Unpckhpdy VR256:$src1, VR256:$src2)),
5715 (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>, Requires<[HasAVX]>;
57045716
57055717 // Shuffle with MOVLHPS
57065718 def : Pat<(X86Movlhps VR128:$src1,
+0
-5
test/CodeGen/X86/SIMD/dg.exp less more
None load_lib llvm.exp
1
2 if { [llvm_supports_target X86] } {
3 RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll,c,cpp}]]
4 }
+0
-20
test/CodeGen/X86/SIMD/notvunpcklpd.ll less more
None ; RUN: llc < %s -mattr=+avx | FileCheck %s
1
2 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
3 target triple = "x86_64-unknown-linux-gnu"
4
5 define void @try_([2 x <4 x double>]* noalias %incarray, [2 x <4 x double>]* noalias %incarrayb ) {
6 entry:
7 %incarray1 = alloca [2 x <4 x double>]*, align 8
8 %incarrayb1 = alloca [2 x <4 x double>]*, align 8
9 %carray = alloca [2 x <4 x double>], align 16
10 %r = getelementptr [2 x <4 x double>]* %incarray, i32 0, i32 0
11 %rb = getelementptr [2 x <4 x double>]* %incarrayb, i32 0, i32 0
12 %r3 = load <4 x double>* %r, align 8
13 %r4 = load <4 x double>* %rb, align 8
14 %r11 = shufflevector <4 x double> %r3, <4 x double> %r4, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x double>> [#uses=1]
15 ; CHECK-NOT: vunpcklpd %ymm
16 %r12 = getelementptr [2 x <4 x double>]* %carray, i32 0, i32 1
17 store <4 x double> %r11, <4 x double>* %r12, align 4
18 ret void
19 }
+0
-20
test/CodeGen/X86/SIMD/notvunpcklps.ll less more
None ; RUN: llc < %s -mattr=+avx | FileCheck %s
1
2 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
3 target triple = "x86_64-unknown-linux-gnu"
4
5 define void @try_([2 x <8 x float>]* noalias %incarray, [2 x <8 x float>]* noalias %incarrayb ) {
6 enmtry:
7 %incarray1 = alloca [2 x <8 x float>]*, align 8
8 %incarrayb1 = alloca [2 x <8 x float>]*, align 8
9 %carray = alloca [2 x <8 x float>], align 16
10 %r = getelementptr [2 x <8 x float>]* %incarray, i32 0, i32 0
11 %rb = getelementptr [2 x <8 x float>]* %incarrayb, i32 0, i32 0
12 %r3 = load <8 x float>* %r, align 8
13 %r4 = load <8 x float>* %rb, align 8
14 %r8 = shufflevector <8 x float> %r3, <8 x float> %r4, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11 > ; <<8 x float>> [#uses=1]
15 ; CHECK-NOT: vunpcklps %ymm
16 %r9 = getelementptr [2 x <8 x float>]* %carray, i32 0, i32 0
17 store <8 x float> %r8, <8 x float>* %r9, align 4
18 ret void
19 }
+0
-20
test/CodeGen/X86/SIMD/vunpcklpd.ll less more
None ; RUN: llc < %s -mattr=+avx | FileCheck %s
1
2 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
3 target triple = "x86_64-unknown-linux-gnu"
4
5 define void @try_([2 x <4 x double>]* noalias %incarray, [2 x <4 x double>]* noalias %incarrayb ) {
6 entry:
7 %incarray1 = alloca [2 x <4 x double>]*, align 8
8 %incarrayb1 = alloca [2 x <4 x double>]*, align 8
9 %carray = alloca [2 x <4 x double>], align 16
10 %r = getelementptr [2 x <4 x double>]* %incarray, i32 0, i32 0
11 %rb = getelementptr [2 x <4 x double>]* %incarrayb, i32 0, i32 0
12 %r3 = load <4 x double>* %r, align 8
13 %r4 = load <4 x double>* %rb, align 8
14 %r11 = shufflevector <4 x double> %r3, <4 x double> %r4, <4 x i32> < i32 0, i32 4, i32 2, i32 6 > ; <<4 x double>> [#uses=1]
15 ; CHECK: vunpcklpd
16 %r12 = getelementptr [2 x <4 x double>]* %carray, i32 0, i32 1
17 store <4 x double> %r11, <4 x double>* %r12, align 4
18 ret void
19 }
+0
-20
test/CodeGen/X86/SIMD/vunpcklps.ll less more
None ; RUN: llc < %s -mattr=+avx | FileCheck %s
1
2 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
3 target triple = "x86_64-unknown-linux-gnu"
4
5 define void @try_([2 x <8 x float>]* noalias %incarray, [2 x <8 x float>]* noalias %incarrayb ) {
6 entry:
7 %incarray1 = alloca [2 x <8 x float>]*, align 8
8 %incarrayb1 = alloca [2 x <8 x float>]*, align 8
9 %carray = alloca [2 x <8 x float>], align 16
10 %r = getelementptr [2 x <8 x float>]* %incarray, i32 0, i32 0
11 %rb = getelementptr [2 x <8 x float>]* %incarrayb, i32 0, i32 0
12 %r3 = load <8 x float>* %r, align 8
13 %r4 = load <8 x float>* %rb, align 8
14 %r11 = shufflevector <8 x float> %r3, <8 x float> %r4, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13 > ; <<8 x float>> [#uses=1]
15 ; CHECK: vunpcklps
16 %r12 = getelementptr [2 x <8 x float>]* %carray, i32 0, i32 1
17 store <8 x float> %r11, <8 x float>* %r12, align 4
18 ret void
19 }
0 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
1
2 ; CHECK: vunpckhps
3 define <8 x float> @unpackhips(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp {
4 entry:
5 %shuffle.i = shufflevector <8 x float> %src1, <8 x float> %src2, <8 x i32>
6 ret <8 x float> %shuffle.i
7 }
8
9 ; CHECK: vunpckhpd
10 define <4 x double> @unpackhipd(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp {
11 entry:
12 %shuffle.i = shufflevector <4 x double> %src1, <4 x double> %src2, <4 x i32>
13 ret <4 x double> %shuffle.i
14 }
15
16 ; CHECK: vunpcklps
17 define <8 x float> @unpacklops(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp {
18 entry:
19 %shuffle.i = shufflevector <8 x float> %src1, <8 x float> %src2, <8 x i32>
20 ret <8 x float> %shuffle.i
21 }
22
23 ; CHECK: vunpcklpd
24 define <4 x double> @unpacklopd(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp {
25 entry:
26 %shuffle.i = shufflevector <4 x double> %src1, <4 x double> %src2, <4 x i32>
27 ret <4 x double> %shuffle.i
28 }
29
30 ; CHECK-NOT: vunpcklps %ymm
31 define <8 x float> @unpacklops-not(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp {
32 entry:
33 %shuffle.i = shufflevector <8 x float> %src1, <8 x float> %src2, <8 x i32>
34 ret <8 x float> %shuffle.i
35 }
36
37 ; CHECK-NOT: vunpcklpd %ymm
38 define <4 x double> @unpacklopd-not(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp {
39 entry:
40 %shuffle.i = shufflevector <4 x double> %src1, <4 x double> %src2, <4 x i32>
41 ret <4 x double> %shuffle.i
42 }
43
44 ; CHECK-NOT: vunpckhps %ymm
45 define <8 x float> @unpackhips-not(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp {
46 entry:
47 %shuffle.i = shufflevector <8 x float> %src1, <8 x float> %src2, <8 x i32>
48 ret <8 x float> %shuffle.i
49 }
50
51 ; CHECK-NOT: vunpckhpd %ymm
52 define <4 x double> @unpackhipd-not(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp {
53 entry:
54 %shuffle.i = shufflevector <4 x double> %src1, <4 x double> %src2, <4 x i32>
55 ret <4 x double> %shuffle.i
56 }
57