llvm.org GIT mirror llvm / 08780d4
[x86] Teach the x86 DAG combiner to form MOVSLDUP and MOVSHDUP instructions when it finds an appropriate pattern. These are lovely instructions, and its a shame to not use them. =] They are fast, and can hand loads folded into their operands, etc. I've also plumbed the comment shuffle decoding through the various layers so that the test cases are printed nicely. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217758 91177308-0d34-0410-b5e6-96231b3b80d8 Chandler Carruth 6 years ago
5 changed file(s) with 137 addition(s) and 32 deletion(s). Raw diff Collapse all Expand all
125125 Src1Name = getRegName(MI->getOperand(1).getReg());
126126 DestName = getRegName(MI->getOperand(0).getReg());
127127 DecodeMOVHLPSMask(2, ShuffleMask);
128 break;
129
130 case X86::MOVSLDUPrr:
131 case X86::VMOVSLDUPrr:
132 Src1Name = getRegName(MI->getOperand(1).getReg());
133 // FALL THROUGH.
134 case X86::MOVSLDUPrm:
135 case X86::VMOVSLDUPrm:
136 DestName = getRegName(MI->getOperand(0).getReg());
137 DecodeMOVSLDUPMask(MVT::v4f32, ShuffleMask);
138 break;
139
140 case X86::VMOVSHDUPYrr:
141 Src1Name = getRegName(MI->getOperand(1).getReg());
142 // FALL THROUGH.
143 case X86::VMOVSHDUPYrm:
144 DestName = getRegName(MI->getOperand(0).getReg());
145 DecodeMOVSHDUPMask(MVT::v8f32, ShuffleMask);
146 break;
147
148 case X86::VMOVSLDUPYrr:
149 Src1Name = getRegName(MI->getOperand(1).getReg());
150 // FALL THROUGH.
151 case X86::VMOVSLDUPYrm:
152 DestName = getRegName(MI->getOperand(0).getReg());
153 DecodeMOVSLDUPMask(MVT::v8f32, ShuffleMask);
154 break;
155
156 case X86::MOVSHDUPrr:
157 case X86::VMOVSHDUPrr:
158 Src1Name = getRegName(MI->getOperand(1).getReg());
159 // FALL THROUGH.
160 case X86::MOVSHDUPrm:
161 case X86::VMOVSHDUPrm:
162 DestName = getRegName(MI->getOperand(0).getReg());
163 DecodeMOVSHDUPMask(MVT::v4f32, ShuffleMask);
128164 break;
129165
130166 case X86::PALIGNR128rr:
6060
6161 for (unsigned i = 0; i != NElts/2; ++i)
6262 ShuffleMask.push_back(NElts+i);
63 }
64
65 void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl &ShuffleMask) {
66 unsigned NumElts = VT.getVectorNumElements();
67 for (int i = 0, e = NumElts / 2; i < e; ++i) {
68 ShuffleMask.push_back(2 * i);
69 ShuffleMask.push_back(2 * i);
70 }
71 }
72
73 void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl &ShuffleMask) {
74 unsigned NumElts = VT.getVectorNumElements();
75 for (int i = 0, e = NumElts / 2; i < e; ++i) {
76 ShuffleMask.push_back(2 * i + 1);
77 ShuffleMask.push_back(2 * i + 1);
78 }
6379 }
6480
6581 void DecodePALIGNRMask(MVT VT, unsigned Imm,
3636
3737 // <0,2> or <0,1,4,5>
3838 void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl &ShuffleMask);
39
40 void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl &ShuffleMask);
41
42 void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl &ShuffleMask);
3943
4044 void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask);
4145
54065406 DecodeVPERM2X128Mask(VT, cast(ImmN)->getZExtValue(), Mask);
54075407 if (Mask.empty()) return false;
54085408 break;
5409 case X86ISD::MOVSLDUP:
5410 DecodeMOVSLDUPMask(VT, Mask);
5411 break;
5412 case X86ISD::MOVSHDUP:
5413 DecodeMOVSHDUPMask(VT, Mask);
5414 break;
54095415 case X86ISD::MOVDDUP:
54105416 case X86ISD::MOVLHPD:
54115417 case X86ISD::MOVLPD:
54125418 case X86ISD::MOVLPS:
5413 case X86ISD::MOVSHDUP:
5414 case X86ISD::MOVSLDUP:
54155419 // Not yet implemented
54165420 return false;
54175421 default: llvm_unreachable("unknown target shuffle node");
1936319367 // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
1936419368 // vectors because it can have a load folded into it that UNPCK cannot. This
1936519369 // doesn't preclude something switching to the shorter encoding post-RA.
19366 if (FloatDomain && (Mask.equals(0, 0) || Mask.equals(1, 1))) {
19367 bool Lo = Mask.equals(0, 0);
19368 unsigned Shuffle;
19369 MVT ShuffleVT;
19370 // Check if we have SSE3 which will let us use MOVDDUP. That instruction
19371 // is no slower than UNPCKLPD but has the option to fold the input operand
19372 // into even an unaligned memory load.
19373 if (Lo && Subtarget->hasSSE3()) {
19374 Shuffle = X86ISD::MOVDDUP;
19375 ShuffleVT = MVT::v2f64;
19376 } else {
19377 // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
19378 // than the UNPCK variants.
19379 Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
19380 ShuffleVT = MVT::v4f32;
19381 }
19382 if (Depth == 1 && Root->getOpcode() == Shuffle)
19383 return false; // Nothing to do!
19384 Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
19385 DCI.AddToWorklist(Op.getNode());
19386 if (Shuffle == X86ISD::MOVDDUP)
19370 if (FloatDomain) {
19371 if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
19372 bool Lo = Mask.equals(0, 0);
19373 unsigned Shuffle;
19374 MVT ShuffleVT;
19375 // Check if we have SSE3 which will let us use MOVDDUP. That instruction
19376 // is no slower than UNPCKLPD but has the option to fold the input operand
19377 // into even an unaligned memory load.
19378 if (Lo && Subtarget->hasSSE3()) {
19379 Shuffle = X86ISD::MOVDDUP;
19380 ShuffleVT = MVT::v2f64;
19381 } else {
19382 // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
19383 // than the UNPCK variants.
19384 Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
19385 ShuffleVT = MVT::v4f32;
19386 }
19387 if (Depth == 1 && Root->getOpcode() == Shuffle)
19388 return false; // Nothing to do!
19389 Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
19390 DCI.AddToWorklist(Op.getNode());
19391 if (Shuffle == X86ISD::MOVDDUP)
19392 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
19393 else
19394 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
19395 DCI.AddToWorklist(Op.getNode());
19396 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
19397 /*AddTo*/ true);
19398 return true;
19399 }
19400 if (Subtarget->hasSSE3() &&
19401 (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) {
19402 bool Lo = Mask.equals(0, 0, 2, 2);
19403 unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
19404 MVT ShuffleVT = MVT::v4f32;
19405 if (Depth == 1 && Root->getOpcode() == Shuffle)
19406 return false; // Nothing to do!
19407 Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
19408 DCI.AddToWorklist(Op.getNode());
1938719409 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
19388 else
19389 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
19390 DCI.AddToWorklist(Op.getNode());
19391 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
19392 /*AddTo*/ true);
19393 return true;
19394 }
19395
19396 // FIXME: We should match UNPCKLPS and UNPCKHPS here.
19410 DCI.AddToWorklist(Op.getNode());
19411 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
19412 /*AddTo*/ true);
19413 return true;
19414 }
19415 }
1939719416
1939819417 // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
1939919418 // variants as none of these have single-instruction variants that are
116116 ; ALL: shufps {{.*}} # xmm0 = xmm0[3,2,1,0]
117117 ; ALL-NEXT: retq
118118 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32>
119 ret <4 x float> %shuffle
120 }
121 define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) {
122 ; SSE2-LABEL: @shuffle_v4f32_0022
123 ; SSE2: shufps {{.*}} # xmm0 = xmm0[0,0,2,2]
124 ; SSE2-NEXT: retq
125 ;
126 ; SSE41-LABEL: @shuffle_v4f32_0022
127 ; SSE41: movsldup {{.*}} # xmm0 = xmm0[0,0,2,2]
128 ; SSE41-NEXT: retq
129 ;
130 ; AVX1-LABEL: @shuffle_v4f32_0022
131 ; AVX1: vmovsldup {{.*}} # xmm0 = xmm0[0,0,2,2]
132 ; AVX1-NEXT: retq
133 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32>
134 ret <4 x float> %shuffle
135 }
136 define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
137 ; SSE2-LABEL: @shuffle_v4f32_1133
138 ; SSE2: shufps {{.*}} # xmm0 = xmm0[1,1,3,3]
139 ; SSE2-NEXT: retq
140 ;
141 ; SSE41-LABEL: @shuffle_v4f32_1133
142 ; SSE41: movshdup {{.*}} # xmm0 = xmm0[1,1,3,3]
143 ; SSE41-NEXT: retq
144 ;
145 ; AVX1-LABEL: @shuffle_v4f32_1133
146 ; AVX1: vmovshdup {{.*}} # xmm0 = xmm0[1,1,3,3]
147 ; AVX1-NEXT: retq
148 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32>
119149 ret <4 x float> %shuffle
120150 }
121151