llvm.org GIT mirror llvm / 2463f3b
[X86] Remove SSE/AVX unaligned store intrinsics as clang no longer uses them. Auto upgrade to native unaligned store instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@271236 91177308-0d34-0410-b5e6-96231b3b80d8 Craig Topper 4 years ago
14 changed file(s) with 190 addition(s) and 333 deletion(s). Raw diff Collapse all Expand all
256256 def int_x86_sse_cvtpi2ps : GCCBuiltin<"__builtin_ia32_cvtpi2ps">,
257257 Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
258258 llvm_x86mmx_ty], [IntrNoMem]>;
259 }
260
261 // SIMD store ops
262 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
263 def int_x86_sse_storeu_ps : GCCBuiltin<"__builtin_ia32_storeups">,
264 Intrinsic<[], [llvm_ptr_ty,
265 llvm_v4f32_ty], [IntrArgMemOnly]>;
266259 }
267260
268261 // Cacheability support ops
522515 Intrinsic<[llvm_x86mmx_ty], [llvm_v2f64_ty], [IntrNoMem]>;
523516 def int_x86_sse_cvtpi2pd : GCCBuiltin<"__builtin_ia32_cvtpi2pd">,
524517 Intrinsic<[llvm_v2f64_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
525 }
526
527 // SIMD store ops
528 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
529 def int_x86_sse2_storeu_pd : GCCBuiltin<"__builtin_ia32_storeupd">,
530 Intrinsic<[], [llvm_ptr_ty,
531 llvm_v2f64_ty], [IntrArgMemOnly]>;
532 def int_x86_sse2_storeu_dq : GCCBuiltin<"__builtin_ia32_storedqu">,
533 Intrinsic<[], [llvm_ptr_ty,
534 llvm_v16i8_ty], [IntrArgMemOnly]>;
535518 }
536519
537520 // Misc.
19351918 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
19361919 def int_x86_avx_ldu_dq_256 : GCCBuiltin<"__builtin_ia32_lddqu256">,
19371920 Intrinsic<[llvm_v32i8_ty], [llvm_ptr_ty], [IntrReadMem]>;
1938 }
1939
1940 // SIMD store ops
1941 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
1942 def int_x86_avx_storeu_pd_256 : GCCBuiltin<"__builtin_ia32_storeupd256">,
1943 Intrinsic<[], [llvm_ptr_ty, llvm_v4f64_ty], [IntrArgMemOnly]>;
1944 def int_x86_avx_storeu_ps_256 : GCCBuiltin<"__builtin_ia32_storeups256">,
1945 Intrinsic<[], [llvm_ptr_ty, llvm_v8f32_ty], [IntrArgMemOnly]>;
1946 def int_x86_avx_storeu_dq_256 : GCCBuiltin<"__builtin_ia32_storedqu256">,
1947 Intrinsic<[], [llvm_ptr_ty, llvm_v32i8_ty], [IntrArgMemOnly]>;
19481921 }
19491922
19501923 // Conditional load ops
190190 Name == "x86.avx2.vextracti128" ||
191191 Name.startswith("x86.avx.movnt.") ||
192192 Name == "x86.sse2.storel.dq" ||
193 Name.startswith("x86.sse.storeu.") ||
194 Name.startswith("x86.sse2.storeu.") ||
195 Name.startswith("x86.avx.storeu.") ||
193196 Name == "x86.sse42.crc32.64.8" ||
194197 Name.startswith("x86.avx.vbroadcast.s") ||
195198 Name.startswith("x86.sse2.psll.dq") ||
437440 PointerType::getUnqual(Elt->getType()),
438441 "cast");
439442 Builder.CreateAlignedStore(Elt, BC, 1);
443
444 // Remove intrinsic.
445 CI->eraseFromParent();
446 return;
447 } else if (Name.startswith("llvm.x86.sse.storeu.") ||
448 Name.startswith("llvm.x86.sse2.storeu.") ||
449 Name.startswith("llvm.x86.avx.storeu.")) {
450 Value *Arg0 = CI->getArgOperand(0);
451 Value *Arg1 = CI->getArgOperand(1);
452
453 Arg0 = Builder.CreateBitCast(Arg0,
454 PointerType::getUnqual(Arg1->getType()),
455 "cast");
456 Builder.CreateAlignedStore(Arg1, Arg0, 1);
440457
441458 // Remove intrinsic.
442459 CI->eraseFromParent();
904904 IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
905905 }
906906
907 def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
908 (VMOVUPSYmr addr:$dst, VR256:$src)>;
909 def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src),
910 (VMOVUPDYmr addr:$dst, VR256:$src)>;
911
912907 // Aliases to help the assembler pick two byte VEX encodings by swapping the
913908 // operands relative to the normal instructions to use VEX.R instead of VEX.B.
914909 def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}",
963958 "movupd\t{$src, $dst|$dst, $src}", [],
964959 IIC_SSE_MOVU_P_RR>;
965960 }
966
967 let Predicates = [HasAVX] in {
968 def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
969 (VMOVUPSmr addr:$dst, VR128:$src)>;
970 def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
971 (VMOVUPDmr addr:$dst, VR128:$src)>;
972 }
973
974 let Predicates = [UseSSE1] in
975 def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
976 (MOVUPSmr addr:$dst, VR128:$src)>;
977 let Predicates = [UseSSE2] in
978 def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
979 (MOVUPDmr addr:$dst, VR128:$src)>;
980961
981962 // Use vmovaps/vmovups for AVX integer load/store.
982963 let Predicates = [HasAVX, NoVLX] in {
38853866 }
38863867
38873868 } // ExeDomain = SSEPackedInt
3888
3889 let Predicates = [HasAVX] in {
3890 def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
3891 (VMOVDQUmr addr:$dst, VR128:$src)>;
3892 def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src),
3893 (VMOVDQUYmr addr:$dst, VR256:$src)>;
3894 }
3895 let Predicates = [UseSSE2] in
3896 def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
3897 (MOVDQUmr addr:$dst, VR128:$src)>;
38983869
38993870 // Aliases to help the assembler pick two byte VEX encodings by swapping the
39003871 // operands relative to the normal instructions to use VEX.R instead of VEX.B.
13931393 PointerType::getUnqual(II->getArgOperand(0)->getType());
13941394 Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy);
13951395 return new StoreInst(II->getArgOperand(0), Ptr);
1396 }
1397 break;
1398
1399 case Intrinsic::x86_sse_storeu_ps:
1400 case Intrinsic::x86_sse2_storeu_pd:
1401 case Intrinsic::x86_sse2_storeu_dq:
1402 // Turn X86 storeu -> store if the pointer is known aligned.
1403 if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, AC, DT) >=
1404 16) {
1405 Type *OpPtrTy =
1406 PointerType::getUnqual(II->getArgOperand(1)->getType());
1407 Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), OpPtrTy);
1408 return new StoreInst(II->getArgOperand(1), Ptr);
1409 }
1410 break;
1411
1412 case Intrinsic::x86_avx_storeu_ps_256:
1413 case Intrinsic::x86_avx_storeu_pd_256:
1414 case Intrinsic::x86_avx_storeu_dq_256:
1415 // Turn X86 storeu -> store if the pointer is known aligned.
1416 if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, II, AC, DT) >=
1417 32) {
1418 Type *OpPtrTy =
1419 PointerType::getUnqual(II->getArgOperand(1)->getType());
1420 Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), OpPtrTy);
1421 return new StoreInst(II->getArgOperand(1), Ptr);
14221396 }
14231397 break;
14241398
683683 switch (II->getIntrinsicID()) {
684684 default: break;
685685 case Intrinsic::prefetch:
686 case Intrinsic::x86_sse_storeu_ps:
687 case Intrinsic::x86_sse2_storeu_pd:
688 case Intrinsic::x86_sse2_storeu_dq:
689 case Intrinsic::x86_avx_storeu_ps_256:
690 case Intrinsic::x86_avx_storeu_pd_256:
691 case Intrinsic::x86_avx_storeu_dq_256:
692686 if (II->getArgOperand(0) == OperandVal)
693687 isAddress = true;
694688 break;
705699 AccessTy.AddrSpace = SI->getPointerAddressSpace();
706700 } else if (const LoadInst *LI = dyn_cast(Inst)) {
707701 AccessTy.AddrSpace = LI->getPointerAddressSpace();
708 } else if (const IntrinsicInst *II = dyn_cast(Inst)) {
709 // Addressing modes can also be folded into prefetches and a variety
710 // of intrinsics.
711 switch (II->getIntrinsicID()) {
712 default: break;
713 case Intrinsic::x86_sse_storeu_ps:
714 case Intrinsic::x86_sse2_storeu_pd:
715 case Intrinsic::x86_sse2_storeu_dq:
716 case Intrinsic::x86_avx_storeu_ps_256:
717 case Intrinsic::x86_avx_storeu_pd_256:
718 case Intrinsic::x86_avx_storeu_dq_256:
719 AccessTy.MemTy = II->getArgOperand(0)->getType();
720 break;
721 }
722702 }
723703
724704 // All pointers have the same requirements, so canonicalize them to an
354354 ret <4 x double> %res
355355 }
356356 declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone
357
358
359 define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
360 ; add operation forces the execution domain.
361 ; CHECK-LABEL: test_x86_sse2_storeu_dq:
362 ; CHECK: ## BB#0:
363 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
364 ; CHECK-NEXT: vpaddb LCPI32_0, %xmm0, %xmm0
365 ; CHECK-NEXT: vmovdqu %xmm0, (%eax)
366 ; CHECK-NEXT: retl
367 %a2 = add <16 x i8> %a1,
368 call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
369 ret void
370 }
371 declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
372
373
374 define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
375 ; fadd operation forces the execution domain.
376 ; CHECK-LABEL: test_x86_sse2_storeu_pd:
377 ; CHECK: ## BB#0:
378 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
379 ; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
380 ; CHECK-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
381 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
382 ; CHECK-NEXT: vmovupd %xmm0, (%eax)
383 ; CHECK-NEXT: retl
384 %a2 = fadd <2 x double> %a1,
385 call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)
386 ret void
387 }
388 declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
389
390
391 define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
392 ; CHECK-LABEL: test_x86_sse_storeu_ps:
393 ; CHECK: ## BB#0:
394 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
395 ; CHECK-NEXT: vmovups %xmm0, (%eax)
396 ; CHECK-NEXT: retl
397 call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1)
398 ret void
399 }
400 declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
401
402
403 define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
404 ; FIXME: unfortunately the execution domain fix pass changes this to vmovups and its hard to force with no 256-bit integer instructions
405 ; add operation forces the execution domain.
406 ; CHECK-LABEL: test_x86_avx_storeu_dq_256:
407 ; CHECK: ## BB#0:
408 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
409 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
410 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
411 ; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1
412 ; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0
413 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
414 ; CHECK-NEXT: vmovups %ymm0, (%eax)
415 ; CHECK-NEXT: vzeroupper
416 ; CHECK-NEXT: retl
417 %a2 = add <32 x i8> %a1,
418 call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
419 ret void
420 }
421 declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
422
423
424 define void @test_x86_avx_storeu_pd_256(i8* %a0, <4 x double> %a1) {
425 ; add operation forces the execution domain.
426 ; CHECK-LABEL: test_x86_avx_storeu_pd_256:
427 ; CHECK: ## BB#0:
428 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
429 ; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1
430 ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
431 ; CHECK-NEXT: vmovupd %ymm0, (%eax)
432 ; CHECK-NEXT: vzeroupper
433 ; CHECK-NEXT: retl
434 %a2 = fadd <4 x double> %a1,
435 call void @llvm.x86.avx.storeu.pd.256(i8* %a0, <4 x double> %a2)
436 ret void
437 }
438 declare void @llvm.x86.avx.storeu.pd.256(i8*, <4 x double>) nounwind
439
440
441 define void @test_x86_avx_storeu_ps_256(i8* %a0, <8 x float> %a1) {
442 ; CHECK-LABEL: test_x86_avx_storeu_ps_256:
443 ; CHECK: ## BB#0:
444 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
445 ; CHECK-NEXT: vmovups %ymm0, (%eax)
446 ; CHECK-NEXT: vzeroupper
447 ; CHECK-NEXT: retl
448 call void @llvm.x86.avx.storeu.ps.256(i8* %a0, <8 x float> %a1)
449 ret void
450 }
451 declare void @llvm.x86.avx.storeu.ps.256(i8*, <8 x float>) nounwind
12181218 ret <2 x double> %res
12191219 }
12201220 declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
1221
1222
1223 define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
1224 ; add operation forces the execution domain.
1225 ; AVX-LABEL: test_x86_sse2_storeu_dq:
1226 ; AVX: ## BB#0:
1227 ; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1228 ; AVX-NEXT: vpaddb LCPI74_0, %xmm0, %xmm0
1229 ; AVX-NEXT: vmovdqu %xmm0, (%eax)
1230 ; AVX-NEXT: retl
1231 ;
1232 ; AVX512VL-LABEL: test_x86_sse2_storeu_dq:
1233 ; AVX512VL: ## BB#0:
1234 ; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
1235 ; AVX512VL-NEXT: vpaddb LCPI74_0, %xmm0, %xmm0
1236 ; AVX512VL-NEXT: vmovdqu %xmm0, (%eax)
1237 ; AVX512VL-NEXT: retl
1238 %a2 = add <16 x i8> %a1,
1239 call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
1240 ret void
1241 }
1242 declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
1243
1244
1245 define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
1246 ; fadd operation forces the execution domain.
1247 ; AVX-LABEL: test_x86_sse2_storeu_pd:
1248 ; AVX: ## BB#0:
1249 ; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1250 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1251 ; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
1252 ; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1253 ; AVX-NEXT: vmovupd %xmm0, (%eax)
1254 ; AVX-NEXT: retl
1255 ;
1256 ; AVX512VL-LABEL: test_x86_sse2_storeu_pd:
1257 ; AVX512VL: ## BB#0:
1258 ; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
1259 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1260 ; AVX512VL-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
1261 ; AVX512VL-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1262 ; AVX512VL-NEXT: vmovups %xmm0, (%eax)
1263 ; AVX512VL-NEXT: retl
1264 %a2 = fadd <2 x double> %a1,
1265 call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)
1266 ret void
1267 }
1268 declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
12691221
12701222
12711223 define <2 x double> @test_x86_sse2_sub_sd(<2 x double> %a0, <2 x double> %a1) {
27992751 ret void
28002752 }
28012753 declare void @llvm.x86.sse.stmxcsr(i8*) nounwind
2802
2803
2804 define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
2805 ; AVX-LABEL: test_x86_sse_storeu_ps:
2806 ; AVX: ## BB#0:
2807 ; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
2808 ; AVX-NEXT: vmovups %xmm0, (%eax)
2809 ; AVX-NEXT: retl
2810 ;
2811 ; AVX512VL-LABEL: test_x86_sse_storeu_ps:
2812 ; AVX512VL: ## BB#0:
2813 ; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
2814 ; AVX512VL-NEXT: vmovups %xmm0, (%eax)
2815 ; AVX512VL-NEXT: retl
2816 call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1)
2817 ret void
2818 }
2819 declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
28202754
28212755
28222756 define <4 x float> @test_x86_sse_sub_ss(<4 x float> %a0, <4 x float> %a1) {
40113945 declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
40123946
40133947
4014 define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
4015 ; FIXME: unfortunately the execution domain fix pass changes this to vmovups and its hard to force with no 256-bit integer instructions
4016 ; add operation forces the execution domain.
4017 ; AVX-LABEL: test_x86_avx_storeu_dq_256:
4018 ; AVX: ## BB#0:
4019 ; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
4020 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
4021 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
4022 ; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
4023 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
4024 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
4025 ; AVX-NEXT: vmovups %ymm0, (%eax)
4026 ; AVX-NEXT: vzeroupper
4027 ; AVX-NEXT: retl
4028 ;
4029 ; AVX512VL-LABEL: test_x86_avx_storeu_dq_256:
4030 ; AVX512VL: ## BB#0:
4031 ; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
4032 ; AVX512VL-NEXT: vpaddb LCPI225_0, %ymm0, %ymm0
4033 ; AVX512VL-NEXT: vmovdqu %ymm0, (%eax)
4034 ; AVX512VL-NEXT: retl
4035 %a2 = add <32 x i8> %a1,
4036 call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
4037 ret void
4038 }
4039 declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
4040
4041
4042 define void @test_x86_avx_storeu_pd_256(i8* %a0, <4 x double> %a1) {
4043 ; add operation forces the execution domain.
4044 ; AVX-LABEL: test_x86_avx_storeu_pd_256:
4045 ; AVX: ## BB#0:
4046 ; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
4047 ; AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1
4048 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
4049 ; AVX-NEXT: vmovupd %ymm0, (%eax)
4050 ; AVX-NEXT: vzeroupper
4051 ; AVX-NEXT: retl
4052 ;
4053 ; AVX512VL-LABEL: test_x86_avx_storeu_pd_256:
4054 ; AVX512VL: ## BB#0:
4055 ; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
4056 ; AVX512VL-NEXT: vpxord %ymm1, %ymm1, %ymm1
4057 ; AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm0
4058 ; AVX512VL-NEXT: vmovups %ymm0, (%eax)
4059 ; AVX512VL-NEXT: retl
4060 %a2 = fadd <4 x double> %a1,
4061 call void @llvm.x86.avx.storeu.pd.256(i8* %a0, <4 x double> %a2)
4062 ret void
4063 }
4064 declare void @llvm.x86.avx.storeu.pd.256(i8*, <4 x double>) nounwind
4065
4066
4067 define void @test_x86_avx_storeu_ps_256(i8* %a0, <8 x float> %a1) {
4068 ; AVX-LABEL: test_x86_avx_storeu_ps_256:
4069 ; AVX: ## BB#0:
4070 ; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
4071 ; AVX-NEXT: vmovups %ymm0, (%eax)
4072 ; AVX-NEXT: vzeroupper
4073 ; AVX-NEXT: retl
4074 ;
4075 ; AVX512VL-LABEL: test_x86_avx_storeu_ps_256:
4076 ; AVX512VL: ## BB#0:
4077 ; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
4078 ; AVX512VL-NEXT: vmovups %ymm0, (%eax)
4079 ; AVX512VL-NEXT: retl
4080 call void @llvm.x86.avx.storeu.ps.256(i8* %a0, <8 x float> %a1)
4081 ret void
4082 }
4083 declare void @llvm.x86.avx.storeu.ps.256(i8*, <8 x float>) nounwind
4084
4085
40863948 define <4 x double> @test_x86_avx_vbroadcastf128_pd_256(i8* %a0) {
40873949 ; AVX-LABEL: test_x86_avx_vbroadcastf128_pd_256:
40883950 ; AVX: ## BB#0:
42704132 ;
42714133 ; AVX512VL-LABEL: test_x86_avx_vpermilvar_pd_256_2:
42724134 ; AVX512VL: ## BB#0:
4273 ; AVX512VL-NEXT: vpermilpd LCPI239_0, %ymm0, %ymm0
4135 ; AVX512VL-NEXT: vpermilpd LCPI233_0, %ymm0, %ymm0
42744136 ; AVX512VL-NEXT: retl
42754137 %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> ) ; <<4 x double>> [#uses=1]
42764138 ret <4 x double> %res
47624624 ; AVX-LABEL: movnt_dq:
47634625 ; AVX: ## BB#0:
47644626 ; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
4765 ; AVX-NEXT: vpaddq LCPI266_0, %xmm0, %xmm0
4627 ; AVX-NEXT: vpaddq LCPI260_0, %xmm0, %xmm0
47664628 ; AVX-NEXT: vmovntdq %ymm0, (%eax)
47674629 ; AVX-NEXT: vzeroupper
47684630 ; AVX-NEXT: retl
47704632 ; AVX512VL-LABEL: movnt_dq:
47714633 ; AVX512VL: ## BB#0:
47724634 ; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
4773 ; AVX512VL-NEXT: vpaddq LCPI266_0, %xmm0, %xmm0
4635 ; AVX512VL-NEXT: vpaddq LCPI260_0, %xmm0, %xmm0
47744636 ; AVX512VL-NEXT: vmovntdq %ymm0, (%eax)
47754637 ; AVX512VL-NEXT: retl
47764638 %a2 = add <2 x i64> %a1,
364364 ret <4 x i64> %res
365365 }
366366 declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone
367
368 ; This is checked here because the execution dependency fix pass makes it hard to test in AVX mode since we don't have 256-bit integer instructions
369 define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
370 ; add operation forces the execution domain.
371 ; CHECK-LABEL: test_x86_avx_storeu_dq_256:
372 ; CHECK: ## BB#0:
373 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
374 ; CHECK-NEXT: vpaddb LCPI33_0, %ymm0, %ymm0
375 ; CHECK-NEXT: vmovdqu %ymm0, (%eax)
376 ; CHECK-NEXT: vzeroupper
377 ; CHECK-NEXT: retl
378 %a2 = add <32 x i8> %a1,
379 call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
380 ret void
381 }
382 declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
14741474 }
14751475 declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
14761476
1477 ; This is checked here because the execution dependency fix pass makes it hard to test in AVX mode since we don't have 256-bit integer instructions
1478 define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
1479 ; add operation forces the execution domain.
1480 ; AVX2-LABEL: test_x86_avx_storeu_dq_256:
1481 ; AVX2: ## BB#0:
1482 ; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
1483 ; AVX2-NEXT: vpaddb LCPI91_0, %ymm0, %ymm0
1484 ; AVX2-NEXT: vmovdqu %ymm0, (%eax)
1485 ; AVX2-NEXT: vzeroupper
1486 ; AVX2-NEXT: retl
1487 ;
1488 ; AVX512VL-LABEL: test_x86_avx_storeu_dq_256:
1489 ; AVX512VL: ## BB#0:
1490 ; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
1491 ; AVX512VL-NEXT: vpaddb LCPI91_0, %ymm0, %ymm0
1492 ; AVX512VL-NEXT: vmovdqu %ymm0, (%eax)
1493 ; AVX512VL-NEXT: retl
1494 %a2 = add <32 x i8> %a1,
1495 call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
1496 ret void
1497 }
1498 declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
1499
15001477 define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, i8* %a1, <4 x i32> %idx, <2 x double> %mask) {
15011478 ; AVX2-LABEL: test_x86_avx2_gather_d_pd:
15021479 ; AVX2: ## BB#0:
0 ; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s
3
4 define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
5 ; SSE-LABEL: test_x86_sse_storeu_ps:
6 ; SSE: ## BB#0:
7 ; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
8 ; SSE-NEXT: movups %xmm0, (%eax)
9 ; SSE-NEXT: retl
10 ;
11 ; KNL-LABEL: test_x86_sse_storeu_ps:
12 ; KNL: ## BB#0:
13 ; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax
14 ; KNL-NEXT: vmovups %xmm0, (%eax)
15 ; KNL-NEXT: retl
16 ; CHECK-LABEL: test_x86_sse_storeu_ps:
17 ; CHECK: ## BB#0:
18 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
19 ; CHECK-NEXT: movups %xmm0, (%eax)
20 ; CHECK-NEXT: retl
21 call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1)
22 ret void
23 }
24 declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
25
26
473473 declare void @llvm.x86.sse.stmxcsr(i8*) nounwind
474474
475475
476 define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
477 ; SSE-LABEL: test_x86_sse_storeu_ps:
478 ; SSE: ## BB#0:
479 ; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
480 ; SSE-NEXT: movups %xmm0, (%eax)
481 ; SSE-NEXT: retl
482 ;
483 ; KNL-LABEL: test_x86_sse_storeu_ps:
484 ; KNL: ## BB#0:
485 ; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax
486 ; KNL-NEXT: vmovups %xmm0, (%eax)
487 ; KNL-NEXT: retl
488 call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1)
489 ret void
490 }
491 declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
492
493
494476 define <4 x float> @test_x86_sse_sub_ss(<4 x float> %a0, <4 x float> %a1) {
495477 ; SSE-LABEL: test_x86_sse_sub_ss:
496478 ; SSE: ## BB#0:
9595 declare void @llvm.x86.sse2.storel.dq(i8*, <4 x i32>) nounwind
9696
9797
98 define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
99 ; add operation forces the execution domain.
100 ; CHECK-LABEL: test_x86_sse2_storeu_dq:
101 ; CHECK: ## BB#0:
102 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
103 ; CHECK-NEXT: paddb LCPI7_0, %xmm0
104 ; CHECK-NEXT: movdqu %xmm0, (%eax)
105 ; CHECK-NEXT: retl
106 %a2 = add <16 x i8> %a1,
107 call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
108 ret void
109 }
110 declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
98111
112
113 define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
114 ; fadd operation forces the execution domain.
115 ; CHECK-LABEL: test_x86_sse2_storeu_pd:
116 ; CHECK: ## BB#0:
117 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
118 ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
119 ; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
120 ; CHECK-NEXT: addpd %xmm0, %xmm1
121 ; CHECK-NEXT: movupd %xmm1, (%eax)
122 ; CHECK-NEXT: retl
123 %a2 = fadd <2 x double> %a1,
124 call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)
125 ret void
126 }
127 declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
128
129
11241124 declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
11251125
11261126
1127 define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
1128 ; add operation forces the execution domain.
1129 ; SSE-LABEL: test_x86_sse2_storeu_dq:
1130 ; SSE: ## BB#0:
1131 ; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1132 ; SSE-NEXT: paddb LCPI68_0, %xmm0
1133 ; SSE-NEXT: movdqu %xmm0, (%eax)
1134 ; SSE-NEXT: retl
1135 ;
1136 ; KNL-LABEL: test_x86_sse2_storeu_dq:
1137 ; KNL: ## BB#0:
1138 ; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax
1139 ; KNL-NEXT: vpaddb LCPI68_0, %xmm0, %xmm0
1140 ; KNL-NEXT: vmovdqu %xmm0, (%eax)
1141 ; KNL-NEXT: retl
1142 %a2 = add <16 x i8> %a1,
1143 call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
1144 ret void
1145 }
1146 declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
1147
1148
1149 define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
1150 ; fadd operation forces the execution domain.
1151 ; SSE-LABEL: test_x86_sse2_storeu_pd:
1152 ; SSE: ## BB#0:
1153 ; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1154 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
1155 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
1156 ; SSE-NEXT: addpd %xmm0, %xmm1
1157 ; SSE-NEXT: movupd %xmm1, (%eax)
1158 ; SSE-NEXT: retl
1159 ;
1160 ; KNL-LABEL: test_x86_sse2_storeu_pd:
1161 ; KNL: ## BB#0:
1162 ; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax
1163 ; KNL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1164 ; KNL-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
1165 ; KNL-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1166 ; KNL-NEXT: vmovupd %xmm0, (%eax)
1167 ; KNL-NEXT: retl
1168 %a2 = fadd <2 x double> %a1,
1169 call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)
1170 ret void
1171 }
1172 declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
1173
1174
11751127 define <2 x double> @test_x86_sse2_sub_sd(<2 x double> %a0, <2 x double> %a1) {
11761128 ; SSE-LABEL: test_x86_sse2_sub_sd:
11771129 ; SSE: ## BB#0:
630630 ; CHECK-NOT: br
631631 ; CHECK-NOT: = or
632632 ; CHECK: store <4 x i32> {{.*}} align 1
633 ; CHECK: call void @llvm.x86.sse.storeu.ps
633 ; CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 1{{$}}
634634 ; CHECK: ret void
635635
636636