llvm.org GIT mirror llvm / fd950f0
AMDGPU: Add baseline test for vector sub x, c canonicalization This will catch regressions from D62341, and show improvements from a future patch to fix them. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363888 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 3 months ago
1 changed file(s) with 1448 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
11 ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,SI
22 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,VI
3 ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX9
34
45 ; Test that add/sub with a constant is swapped to sub/add with negated
56 ; constant to minimize code size.
3738 ; VI-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3
3839 ; VI-NEXT: flat_store_dword v[0:1], v2
3940 ; VI-NEXT: s_endpgm
41 ;
42 ; GFX9-LABEL: v_test_i32_x_sub_64:
43 ; GFX9: ; %bb.0:
44 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
45 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
46 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
47 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
48 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
49 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
50 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
51 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
52 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
53 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
54 ; GFX9-NEXT: s_waitcnt vmcnt(0)
55 ; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v3
56 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
57 ; GFX9-NEXT: s_endpgm
4058 %tid = call i32 @llvm.amdgcn.workitem.id.x()
4159 %tid.ext = sext i32 %tid to i64
4260 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
88106 ; VI-NEXT: flat_store_dword v[2:3], v1
89107 ; VI-NEXT: flat_store_dword v[2:3], v0
90108 ; VI-NEXT: s_endpgm
109 ;
110 ; GFX9-LABEL: v_test_i32_x_sub_64_multi_use:
111 ; GFX9: ; %bb.0:
112 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
113 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
114 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
115 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
116 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
117 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
118 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
119 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
120 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
121 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
122 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
123 ; GFX9-NEXT: s_waitcnt vmcnt(1)
124 ; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v4
125 ; GFX9-NEXT: s_waitcnt vmcnt(0)
126 ; GFX9-NEXT: v_subrev_u32_e32 v0, 64, v0
127 ; GFX9-NEXT: global_store_dword v[2:3], v1, off
128 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
129 ; GFX9-NEXT: s_endpgm
91130 %tid = call i32 @llvm.amdgcn.workitem.id.x()
92131 %tid.ext = sext i32 %tid to i64
93132 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
134173 ; VI-NEXT: v_sub_u32_e32 v2, vcc, 64, v3
135174 ; VI-NEXT: flat_store_dword v[0:1], v2
136175 ; VI-NEXT: s_endpgm
176 ;
177 ; GFX9-LABEL: v_test_i32_64_sub_x:
178 ; GFX9: ; %bb.0:
179 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
180 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
181 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
182 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
183 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
184 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
185 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
186 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
187 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
188 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
189 ; GFX9-NEXT: s_waitcnt vmcnt(0)
190 ; GFX9-NEXT: v_sub_u32_e32 v2, 64, v3
191 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
192 ; GFX9-NEXT: s_endpgm
137193 %tid = call i32 @llvm.amdgcn.workitem.id.x()
138194 %tid.ext = sext i32 %tid to i64
139195 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
177233 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0xffffffbf, v3
178234 ; VI-NEXT: flat_store_dword v[0:1], v2
179235 ; VI-NEXT: s_endpgm
236 ;
237 ; GFX9-LABEL: v_test_i32_x_sub_65:
238 ; GFX9: ; %bb.0:
239 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
240 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
241 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
242 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
243 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
244 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
245 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
246 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
247 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
248 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
249 ; GFX9-NEXT: s_waitcnt vmcnt(0)
250 ; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffbf, v3
251 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
252 ; GFX9-NEXT: s_endpgm
180253 %tid = call i32 @llvm.amdgcn.workitem.id.x()
181254 %tid.ext = sext i32 %tid to i64
182255 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
220293 ; VI-NEXT: v_sub_u32_e32 v2, vcc, 0x41, v3
221294 ; VI-NEXT: flat_store_dword v[0:1], v2
222295 ; VI-NEXT: s_endpgm
296 ;
297 ; GFX9-LABEL: v_test_i32_65_sub_x:
298 ; GFX9: ; %bb.0:
299 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
300 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
301 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
302 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
303 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
304 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
305 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
306 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
307 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
308 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
309 ; GFX9-NEXT: s_waitcnt vmcnt(0)
310 ; GFX9-NEXT: v_sub_u32_e32 v2, 0x41, v3
311 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
312 ; GFX9-NEXT: s_endpgm
223313 %tid = call i32 @llvm.amdgcn.workitem.id.x()
224314 %tid.ext = sext i32 %tid to i64
225315 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
263353 ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v3
264354 ; VI-NEXT: flat_store_dword v[0:1], v2
265355 ; VI-NEXT: s_endpgm
356 ;
357 ; GFX9-LABEL: v_test_i32_x_sub_neg16:
358 ; GFX9: ; %bb.0:
359 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
360 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
361 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
362 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
363 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
364 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
365 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
366 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
367 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
368 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
369 ; GFX9-NEXT: s_waitcnt vmcnt(0)
370 ; GFX9-NEXT: v_add_u32_e32 v2, 16, v3
371 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
372 ; GFX9-NEXT: s_endpgm
266373 %tid = call i32 @llvm.amdgcn.workitem.id.x()
267374 %tid.ext = sext i32 %tid to i64
268375 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
306413 ; VI-NEXT: v_sub_u32_e32 v2, vcc, -16, v3
307414 ; VI-NEXT: flat_store_dword v[0:1], v2
308415 ; VI-NEXT: s_endpgm
416 ;
417 ; GFX9-LABEL: v_test_i32_neg16_sub_x:
418 ; GFX9: ; %bb.0:
419 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
420 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
421 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
422 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
423 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
424 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
425 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
426 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
427 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
428 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
429 ; GFX9-NEXT: s_waitcnt vmcnt(0)
430 ; GFX9-NEXT: v_sub_u32_e32 v2, -16, v3
431 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
432 ; GFX9-NEXT: s_endpgm
309433 %tid = call i32 @llvm.amdgcn.workitem.id.x()
310434 %tid.ext = sext i32 %tid to i64
311435 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
349473 ; VI-NEXT: v_add_u32_e32 v2, vcc, 17, v3
350474 ; VI-NEXT: flat_store_dword v[0:1], v2
351475 ; VI-NEXT: s_endpgm
476 ;
477 ; GFX9-LABEL: v_test_i32_x_sub_neg17:
478 ; GFX9: ; %bb.0:
479 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
480 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
481 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
482 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
483 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
484 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
485 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
486 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
487 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
488 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
489 ; GFX9-NEXT: s_waitcnt vmcnt(0)
490 ; GFX9-NEXT: v_add_u32_e32 v2, 17, v3
491 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
492 ; GFX9-NEXT: s_endpgm
352493 %tid = call i32 @llvm.amdgcn.workitem.id.x()
353494 %tid.ext = sext i32 %tid to i64
354495 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
392533 ; VI-NEXT: v_sub_u32_e32 v2, vcc, 0xffffffef, v3
393534 ; VI-NEXT: flat_store_dword v[0:1], v2
394535 ; VI-NEXT: s_endpgm
536 ;
537 ; GFX9-LABEL: v_test_i32_neg17_sub_x:
538 ; GFX9: ; %bb.0:
539 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
540 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
541 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
542 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
543 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
544 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
545 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
546 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
547 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
548 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
549 ; GFX9-NEXT: s_waitcnt vmcnt(0)
550 ; GFX9-NEXT: v_sub_u32_e32 v2, 0xffffffef, v3
551 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
552 ; GFX9-NEXT: s_endpgm
395553 %tid = call i32 @llvm.amdgcn.workitem.id.x()
396554 %tid.ext = sext i32 %tid to i64
397555 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
422580 ; VI-NEXT: ; use s0
423581 ; VI-NEXT: ;;#ASMEND
424582 ; VI-NEXT: s_endpgm
583 ;
584 ; GFX9-LABEL: s_test_i32_x_sub_64:
585 ; GFX9: ; %bb.0:
586 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
587 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
588 ; GFX9-NEXT: s_sub_i32 s0, s0, 64
589 ; GFX9-NEXT: ;;#ASMSTART
590 ; GFX9-NEXT: ; use s0
591 ; GFX9-NEXT: ;;#ASMEND
592 ; GFX9-NEXT: s_endpgm
425593 %result = sub i32 %x, 64
426594 call void asm sideeffect "; use $0", "s"(i32 %result)
427595 ret void
460628 ; VI-NEXT: v_subrev_u16_e32 v2, 64, v3
461629 ; VI-NEXT: flat_store_short v[0:1], v2
462630 ; VI-NEXT: s_endpgm
631 ;
632 ; GFX9-LABEL: v_test_i16_x_sub_64:
633 ; GFX9: ; %bb.0:
634 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
635 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v0
636 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
637 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
638 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
639 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
640 ; GFX9-NEXT: global_load_ushort v3, v[0:1], off
641 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
642 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
643 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
644 ; GFX9-NEXT: s_waitcnt vmcnt(0)
645 ; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v3
646 ; GFX9-NEXT: global_store_short v[0:1], v2, off
647 ; GFX9-NEXT: s_endpgm
463648 %tid = call i32 @llvm.amdgcn.workitem.id.x()
464649 %tid.ext = sext i32 %tid to i64
465650 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
511696 ; VI-NEXT: flat_store_short v[2:3], v1
512697 ; VI-NEXT: flat_store_short v[2:3], v0
513698 ; VI-NEXT: s_endpgm
699 ;
700 ; GFX9-LABEL: v_test_i16_x_sub_64_multi_use:
701 ; GFX9: ; %bb.0:
702 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
703 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v0
704 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
705 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
706 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
707 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
708 ; GFX9-NEXT: global_load_ushort v4, v[0:1], off
709 ; GFX9-NEXT: global_load_ushort v0, v[0:1], off
710 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
711 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
712 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
713 ; GFX9-NEXT: s_waitcnt vmcnt(1)
714 ; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v4
715 ; GFX9-NEXT: s_waitcnt vmcnt(0)
716 ; GFX9-NEXT: v_subrev_u16_e32 v0, 64, v0
717 ; GFX9-NEXT: global_store_short v[2:3], v1, off
718 ; GFX9-NEXT: global_store_short v[2:3], v0, off
719 ; GFX9-NEXT: s_endpgm
514720 %tid = call i32 @llvm.amdgcn.workitem.id.x()
515721 %tid.ext = sext i32 %tid to i64
516722 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
524730 ret void
525731 }
526732
733 define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
734 ; SI-LABEL: v_test_v2i16_x_sub_64_64:
735 ; SI: ; %bb.0:
736 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
737 ; SI-NEXT: s_mov_b32 s7, 0xf000
738 ; SI-NEXT: s_mov_b32 s6, 0
739 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
740 ; SI-NEXT: v_mov_b32_e32 v1, 0
741 ; SI-NEXT: s_waitcnt lgkmcnt(0)
742 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
743 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
744 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
745 ; SI-NEXT: s_waitcnt vmcnt(0)
746 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
747 ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
748 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
749 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
750 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffc00000, v2
751 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
752 ; SI-NEXT: s_endpgm
753 ;
754 ; VI-LABEL: v_test_v2i16_x_sub_64_64:
755 ; VI: ; %bb.0:
756 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
757 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
758 ; VI-NEXT: v_mov_b32_e32 v3, 64
759 ; VI-NEXT: s_waitcnt lgkmcnt(0)
760 ; VI-NEXT: v_mov_b32_e32 v1, s3
761 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
762 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
763 ; VI-NEXT: flat_load_dword v4, v[0:1]
764 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
765 ; VI-NEXT: v_mov_b32_e32 v1, s1
766 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
767 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
768 ; VI-NEXT: v_add_u16_e32 v2, 0xffffffc0, v4
769 ; VI-NEXT: v_sub_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
770 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
771 ; VI-NEXT: flat_store_dword v[0:1], v2
772 ; VI-NEXT: s_endpgm
773 ;
774 ; GFX9-LABEL: v_test_v2i16_x_sub_64_64:
775 ; GFX9: ; %bb.0:
776 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
777 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
778 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
779 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
780 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
781 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
782 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
783 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
784 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
785 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
786 ; GFX9-NEXT: s_waitcnt vmcnt(0)
787 ; GFX9-NEXT: v_pk_sub_i16 v2, v3, 64 op_sel_hi:[1,0]
788 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
789 ; GFX9-NEXT: s_endpgm
790 %tid = call i32 @llvm.amdgcn.workitem.id.x()
791 %tid.ext = sext i32 %tid to i64
792 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
793 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
794 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
795 %result = sub <2 x i16> %x,
796 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
797 ret void
798 }
799
800 define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
801 ; SI-LABEL: v_test_v2i16_x_sub_7_64:
802 ; SI: ; %bb.0:
803 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
804 ; SI-NEXT: s_mov_b32 s7, 0xf000
805 ; SI-NEXT: s_mov_b32 s6, 0
806 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
807 ; SI-NEXT: v_mov_b32_e32 v1, 0
808 ; SI-NEXT: s_waitcnt lgkmcnt(0)
809 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
810 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
811 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
812 ; SI-NEXT: s_waitcnt vmcnt(0)
813 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
814 ; SI-NEXT: v_add_i32_e32 v2, vcc, -7, v2
815 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
816 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
817 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffc00000, v2
818 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
819 ; SI-NEXT: s_endpgm
820 ;
821 ; VI-LABEL: v_test_v2i16_x_sub_7_64:
822 ; VI: ; %bb.0:
823 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
824 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
825 ; VI-NEXT: v_mov_b32_e32 v3, 64
826 ; VI-NEXT: s_waitcnt lgkmcnt(0)
827 ; VI-NEXT: v_mov_b32_e32 v1, s3
828 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
829 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
830 ; VI-NEXT: flat_load_dword v4, v[0:1]
831 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
832 ; VI-NEXT: v_mov_b32_e32 v1, s1
833 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
834 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
835 ; VI-NEXT: v_add_u16_e32 v2, -7, v4
836 ; VI-NEXT: v_sub_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
837 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
838 ; VI-NEXT: flat_store_dword v[0:1], v2
839 ; VI-NEXT: s_endpgm
840 ;
841 ; GFX9-LABEL: v_test_v2i16_x_sub_7_64:
842 ; GFX9: ; %bb.0:
843 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
844 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
845 ; GFX9-NEXT: s_mov_b32 s4, 0x400007
846 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
847 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
848 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
849 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
850 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
851 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
852 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
853 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
854 ; GFX9-NEXT: s_waitcnt vmcnt(0)
855 ; GFX9-NEXT: v_pk_sub_i16 v2, v3, s4
856 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
857 ; GFX9-NEXT: s_endpgm
858 %tid = call i32 @llvm.amdgcn.workitem.id.x()
859 %tid.ext = sext i32 %tid to i64
860 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
861 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
862 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
863 %result = sub <2 x i16> %x,
864 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
865 ret void
866 }
867
868 define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
869 ; SI-LABEL: v_test_v2i16_x_sub_64_123:
870 ; SI: ; %bb.0:
871 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
872 ; SI-NEXT: s_mov_b32 s7, 0xf000
873 ; SI-NEXT: s_mov_b32 s6, 0
874 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
875 ; SI-NEXT: v_mov_b32_e32 v1, 0
876 ; SI-NEXT: s_waitcnt lgkmcnt(0)
877 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
878 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
879 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
880 ; SI-NEXT: s_waitcnt vmcnt(0)
881 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
882 ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
883 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
884 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
885 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xff850000, v2
886 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
887 ; SI-NEXT: s_endpgm
888 ;
889 ; VI-LABEL: v_test_v2i16_x_sub_64_123:
890 ; VI: ; %bb.0:
891 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
892 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
893 ; VI-NEXT: v_mov_b32_e32 v3, 0xffffff85
894 ; VI-NEXT: s_waitcnt lgkmcnt(0)
895 ; VI-NEXT: v_mov_b32_e32 v1, s3
896 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
897 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
898 ; VI-NEXT: flat_load_dword v4, v[0:1]
899 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
900 ; VI-NEXT: v_mov_b32_e32 v1, s1
901 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
902 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
903 ; VI-NEXT: v_add_u16_e32 v2, 0xffffffc0, v4
904 ; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
905 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
906 ; VI-NEXT: flat_store_dword v[0:1], v2
907 ; VI-NEXT: s_endpgm
908 ;
909 ; GFX9-LABEL: v_test_v2i16_x_sub_64_123:
910 ; GFX9: ; %bb.0:
911 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
912 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
913 ; GFX9-NEXT: s_mov_b32 s4, 0x7b0040
914 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
915 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
916 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
917 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
918 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
919 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
920 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
921 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
922 ; GFX9-NEXT: s_waitcnt vmcnt(0)
923 ; GFX9-NEXT: v_pk_sub_i16 v2, v3, s4
924 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
925 ; GFX9-NEXT: s_endpgm
926 %tid = call i32 @llvm.amdgcn.workitem.id.x()
927 %tid.ext = sext i32 %tid to i64
928 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
929 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
930 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
931 %result = sub <2 x i16> %x,
932 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
933 ret void
934 }
935
936 ; Can fold 0 and inline immediate in other half.
937 define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
938 ; SI-LABEL: v_test_v2i16_x_sub_7_0:
939 ; SI: ; %bb.0:
940 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
941 ; SI-NEXT: s_mov_b32 s7, 0xf000
942 ; SI-NEXT: s_mov_b32 s6, 0
943 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
944 ; SI-NEXT: v_mov_b32_e32 v1, 0
945 ; SI-NEXT: s_waitcnt lgkmcnt(0)
946 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
947 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
948 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
949 ; SI-NEXT: s_waitcnt vmcnt(0)
950 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
951 ; SI-NEXT: v_add_i32_e32 v2, vcc, -7, v2
952 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
953 ; SI-NEXT: v_or_b32_e32 v2, v2, v3
954 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
955 ; SI-NEXT: s_endpgm
956 ;
957 ; VI-LABEL: v_test_v2i16_x_sub_7_0:
958 ; VI: ; %bb.0:
959 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
960 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
961 ; VI-NEXT: s_waitcnt lgkmcnt(0)
962 ; VI-NEXT: v_mov_b32_e32 v1, s3
963 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
964 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
965 ; VI-NEXT: flat_load_dword v3, v[0:1]
966 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
967 ; VI-NEXT: v_mov_b32_e32 v1, s1
968 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
969 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
970 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
971 ; VI-NEXT: v_add_u16_e32 v3, -7, v3
972 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
973 ; VI-NEXT: flat_store_dword v[0:1], v2
974 ; VI-NEXT: s_endpgm
975 ;
976 ; GFX9-LABEL: v_test_v2i16_x_sub_7_0:
977 ; GFX9: ; %bb.0:
978 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
979 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
980 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
981 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
982 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
983 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
984 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
985 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
986 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
987 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
988 ; GFX9-NEXT: s_waitcnt vmcnt(0)
989 ; GFX9-NEXT: v_pk_sub_i16 v2, v3, 7
990 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
991 ; GFX9-NEXT: s_endpgm
992 %tid = call i32 @llvm.amdgcn.workitem.id.x()
993 %tid.ext = sext i32 %tid to i64
994 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
995 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
996 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
997 %result = sub <2 x i16> %x,
998 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
999 ret void
1000 }
1001
1002 ; Can fold 0 and inline immediate in other half.
1003 define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1004 ; SI-LABEL: v_test_v2i16_x_sub_0_16:
1005 ; SI: ; %bb.0:
1006 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1007 ; SI-NEXT: s_mov_b32 s7, 0xf000
1008 ; SI-NEXT: s_mov_b32 s6, 0
1009 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1010 ; SI-NEXT: v_mov_b32_e32 v1, 0
1011 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1012 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1013 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1014 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1015 ; SI-NEXT: s_waitcnt vmcnt(0)
1016 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xfff00000, v2
1017 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1018 ; SI-NEXT: s_endpgm
1019 ;
1020 ; VI-LABEL: v_test_v2i16_x_sub_0_16:
1021 ; VI: ; %bb.0:
1022 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1023 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1024 ; VI-NEXT: v_mov_b32_e32 v3, -16
1025 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1026 ; VI-NEXT: v_mov_b32_e32 v1, s3
1027 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1028 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1029 ; VI-NEXT: flat_load_dword v4, v[0:1]
1030 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1031 ; VI-NEXT: v_mov_b32_e32 v1, s1
1032 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1033 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1034 ; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1035 ; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1036 ; VI-NEXT: flat_store_dword v[0:1], v2
1037 ; VI-NEXT: s_endpgm
1038 ;
1039 ; GFX9-LABEL: v_test_v2i16_x_sub_0_16:
1040 ; GFX9: ; %bb.0:
1041 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1042 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1043 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1044 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1045 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1046 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1047 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1048 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1049 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1050 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1051 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1052 ; GFX9-NEXT: v_pk_sub_i16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0]
1053 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1054 ; GFX9-NEXT: s_endpgm
1055 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1056 %tid.ext = sext i32 %tid to i64
1057 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1058 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1059 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1060 %result = sub <2 x i16> %x,
1061 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1062 ret void
1063 }
1064
1065 define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1066 ; SI-LABEL: v_test_v2i16_x_sub_0_1_0:
1067 ; SI: ; %bb.0:
1068 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1069 ; SI-NEXT: s_mov_b32 s7, 0xf000
1070 ; SI-NEXT: s_mov_b32 s6, 0
1071 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1072 ; SI-NEXT: v_mov_b32_e32 v1, 0
1073 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1074 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1075 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1076 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1077 ; SI-NEXT: s_waitcnt vmcnt(0)
1078 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3c000000, v2
1079 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1080 ; SI-NEXT: s_endpgm
1081 ;
1082 ; VI-LABEL: v_test_v2i16_x_sub_0_1_0:
1083 ; VI: ; %bb.0:
1084 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1085 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1086 ; VI-NEXT: v_mov_b32_e32 v3, 0x3c00
1087 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1088 ; VI-NEXT: v_mov_b32_e32 v1, s3
1089 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1090 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1091 ; VI-NEXT: flat_load_dword v4, v[0:1]
1092 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1093 ; VI-NEXT: v_mov_b32_e32 v1, s1
1094 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1095 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1096 ; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1097 ; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1098 ; VI-NEXT: flat_store_dword v[0:1], v2
1099 ; VI-NEXT: s_endpgm
1100 ;
1101 ; GFX9-LABEL: v_test_v2i16_x_sub_0_1_0:
1102 ; GFX9: ; %bb.0:
1103 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1104 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1105 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1106 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1107 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1108 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1109 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1110 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1111 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1112 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1113 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1114 ; GFX9-NEXT: v_pk_sub_i16 v2, v3, -4.0 op_sel:[0,1] op_sel_hi:[1,0]
1115 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1116 ; GFX9-NEXT: s_endpgm
1117 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1118 %tid.ext = sext i32 %tid to i64
1119 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1120 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1121 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1122 %result = sub <2 x i16> %x,
1123 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1124 ret void
1125 }
1126
1127 define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1128 ; SI-LABEL: v_test_v2i16_x_sub_0_neg1_0:
1129 ; SI: ; %bb.0:
1130 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1131 ; SI-NEXT: s_mov_b32 s7, 0xf000
1132 ; SI-NEXT: s_mov_b32 s6, 0
1133 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1134 ; SI-NEXT: v_mov_b32_e32 v1, 0
1135 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1136 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1137 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1138 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1139 ; SI-NEXT: s_waitcnt vmcnt(0)
1140 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xbc000000, v2
1141 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1142 ; SI-NEXT: s_endpgm
1143 ;
1144 ; VI-LABEL: v_test_v2i16_x_sub_0_neg1_0:
1145 ; VI: ; %bb.0:
1146 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1147 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1148 ; VI-NEXT: v_mov_b32_e32 v3, 0xffffbc00
1149 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1150 ; VI-NEXT: v_mov_b32_e32 v1, s3
1151 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1152 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1153 ; VI-NEXT: flat_load_dword v4, v[0:1]
1154 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1155 ; VI-NEXT: v_mov_b32_e32 v1, s1
1156 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1157 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1158 ; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1159 ; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1160 ; VI-NEXT: flat_store_dword v[0:1], v2
1161 ; VI-NEXT: s_endpgm
1162 ;
1163 ; GFX9-LABEL: v_test_v2i16_x_sub_0_neg1_0:
1164 ; GFX9: ; %bb.0:
1165 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1166 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1167 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1168 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1169 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1170 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1171 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1172 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1173 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1174 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1175 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1176 ; GFX9-NEXT: v_pk_sub_i16 v2, v3, 4.0 op_sel:[0,1] op_sel_hi:[1,0]
1177 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1178 ; GFX9-NEXT: s_endpgm
1179 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1180 %tid.ext = sext i32 %tid to i64
1181 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1182 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1183 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1184 %result = sub <2 x i16> %x,
1185 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1186 ret void
1187 }
1188
1189 ; -32 isn't an inline immediate, but 32 is
1190 define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1191 ; SI-LABEL: v_test_v2i16_x_add_neg32_neg32:
1192 ; SI: ; %bb.0:
1193 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1194 ; SI-NEXT: s_mov_b32 s7, 0xf000
1195 ; SI-NEXT: s_mov_b32 s6, 0
1196 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1197 ; SI-NEXT: v_mov_b32_e32 v1, 0
1198 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1199 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1200 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1201 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1202 ; SI-NEXT: s_waitcnt vmcnt(0)
1203 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
1204 ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 32, v2
1205 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
1206 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
1207 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffe00000, v2
1208 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1209 ; SI-NEXT: s_endpgm
1210 ;
1211 ; VI-LABEL: v_test_v2i16_x_add_neg32_neg32:
1212 ; VI: ; %bb.0:
1213 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1214 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1215 ; VI-NEXT: v_mov_b32_e32 v3, 32
1216 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1217 ; VI-NEXT: v_mov_b32_e32 v1, s3
1218 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1219 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1220 ; VI-NEXT: flat_load_dword v4, v[0:1]
1221 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1222 ; VI-NEXT: v_mov_b32_e32 v1, s1
1223 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1224 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1225 ; VI-NEXT: v_add_u16_e32 v2, 0xffffffe0, v4
1226 ; VI-NEXT: v_sub_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1227 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
1228 ; VI-NEXT: flat_store_dword v[0:1], v2
1229 ; VI-NEXT: s_endpgm
1230 ;
1231 ; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32:
1232 ; GFX9: ; %bb.0:
1233 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1234 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1235 ; GFX9-NEXT: s_movk_i32 s4, 0xffe0
1236 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1237 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1238 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1239 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1240 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1241 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1242 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1243 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1244 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1245 ; GFX9-NEXT: v_pk_add_u16 v2, v3, s4 op_sel_hi:[1,0]
1246 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1247 ; GFX9-NEXT: s_endpgm
1248 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1249 %tid.ext = sext i32 %tid to i64
1250 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1251 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1252 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1253 %result = add <2 x i16> %x,
1254 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1255 ret void
1256 }
1257
1258 define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1259 ; SI-LABEL: v_test_v2i16_x_add_0_neg32:
1260 ; SI: ; %bb.0:
1261 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1262 ; SI-NEXT: s_mov_b32 s7, 0xf000
1263 ; SI-NEXT: s_mov_b32 s6, 0
1264 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1265 ; SI-NEXT: v_mov_b32_e32 v1, 0
1266 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1267 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1268 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1269 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1270 ; SI-NEXT: s_waitcnt vmcnt(0)
1271 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffe00000, v2
1272 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1273 ; SI-NEXT: s_endpgm
1274 ;
1275 ; VI-LABEL: v_test_v2i16_x_add_0_neg32:
1276 ; VI: ; %bb.0:
1277 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1278 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1279 ; VI-NEXT: v_mov_b32_e32 v3, 32
1280 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1281 ; VI-NEXT: v_mov_b32_e32 v1, s3
1282 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1283 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1284 ; VI-NEXT: flat_load_dword v4, v[0:1]
1285 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1286 ; VI-NEXT: v_mov_b32_e32 v1, s1
1287 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1288 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1289 ; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1290 ; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1291 ; VI-NEXT: flat_store_dword v[0:1], v2
1292 ; VI-NEXT: s_endpgm
1293 ;
1294 ; GFX9-LABEL: v_test_v2i16_x_add_0_neg32:
1295 ; GFX9: ; %bb.0:
1296 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1297 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1298 ; GFX9-NEXT: s_mov_b32 s4, 0xffe00000
1299 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1300 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1301 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1302 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1303 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1304 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1305 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1306 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1307 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1308 ; GFX9-NEXT: v_pk_add_u16 v2, v3, s4
1309 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1310 ; GFX9-NEXT: s_endpgm
1311 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1312 %tid.ext = sext i32 %tid to i64
1313 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1314 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1315 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1316 %result = add <2 x i16> %x,
1317 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1318 ret void
1319 }
1320
1321 define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1322 ; SI-LABEL: v_test_v2i16_x_add_neg32_0:
1323 ; SI: ; %bb.0:
1324 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1325 ; SI-NEXT: s_mov_b32 s7, 0xf000
1326 ; SI-NEXT: s_mov_b32 s6, 0
1327 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1328 ; SI-NEXT: v_mov_b32_e32 v1, 0
1329 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1330 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1331 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1332 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1333 ; SI-NEXT: s_waitcnt vmcnt(0)
1334 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
1335 ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 32, v2
1336 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
1337 ; SI-NEXT: v_or_b32_e32 v2, v2, v3
1338 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1339 ; SI-NEXT: s_endpgm
1340 ;
1341 ; VI-LABEL: v_test_v2i16_x_add_neg32_0:
1342 ; VI: ; %bb.0:
1343 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1344 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1345 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1346 ; VI-NEXT: v_mov_b32_e32 v1, s3
1347 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1348 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1349 ; VI-NEXT: flat_load_dword v3, v[0:1]
1350 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1351 ; VI-NEXT: v_mov_b32_e32 v1, s1
1352 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1353 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1354 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
1355 ; VI-NEXT: v_add_u16_e32 v3, 0xffffffe0, v3
1356 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
1357 ; VI-NEXT: flat_store_dword v[0:1], v2
1358 ; VI-NEXT: s_endpgm
1359 ;
1360 ; GFX9-LABEL: v_test_v2i16_x_add_neg32_0:
1361 ; GFX9: ; %bb.0:
1362 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1363 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1364 ; GFX9-NEXT: s_mov_b32 s4, 0xffe0
1365 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1366 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1367 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1368 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1369 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1370 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1371 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1372 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1373 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1374 ; GFX9-NEXT: v_pk_add_u16 v2, v3, s4
1375 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1376 ; GFX9-NEXT: s_endpgm
1377 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1378 %tid.ext = sext i32 %tid to i64
1379 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1380 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1381 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1382 %result = add <2 x i16> %x,
1383 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1384 ret void
1385 }
1386
1387 ; 16 and -16 are both inline immediates
1388 define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1389 ; SI-LABEL: v_test_v2i16_x_add_neg16_neg16:
1390 ; SI: ; %bb.0:
1391 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1392 ; SI-NEXT: s_mov_b32 s7, 0xf000
1393 ; SI-NEXT: s_mov_b32 s6, 0
1394 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1395 ; SI-NEXT: v_mov_b32_e32 v1, 0
1396 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1397 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1398 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1399 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1400 ; SI-NEXT: s_waitcnt vmcnt(0)
1401 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
1402 ; SI-NEXT: v_add_i32_e32 v2, vcc, -16, v2
1403 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
1404 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
1405 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xfff00000, v2
1406 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1407 ; SI-NEXT: s_endpgm
1408 ;
1409 ; VI-LABEL: v_test_v2i16_x_add_neg16_neg16:
1410 ; VI: ; %bb.0:
1411 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1412 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1413 ; VI-NEXT: v_mov_b32_e32 v3, -16
1414 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1415 ; VI-NEXT: v_mov_b32_e32 v1, s3
1416 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1417 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1418 ; VI-NEXT: flat_load_dword v4, v[0:1]
1419 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1420 ; VI-NEXT: v_mov_b32_e32 v1, s1
1421 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1422 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1423 ; VI-NEXT: v_add_u16_e32 v2, -16, v4
1424 ; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1425 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
1426 ; VI-NEXT: flat_store_dword v[0:1], v2
1427 ; VI-NEXT: s_endpgm
1428 ;
1429 ; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16:
1430 ; GFX9: ; %bb.0:
1431 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1432 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1433 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1434 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1435 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1436 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1437 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1438 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1439 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1440 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1441 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1442 ; GFX9-NEXT: v_pk_add_u16 v2, v3, -16 op_sel_hi:[1,0]
1443 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1444 ; GFX9-NEXT: s_endpgm
1445 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1446 %tid.ext = sext i32 %tid to i64
1447 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1448 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1449 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1450 %result = add <2 x i16> %x,
1451 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1452 ret void
1453 }
1454
1455 define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1456 ; SI-LABEL: v_test_v2i16_x_add_0_neg16:
1457 ; SI: ; %bb.0:
1458 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1459 ; SI-NEXT: s_mov_b32 s7, 0xf000
1460 ; SI-NEXT: s_mov_b32 s6, 0
1461 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1462 ; SI-NEXT: v_mov_b32_e32 v1, 0
1463 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1464 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1465 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1466 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1467 ; SI-NEXT: s_waitcnt vmcnt(0)
1468 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xfff00000, v2
1469 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1470 ; SI-NEXT: s_endpgm
1471 ;
1472 ; VI-LABEL: v_test_v2i16_x_add_0_neg16:
1473 ; VI: ; %bb.0:
1474 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1475 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1476 ; VI-NEXT: v_mov_b32_e32 v3, -16
1477 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1478 ; VI-NEXT: v_mov_b32_e32 v1, s3
1479 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1480 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1481 ; VI-NEXT: flat_load_dword v4, v[0:1]
1482 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1483 ; VI-NEXT: v_mov_b32_e32 v1, s1
1484 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1485 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1486 ; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1487 ; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1488 ; VI-NEXT: flat_store_dword v[0:1], v2
1489 ; VI-NEXT: s_endpgm
1490 ;
1491 ; GFX9-LABEL: v_test_v2i16_x_add_0_neg16:
1492 ; GFX9: ; %bb.0:
1493 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1494 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1495 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1496 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1497 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1498 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1499 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1500 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1501 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1502 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1503 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1504 ; GFX9-NEXT: v_pk_add_u16 v2, v3, -16 op_sel:[0,1] op_sel_hi:[1,0]
1505 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1506 ; GFX9-NEXT: s_endpgm
1507 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1508 %tid.ext = sext i32 %tid to i64
1509 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1510 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1511 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1512 %result = add <2 x i16> %x,
1513 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1514 ret void
1515 }
1516
1517 define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1518 ; SI-LABEL: v_test_v2i16_x_add_neg16_0:
1519 ; SI: ; %bb.0:
1520 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1521 ; SI-NEXT: s_mov_b32 s7, 0xf000
1522 ; SI-NEXT: s_mov_b32 s6, 0
1523 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1524 ; SI-NEXT: v_mov_b32_e32 v1, 0
1525 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1526 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1527 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1528 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1529 ; SI-NEXT: s_waitcnt vmcnt(0)
1530 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
1531 ; SI-NEXT: v_add_i32_e32 v2, vcc, -16, v2
1532 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
1533 ; SI-NEXT: v_or_b32_e32 v2, v2, v3
1534 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1535 ; SI-NEXT: s_endpgm
1536 ;
1537 ; VI-LABEL: v_test_v2i16_x_add_neg16_0:
1538 ; VI: ; %bb.0:
1539 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1540 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1541 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1542 ; VI-NEXT: v_mov_b32_e32 v1, s3
1543 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1544 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1545 ; VI-NEXT: flat_load_dword v3, v[0:1]
1546 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1547 ; VI-NEXT: v_mov_b32_e32 v1, s1
1548 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1549 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1550 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
1551 ; VI-NEXT: v_add_u16_e32 v3, -16, v3
1552 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
1553 ; VI-NEXT: flat_store_dword v[0:1], v2
1554 ; VI-NEXT: s_endpgm
1555 ;
1556 ; GFX9-LABEL: v_test_v2i16_x_add_neg16_0:
1557 ; GFX9: ; %bb.0:
1558 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1559 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1560 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1561 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1562 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1563 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1564 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1565 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1566 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1567 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1568 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1569 ; GFX9-NEXT: v_pk_add_u16 v2, v3, -16
1570 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1571 ; GFX9-NEXT: s_endpgm
1572 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1573 %tid.ext = sext i32 %tid to i64
1574 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1575 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1576 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1577 %result = add <2 x i16> %x,
1578 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1579 ret void
1580 }
1581
1582 define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1583 ; SI-LABEL: v_test_v2i16_x_add_neg_fpone:
1584 ; SI: ; %bb.0:
1585 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1586 ; SI-NEXT: s_mov_b32 s7, 0xf000
1587 ; SI-NEXT: s_mov_b32 s6, 0
1588 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1589 ; SI-NEXT: v_mov_b32_e32 v1, 0
1590 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1591 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1592 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1593 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1594 ; SI-NEXT: s_waitcnt vmcnt(0)
1595 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
1596 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffffc400, v2
1597 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
1598 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
1599 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xc4000000, v2
1600 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1601 ; SI-NEXT: s_endpgm
1602 ;
1603 ; VI-LABEL: v_test_v2i16_x_add_neg_fpone:
1604 ; VI: ; %bb.0:
1605 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1606 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1607 ; VI-NEXT: v_mov_b32_e32 v3, 0xffffc400
1608 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1609 ; VI-NEXT: v_mov_b32_e32 v1, s3
1610 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1611 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1612 ; VI-NEXT: flat_load_dword v4, v[0:1]
1613 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1614 ; VI-NEXT: v_mov_b32_e32 v1, s1
1615 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1616 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1617 ; VI-NEXT: v_add_u16_e32 v2, 0xffffc400, v4
1618 ; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1619 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
1620 ; VI-NEXT: flat_store_dword v[0:1], v2
1621 ; VI-NEXT: s_endpgm
1622 ;
1623 ; GFX9-LABEL: v_test_v2i16_x_add_neg_fpone:
1624 ; GFX9: ; %bb.0:
1625 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1626 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1627 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1628 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1629 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1630 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1631 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1632 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1633 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1634 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1635 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1636 ; GFX9-NEXT: v_pk_add_u16 v2, v3, -4.0 op_sel_hi:[1,0]
1637 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1638 ; GFX9-NEXT: s_endpgm
1639 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1640 %tid.ext = sext i32 %tid to i64
1641 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1642 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1643 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1644 %result = add <2 x i16> %x,
1645 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1646 ret void
1647 }
1648
1649 define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1650 ; SI-LABEL: v_test_v2i16_x_add_neg_negfpone:
1651 ; SI: ; %bb.0:
1652 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1653 ; SI-NEXT: s_mov_b32 s7, 0xf000
1654 ; SI-NEXT: s_mov_b32 s6, 0
1655 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1656 ; SI-NEXT: v_mov_b32_e32 v1, 0
1657 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1658 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1659 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1660 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1661 ; SI-NEXT: s_waitcnt vmcnt(0)
1662 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
1663 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4400, v2
1664 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
1665 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
1666 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44000000, v2
1667 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1668 ; SI-NEXT: s_endpgm
1669 ;
1670 ; VI-LABEL: v_test_v2i16_x_add_neg_negfpone:
1671 ; VI: ; %bb.0:
1672 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1673 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1674 ; VI-NEXT: v_mov_b32_e32 v3, 0x4400
1675 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1676 ; VI-NEXT: v_mov_b32_e32 v1, s3
1677 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1678 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1679 ; VI-NEXT: flat_load_dword v4, v[0:1]
1680 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1681 ; VI-NEXT: v_mov_b32_e32 v1, s1
1682 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1683 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1684 ; VI-NEXT: v_add_u16_e32 v2, 4.0, v4
1685 ; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1686 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
1687 ; VI-NEXT: flat_store_dword v[0:1], v2
1688 ; VI-NEXT: s_endpgm
1689 ;
1690 ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfpone:
1691 ; GFX9: ; %bb.0:
1692 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1693 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1694 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1695 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1696 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1697 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1698 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1699 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1700 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1701 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1702 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1703 ; GFX9-NEXT: v_pk_add_u16 v2, v3, 4.0 op_sel_hi:[1,0]
1704 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1705 ; GFX9-NEXT: s_endpgm
1706 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1707 %tid.ext = sext i32 %tid to i64
1708 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1709 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1710 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1711 %result = add <2 x i16> %x,
1712 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1713 ret void
1714 }
1715
1716 define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1717 ; SI-LABEL: v_test_v2i16_x_add_neg_fptwo:
1718 ; SI: ; %bb.0:
1719 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1720 ; SI-NEXT: s_mov_b32 s7, 0xf000
1721 ; SI-NEXT: s_mov_b32 s6, 0
1722 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1723 ; SI-NEXT: v_mov_b32_e32 v1, 0
1724 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1725 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1726 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1727 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1728 ; SI-NEXT: s_waitcnt vmcnt(0)
1729 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
1730 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4000, v2
1731 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
1732 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
1733 ; SI-NEXT: v_add_i32_e32 v2, vcc, 2.0, v2
1734 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1735 ; SI-NEXT: s_endpgm
1736 ;
1737 ; VI-LABEL: v_test_v2i16_x_add_neg_fptwo:
1738 ; VI: ; %bb.0:
1739 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1740 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1741 ; VI-NEXT: v_mov_b32_e32 v3, 0x4000
1742 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1743 ; VI-NEXT: v_mov_b32_e32 v1, s3
1744 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1745 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1746 ; VI-NEXT: flat_load_dword v4, v[0:1]
1747 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1748 ; VI-NEXT: v_mov_b32_e32 v1, s1
1749 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1750 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1751 ; VI-NEXT: v_add_u16_e32 v2, 2.0, v4
1752 ; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1753 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
1754 ; VI-NEXT: flat_store_dword v[0:1], v2
1755 ; VI-NEXT: s_endpgm
1756 ;
1757 ; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo:
1758 ; GFX9: ; %bb.0:
1759 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1760 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1761 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1762 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1763 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1764 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1765 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1766 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1767 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1768 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1769 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1770 ; GFX9-NEXT: v_pk_add_u16 v2, v3, 2.0 op_sel_hi:[1,0]
1771 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1772 ; GFX9-NEXT: s_endpgm
1773 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1774 %tid.ext = sext i32 %tid to i64
1775 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1776 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1777 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1778 %result = add <2 x i16> %x,
1779 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1780 ret void
1781 }
1782
1783 define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1784 ; SI-LABEL: v_test_v2i16_x_add_neg_negfptwo:
1785 ; SI: ; %bb.0:
1786 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1787 ; SI-NEXT: s_mov_b32 s7, 0xf000
1788 ; SI-NEXT: s_mov_b32 s6, 0
1789 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1790 ; SI-NEXT: v_mov_b32_e32 v1, 0
1791 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1792 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1793 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1794 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1795 ; SI-NEXT: s_waitcnt vmcnt(0)
1796 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
1797 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffffc000, v2
1798 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
1799 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
1800 ; SI-NEXT: v_add_i32_e32 v2, vcc, -2.0, v2
1801 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1802 ; SI-NEXT: s_endpgm
1803 ;
1804 ; VI-LABEL: v_test_v2i16_x_add_neg_negfptwo:
1805 ; VI: ; %bb.0:
1806 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1807 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1808 ; VI-NEXT: v_mov_b32_e32 v3, 0xffffc000
1809 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1810 ; VI-NEXT: v_mov_b32_e32 v1, s3
1811 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1812 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1813 ; VI-NEXT: flat_load_dword v4, v[0:1]
1814 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1815 ; VI-NEXT: v_mov_b32_e32 v1, s1
1816 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1817 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1818 ; VI-NEXT: v_add_u16_e32 v2, 0xffffc000, v4
1819 ; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1820 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
1821 ; VI-NEXT: flat_store_dword v[0:1], v2
1822 ; VI-NEXT: s_endpgm
1823 ;
1824 ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo:
1825 ; GFX9: ; %bb.0:
1826 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1827 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1828 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1829 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1830 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1831 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1832 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1833 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1834 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1835 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1836 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1837 ; GFX9-NEXT: v_pk_add_u16 v2, v3, -2.0 op_sel_hi:[1,0]
1838 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1839 ; GFX9-NEXT: s_endpgm
1840 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1841 %tid.ext = sext i32 %tid to i64
1842 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1843 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1844 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1845 %result = add <2 x i16> %x,
1846 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1847 ret void
1848 }
1849
1850 define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1851 ; SI-LABEL: v_test_v2i16_x_add_undef_neg32:
1852 ; SI: ; %bb.0:
1853 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1854 ; SI-NEXT: s_mov_b32 s7, 0xf000
1855 ; SI-NEXT: s_mov_b32 s6, 0
1856 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1857 ; SI-NEXT: v_mov_b32_e32 v1, 0
1858 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1859 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1860 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1861 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1862 ; SI-NEXT: s_waitcnt vmcnt(0)
1863 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
1864 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffe00000, v2
1865 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1866 ; SI-NEXT: s_endpgm
1867 ;
1868 ; VI-LABEL: v_test_v2i16_x_add_undef_neg32:
1869 ; VI: ; %bb.0:
1870 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1871 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1872 ; VI-NEXT: v_mov_b32_e32 v3, 32
1873 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1874 ; VI-NEXT: v_mov_b32_e32 v1, s3
1875 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1876 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1877 ; VI-NEXT: flat_load_dword v4, v[0:1]
1878 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1879 ; VI-NEXT: v_mov_b32_e32 v1, s1
1880 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1881 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1882 ; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1883 ; VI-NEXT: flat_store_dword v[0:1], v2
1884 ; VI-NEXT: s_endpgm
1885 ;
1886 ; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32:
1887 ; GFX9: ; %bb.0:
1888 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1889 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1890 ; GFX9-NEXT: s_mov_b32 s4, 0xffe00000
1891 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1892 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1893 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1894 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1895 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1896 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1897 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1898 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1899 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1900 ; GFX9-NEXT: v_pk_add_u16 v2, v3, s4
1901 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1902 ; GFX9-NEXT: s_endpgm
1903 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1904 %tid.ext = sext i32 %tid to i64
1905 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1906 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1907 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1908 %result = add <2 x i16> %x,
1909 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1910 ret void
1911 }
1912
1913 define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1914 ; SI-LABEL: v_test_v2i16_x_add_neg32_undef:
1915 ; SI: ; %bb.0:
1916 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1917 ; SI-NEXT: s_mov_b32 s7, 0xf000
1918 ; SI-NEXT: s_mov_b32 s6, 0
1919 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1920 ; SI-NEXT: v_mov_b32_e32 v1, 0
1921 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1922 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1923 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1924 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1925 ; SI-NEXT: s_waitcnt vmcnt(0)
1926 ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 32, v2
1927 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
1928 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1929 ; SI-NEXT: s_endpgm
1930 ;
1931 ; VI-LABEL: v_test_v2i16_x_add_neg32_undef:
1932 ; VI: ; %bb.0:
1933 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1934 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1935 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1936 ; VI-NEXT: v_mov_b32_e32 v1, s3
1937 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1938 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1939 ; VI-NEXT: flat_load_dword v3, v[0:1]
1940 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1941 ; VI-NEXT: v_mov_b32_e32 v1, s1
1942 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1943 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1944 ; VI-NEXT: v_subrev_u16_e32 v2, 32, v3
1945 ; VI-NEXT: flat_store_dword v[0:1], v2
1946 ; VI-NEXT: s_endpgm
1947 ;
1948 ; GFX9-LABEL: v_test_v2i16_x_add_neg32_undef:
1949 ; GFX9: ; %bb.0:
1950 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1951 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1952 ; GFX9-NEXT: s_movk_i32 s4, 0xffe0
1953 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1954 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1955 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1956 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1957 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1958 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1959 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1960 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1961 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1962 ; GFX9-NEXT: v_pk_add_u16 v2, v3, s4
1963 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1964 ; GFX9-NEXT: s_endpgm
1965 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1966 %tid.ext = sext i32 %tid to i64
1967 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1968 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1969 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1970 %result = add <2 x i16> %x,
1971 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1972 ret void
1973 }
1974
5271975 declare i32 @llvm.amdgcn.workitem.id.x() #1
5281976
5291977 attributes #0 = { nounwind }