llvm.org GIT mirror llvm / 508c80f
[NVPTX] Add support for efficient rotate instructions on SM 3.2+ git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211934 91177308-0d34-0410-b5e6-96231b3b80d8 Justin Holewinski 6 years ago
4 changed file(s) with 247 addition(s) and 4 deletion(s). Raw diff Collapse all Expand all
19471947 "llvm.nvvm.sust.p.3d.v4i32.trap">,
19481948 GCCBuiltin<"__nvvm_sust_p_3d_v4i32_trap">;
19491949
1950 def int_nvvm_rotate_b32
1951 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
1952 [IntrNoMem], "llvm.nvvm.rotate.b32">,
1953 GCCBuiltin<"__nvvm_rotate_b32">;
1954
1955 def int_nvvm_rotate_b64
1956 :Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty],
1957 [IntrNoMem], "llvm.nvvm.rotate.b64">,
1958 GCCBuiltin<"__nvvm_rotate_b64">;
1959
1960 def int_nvvm_rotate_right_b64
1961 : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty],
1962 [IntrNoMem], "llvm.nvvm.rotate.right.b64">,
1963 GCCBuiltin<"__nvvm_rotate_right_b64">;
1964
1965 def int_nvvm_swap_lo_hi_b64
1966 : Intrinsic<[llvm_i64_ty], [llvm_i64_ty],
1967 [IntrNoMem], "llvm.nvvm.swap.lo.hi.b64">,
1968 GCCBuiltin<"__nvvm_swap_lo_hi_b64">;
19501969
19511970
19521971 // Old PTX back-end intrinsics retained here for backwards-compatibility
157157 def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
158158
159159 def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">;
160 def noHWROT32 : Predicate<"!Subtarget.hasHWROT32()">;
160161
161162 def true : Predicate<"1">;
162163
10841085 defm SRA : RSHIFT_FORMAT<"shr.s", sra>;
10851086 defm SRL : RSHIFT_FORMAT<"shr.u", srl>;
10861087
1088 //
1089 // Rotate: use ptx shf instruction if available.
1090 //
1091
1092 // 32 bit r2 = rotl r1, n
1093 // =>
1094 // r2 = shf.l r1, r1, n
1095 def ROTL32imm_hw : NVPTXInst<(outs Int32Regs:$dst),
1096 (ins Int32Regs:$src, i32imm:$amt),
1097 "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1098 [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>,
1099 Requires<[hasHWROT32]> ;
1100
1101 def ROTL32reg_hw : NVPTXInst<(outs Int32Regs:$dst),
1102 (ins Int32Regs:$src, Int32Regs:$amt),
1103 "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1104 [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
1105 Requires<[hasHWROT32]>;
1106
1107 // 32 bit r2 = rotr r1, n
1108 // =>
1109 // r2 = shf.r r1, r1, n
1110 def ROTR32imm_hw : NVPTXInst<(outs Int32Regs:$dst),
1111 (ins Int32Regs:$src, i32imm:$amt),
1112 "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1113 [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>,
1114 Requires<[hasHWROT32]>;
1115
1116 def ROTR32reg_hw : NVPTXInst<(outs Int32Regs:$dst),
1117 (ins Int32Regs:$src, Int32Regs:$amt),
1118 "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1119 [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
1120 Requires<[hasHWROT32]>;
1121
1122 //
1123 // Rotate: if ptx shf instruction is not available, then use shift+add
1124 //
10871125 // 32bit
10881126 def ROT32imm_sw : NVPTXInst<(outs Int32Regs:$dst),
10891127 (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
11011139 }]>;
11021140
11031141 def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
1104 (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>;
1142 (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
1143 Requires<[noHWROT32]>;
11051144 def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
1106 (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>;
1145 (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
1146 Requires<[noHWROT32]>;
11071147
11081148 def ROTL32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
11091149 Int32Regs:$amt),
11161156 !strconcat("shr.b32 \t%rhs, $src, %amt2;\n\t",
11171157 !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
11181158 !strconcat("}}", ""))))))))),
1119 [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>;
1159 [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
1160 Requires<[noHWROT32]>;
11201161
11211162 def ROTR32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
11221163 Int32Regs:$amt),
11291170 !strconcat("shl.b32 \t%rhs, $src, %amt2;\n\t",
11301171 !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
11311172 !strconcat("}}", ""))))))))),
1132 [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>;
1173 [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
1174 Requires<[noHWROT32]>;
11331175
11341176 // 64bit
11351177 def ROT64imm_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
18631863 def : Pat<(int_nvvm_read_ptx_sreg_envreg31), (MOV_SPECIAL ENVREG31)>;
18641864
18651865
1866 // rotate builtin support
1867
1868 def ROTATE_B32_HW_IMM
1869 : NVPTXInst<(outs Int32Regs:$dst),
1870 (ins Int32Regs:$src, i32imm:$amt),
1871 "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1872 [(set Int32Regs:$dst,
1873 (int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)))]>,
1874 Requires<[hasHWROT32]> ;
1875
1876 def ROTATE_B32_HW_REG
1877 : NVPTXInst<(outs Int32Regs:$dst),
1878 (ins Int32Regs:$src, Int32Regs:$amt),
1879 "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1880 [(set Int32Regs:$dst,
1881 (int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt))]>,
1882 Requires<[hasHWROT32]> ;
1883
1884 def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)),
1885 (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
1886 Requires<[noHWROT32]> ;
1887
1888 def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt),
1889 (ROTL32reg_sw Int32Regs:$src, Int32Regs:$amt)>,
1890 Requires<[noHWROT32]> ;
1891
1892 def GET_LO_INT64
1893 : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
1894 !strconcat("{{\n\t",
1895 !strconcat(".reg .b32 %dummy;\n\t",
1896 !strconcat("mov.b64 \t{$dst,%dummy}, $src;\n\t",
1897 !strconcat("}}", "")))),
1898 []> ;
1899
1900 def GET_HI_INT64
1901 : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
1902 !strconcat("{{\n\t",
1903 !strconcat(".reg .b32 %dummy;\n\t",
1904 !strconcat("mov.b64 \t{%dummy,$dst}, $src;\n\t",
1905 !strconcat("}}", "")))),
1906 []> ;
1907
1908 def PACK_TWO_INT32
1909 : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$lo, Int32Regs:$hi),
1910 "mov.b64 \t$dst, {{$lo, $hi}};", []> ;
1911
1912 def : Pat<(int_nvvm_swap_lo_hi_b64 Int64Regs:$src),
1913 (PACK_TWO_INT32 (GET_HI_INT64 Int64Regs:$src),
1914 (GET_LO_INT64 Int64Regs:$src))> ;
1915
1916 // funnel shift, requires >= sm_32
1917 def SHF_L_WRAP_B32_IMM
1918 : NVPTXInst<(outs Int32Regs:$dst),
1919 (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
1920 "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
1921 Requires<[hasHWROT32]>;
1922
1923 def SHF_L_WRAP_B32_REG
1924 : NVPTXInst<(outs Int32Regs:$dst),
1925 (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1926 "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
1927 Requires<[hasHWROT32]>;
1928
1929 def SHF_R_WRAP_B32_IMM
1930 : NVPTXInst<(outs Int32Regs:$dst),
1931 (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
1932 "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
1933 Requires<[hasHWROT32]>;
1934
1935 def SHF_R_WRAP_B32_REG
1936 : NVPTXInst<(outs Int32Regs:$dst),
1937 (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1938 "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
1939 Requires<[hasHWROT32]>;
1940
1941 // HW version of rotate 64
1942 def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)),
1943 (PACK_TWO_INT32
1944 (SHF_L_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src),
1945 (GET_LO_INT64 Int64Regs:$src), imm:$amt),
1946 (SHF_L_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src),
1947 (GET_HI_INT64 Int64Regs:$src), imm:$amt))>,
1948 Requires<[hasHWROT32]>;
1949
1950 def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt),
1951 (PACK_TWO_INT32
1952 (SHF_L_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src),
1953 (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt),
1954 (SHF_L_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src),
1955 (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt))>,
1956 Requires<[hasHWROT32]>;
1957
1958
1959 def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)),
1960 (PACK_TWO_INT32
1961 (SHF_R_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src),
1962 (GET_HI_INT64 Int64Regs:$src), imm:$amt),
1963 (SHF_R_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src),
1964 (GET_LO_INT64 Int64Regs:$src), imm:$amt))>,
1965 Requires<[hasHWROT32]>;
1966
1967 def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt),
1968 (PACK_TWO_INT32
1969 (SHF_R_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src),
1970 (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt),
1971 (SHF_R_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src),
1972 (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt))>,
1973 Requires<[hasHWROT32]>;
1974
1975 // SW version of rotate 64
1976 def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)),
1977 (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
1978 Requires<[noHWROT32]>;
1979 def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt),
1980 (ROTL64reg_sw Int64Regs:$src, Int32Regs:$amt)>,
1981 Requires<[noHWROT32]>;
1982 def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)),
1983 (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>,
1984 Requires<[noHWROT32]>;
1985 def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt),
1986 (ROTR64reg_sw Int64Regs:$src, Int32Regs:$amt)>,
1987 Requires<[noHWROT32]>;
1988
1989
18661990 //-----------------------------------
18671991 // Texture Intrinsics
18681992 //-----------------------------------
0 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck --check-prefix=SM20 %s
1 ; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck --check-prefix=SM35 %s
2
3
4 declare i32 @llvm.nvvm.rotate.b32(i32, i32)
5 declare i64 @llvm.nvvm.rotate.b64(i64, i32)
6 declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)
7
8 ; SM20: rotate32
9 ; SM35: rotate32
10 define i32 @rotate32(i32 %a, i32 %b) {
11 ; SM20: shl.b32
12 ; SM20: sub.s32
13 ; SM20: shr.b32
14 ; SM20: add.u32
15 ; SM35: shf.l.wrap.b32
16 %val = tail call i32 @llvm.nvvm.rotate.b32(i32 %a, i32 %b)
17 ret i32 %val
18 }
19
20 ; SM20: rotate64
21 ; SM35: rotate64
22 define i64 @rotate64(i64 %a, i32 %b) {
23 ; SM20: shl.b64
24 ; SM20: sub.u32
25 ; SM20: shr.b64
26 ; SM20: add.u64
27 ; SM35: shf.l.wrap.b32
28 ; SM35: shf.l.wrap.b32
29 %val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 %b)
30 ret i64 %val
31 }
32
33 ; SM20: rotateright64
34 ; SM35: rotateright64
35 define i64 @rotateright64(i64 %a, i32 %b) {
36 ; SM20: shr.b64
37 ; SM20: sub.u32
38 ; SM20: shl.b64
39 ; SM20: add.u64
40 ; SM35: shf.r.wrap.b32
41 ; SM35: shf.r.wrap.b32
42 %val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 %b)
43 ret i64 %val
44 }
45
46 ; SM20: rotl0
47 ; SM35: rotl0
48 define i32 @rotl0(i32 %x) {
49 ; SM20: shl.b32
50 ; SM20: shr.b32
51 ; SM20: add.u32
52 ; SM35: shf.l.wrap.b32
53 %t0 = shl i32 %x, 8
54 %t1 = lshr i32 %x, 24
55 %t2 = or i32 %t0, %t1
56 ret i32 %t2
57 }