llvm.org GIT mirror llvm / 87ba52d
[ARM] Add bitcast/extract_subvec. of fp16 vectors Summary: This patch adds some basic operations for fp16 vectors, such as bitcast from fp16 to i16, required to perform extract_subvector (also added here) and extract_element. Reviewers: SjoerdMeijer, DavidSpickett, t.p.northover, ostannard Reviewed By: ostannard Subscribers: javed.absar, kristof.beyls, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D60618 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@359433 91177308-0d34-0410-b5e6-96231b3b80d8 Diogo N. Sampaio 8 months ago
3 changed file(s) with 871 addition(s) and 98 deletion(s). Raw diff Collapse all Expand all
66936693
66946694 def : AlignedVEXTq;
66956695
6696 def : AlignedVEXTq; // v8f16 -> v4f16
66966697
66976698 // VEXT : Vector Extract
66986699
71237124 Requires<[HasNEON, DontUseVMOVSR]>;
71247125
71257126 //===----------------------------------------------------------------------===//
7126 // Non-Instruction Patterns
7127 // Non-Instruction Patterns or Endiness - Revert Patterns
71277128 //===----------------------------------------------------------------------===//
71287129
71297130 // bit_convert
7131 // 64 bit conversions
7132 def : Pat<(f64 (bitconvert (v1i64 DPR:$src))), (f64 DPR:$src)>;
7133 def : Pat<(v1i64 (bitconvert (f64 DPR:$src))), (v1i64 DPR:$src)>;
7134
7135 def : Pat<(v2f32 (bitconvert (v2i32 DPR:$src))), (v2f32 DPR:$src)>;
7136 def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>;
7137
7138 def : Pat<(v4i16 (bitconvert (v4f16 DPR:$src))), (v4i16 DPR:$src)>;
7139 def : Pat<(v4f16 (bitconvert (v4i16 DPR:$src))), (v4f16 DPR:$src)>;
7140
7141 // 128 bit conversions
7142 def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>;
7143 def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>;
7144
7145 def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>;
7146 def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>;
7147
7148 def : Pat<(v8i16 (bitconvert (v8f16 QPR:$src))), (v8i16 QPR:$src)>;
7149 def : Pat<(v8f16 (bitconvert (v8i16 QPR:$src))), (v8f16 QPR:$src)>;
7150
71307151 let Predicates = [IsLE] in {
7152 // 64 bit conversions
7153 def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (f64 DPR:$src)>;
7154 def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (f64 DPR:$src)>;
7155 def : Pat<(f64 (bitconvert (v4f16 DPR:$src))), (f64 DPR:$src)>;
7156 def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (f64 DPR:$src)>;
7157 def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (f64 DPR:$src)>;
7158
7159 def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>;
71317160 def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (v1i64 DPR:$src)>;
7161 def : Pat<(v1i64 (bitconvert (v4f16 DPR:$src))), (v1i64 DPR:$src)>;
71327162 def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (v1i64 DPR:$src)>;
71337163 def : Pat<(v1i64 (bitconvert (v8i8 DPR:$src))), (v1i64 DPR:$src)>;
7134 }
7135 def : Pat<(v1i64 (bitconvert (f64 DPR:$src))), (v1i64 DPR:$src)>;
7136 let Predicates = [IsLE] in {
7137 def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>;
7164
7165 def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (v2f32 DPR:$src)>;
7166 def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>;
7167 def : Pat<(v2f32 (bitconvert (v4f16 DPR:$src))), (v2f32 DPR:$src)>;
7168 def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>;
7169 def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (v2f32 DPR:$src)>;
7170
7171 def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (v2i32 DPR:$src)>;
71387172 def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (v2i32 DPR:$src)>;
7173 def : Pat<(v2i32 (bitconvert (v4f16 DPR:$src))), (v2i32 DPR:$src)>;
71397174 def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (v2i32 DPR:$src)>;
71407175 def : Pat<(v2i32 (bitconvert (v8i8 DPR:$src))), (v2i32 DPR:$src)>;
7141 def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (v2i32 DPR:$src)>;
7142 }
7143 def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>;
7144 let Predicates = [IsLE] in {
7176
7177 def : Pat<(v4f16 (bitconvert (f64 DPR:$src))), (v4f16 DPR:$src)>;
7178 def : Pat<(v4f16 (bitconvert (v1i64 DPR:$src))), (v4f16 DPR:$src)>;
7179 def : Pat<(v4f16 (bitconvert (v2f32 DPR:$src))), (v4f16 DPR:$src)>;
7180 def : Pat<(v4f16 (bitconvert (v2i32 DPR:$src))), (v4f16 DPR:$src)>;
7181 def : Pat<(v4f16 (bitconvert (v8i8 DPR:$src))), (v4f16 DPR:$src)>;
7182
7183 def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (v4i16 DPR:$src)>;
71457184 def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>;
7185 def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>;
71467186 def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (v4i16 DPR:$src)>;
71477187 def : Pat<(v4i16 (bitconvert (v8i8 DPR:$src))), (v4i16 DPR:$src)>;
7148 def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (v4i16 DPR:$src)>;
7149 def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>;
7188
7189 def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (v8i8 DPR:$src)>;
71507190 def : Pat<(v8i8 (bitconvert (v1i64 DPR:$src))), (v8i8 DPR:$src)>;
7191 def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (v8i8 DPR:$src)>;
71517192 def : Pat<(v8i8 (bitconvert (v2i32 DPR:$src))), (v8i8 DPR:$src)>;
7193 def : Pat<(v8i8 (bitconvert (v4f16 DPR:$src))), (v8i8 DPR:$src)>;
71527194 def : Pat<(v8i8 (bitconvert (v4i16 DPR:$src))), (v8i8 DPR:$src)>;
7153 def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (v8i8 DPR:$src)>;
7154 def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (v8i8 DPR:$src)>;
7155 }
7156 def : Pat<(f64 (bitconvert (v1i64 DPR:$src))), (f64 DPR:$src)>;
7157 let Predicates = [IsLE] in {
7158 def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (f64 DPR:$src)>;
7159 def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (f64 DPR:$src)>;
7160 def : Pat<(f64 (bitconvert (v4f16 DPR:$src))), (f64 DPR:$src)>;
7161 def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (f64 DPR:$src)>;
7162 def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (f64 DPR:$src)>;
7163 def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (v2f32 DPR:$src)>;
7164 def : Pat<(v4f16 (bitconvert (f64 DPR:$src))), (v4f16 DPR:$src)>;
7165 def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>;
7166 }
7167 def : Pat<(v2f32 (bitconvert (v2i32 DPR:$src))), (v2f32 DPR:$src)>;
7168 let Predicates = [IsLE] in {
7169 def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>;
7170 def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (v2f32 DPR:$src)>;
7171 }
7172
7173 let Predicates = [IsLE] in {
7195
7196 // 128 bit conversions
7197 def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
7198 def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>;
7199 def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 QPR:$src)>;
7200 def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>;
7201 def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>;
7202
7203 def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>;
71747204 def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>;
7205 def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (v2i64 QPR:$src)>;
71757206 def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>;
71767207 def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>;
7177 }
7178 def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>;
7179 let Predicates = [IsLE] in {
7180 def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>;
7208
7209 def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>;
7210 def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>;
7211 def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (v4f32 QPR:$src)>;
7212 def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>;
7213 def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>;
7214
7215 def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>;
71817216 def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>;
7217 def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (v4i32 QPR:$src)>;
71827218 def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>;
71837219 def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>;
7184 def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>;
7185 }
7186 def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>;
7187 let Predicates = [IsLE] in {
7220
7221 def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (v8f16 QPR:$src)>;
7222 def : Pat<(v8f16 (bitconvert (v2i64 QPR:$src))), (v8f16 QPR:$src)>;
7223 def : Pat<(v8f16 (bitconvert (v4f32 QPR:$src))), (v8f16 QPR:$src)>;
7224 def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (v8f16 QPR:$src)>;
7225 def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (v8f16 QPR:$src)>;
7226
7227 def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>;
71887228 def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>;
7229 def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>;
71897230 def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>;
71907231 def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>;
7191 def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>;
7192 def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>;
7193 def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (v8f16 QPR:$src)>;
7232
7233 def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>;
71947234 def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>;
7235 def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>;
71957236 def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>;
7237 def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (v16i8 QPR:$src)>;
71967238 def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>;
7197 def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>;
7198 def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>;
7199 def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>;
7200 }
7201 def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>;
7202 let Predicates = [IsLE] in {
7203 def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>;
7204 def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>;
7205 def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>;
7206 }
7207 def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>;
7208 let Predicates = [IsLE] in {
7209 def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>;
7210 def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>;
7211 def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 QPR:$src)>;
7212 def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>;
7213 def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
72147239 }
72157240
72167241 let Predicates = [IsBE] in {
72177242 // 64 bit conversions
7218 def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>;
7219 def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>;
7220 def : Pat<(v1i64 (bitconvert (v8i8 DPR:$src))), (VREV64d8 DPR:$src)>;
7221 def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>;
7222 def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>;
7223 def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>;
7224 def : Pat<(v2i32 (bitconvert (v8i8 DPR:$src))), (VREV32d8 DPR:$src)>;
7225 def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>;
7226 def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (VREV64d16 DPR:$src)>;
7227 def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (VREV32d16 DPR:$src)>;
7228 def : Pat<(v4i16 (bitconvert (v8i8 DPR:$src))), (VREV16d8 DPR:$src)>;
7229 def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (VREV64d16 DPR:$src)>;
7230 def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>;
7231 def : Pat<(v8i8 (bitconvert (v1i64 DPR:$src))), (VREV64d8 DPR:$src)>;
7232 def : Pat<(v8i8 (bitconvert (v2i32 DPR:$src))), (VREV32d8 DPR:$src)>;
7233 def : Pat<(v8i8 (bitconvert (v4i16 DPR:$src))), (VREV16d8 DPR:$src)>;
7234 def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (VREV64d8 DPR:$src)>;
7235 def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (VREV32d8 DPR:$src)>;
7243 def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>;
72367244 def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>;
72377245 def : Pat<(f64 (bitconvert (v4f16 DPR:$src))), (VREV64d16 DPR:$src)>;
72387246 def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>;
72397247 def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (VREV64d8 DPR:$src)>;
7240 def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>;
7248
7249 def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>;
7250 def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>;
7251 def : Pat<(v1i64 (bitconvert (v4f16 DPR:$src))), (VREV64d16 DPR:$src)>;
7252 def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>;
7253 def : Pat<(v1i64 (bitconvert (v8i8 DPR:$src))), (VREV64d8 DPR:$src)>;
7254
72417255 def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>;
72427256 def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>;
7257 def : Pat<(v2f32 (bitconvert (v4f16 DPR:$src))), (VREV32d16 DPR:$src)>;
72437258 def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>;
72447259 def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (VREV32d8 DPR:$src)>;
72457260
7261 def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>;
7262 def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>;
7263 def : Pat<(v2i32 (bitconvert (v4f16 DPR:$src))), (VREV32d16 DPR:$src)>;
7264 def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>;
7265 def : Pat<(v2i32 (bitconvert (v8i8 DPR:$src))), (VREV32d8 DPR:$src)>;
7266
7267 def : Pat<(v4f16 (bitconvert (f64 DPR:$src))), (VREV64d16 DPR:$src)>;
7268 def : Pat<(v4f16 (bitconvert (v1i64 DPR:$src))), (VREV64d16 DPR:$src)>;
7269 def : Pat<(v4f16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>;
7270 def : Pat<(v4f16 (bitconvert (v2i32 DPR:$src))), (VREV32d16 DPR:$src)>;
7271 def : Pat<(v4f16 (bitconvert (v8i8 DPR:$src))), (VREV16d8 DPR:$src)>;
7272
7273 def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (VREV64d16 DPR:$src)>;
7274 def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (VREV64d16 DPR:$src)>;
7275 def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>;
7276 def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (VREV32d16 DPR:$src)>;
7277 def : Pat<(v4i16 (bitconvert (v8i8 DPR:$src))), (VREV16d8 DPR:$src)>;
7278
7279 def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (VREV64d8 DPR:$src)>;
7280 def : Pat<(v8i8 (bitconvert (v1i64 DPR:$src))), (VREV64d8 DPR:$src)>;
7281 def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (VREV32d8 DPR:$src)>;
7282 def : Pat<(v8i8 (bitconvert (v2i32 DPR:$src))), (VREV32d8 DPR:$src)>;
7283 def : Pat<(v8i8 (bitconvert (v4f16 DPR:$src))), (VREV16d8 DPR:$src)>;
7284 def : Pat<(v8i8 (bitconvert (v4i16 DPR:$src))), (VREV16d8 DPR:$src)>;
7285
72467286 // 128 bit conversions
7287 def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>;
7288 def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>;
7289 def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (VREV64q16 QPR:$src)>;
7290 def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>;
7291 def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (VREV64q8 QPR:$src)>;
7292
7293 def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>;
72477294 def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>;
7295 def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (VREV64q16 QPR:$src)>;
72487296 def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>;
72497297 def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (VREV64q8 QPR:$src)>;
7250 def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>;
7298
7299 def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>;
7300 def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>;
7301 def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (VREV32q16 QPR:$src)>;
7302 def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>;
7303 def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (VREV32q8 QPR:$src)>;
7304
7305 def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>;
72517306 def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>;
7307 def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (VREV32q16 QPR:$src)>;
72527308 def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>;
72537309 def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (VREV32q8 QPR:$src)>;
7254 def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>;
7310
7311 def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>;
7312 def : Pat<(v8f16 (bitconvert (v2i64 QPR:$src))), (VREV64q16 QPR:$src)>;
7313 def : Pat<(v8f16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>;
7314 def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>;
7315 def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (VREV16q8 QPR:$src)>;
7316
7317 def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>;
72557318 def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (VREV64q16 QPR:$src)>;
7319 def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>;
72567320 def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>;
72577321 def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (VREV16q8 QPR:$src)>;
7258 def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>;
7259 def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>;
7260 def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>;
7322
7323 def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (VREV64q8 QPR:$src)>;
72617324 def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (VREV64q8 QPR:$src)>;
7325 def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (VREV32q8 QPR:$src)>;
72627326 def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (VREV32q8 QPR:$src)>;
7327 def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (VREV16q8 QPR:$src)>;
72637328 def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (VREV16q8 QPR:$src)>;
7264 def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (VREV64q8 QPR:$src)>;
7265 def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (VREV32q8 QPR:$src)>;
7266 def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>;
7267 def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>;
7268 def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (VREV32q16 QPR:$src)>;
7269 def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (VREV32q8 QPR:$src)>;
7270 def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>;
7271 def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>;
7272 def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>;
7273 def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (VREV64q16 QPR:$src)>;
7274 def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (VREV64q8 QPR:$src)>;
7275 def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>;
72767329 }
72777330
72787331 // Use VLD1/VST1 + VREV for non-word-aligned v2f64 load/store on Big Endian
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc -mtriple armeb-eabi -mattr=armv8.2-a,neon,fullfp16 -target-abi=aapcs-gnu -float-abi hard -o - %s | FileCheck %s
2
3 ;64 bit conversions to v4f16
4 define void @conv_i64_to_v4f16( i64 %val, <4 x half>* %store ) {
5 ; CHECK-LABEL: conv_i64_to_v4f16:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: vmov d16, r1, r0
8 ; CHECK-NEXT: vldr d17, [r2]
9 ; CHECK-NEXT: vrev64.16 d16, d16
10 ; CHECK-NEXT: vrev64.16 d17, d17
11 ; CHECK-NEXT: vadd.f16 d16, d16, d17
12 ; CHECK-NEXT: vrev64.16 d16, d16
13 ; CHECK-NEXT: vstr d16, [r2]
14 ; CHECK-NEXT: bx lr
15 entry:
16 %v = bitcast i64 %val to <4 x half>
17 %w = load <4 x half>, <4 x half>* %store
18 %a = fadd <4 x half> %v, %w
19 store <4 x half> %a, <4 x half>* %store
20 ret void
21 }
22
23 define void @conv_f64_to_v4f16( double %val, <4 x half>* %store ) {
24 ; CHECK-LABEL: conv_f64_to_v4f16:
25 ; CHECK: @ %bb.0: @ %entry
26 ; CHECK-NEXT: vldr d16, [r0]
27 ; CHECK-NEXT: vrev64.16 d17, d0
28 ; CHECK-NEXT: vrev64.16 d16, d16
29 ; CHECK-NEXT: vadd.f16 d16, d17, d16
30 ; CHECK-NEXT: vrev64.16 d16, d16
31 ; CHECK-NEXT: vstr d16, [r0]
32 ; CHECK-NEXT: bx lr
33 entry:
34 %v = bitcast double %val to <4 x half>
35 %w = load <4 x half>, <4 x half>* %store
36 %a = fadd <4 x half> %v, %w
37 store <4 x half> %a, <4 x half>* %store
38 ret void
39 }
40
41 define void @conv_v2f32_to_v4f16( <2 x float> %a, <4 x half>* %store ) {
42 ; CHECK-LABEL: conv_v2f32_to_v4f16:
43 ; CHECK: @ %bb.0: @ %entry
44 ; CHECK-NEXT: vldr d16, .LCPI2_0
45 ; CHECK-NEXT: vrev64.32 d17, d0
46 ; CHECK-NEXT: vrev64.32 d16, d16
47 ; CHECK-NEXT: vadd.f32 d16, d17, d16
48 ; CHECK-NEXT: vldr d17, [r0]
49 ; CHECK-NEXT: vrev64.16 d17, d17
50 ; CHECK-NEXT: vrev32.16 d16, d16
51 ; CHECK-NEXT: vadd.f16 d16, d16, d17
52 ; CHECK-NEXT: vrev64.16 d16, d16
53 ; CHECK-NEXT: vstr d16, [r0]
54 ; CHECK-NEXT: bx lr
55 ; CHECK-NEXT: .p2align 3
56 ; CHECK-NEXT: @ %bb.1:
57 ; CHECK-NEXT: .LCPI2_0:
58 ; CHECK-NEXT: .long 3212836864 @ float -1
59 ; CHECK-NEXT: .long 1065353216 @ float 1
60 entry:
61 %c = fadd <2 x float> %a,
62 %v = bitcast <2 x float> %c to <4 x half>
63 %w = load <4 x half>, <4 x half>* %store
64 %z = fadd <4 x half> %v, %w
65 store <4 x half> %z, <4 x half>* %store
66 ret void
67 }
68
69 define void @conv_v2i32_to_v4f16( <2 x i32> %a, <4 x half>* %store ) {
70 ; CHECK-LABEL: conv_v2i32_to_v4f16:
71 ; CHECK: @ %bb.0: @ %entry
72 ; CHECK-NEXT: vldr d16, .LCPI3_0
73 ; CHECK-NEXT: vrev64.32 d17, d0
74 ; CHECK-NEXT: vrev64.32 d16, d16
75 ; CHECK-NEXT: vadd.i32 d16, d17, d16
76 ; CHECK-NEXT: vldr d18, [r0]
77 ; CHECK-NEXT: vrev64.16 d17, d18
78 ; CHECK-NEXT: vrev32.16 d16, d16
79 ; CHECK-NEXT: vadd.f16 d16, d16, d17
80 ; CHECK-NEXT: vrev64.16 d16, d16
81 ; CHECK-NEXT: vstr d16, [r0]
82 ; CHECK-NEXT: bx lr
83 ; CHECK-NEXT: .p2align 3
84 ; CHECK-NEXT: @ %bb.1:
85 ; CHECK-NEXT: .LCPI3_0:
86 ; CHECK-NEXT: .long 1 @ 0x1
87 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
88 entry:
89 %c = add <2 x i32> %a,
90 %v = bitcast <2 x i32> %c to <4 x half>
91 %w = load <4 x half>, <4 x half>* %store
92 %z = fadd <4 x half> %v, %w
93 store <4 x half> %z, <4 x half>* %store
94 ret void
95 }
96
97 define void @conv_v4i16_to_v4f16( <4 x i16> %a, <4 x half>* %store ) {
98 ; CHECK-LABEL: conv_v4i16_to_v4f16:
99 ; CHECK: @ %bb.0: @ %entry
100 ; CHECK-NEXT: vmov.i64 d16, #0xffffffff0000
101 ; CHECK-NEXT: vldr d17, [r0]
102 ; CHECK-NEXT: vrev64.16 d18, d0
103 ; CHECK-NEXT: vrev64.16 d17, d17
104 ; CHECK-NEXT: vrev64.16 d16, d16
105 ; CHECK-NEXT: vadd.i16 d16, d18, d16
106 ; CHECK-NEXT: vadd.f16 d16, d16, d17
107 ; CHECK-NEXT: vrev64.16 d16, d16
108 ; CHECK-NEXT: vstr d16, [r0]
109 ; CHECK-NEXT: bx lr
110 entry:
111 %c = add <4 x i16> %a,
112 %v = bitcast <4 x i16> %c to <4 x half>
113 %w = load <4 x half>, <4 x half>* %store
114 %z = fadd <4 x half> %v, %w
115 store <4 x half> %z, <4 x half>* %store
116 ret void
117 }
118
119 define void @conv_v8i8_to_v4f16( <8 x i8> %a, <4 x half>* %store ) {
120 ; CHECK-LABEL: conv_v8i8_to_v4f16:
121 ; CHECK: @ %bb.0: @ %entry
122 ; CHECK-NEXT: vmov.i8 d16, #0x1
123 ; CHECK-NEXT: vrev64.8 d17, d0
124 ; CHECK-NEXT: vldr d18, [r0]
125 ; CHECK-NEXT: vadd.i8 d16, d17, d16
126 ; CHECK-NEXT: vrev64.16 d17, d18
127 ; CHECK-NEXT: vrev16.8 d16, d16
128 ; CHECK-NEXT: vadd.f16 d16, d16, d17
129 ; CHECK-NEXT: vrev64.16 d16, d16
130 ; CHECK-NEXT: vstr d16, [r0]
131 ; CHECK-NEXT: bx lr
132 entry:
133 %c = add <8 x i8> %a,
134 %v = bitcast <8 x i8> %c to <4 x half>
135 %w = load <4 x half>, <4 x half>* %store
136 %z = fadd <4 x half> %v, %w
137 store <4 x half> %z, <4 x half>* %store
138 ret void
139 }
140
141 define void @conv_v2i64_to_v8f16( <2 x i64> %val, <8 x half>* %store ) {
142 ; CHECK-LABEL: conv_v2i64_to_v8f16:
143 ; CHECK: @ %bb.0: @ %entry
144 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
145 ; CHECK-NEXT: adr r1, .LCPI6_0
146 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1:128]
147 ; CHECK-NEXT: vadd.i64 q9, q0, q9
148 ; CHECK-NEXT: vrev64.16 q8, q8
149 ; CHECK-NEXT: vrev64.16 q9, q9
150 ; CHECK-NEXT: vadd.f16 q8, q9, q8
151 ; CHECK-NEXT: vrev64.16 q8, q8
152 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
153 ; CHECK-NEXT: bx lr
154 ; CHECK-NEXT: .p2align 4
155 ; CHECK-NEXT: @ %bb.1:
156 ; CHECK-NEXT: .LCPI6_0:
157 ; CHECK-NEXT: .long 0 @ 0x0
158 ; CHECK-NEXT: .long 1 @ 0x1
159 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
160 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
161 entry:
162 %v = add <2 x i64> %val,
163 %v1 = bitcast <2 x i64> %v to <8 x half>
164 %w = load <8 x half>, <8 x half>* %store
165 %a = fadd <8 x half> %v1, %w
166 store <8 x half> %a, <8 x half>* %store
167 ret void
168 }
169 define void @conv_v2f64_to_v8f16( <2 x double> %val, <8 x half>* %store ) {
170 ; CHECK-LABEL: conv_v2f64_to_v8f16:
171 ; CHECK: @ %bb.0: @ %entry
172 ; CHECK-NEXT: vmov.f64 d16, #-1.000000e+00
173 ; CHECK-NEXT: vmov.f64 d17, #1.000000e+00
174 ; CHECK-NEXT: vadd.f64 d19, d1, d16
175 ; CHECK-NEXT: vadd.f64 d18, d0, d17
176 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
177 ; CHECK-NEXT: vrev64.16 q8, q8
178 ; CHECK-NEXT: vrev64.16 q9, q9
179 ; CHECK-NEXT: vadd.f16 q8, q9, q8
180 ; CHECK-NEXT: vrev64.16 q8, q8
181 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
182 ; CHECK-NEXT: bx lr
183 entry:
184 %v = fadd <2 x double> %val,
185 %v1 = bitcast <2 x double> %v to <8 x half>
186 %w = load <8 x half>, <8 x half>* %store
187 %a = fadd <8 x half> %v1, %w
188 store <8 x half> %a, <8 x half>* %store
189 ret void
190 }
191
192 define void @conv_v4f32_to_v8f16( <4 x float> %a, <8 x half>* %store ) {
193 ; CHECK-LABEL: conv_v4f32_to_v8f16:
194 ; CHECK: @ %bb.0: @ %entry
195 ; CHECK-NEXT: adr r1, .LCPI8_0
196 ; CHECK-NEXT: vrev64.32 q9, q0
197 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]
198 ; CHECK-NEXT: vrev64.32 q8, q8
199 ; CHECK-NEXT: vadd.f32 q8, q9, q8
200 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
201 ; CHECK-NEXT: vrev64.16 q9, q9
202 ; CHECK-NEXT: vrev32.16 q8, q8
203 ; CHECK-NEXT: vadd.f16 q8, q8, q9
204 ; CHECK-NEXT: vrev64.16 q8, q8
205 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
206 ; CHECK-NEXT: bx lr
207 ; CHECK-NEXT: .p2align 4
208 ; CHECK-NEXT: @ %bb.1:
209 ; CHECK-NEXT: .LCPI8_0:
210 ; CHECK-NEXT: .long 3212836864 @ float -1
211 ; CHECK-NEXT: .long 1065353216 @ float 1
212 ; CHECK-NEXT: .long 3212836864 @ float -1
213 ; CHECK-NEXT: .long 1065353216 @ float 1
214 entry:
215 %c = fadd <4 x float> %a,
216 %v = bitcast <4 x float> %c to <8 x half>
217 %w = load <8 x half>, <8 x half>* %store
218 %z = fadd <8 x half> %v, %w
219 store <8 x half> %z, <8 x half>* %store
220 ret void
221 }
222
223 define void @conv_v4i32_to_v8f16( <4 x i32> %a, <8 x half>* %store ) {
224 ; CHECK-LABEL: conv_v4i32_to_v8f16:
225 ; CHECK: @ %bb.0: @ %entry
226 ; CHECK-NEXT: adr r1, .LCPI9_0
227 ; CHECK-NEXT: vrev64.32 q9, q0
228 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]
229 ; CHECK-NEXT: vrev64.32 q8, q8
230 ; CHECK-NEXT: vadd.i32 q8, q9, q8
231 ; CHECK-NEXT: vld1.64 {d20, d21}, [r0]
232 ; CHECK-NEXT: vrev64.16 q9, q10
233 ; CHECK-NEXT: vrev32.16 q8, q8
234 ; CHECK-NEXT: vadd.f16 q8, q8, q9
235 ; CHECK-NEXT: vrev64.16 q8, q8
236 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
237 ; CHECK-NEXT: bx lr
238 ; CHECK-NEXT: .p2align 4
239 ; CHECK-NEXT: @ %bb.1:
240 ; CHECK-NEXT: .LCPI9_0:
241 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
242 ; CHECK-NEXT: .long 1 @ 0x1
243 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
244 ; CHECK-NEXT: .long 1 @ 0x1
245 entry:
246 %c = add <4 x i32> %a,
247 %v = bitcast <4 x i32> %c to <8 x half>
248 %w = load <8 x half>, <8 x half>* %store
249 %z = fadd <8 x half> %v, %w
250 store <8 x half> %z, <8 x half>* %store
251 ret void
252 }
253
254 define void @conv_v8i16_to_v8f16( <8 x i16> %a, <8 x half>* %store ) {
255 ; CHECK-LABEL: conv_v8i16_to_v8f16:
256 ; CHECK: @ %bb.0: @ %entry
257 ; CHECK-NEXT: adr r1, .LCPI10_0
258 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
259 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]
260 ; CHECK-NEXT: vrev64.16 q10, q0
261 ; CHECK-NEXT: vrev64.16 q8, q8
262 ; CHECK-NEXT: vrev64.16 q9, q9
263 ; CHECK-NEXT: vadd.i16 q8, q10, q8
264 ; CHECK-NEXT: vadd.f16 q8, q8, q9
265 ; CHECK-NEXT: vrev64.16 q8, q8
266 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
267 ; CHECK-NEXT: bx lr
268 ; CHECK-NEXT: .p2align 4
269 ; CHECK-NEXT: @ %bb.1:
270 ; CHECK-NEXT: .LCPI10_0:
271 ; CHECK-NEXT: .short 65535 @ 0xffff
272 ; CHECK-NEXT: .short 1 @ 0x1
273 ; CHECK-NEXT: .short 0 @ 0x0
274 ; CHECK-NEXT: .short 7 @ 0x7
275 ; CHECK-NEXT: .short 65535 @ 0xffff
276 ; CHECK-NEXT: .short 1 @ 0x1
277 ; CHECK-NEXT: .short 0 @ 0x0
278 ; CHECK-NEXT: .short 7 @ 0x7
279 entry:
280 %c = add <8 x i16> %a,
281 %v = bitcast <8 x i16> %c to <8 x half>
282 %w = load <8 x half>, <8 x half>* %store
283 %z = fadd <8 x half> %v, %w
284 store <8 x half> %z, <8 x half>* %store
285 ret void
286 }
287
288 define void @conv_v16i8_to_v8f16( <16 x i8> %a, <8 x half>* %store ) {
289 ; CHECK-LABEL: conv_v16i8_to_v8f16:
290 ; CHECK: @ %bb.0: @ %entry
291 ; CHECK-NEXT: vrev64.8 q8, q0
292 ; CHECK-NEXT: vmov.i8 q9, #0x1
293 ; CHECK-NEXT: vadd.i8 q8, q8, q9
294 ; CHECK-NEXT: vld1.64 {d20, d21}, [r0]
295 ; CHECK-NEXT: vrev64.16 q9, q10
296 ; CHECK-NEXT: vrev16.8 q8, q8
297 ; CHECK-NEXT: vadd.f16 q8, q8, q9
298 ; CHECK-NEXT: vrev64.16 q8, q8
299 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
300 ; CHECK-NEXT: bx lr
301 entry:
302 %c = add <16 x i8> %a,
303 %v = bitcast <16 x i8> %c to <8 x half>
304 %w = load <8 x half>, <8 x half>* %store
305 %z = fadd <8 x half> %v, %w
306 store <8 x half> %z, <8 x half>* %store
307 ret void
308 }
309
310 define void @conv_v4f16_to_i64( <4 x half> %a, i64* %store ) {
311 ; CHECK-LABEL: conv_v4f16_to_i64:
312 ; CHECK: @ %bb.0: @ %entry
313 ; CHECK-NEXT: vldr d16, .LCPI12_0
314 ; CHECK-NEXT: vrev64.16 d17, d0
315 ; CHECK-NEXT: vrev64.16 d16, d16
316 ; CHECK-NEXT: vadd.f16 d16, d17, d16
317 ; CHECK-NEXT: vrev64.16 d16, d16
318 ; CHECK-NEXT: vmov r1, r2, d16
319 ; CHECK-NEXT: subs r1, r1, #1
320 ; CHECK-NEXT: sbc r2, r2, #0
321 ; CHECK-NEXT: str r2, [r0]
322 ; CHECK-NEXT: str r1, [r0, #4]
323 ; CHECK-NEXT: bx lr
324 ; CHECK-NEXT: .p2align 3
325 ; CHECK-NEXT: @ %bb.1:
326 ; CHECK-NEXT: .LCPI12_0:
327 ; CHECK-NEXT: .short 48128 @ half -1
328 ; CHECK-NEXT: .short 15360 @ half 1
329 ; CHECK-NEXT: .short 48128 @ half -1
330 ; CHECK-NEXT: .short 15360 @ half 1
331 entry:
332 %z = fadd <4 x half> %a,
333 %y = bitcast <4 x half> %z to i64
334 %w = add i64 %y, -1
335 store i64 %w, i64* %store
336 ret void
337 }
338
339 define void @conv_v4f16_to_f64( <4 x half> %a, double* %store ) {
340 ; CHECK-LABEL: conv_v4f16_to_f64:
341 ; CHECK: @ %bb.0: @ %entry
342 ; CHECK-NEXT: vldr d16, .LCPI13_0
343 ; CHECK-NEXT: vrev64.16 d17, d0
344 ; CHECK-NEXT: vrev64.16 d16, d16
345 ; CHECK-NEXT: vadd.f16 d16, d17, d16
346 ; CHECK-NEXT: vmov.f64 d17, #-1.000000e+00
347 ; CHECK-NEXT: vrev64.16 d16, d16
348 ; CHECK-NEXT: vadd.f64 d16, d16, d17
349 ; CHECK-NEXT: vstr d16, [r0]
350 ; CHECK-NEXT: bx lr
351 ; CHECK-NEXT: .p2align 3
352 ; CHECK-NEXT: @ %bb.1:
353 ; CHECK-NEXT: .LCPI13_0:
354 ; CHECK-NEXT: .short 48128 @ half -1
355 ; CHECK-NEXT: .short 15360 @ half 1
356 ; CHECK-NEXT: .short 48128 @ half -1
357 ; CHECK-NEXT: .short 15360 @ half 1
358 entry:
359 %z = fadd <4 x half> %a,
360 %y = bitcast <4 x half> %z to double
361 %w = fadd double %y, -1.0
362 store double %w, double* %store
363 ret void
364 }
365
366 define void @conv_v4f16_to_v2i32( <4 x half> %a, <2 x i32>* %store ) {
367 ; CHECK-LABEL: conv_v4f16_to_v2i32:
368 ; CHECK: @ %bb.0: @ %entry
369 ; CHECK-NEXT: vldr d16, .LCPI14_0
370 ; CHECK-NEXT: vrev64.16 d17, d0
371 ; CHECK-NEXT: vrev64.16 d16, d16
372 ; CHECK-NEXT: vadd.f16 d16, d17, d16
373 ; CHECK-NEXT: vldr d17, .LCPI14_1
374 ; CHECK-NEXT: vrev64.32 d17, d17
375 ; CHECK-NEXT: vrev32.16 d16, d16
376 ; CHECK-NEXT: vadd.i32 d16, d16, d17
377 ; CHECK-NEXT: vrev64.32 d16, d16
378 ; CHECK-NEXT: vstr d16, [r0]
379 ; CHECK-NEXT: bx lr
380 ; CHECK-NEXT: .p2align 3
381 ; CHECK-NEXT: @ %bb.1:
382 ; CHECK-NEXT: .LCPI14_0:
383 ; CHECK-NEXT: .short 48128 @ half -1
384 ; CHECK-NEXT: .short 15360 @ half 1
385 ; CHECK-NEXT: .short 48128 @ half -1
386 ; CHECK-NEXT: .short 15360 @ half 1
387 ; CHECK-NEXT: .LCPI14_1:
388 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
389 ; CHECK-NEXT: .long 1 @ 0x1
390 entry:
391 %z = fadd <4 x half> %a,
392 %y = bitcast <4 x half> %z to <2 x i32>
393 %w = add <2 x i32> %y,
394 store <2 x i32> %w, <2 x i32>* %store
395 ret void
396 }
397
398 define void @conv_v4f16_to_v2f32( <4 x half> %a, <2 x float>* %store ) {
399 ; CHECK-LABEL: conv_v4f16_to_v2f32:
400 ; CHECK: @ %bb.0: @ %entry
401 ; CHECK-NEXT: vldr d16, .LCPI15_0
402 ; CHECK-NEXT: vrev64.16 d17, d0
403 ; CHECK-NEXT: vrev64.16 d16, d16
404 ; CHECK-NEXT: vadd.f16 d16, d17, d16
405 ; CHECK-NEXT: vldr d17, .LCPI15_1
406 ; CHECK-NEXT: vrev64.32 d17, d17
407 ; CHECK-NEXT: vrev32.16 d16, d16
408 ; CHECK-NEXT: vadd.f32 d16, d16, d17
409 ; CHECK-NEXT: vrev64.32 d16, d16
410 ; CHECK-NEXT: vstr d16, [r0]
411 ; CHECK-NEXT: bx lr
412 ; CHECK-NEXT: .p2align 3
413 ; CHECK-NEXT: @ %bb.1:
414 ; CHECK-NEXT: .LCPI15_0:
415 ; CHECK-NEXT: .short 48128 @ half -1
416 ; CHECK-NEXT: .short 15360 @ half 1
417 ; CHECK-NEXT: .short 48128 @ half -1
418 ; CHECK-NEXT: .short 15360 @ half 1
419 ; CHECK-NEXT: .LCPI15_1:
420 ; CHECK-NEXT: .long 3212836864 @ float -1
421 ; CHECK-NEXT: .long 1065353216 @ float 1
422 entry:
423 %z = fadd <4 x half> %a,
424 %y = bitcast <4 x half> %z to <2 x float>
425 %w = fadd <2 x float> %y,
426 store <2 x float> %w, <2 x float>* %store
427 ret void
428 }
429
430 define void @conv_v4f16_to_v4i16( <4 x half> %a, <4 x i16>* %store ) {
431 ; CHECK-LABEL: conv_v4f16_to_v4i16:
432 ; CHECK: @ %bb.0: @ %entry
433 ; CHECK-NEXT: vldr d16, .LCPI16_0
434 ; CHECK-NEXT: vrev64.16 d17, d0
435 ; CHECK-NEXT: vrev64.16 d16, d16
436 ; CHECK-NEXT: vadd.f16 d16, d17, d16
437 ; CHECK-NEXT: vldr d17, .LCPI16_1
438 ; CHECK-NEXT: vrev64.16 d17, d17
439 ; CHECK-NEXT: vadd.i16 d16, d16, d17
440 ; CHECK-NEXT: vrev64.16 d16, d16
441 ; CHECK-NEXT: vstr d16, [r0]
442 ; CHECK-NEXT: bx lr
443 ; CHECK-NEXT: .p2align 3
444 ; CHECK-NEXT: @ %bb.1:
445 ; CHECK-NEXT: .LCPI16_0:
446 ; CHECK-NEXT: .short 48128 @ half -1
447 ; CHECK-NEXT: .short 15360 @ half 1
448 ; CHECK-NEXT: .short 48128 @ half -1
449 ; CHECK-NEXT: .short 15360 @ half 1
450 ; CHECK-NEXT: .LCPI16_1:
451 ; CHECK-NEXT: .short 65535 @ 0xffff
452 ; CHECK-NEXT: .short 1 @ 0x1
453 ; CHECK-NEXT: .short 0 @ 0x0
454 ; CHECK-NEXT: .short 7 @ 0x7
455 entry:
456 %z = fadd <4 x half> %a,
457 %y = bitcast <4 x half> %z to <4 x i16>
458 %w = add <4 x i16> %y,
459 store <4 x i16> %w, <4 x i16>* %store
460 ret void
461 }
462
463 define void @conv_v4f16_to_v8f8( <4 x half> %a, <8 x i8>* %store ) {
464 ; CHECK-LABEL: conv_v4f16_to_v8f8:
465 ; CHECK: @ %bb.0: @ %entry
466 ; CHECK-NEXT: vldr d16, .LCPI17_0
467 ; CHECK-NEXT: vrev64.16 d17, d0
468 ; CHECK-NEXT: vrev64.16 d16, d16
469 ; CHECK-NEXT: vadd.f16 d16, d17, d16
470 ; CHECK-NEXT: vmov.i8 d17, #0x1
471 ; CHECK-NEXT: vrev16.8 d16, d16
472 ; CHECK-NEXT: vadd.i8 d16, d16, d17
473 ; CHECK-NEXT: vrev64.8 d16, d16
474 ; CHECK-NEXT: vstr d16, [r0]
475 ; CHECK-NEXT: bx lr
476 ; CHECK-NEXT: .p2align 3
477 ; CHECK-NEXT: @ %bb.1:
478 ; CHECK-NEXT: .LCPI17_0:
479 ; CHECK-NEXT: .short 48128 @ half -1
480 ; CHECK-NEXT: .short 15360 @ half 1
481 ; CHECK-NEXT: .short 48128 @ half -1
482 ; CHECK-NEXT: .short 15360 @ half 1
483 entry:
484 %z = fadd <4 x half> %a,
485 %y = bitcast <4 x half> %z to <8 x i8>
486 %w = add <8 x i8> %y,
487 store <8 x i8> %w, <8 x i8>* %store
488 ret void
489 }
490
491 define void @conv_v8f16_to_i128( <8 x half> %a, i128* %store ) {
492 ; CHECK-LABEL: conv_v8f16_to_i128:
493 ; CHECK: @ %bb.0: @ %entry
494 ; CHECK-NEXT: adr r1, .LCPI18_0
495 ; CHECK-NEXT: vrev64.16 q9, q0
496 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]
497 ; CHECK-NEXT: vrev64.16 q8, q8
498 ; CHECK-NEXT: vadd.f16 q8, q9, q8
499 ; CHECK-NEXT: vrev32.16 q8, q8
500 ; CHECK-NEXT: vmov.32 r12, d17[1]
501 ; CHECK-NEXT: vmov.32 r2, d17[0]
502 ; CHECK-NEXT: vmov.32 r3, d16[1]
503 ; CHECK-NEXT: vmov.32 r1, d16[0]
504 ; CHECK-NEXT: subs r12, r12, #1
505 ; CHECK-NEXT: sbcs r2, r2, #0
506 ; CHECK-NEXT: sbcs r3, r3, #0
507 ; CHECK-NEXT: sbc r1, r1, #0
508 ; CHECK-NEXT: stm r0, {r1, r3}
509 ; CHECK-NEXT: str r2, [r0, #8]
510 ; CHECK-NEXT: str r12, [r0, #12]
511 ; CHECK-NEXT: bx lr
512 ; CHECK-NEXT: .p2align 4
513 ; CHECK-NEXT: @ %bb.1:
514 ; CHECK-NEXT: .LCPI18_0:
515 ; CHECK-NEXT: .short 48128 @ half -1
516 ; CHECK-NEXT: .short 15360 @ half 1
517 ; CHECK-NEXT: .short 48128 @ half -1
518 ; CHECK-NEXT: .short 15360 @ half 1
519 ; CHECK-NEXT: .short 48128 @ half -1
520 ; CHECK-NEXT: .short 15360 @ half 1
521 ; CHECK-NEXT: .short 48128 @ half -1
522 ; CHECK-NEXT: .short 15360 @ half 1
523 entry:
524 %z = fadd <8 x half> %a,
525 %y = bitcast <8 x half> %z to i128
526 %w = add i128 %y, -1
527 store i128 %w, i128* %store
528 ret void
529 }
530
531 define void @conv_v8f16_to_v2f64( <8 x half> %a, <2 x double>* %store ) {
532 ; CHECK-LABEL: conv_v8f16_to_v2f64:
533 ; CHECK: @ %bb.0: @ %entry
534 ; CHECK-NEXT: adr r1, .LCPI19_0
535 ; CHECK-NEXT: vrev64.16 q9, q0
536 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]
537 ; CHECK-NEXT: vrev64.16 q8, q8
538 ; CHECK-NEXT: vadd.f16 q8, q9, q8
539 ; CHECK-NEXT: vmov.f64 d18, #1.000000e+00
540 ; CHECK-NEXT: vrev64.16 q8, q8
541 ; CHECK-NEXT: vmov.f64 d19, #-1.000000e+00
542 ; CHECK-NEXT: vadd.f64 d21, d17, d18
543 ; CHECK-NEXT: vadd.f64 d20, d16, d19
544 ; CHECK-NEXT: vst1.64 {d20, d21}, [r0]
545 ; CHECK-NEXT: bx lr
546 ; CHECK-NEXT: .p2align 4
547 ; CHECK-NEXT: @ %bb.1:
548 ; CHECK-NEXT: .LCPI19_0:
549 ; CHECK-NEXT: .short 48128 @ half -1
550 ; CHECK-NEXT: .short 15360 @ half 1
551 ; CHECK-NEXT: .short 48128 @ half -1
552 ; CHECK-NEXT: .short 15360 @ half 1
553 ; CHECK-NEXT: .short 48128 @ half -1
554 ; CHECK-NEXT: .short 15360 @ half 1
555 ; CHECK-NEXT: .short 48128 @ half -1
556 ; CHECK-NEXT: .short 15360 @ half 1
557 entry:
558 %z = fadd <8 x half> %a,
559 %y = bitcast <8 x half> %z to <2 x double>
560 %w = fadd <2 x double> %y,
561 store <2 x double> %w, <2 x double>* %store
562 ret void
563 }
564
565 define void @conv_v8f16_to_v4i32( <8 x half> %a, <4 x i32>* %store ) {
566 ; CHECK-LABEL: conv_v8f16_to_v4i32:
567 ; CHECK: @ %bb.0: @ %entry
568 ; CHECK-NEXT: adr r1, .LCPI20_0
569 ; CHECK-NEXT: vrev64.16 q9, q0
570 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]
571 ; CHECK-NEXT: adr r1, .LCPI20_1
572 ; CHECK-NEXT: vrev64.16 q8, q8
573 ; CHECK-NEXT: vadd.f16 q8, q9, q8
574 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1:128]
575 ; CHECK-NEXT: vrev64.32 q9, q9
576 ; CHECK-NEXT: vrev32.16 q8, q8
577 ; CHECK-NEXT: vadd.i32 q8, q8, q9
578 ; CHECK-NEXT: vrev64.32 q8, q8
579 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
580 ; CHECK-NEXT: bx lr
581 ; CHECK-NEXT: .p2align 4
582 ; CHECK-NEXT: @ %bb.1:
583 ; CHECK-NEXT: .LCPI20_0:
584 ; CHECK-NEXT: .short 48128 @ half -1
585 ; CHECK-NEXT: .short 15360 @ half 1
586 ; CHECK-NEXT: .short 48128 @ half -1
587 ; CHECK-NEXT: .short 15360 @ half 1
588 ; CHECK-NEXT: .short 48128 @ half -1
589 ; CHECK-NEXT: .short 15360 @ half 1
590 ; CHECK-NEXT: .short 48128 @ half -1
591 ; CHECK-NEXT: .short 15360 @ half 1
592 ; CHECK-NEXT: .LCPI20_1:
593 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
594 ; CHECK-NEXT: .long 1 @ 0x1
595 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
596 ; CHECK-NEXT: .long 1 @ 0x1
597 entry:
598 %z = fadd <8 x half> %a,
599 %y = bitcast <8 x half> %z to <4 x i32>
600 %w = add <4 x i32> %y,
601 store <4 x i32> %w, <4 x i32>* %store
602 ret void
603 }
604
605 define void @conv_v8f16_to_v4f32( <8 x half> %a, <4 x float>* %store ) {
606 ; CHECK-LABEL: conv_v8f16_to_v4f32:
607 ; CHECK: @ %bb.0: @ %entry
608 ; CHECK-NEXT: adr r1, .LCPI21_0
609 ; CHECK-NEXT: vrev64.16 q9, q0
610 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]
611 ; CHECK-NEXT: adr r1, .LCPI21_1
612 ; CHECK-NEXT: vrev64.16 q8, q8
613 ; CHECK-NEXT: vadd.f16 q8, q9, q8
614 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1:128]
615 ; CHECK-NEXT: vrev64.32 q9, q9
616 ; CHECK-NEXT: vrev32.16 q8, q8
617 ; CHECK-NEXT: vadd.f32 q8, q8, q9
618 ; CHECK-NEXT: vrev64.32 q8, q8
619 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
620 ; CHECK-NEXT: bx lr
621 ; CHECK-NEXT: .p2align 4
622 ; CHECK-NEXT: @ %bb.1:
623 ; CHECK-NEXT: .LCPI21_0:
624 ; CHECK-NEXT: .short 48128 @ half -1
625 ; CHECK-NEXT: .short 15360 @ half 1
626 ; CHECK-NEXT: .short 48128 @ half -1
627 ; CHECK-NEXT: .short 15360 @ half 1
628 ; CHECK-NEXT: .short 48128 @ half -1
629 ; CHECK-NEXT: .short 15360 @ half 1
630 ; CHECK-NEXT: .short 48128 @ half -1
631 ; CHECK-NEXT: .short 15360 @ half 1
632 ; CHECK-NEXT: .LCPI21_1:
633 ; CHECK-NEXT: .long 3212836864 @ float -1
634 ; CHECK-NEXT: .long 1065353216 @ float 1
635 ; CHECK-NEXT: .long 3212836864 @ float -1
636 ; CHECK-NEXT: .long 1065353216 @ float 1
637 entry:
638 %z = fadd <8 x half> %a,
639 %y = bitcast <8 x half> %z to <4 x float>
640 %w = fadd <4 x float> %y,
641 store <4 x float> %w, <4 x float>* %store
642 ret void
643 }
644
645 define void @conv_v8f16_to_v8i16( <8 x half> %a, <8 x i16>* %store ) {
646 ; CHECK-LABEL: conv_v8f16_to_v8i16:
647 ; CHECK: @ %bb.0: @ %entry
648 ; CHECK-NEXT: adr r1, .LCPI22_0
649 ; CHECK-NEXT: vrev64.16 q9, q0
650 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]
651 ; CHECK-NEXT: adr r1, .LCPI22_1
652 ; CHECK-NEXT: vrev64.16 q8, q8
653 ; CHECK-NEXT: vadd.f16 q8, q9, q8
654 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1:128]
655 ; CHECK-NEXT: vrev64.16 q9, q9
656 ; CHECK-NEXT: vadd.i16 q8, q8, q9
657 ; CHECK-NEXT: vrev64.16 q8, q8
658 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
659 ; CHECK-NEXT: bx lr
660 ; CHECK-NEXT: .p2align 4
661 ; CHECK-NEXT: @ %bb.1:
662 ; CHECK-NEXT: .LCPI22_0:
663 ; CHECK-NEXT: .short 48128 @ half -1
664 ; CHECK-NEXT: .short 15360 @ half 1
665 ; CHECK-NEXT: .short 48128 @ half -1
666 ; CHECK-NEXT: .short 15360 @ half 1
667 ; CHECK-NEXT: .short 48128 @ half -1
668 ; CHECK-NEXT: .short 15360 @ half 1
669 ; CHECK-NEXT: .short 48128 @ half -1
670 ; CHECK-NEXT: .short 15360 @ half 1
671 ; CHECK-NEXT: .LCPI22_1:
672 ; CHECK-NEXT: .short 65535 @ 0xffff
673 ; CHECK-NEXT: .short 1 @ 0x1
674 ; CHECK-NEXT: .short 0 @ 0x0
675 ; CHECK-NEXT: .short 7 @ 0x7
676 ; CHECK-NEXT: .short 65535 @ 0xffff
677 ; CHECK-NEXT: .short 1 @ 0x1
678 ; CHECK-NEXT: .short 0 @ 0x0
679 ; CHECK-NEXT: .short 7 @ 0x7
680 entry:
681 %z = fadd <8 x half> %a,
682 %y = bitcast <8 x half> %z to <8 x i16>
683 %w = add <8 x i16> %y,
684 store <8 x i16> %w, <8 x i16>* %store
685 ret void
686 }
687
688 define void @conv_v8f16_to_v8f8( <8 x half> %a, <16 x i8>* %store ) {
689 ; CHECK-LABEL: conv_v8f16_to_v8f8:
690 ; CHECK: @ %bb.0: @ %entry
691 ; CHECK-NEXT: adr r1, .LCPI23_0
692 ; CHECK-NEXT: vrev64.16 q9, q0
693 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]
694 ; CHECK-NEXT: vrev64.16 q8, q8
695 ; CHECK-NEXT: vadd.f16 q8, q9, q8
696 ; CHECK-NEXT: vmov.i8 q9, #0x1
697 ; CHECK-NEXT: vrev16.8 q8, q8
698 ; CHECK-NEXT: vadd.i8 q8, q8, q9
699 ; CHECK-NEXT: vrev64.8 q8, q8
700 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
701 ; CHECK-NEXT: bx lr
702 ; CHECK-NEXT: .p2align 4
703 ; CHECK-NEXT: @ %bb.1:
704 ; CHECK-NEXT: .LCPI23_0:
705 ; CHECK-NEXT: .short 48128 @ half -1
706 ; CHECK-NEXT: .short 15360 @ half 1
707 ; CHECK-NEXT: .short 48128 @ half -1
708 ; CHECK-NEXT: .short 15360 @ half 1
709 ; CHECK-NEXT: .short 48128 @ half -1
710 ; CHECK-NEXT: .short 15360 @ half 1
711 ; CHECK-NEXT: .short 48128 @ half -1
712 ; CHECK-NEXT: .short 15360 @ half 1
713 entry:
714 %z = fadd <8 x half> %a,
715 %y = bitcast <8 x half> %z to <16 x i8>
716 %w = add <16 x i8> %y,
717 store <16 x i8> %w, <16 x i8>* %store
718 ret void
719 }
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
11 ; RUN: llc -mtriple=armv8a -mattr=+armv8.2-a,+fullfp16,+neon -target-abi=aapcs-gnu -float-abi=soft -o - %s | FileCheck %s --check-prefix=SOFT
22 ; RUN: llc -mtriple=armv8a -mattr=+armv8.2-a,+fullfp16,+neon -target-abi=aapcs-gnu -float-abi=hard -o - %s | FileCheck %s --check-prefix=HARD
3 ; RUNTOADD: llc -mtriple=armeb-eabi -mattr=+armv8.2-a,+fullfp16,+neon -target-abi=aapcs-gnu -float-abi=soft -o - %s | FileCheck %s --check-prefix=SOFTEB
4 ; RUNTOADD: llc -mtriple=armeb-eabi -mattr=+armv8.2-a,+fullfp16,+neon -target-abi=aapcs-gnu -float-abi=hard -o - %s | FileCheck %s --check-prefix=HARDEB
3 ; RUN: llc -mtriple=armeb-eabi -mattr=+armv8.2-a,+fullfp16,+neon -target-abi=aapcs-gnu -float-abi=soft -o - %s | FileCheck %s --check-prefix=SOFTEB
4 ; RUN: llc -mtriple=armeb-eabi -mattr=+armv8.2-a,+fullfp16,+neon -target-abi=aapcs-gnu -float-abi=hard -o - %s | FileCheck %s --check-prefix=HARDEB
55
66 declare <4 x half> @llvm.fabs.v4f16(<4 x half>)
77 declare <8 x half> @llvm.fabs.v8f16(<8 x half>)