| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #ifndef KLEIDICV_NEON_H | ||
| 6 | #define KLEIDICV_NEON_H | ||
| 7 | |||
| 8 | #include <utility> | ||
| 9 | |||
| 10 | #include "kleidicv/neon_intrinsics.h" | ||
| 11 | #include "kleidicv/operations.h" | ||
| 12 | #include "kleidicv/utils.h" | ||
| 13 | |||
| 14 | namespace kleidicv::neon { | ||
| 15 | |||
| 16 | template <> | ||
| 17 | class half_element_width<uint16x8_t> { | ||
| 18 | public: | ||
| 19 | using type = uint8x16_t; | ||
| 20 | }; | ||
| 21 | |||
| 22 | template <> | ||
| 23 | class half_element_width<uint32x4_t> { | ||
| 24 | public: | ||
| 25 | using type = uint16x8_t; | ||
| 26 | }; | ||
| 27 | |||
| 28 | template <> | ||
| 29 | class half_element_width<uint64x2_t> { | ||
| 30 | public: | ||
| 31 | using type = uint32x4_t; | ||
| 32 | }; | ||
| 33 | |||
| 34 | template <> | ||
| 35 | class double_element_width<uint8x16_t> { | ||
| 36 | public: | ||
| 37 | using type = uint16x8_t; | ||
| 38 | }; | ||
| 39 | |||
| 40 | template <> | ||
| 41 | class double_element_width<uint16x8_t> { | ||
| 42 | public: | ||
| 43 | using type = uint32x4_t; | ||
| 44 | }; | ||
| 45 | |||
| 46 | template <> | ||
| 47 | class double_element_width<uint32x4_t> { | ||
| 48 | public: | ||
| 49 | using type = uint64x2_t; | ||
| 50 | }; | ||
| 51 | |||
| 52 | // Primary template to describe logically grouped peroperties of vectors. | ||
| 53 | template <typename ScalarType> | ||
| 54 | class VectorTypes; | ||
| 55 | |||
| 56 | template <> | ||
| 57 | class VectorTypes<int8_t> { | ||
| 58 | public: | ||
| 59 | using ScalarType = int8_t; | ||
| 60 | using VectorType = int8x16_t; | ||
| 61 | using Vector2Type = int8x16x2_t; | ||
| 62 | using Vector3Type = int8x16x3_t; | ||
| 63 | using Vector4Type = int8x16x4_t; | ||
| 64 | }; // end of class VectorTypes<int8_t> | ||
| 65 | |||
| 66 | template <> | ||
| 67 | class VectorTypes<uint8_t> { | ||
| 68 | public: | ||
| 69 | using ScalarType = uint8_t; | ||
| 70 | using VectorType = uint8x16_t; | ||
| 71 | using Vector2Type = uint8x16x2_t; | ||
| 72 | using Vector3Type = uint8x16x3_t; | ||
| 73 | using Vector4Type = uint8x16x4_t; | ||
| 74 | }; // end of class VectorTypes<uint8_t> | ||
| 75 | |||
| 76 | template <> | ||
| 77 | class VectorTypes<int16_t> { | ||
| 78 | public: | ||
| 79 | using ScalarType = int16_t; | ||
| 80 | using VectorType = int16x8_t; | ||
| 81 | using Vector2Type = int16x8x2_t; | ||
| 82 | using Vector3Type = int16x8x3_t; | ||
| 83 | using Vector4Type = int16x8x4_t; | ||
| 84 | }; // end of class VectorTypes<int16_t> | ||
| 85 | |||
| 86 | template <> | ||
| 87 | class VectorTypes<uint16_t> { | ||
| 88 | public: | ||
| 89 | using ScalarType = uint16_t; | ||
| 90 | using VectorType = uint16x8_t; | ||
| 91 | using Vector2Type = uint16x8x2_t; | ||
| 92 | using Vector3Type = uint16x8x3_t; | ||
| 93 | using Vector4Type = uint16x8x4_t; | ||
| 94 | }; // end of class VectorTypes<uint16_t> | ||
| 95 | |||
| 96 | template <> | ||
| 97 | class VectorTypes<int32_t> { | ||
| 98 | public: | ||
| 99 | using ScalarType = int32_t; | ||
| 100 | using VectorType = int32x4_t; | ||
| 101 | using Vector2Type = int32x4x2_t; | ||
| 102 | using Vector3Type = int32x4x3_t; | ||
| 103 | using Vector4Type = int32x4x4_t; | ||
| 104 | }; // end of class VectorTypes<int32_t> | ||
| 105 | |||
| 106 | template <> | ||
| 107 | class VectorTypes<uint32_t> { | ||
| 108 | public: | ||
| 109 | using ScalarType = uint32_t; | ||
| 110 | using VectorType = uint32x4_t; | ||
| 111 | using Vector2Type = uint32x4x2_t; | ||
| 112 | using Vector3Type = uint32x4x3_t; | ||
| 113 | using Vector4Type = uint32x4x4_t; | ||
| 114 | }; // end of class VectorTypes<uint32_t> | ||
| 115 | |||
| 116 | template <> | ||
| 117 | class VectorTypes<int64_t> { | ||
| 118 | public: | ||
| 119 | using ScalarType = int64_t; | ||
| 120 | using VectorType = int64x2_t; | ||
| 121 | using Vector2Type = int64x2x2_t; | ||
| 122 | using Vector3Type = int64x2x3_t; | ||
| 123 | using Vector4Type = int64x2x4_t; | ||
| 124 | }; // end of class VectorTypes<int64_t> | ||
| 125 | |||
| 126 | template <> | ||
| 127 | class VectorTypes<uint64_t> { | ||
| 128 | public: | ||
| 129 | using ScalarType = uint64_t; | ||
| 130 | using VectorType = uint64x2_t; | ||
| 131 | using Vector2Type = uint64x2x2_t; | ||
| 132 | using Vector3Type = uint64x2x3_t; | ||
| 133 | using Vector4Type = uint64x2x4_t; | ||
| 134 | }; // end of class VectorTypes<uint64_t> | ||
| 135 | |||
| 136 | template <> | ||
| 137 | class VectorTypes<float> { | ||
| 138 | public: | ||
| 139 | using ScalarType = float; | ||
| 140 | using VectorType = float32x4_t; | ||
| 141 | using Vector2Type = float32x4x2_t; | ||
| 142 | using Vector3Type = float32x4x3_t; | ||
| 143 | using Vector4Type = float32x4x4_t; | ||
| 144 | }; // end of class VectorTypes<float> | ||
| 145 | |||
| 146 | template <> | ||
| 147 | class VectorTypes<double> { | ||
| 148 | public: | ||
| 149 | using ScalarType = double; | ||
| 150 | using VectorType = float64x2_t; | ||
| 151 | using Vector2Type = float64x2x2_t; | ||
| 152 | using Vector3Type = float64x2x3_t; | ||
| 153 | using Vector4Type = float64x2x4_t; | ||
| 154 | }; // end of class VectorTypes<double> | ||
| 155 | |||
| 156 | template <> | ||
| 157 | class VectorTypes<float16_t> { | ||
| 158 | public: | ||
| 159 | using ScalarType = float16_t; | ||
| 160 | using VectorType = float16x8_t; | ||
| 161 | using Vector2Type = float16x8x2_t; | ||
| 162 | using Vector3Type = float16x8x3_t; | ||
| 163 | using Vector4Type = float16x8x4_t; | ||
| 164 | }; // end of class VectorTypes<float16_t> | ||
| 165 | |||
| 166 | // NEON vector length in bytes. | ||
| 167 | static constexpr size_t kVectorLength = 16; | ||
| 168 | |||
| 169 | // Base class for all NEON vector traits. | ||
| 170 | template <typename ScalarType> | ||
| 171 | class VecTraitsBase : public VectorTypes<ScalarType> { | ||
| 172 | public: | ||
| 173 | using typename VectorTypes<ScalarType>::VectorType; | ||
| 174 | using typename VectorTypes<ScalarType>::Vector2Type; | ||
| 175 | using typename VectorTypes<ScalarType>::Vector3Type; | ||
| 176 | using typename VectorTypes<ScalarType>::Vector4Type; | ||
| 177 | |||
| 178 | // Number of lanes in a vector. | ||
| 179 | 621408 | static constexpr size_t num_lanes() { | |
| 180 | 621408 | return kVectorLength / sizeof(ScalarType); | |
| 181 | } | ||
| 182 | |||
| 183 | // Maximum number of lanes in a vector. | ||
| 184 | static constexpr size_t max_num_lanes() { return num_lanes(); } | ||
| 185 | |||
| 186 | #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS | ||
| 187 | |||
| 188 | private: | ||
| 189 | static inline int8x16x2_t vld1q_x2(const int8_t *src) { | ||
| 190 | return vld1q_s8_x2(src); | ||
| 191 | } | ||
| 192 | |||
| 193 | 3892 | static inline uint8x16x2_t vld1q_x2(const uint8_t *src) { | |
| 194 | 3892 | return vld1q_u8_x2(src); | |
| 195 | } | ||
| 196 | |||
| 197 | 4728 | static inline int16x8x2_t vld1q_x2(const int16_t *src) { | |
| 198 | 4728 | return vld1q_s16_x2(src); | |
| 199 | } | ||
| 200 | |||
| 201 | 568 | static inline uint16x8x2_t vld1q_x2(const uint16_t *src) { | |
| 202 | 568 | return vld1q_u16_x2(src); | |
| 203 | } | ||
| 204 | |||
| 205 | static inline int32x4x2_t vld1q_x2(const int32_t *src) { | ||
| 206 | return vld1q_s32_x2(src); | ||
| 207 | } | ||
| 208 | |||
| 209 | 400 | static inline uint32x4x2_t vld1q_x2(const uint32_t *src) { | |
| 210 | 400 | return vld1q_u32_x2(src); | |
| 211 | } | ||
| 212 | |||
| 213 | static inline int64x2x2_t vld1q_x2(const int64_t *src) { | ||
| 214 | return vld1q_s64_x2(src); | ||
| 215 | } | ||
| 216 | |||
| 217 | 400 | static inline uint64x2x2_t vld1q_x2(const uint64_t *src) { | |
| 218 | 400 | return vld1q_u64_x2(src); | |
| 219 | } | ||
| 220 | |||
| 221 | static inline float32x4x2_t vld1q_x2(const float32_t *src) { | ||
| 222 | return vld1q_f32_x2(src); | ||
| 223 | } | ||
| 224 | |||
| 225 | static inline int8x16x3_t vld1q_x3(const int8_t *src) { | ||
| 226 | return vld1q_s8_x3(src); | ||
| 227 | } | ||
| 228 | |||
| 229 | 2103 | static inline uint8x16x3_t vld1q_x3(const uint8_t *src) { | |
| 230 | 2103 | return vld1q_u8_x3(src); | |
| 231 | } | ||
| 232 | |||
| 233 | static inline int16x8x3_t vld1q_x3(const int16_t *src) { | ||
| 234 | return vld1q_s16_x3(src); | ||
| 235 | } | ||
| 236 | |||
| 237 | 720 | static inline uint16x8x3_t vld1q_x3(const uint16_t *src) { | |
| 238 | 720 | return vld1q_u16_x3(src); | |
| 239 | } | ||
| 240 | |||
| 241 | static inline int32x4x3_t vld1q_x3(const int32_t *src) { | ||
| 242 | return vld1q_s32_x3(src); | ||
| 243 | } | ||
| 244 | |||
| 245 | 720 | static inline uint32x4x3_t vld1q_x3(const uint32_t *src) { | |
| 246 | 720 | return vld1q_u32_x3(src); | |
| 247 | } | ||
| 248 | |||
| 249 | static inline int64x2x3_t vld1q_x3(const int64_t *src) { | ||
| 250 | return vld1q_s64_x3(src); | ||
| 251 | } | ||
| 252 | |||
| 253 | 720 | static inline uint64x2x3_t vld1q_x3(const uint64_t *src) { | |
| 254 | 720 | return vld1q_u64_x3(src); | |
| 255 | } | ||
| 256 | |||
| 257 | static inline float32x4x3_t vld1q_x3(const float32_t *src) { | ||
| 258 | return vld1q_f32_x3(src); | ||
| 259 | } | ||
| 260 | |||
| 261 | 884 | static inline int8x16x4_t vld1q_x4(const int8_t *src) { | |
| 262 | 884 | return vld1q_s8_x4(src); | |
| 263 | } | ||
| 264 | |||
| 265 | 2207 | static inline uint8x16x4_t vld1q_x4(const uint8_t *src) { | |
| 266 | 2207 | return vld1q_u8_x4(src); | |
| 267 | } | ||
| 268 | |||
| 269 | static inline int16x8x4_t vld1q_x4(const int16_t *src) { | ||
| 270 | return vld1q_s16_x4(src); | ||
| 271 | } | ||
| 272 | |||
| 273 | 720 | static inline uint16x8x4_t vld1q_x4(const uint16_t *src) { | |
| 274 | 720 | return vld1q_u16_x4(src); | |
| 275 | } | ||
| 276 | |||
| 277 | static inline int32x4x4_t vld1q_x4(const int32_t *src) { | ||
| 278 | return vld1q_s32_x4(src); | ||
| 279 | } | ||
| 280 | |||
| 281 | 720 | static inline uint32x4x4_t vld1q_x4(const uint32_t *src) { | |
| 282 | 720 | return vld1q_u32_x4(src); | |
| 283 | } | ||
| 284 | |||
| 285 | static inline int64x2x4_t vld1q_x4(const int64_t *src) { | ||
| 286 | return vld1q_s64_x4(src); | ||
| 287 | } | ||
| 288 | |||
| 289 | 720 | static inline uint64x2x4_t vld1q_x4(const uint64_t *src) { | |
| 290 | 720 | return vld1q_u64_x4(src); | |
| 291 | } | ||
| 292 | |||
| 293 | 457 | static inline float32x4x4_t vld1q_x4(const float32_t *src) { | |
| 294 | 457 | return vld1q_f32_x4(src); | |
| 295 | } | ||
| 296 | |||
| 297 | static inline void vst1q_x2(int8_t *dst, int8x16x2_t vec) { | ||
| 298 | vst1q_s8_x2(dst, vec); | ||
| 299 | } | ||
| 300 | |||
| 301 | 13871 | static inline void vst1q_x2(uint8_t *dst, uint8x16x2_t vec) { | |
| 302 | 13871 | vst1q_u8_x2(dst, vec); | |
| 303 | 13871 | } | |
| 304 | |||
| 305 | static inline void vst1q_x2(int16_t *dst, int16x8x2_t vec) { | ||
| 306 | vst1q_s16_x2(dst, vec); | ||
| 307 | } | ||
| 308 | |||
| 309 | 3320 | static inline void vst1q_x2(uint16_t *dst, uint16x8x2_t vec) { | |
| 310 | 3320 | vst1q_u16_x2(dst, vec); | |
| 311 | 3320 | } | |
| 312 | |||
| 313 | static inline void vst1q_x2(int32_t *dst, int32x4x2_t vec) { | ||
| 314 | vst1q_s32_x2(dst, vec); | ||
| 315 | } | ||
| 316 | |||
| 317 | 3320 | static inline void vst1q_x2(uint32_t *dst, uint32x4x2_t vec) { | |
| 318 | 3320 | vst1q_u32_x2(dst, vec); | |
| 319 | 3320 | } | |
| 320 | |||
| 321 | static inline void vst1q_x2(int64_t *dst, int64x2x2_t vec) { | ||
| 322 | vst1q_s64_x2(dst, vec); | ||
| 323 | } | ||
| 324 | |||
| 325 | 3320 | static inline void vst1q_x2(uint64_t *dst, uint64x2x2_t vec) { | |
| 326 | 3320 | vst1q_u64_x2(dst, vec); | |
| 327 | 3320 | } | |
| 328 | |||
| 329 | 73024 | static inline void vst1q_x2(float32_t *dst, float32x4x2_t vec) { | |
| 330 | 73024 | vst1q_f32_x2(dst, vec); | |
| 331 | 73024 | } | |
| 332 | |||
| 333 | static inline void vst1q_x2(float16_t *dst, float16x8x2_t vec) { | ||
| 334 | vst1q_f16_x2(dst, vec); | ||
| 335 | } | ||
| 336 | |||
| 337 | static inline void vst1q_x3(int8_t *dst, int8x16x3_t vec) { | ||
| 338 | vst1q_s8_x3(dst, vec); | ||
| 339 | } | ||
| 340 | |||
| 341 | 1628 | static inline void vst1q_x3(uint8_t *dst, uint8x16x3_t vec) { | |
| 342 | 1628 | vst1q_u8_x3(dst, vec); | |
| 343 | 1628 | } | |
| 344 | |||
| 345 | static inline void vst1q_x3(int16_t *dst, int16x8x3_t vec) { | ||
| 346 | vst1q_s16_x3(dst, vec); | ||
| 347 | } | ||
| 348 | |||
| 349 | 720 | static inline void vst1q_x3(uint16_t *dst, uint16x8x3_t vec) { | |
| 350 | 720 | vst1q_u16_x3(dst, vec); | |
| 351 | 720 | } | |
| 352 | |||
| 353 | static inline void vst1q_x3(int32_t *dst, int32x4x3_t vec) { | ||
| 354 | vst1q_s32_x3(dst, vec); | ||
| 355 | } | ||
| 356 | |||
| 357 | static inline void vst1q_x3(uint32_t *dst, uint32x4x3_t vec) { | ||
| 358 | vst1q_u32_x3(dst, vec); | ||
| 359 | } | ||
| 360 | |||
| 361 | static inline void vst1q_x3(int64_t *dst, int64x2x3_t vec) { | ||
| 362 | vst1q_s64_x3(dst, vec); | ||
| 363 | } | ||
| 364 | |||
| 365 | 720 | static inline void vst1q_x3(uint64_t *dst, uint64x2x3_t vec) { | |
| 366 | 720 | vst1q_u64_x3(dst, vec); | |
| 367 | 720 | } | |
| 368 | |||
| 369 | static inline void vst1q_x3(float32_t *dst, float32x4x3_t vec) { | ||
| 370 | vst1q_f32_x3(dst, vec); | ||
| 371 | } | ||
| 372 | |||
| 373 | static inline void vst1q_x3(float16_t *dst, float16x8x3_t vec) { | ||
| 374 | vst1q_f16_x3(dst, vec); | ||
| 375 | } | ||
| 376 | |||
| 377 | static inline void vst1q_x4(int8_t *dst, int8x16x4_t vec) { | ||
| 378 | vst1q_s8_x4(dst, vec); | ||
| 379 | } | ||
| 380 | |||
| 381 | 908 | static inline void vst1q_x4(uint8_t *dst, uint8x16x4_t vec) { | |
| 382 | 908 | vst1q_u8_x4(dst, vec); | |
| 383 | 908 | } | |
| 384 | |||
| 385 | static inline void vst1q_x4(int16_t *dst, int16x8x4_t vec) { | ||
| 386 | vst1q_s16_x4(dst, vec); | ||
| 387 | } | ||
| 388 | |||
| 389 | 11304 | static inline void vst1q_x4(uint16_t *dst, uint16x8x4_t vec) { | |
| 390 | 11304 | vst1q_u16_x4(dst, vec); | |
| 391 | 11304 | } | |
| 392 | |||
| 393 | static inline void vst1q_x4(int32_t *dst, int32x4x4_t vec) { | ||
| 394 | vst1q_s32_x4(dst, vec); | ||
| 395 | } | ||
| 396 | |||
| 397 | 1360 | static inline void vst1q_x4(uint32_t *dst, uint32x4x4_t vec) { | |
| 398 | 1360 | vst1q_u32_x4(dst, vec); | |
| 399 | 1360 | } | |
| 400 | |||
| 401 | static inline void vst1q_x4(int64_t *dst, int64x2x4_t vec) { | ||
| 402 | vst1q_s64_x4(dst, vec); | ||
| 403 | } | ||
| 404 | |||
| 405 | 2720 | static inline void vst1q_x4(uint64_t *dst, uint64x2x4_t vec) { | |
| 406 | 2720 | vst1q_u64_x4(dst, vec); | |
| 407 | 2720 | } | |
| 408 | |||
| 409 | 312 | static inline void vst1q_x4(float32_t *dst, float32x4x4_t vec) { | |
| 410 | 312 | vst1q_f32_x4(dst, vec); | |
| 411 | 312 | } | |
| 412 | |||
| 413 | 480 | static inline void vst1q_x4(float16_t *dst, float16x8x4_t vec) { | |
| 414 | 480 | vst1q_f16_x4(dst, vec); | |
| 415 | 480 | } | |
| 416 | |||
| 417 | public: | ||
| 418 | #endif | ||
| 419 | |||
| 420 | // Loads a single vector from 'src'. | ||
| 421 | 4264 | static inline void load(const ScalarType *src, VectorType &vec) { | |
| 422 | 4264 | vec = vld1q(&src[0]); | |
| 423 | 4264 | } | |
| 424 | |||
| 425 | // Loads two consecutive vectors from 'src'. | ||
| 426 | 8388 | static inline void load(const ScalarType *src, Vector2Type &vec) { | |
| 427 | #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS | ||
| 428 | 8388 | vec = vld1q_x2(&src[0]); | |
| 429 | #else | ||
| 430 | vec = {vld1q(&src[0]), vld1q(&src[0] + num_lanes())}; | ||
| 431 | #endif | ||
| 432 | 8388 | } | |
| 433 | |||
| 434 | // Loads three consecutive vectors from 'src'. | ||
| 435 | 1383 | static inline void load(const ScalarType *src, Vector3Type &vec) { | |
| 436 | #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS | ||
| 437 | 1383 | vec = vld1q_x3(&src[0]); | |
| 438 | #else | ||
| 439 | vec = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()), | ||
| 440 | vld1q(&src[0] + (2 * num_lanes()))}; | ||
| 441 | #endif | ||
| 442 | 1383 | } | |
| 443 | |||
| 444 | // Loads four consecutive vectors from 'src'. | ||
| 445 | 2828 | static inline void load(const ScalarType *src, Vector4Type &vec) { | |
| 446 | #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS | ||
| 447 | 2828 | vec = vld1q_x4(&src[0]); | |
| 448 | #else | ||
| 449 | vec = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()), | ||
| 450 | vld1q(&src[0] + (2 * num_lanes())), | ||
| 451 | vld1q(&src[0] + (3 * num_lanes()))}; | ||
| 452 | #endif | ||
| 453 | 2828 | } | |
| 454 | |||
| 455 | // Loads two consecutive vectors from 'src'. | ||
| 456 | 490235 | static inline void load_consecutive(const ScalarType *src, VectorType &vec_0, | |
| 457 | VectorType &vec_1) { | ||
| 458 | 490235 | vec_0 = vld1q(&src[0]); | |
| 459 | 490235 | vec_1 = vld1q(&src[num_lanes()]); | |
| 460 | 490235 | } | |
| 461 | |||
| 462 | // Loads 2x2 consecutive vectors from 'src'. | ||
| 463 | 800 | static inline void load_consecutive(const ScalarType *src, Vector2Type &vec_0, | |
| 464 | Vector2Type &vec_1) { | ||
| 465 | #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS | ||
| 466 | 800 | vec_0 = vld1q_x2(&src[0]); | |
| 467 | 800 | vec_1 = vld1q_x2(&src[num_lanes() * 2]); | |
| 468 | #else | ||
| 469 | vec_0 = {vld1q(&src[0]), vld1q(&src[0] + num_lanes())}; | ||
| 470 | vec_1 = {vld1q(&src[num_lanes() * 2]), | ||
| 471 | vld1q(&src[num_lanes() * 2] + num_lanes())}; | ||
| 472 | #endif | ||
| 473 | 800 | } | |
| 474 | |||
| 475 | // Loads 2x3 consecutive vectors from 'src'. | ||
| 476 | 1440 | static inline void load_consecutive(const ScalarType *src, Vector3Type &vec_0, | |
| 477 | Vector3Type &vec_1) { | ||
| 478 | #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS | ||
| 479 | 1440 | vec_0 = vld1q_x3(&src[0]); | |
| 480 | 1440 | vec_1 = vld1q_x3(&src[num_lanes() * 3]); | |
| 481 | #else | ||
| 482 | vec_0 = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()), | ||
| 483 | vld1q(&src[0] + (2 * num_lanes()))}; | ||
| 484 | vec_1 = {vld1q(&src[num_lanes() * 3]), | ||
| 485 | vld1q(&src[num_lanes() * 3] + num_lanes()), | ||
| 486 | vld1q(&src[num_lanes() * 3] + (2 * num_lanes()))}; | ||
| 487 | #endif | ||
| 488 | 1440 | } | |
| 489 | |||
| 490 | // Loads 2x4 consecutive vectors from 'src'. | ||
| 491 | 1440 | static inline void load_consecutive(const ScalarType *src, Vector4Type &vec_0, | |
| 492 | Vector4Type &vec_1) { | ||
| 493 | #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS | ||
| 494 | 1440 | vec_0 = vld1q_x4(&src[0]); | |
| 495 | 1440 | vec_1 = vld1q_x4(&src[num_lanes() * 4]); | |
| 496 | |||
| 497 | #else | ||
| 498 | vec_0 = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()), | ||
| 499 | vld1q(&src[0] + (2 * num_lanes())), | ||
| 500 | vld1q(&src[0] + (3 * num_lanes()))}; | ||
| 501 | vec_1 = {vld1q(&src[num_lanes() * 4]), | ||
| 502 | vld1q(&src[num_lanes() * 4] + num_lanes()), | ||
| 503 | vld1q(&src[num_lanes() * 4] + (2 * num_lanes())), | ||
| 504 | vld1q(&src[num_lanes() * 4] + (3 * num_lanes()))}; | ||
| 505 | #endif | ||
| 506 | 1440 | } | |
| 507 | |||
| 508 | // Stores a single vector to 'dst'. | ||
| 509 | 12166 | static inline void store(VectorType vec, ScalarType *dst) { | |
| 510 | 12166 | vst1q(&dst[0], vec); | |
| 511 | 12166 | } | |
| 512 | |||
| 513 | // Stores two consecutive vectors to 'dst'. | ||
| 514 | 96855 | static inline void store(Vector2Type vec, ScalarType *dst) { | |
| 515 | #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS | ||
| 516 | 96855 | vst1q_x2(&dst[0], vec); | |
| 517 | #else | ||
| 518 | vst1q(&dst[0], vec.val[0]); | ||
| 519 | vst1q(&dst[0] + num_lanes(), vec.val[1]); | ||
| 520 | #endif | ||
| 521 | 96855 | } | |
| 522 | |||
| 523 | // Stores three consecutive vectors to 'dst'. | ||
| 524 | 3068 | static inline void store(Vector3Type vec, ScalarType *dst) { | |
| 525 | #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS | ||
| 526 | 3068 | vst1q_x3(&dst[0], vec); | |
| 527 | #else | ||
| 528 | vst1q(&dst[0], vec.val[0]); | ||
| 529 | vst1q(&dst[0] + num_lanes(), vec.val[1]); | ||
| 530 | vst1q(&dst[0] + (2 * num_lanes()), vec.val[2]); | ||
| 531 | #endif | ||
| 532 | 3068 | } | |
| 533 | |||
| 534 | // Stores four consecutive vectors to 'dst'. | ||
| 535 | 17084 | static inline void store(Vector4Type vec, ScalarType *dst) { | |
| 536 | #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS | ||
| 537 | 17084 | vst1q_x4(&dst[0], vec); | |
| 538 | #else | ||
| 539 | vst1q(&dst[0], vec.val[0]); | ||
| 540 | vst1q(&dst[0] + num_lanes(), vec.val[1]); | ||
| 541 | vst1q(&dst[0] + (2 * num_lanes()), vec.val[2]); | ||
| 542 | vst1q(&dst[0] + (3 * num_lanes()), vec.val[3]); | ||
| 543 | #endif | ||
| 544 | 17084 | } | |
| 545 | |||
| 546 | // Stores two consecutive vectors to 'dst'. | ||
| 547 | 28627 | static inline void store_consecutive(VectorType vec_0, VectorType vec_1, | |
| 548 | ScalarType *dst) { | ||
| 549 | 28627 | vst1q(&dst[0], vec_0); | |
| 550 | 28627 | vst1q(&dst[num_lanes()], vec_1); | |
| 551 | 28627 | } | |
| 552 | }; // end of class VecTraitsBase<ScalarType> | ||
| 553 | |||
| 554 | // Available NEON vector traits. | ||
| 555 | template <typename ScalarType> | ||
| 556 | class VecTraits : public VecTraitsBase<ScalarType> {}; | ||
| 557 | |||
| 558 | // NEON has no associated context yet. | ||
| 559 | using NeonContextType = Monostate; | ||
| 560 | |||
| 561 | // Adapter which simply adds context and forwards all arguments. | ||
| 562 | template <typename OperationType> | ||
| 563 | class OperationContextAdapter : public OperationBase<OperationType> { | ||
| 564 | // Shorten rows: no need to write 'this->'. | ||
| 565 | using OperationBase<OperationType>::operation; | ||
| 566 | |||
| 567 | public: | ||
| 568 | using ContextType = NeonContextType; | ||
| 569 | |||
| 570 | 7255 | explicit OperationContextAdapter(OperationType &operation) | |
| 571 | 7255 | : OperationBase<OperationType>(operation) {} | |
| 572 | |||
| 573 | // Forwards vector_path_2x() calls to the inner operation. | ||
| 574 | template <typename... ArgTypes> | ||
| 575 | 462471 | void vector_path_2x(ArgTypes &&...args) { | |
| 576 | 462471 | operation().vector_path_2x(ContextType{}, std::forward<ArgTypes>(args)...); | |
| 577 | 462471 | } | |
| 578 | |||
| 579 | // Forwards vector_path() calls to the inner operation. | ||
| 580 | template <typename... ArgTypes> | ||
| 581 | 7762 | void vector_path(ArgTypes &&...args) { | |
| 582 | 7762 | operation().vector_path(ContextType{}, std::forward<ArgTypes>(args)...); | |
| 583 | 7762 | } | |
| 584 | |||
| 585 | // Forwards remaining_path() calls to the inner operation. | ||
| 586 | template <typename... ArgTypes> | ||
| 587 | 8596 | void remaining_path(ArgTypes &&...args) { | |
| 588 | 8596 | operation().remaining_path(ContextType{}, std::forward<ArgTypes>(args)...); | |
| 589 | 8596 | } | |
| 590 | }; // end of class OperationContextAdapter<OperationType> | ||
| 591 | |||
| 592 | // Adapter which implements remaining_path() for general NEON operations. | ||
| 593 | template <typename OperationType> | ||
| 594 | class RemainingPathAdapter : public OperationBase<OperationType> { | ||
| 595 | public: | ||
| 596 | using ContextType = NeonContextType; | ||
| 597 | |||
| 598 | 6951 | explicit RemainingPathAdapter(OperationType &operation) | |
| 599 | 6951 | : OperationBase<OperationType>(operation) {} | |
| 600 | |||
| 601 | // Forwards remaining_path() calls to scalar_path() of the inner operation | ||
| 602 | // element by element. | ||
| 603 | template <typename... ColumnTypes> | ||
| 604 | 7940 | void remaining_path(ContextType ctx, size_t length, ColumnTypes... columns) { | |
| 605 |
24/24✓ Branch 0 taken 2068 times.
✓ Branch 1 taken 33794 times.
✓ Branch 2 taken 990 times.
✓ Branch 3 taken 11388 times.
✓ Branch 4 taken 957 times.
✓ Branch 5 taken 4668 times.
✓ Branch 6 taken 664 times.
✓ Branch 7 taken 3983 times.
✓ Branch 8 taken 764 times.
✓ Branch 9 taken 2353 times.
✓ Branch 10 taken 705 times.
✓ Branch 11 taken 1596 times.
✓ Branch 12 taken 312 times.
✓ Branch 13 taken 356 times.
✓ Branch 14 taken 360 times.
✓ Branch 15 taken 400 times.
✓ Branch 16 taken 376 times.
✓ Branch 17 taken 384 times.
✓ Branch 18 taken 120 times.
✓ Branch 19 taken 128 times.
✓ Branch 20 taken 248 times.
✓ Branch 21 taken 256 times.
✓ Branch 22 taken 376 times.
✓ Branch 23 taken 384 times.
|
67630 | for (size_t index = 0; index < length; ++index) { |
| 606 | 59690 | disable_loop_vectorization(); | |
| 607 | 59690 | this->operation().scalar_path(ctx, columns.at(index)...); | |
| 608 | 59690 | } | |
| 609 | 7940 | } | |
| 610 | }; // end of class RemainingPathAdapter<OperationType> | ||
| 611 | |||
| 612 | // Adapter which implements remaining_path() for NEON operations which | ||
| 613 | // implementation custom processing of remaining elements. | ||
| 614 | template <typename OperationType> | ||
| 615 | class RemainingPathToScalarPathAdapter : public OperationBase<OperationType> { | ||
| 616 | public: | ||
| 617 | using ContextType = NeonContextType; | ||
| 618 | |||
| 619 | 304 | explicit RemainingPathToScalarPathAdapter(OperationType &operation) | |
| 620 | 304 | : OperationBase<OperationType>(operation) {} | |
| 621 | |||
| 622 | // Forwards remaining_path() calls to scalar_path() of the inner operation. | ||
| 623 | template <typename... ArgTypes> | ||
| 624 | 656 | void remaining_path(ArgTypes &&...args) { | |
| 625 | 656 | this->operation().scalar_path(std::forward<ArgTypes>(args)...); | |
| 626 | 656 | } | |
| 627 | }; // end of class RemainingPathToScalarPathAdapter<OperationType> | ||
| 628 | |||
| 629 | // Shorthand for applying a generic unrolled NEON operation. | ||
| 630 | template <typename OperationType, typename... ArgTypes> | ||
| 631 | 6327 | void apply_operation_by_rows(OperationType &operation, ArgTypes &&...args) { | |
| 632 | 6327 | RemoveContextAdapter remove_context_adapter{operation}; | |
| 633 | 6327 | OperationAdapter operation_adapter{remove_context_adapter}; | |
| 634 | 6327 | RemainingPathAdapter remaining_path_adapter{operation_adapter}; | |
| 635 | 6327 | OperationContextAdapter context_adapter{remaining_path_adapter}; | |
| 636 | 6327 | RowBasedOperation row_based_operation{context_adapter}; | |
| 637 | 6327 | zip_rows(row_based_operation, std::forward<ArgTypes>(args)...); | |
| 638 | 6327 | } | |
| 639 | |||
| 640 | // Shorthand for applying a generic unrolled and block-based NEON operation. | ||
| 641 | template <typename OperationType, typename... ArgTypes> | ||
| 642 | 624 | void apply_block_operation_by_rows(OperationType &operation, | |
| 643 | ArgTypes &&...args) { | ||
| 644 | 624 | RemoveContextAdapter remove_context_adapter{operation}; | |
| 645 | 624 | OperationAdapter operation_adapter{remove_context_adapter}; | |
| 646 | 624 | RemainingPathAdapter remaining_path_adapter{operation_adapter}; | |
| 647 | 624 | OperationContextAdapter context_adapter{remaining_path_adapter}; | |
| 648 | 624 | RowBasedBlockOperation block_operation{context_adapter}; | |
| 649 | 624 | zip_rows(block_operation, std::forward<ArgTypes>(args)...); | |
| 650 | 624 | } | |
| 651 | |||
| 652 | } // namespace kleidicv::neon | ||
| 653 | |||
| 654 | #endif // KLEIDICV_NEON_H | ||
| 655 |