| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #ifndef KLEIDICV_SVE2_H | ||
| 6 | #define KLEIDICV_SVE2_H | ||
| 7 | |||
| 8 | #include <arm_sve.h> | ||
| 9 | |||
| 10 | #include <utility> | ||
| 11 | |||
| 12 | #include "kleidicv/operations.h" | ||
| 13 | #include "kleidicv/utils.h" | ||
| 14 | |||
| 15 | // It is used by SVE2 and SME, the actual namespace will reflect it. | ||
| 16 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
| 17 | |||
| 18 | // Context associated with SVE operations. | ||
| 19 | class Context { | ||
| 20 | public: | ||
| 21 | 83463 | explicit Context(svbool_t &pg) KLEIDICV_STREAMING : pg_{pg} {} | |
| 22 | |||
| 23 | // Sets the predicate associated with the context to a given predicate. | ||
| 24 | 109894 | void set_predicate(svbool_t pg) KLEIDICV_STREAMING { pg_ = pg; } | |
| 25 | |||
| 26 | // Returns predicate associated with the context. | ||
| 27 | 500619 | svbool_t predicate() const KLEIDICV_STREAMING { return pg_; } | |
| 28 | |||
| 29 | protected: | ||
| 30 | // Hold a reference to an svbool_t because a sizeless type cannot be a member. | ||
| 31 | svbool_t &pg_; | ||
| 32 | }; // end of class Context | ||
| 33 | |||
| 34 | // Primary template to describe logically grouped properties of vectors. | ||
| 35 | template <typename ScalarType> | ||
| 36 | class VectorTypes; | ||
| 37 | |||
| 38 | template <> | ||
| 39 | class VectorTypes<int8_t> { | ||
| 40 | public: | ||
| 41 | using ScalarType = int8_t; | ||
| 42 | using VectorType = svint8_t; | ||
| 43 | using Vector2Type = svint8x2_t; | ||
| 44 | using Vector3Type = svint8x3_t; | ||
| 45 | using Vector4Type = svint8x4_t; | ||
| 46 | }; // end of class VectorTypes<int8_t> | ||
| 47 | |||
| 48 | template <> | ||
| 49 | class VectorTypes<uint8_t> { | ||
| 50 | public: | ||
| 51 | using ScalarType = uint8_t; | ||
| 52 | using VectorType = svuint8_t; | ||
| 53 | using Vector2Type = svuint8x2_t; | ||
| 54 | using Vector3Type = svuint8x3_t; | ||
| 55 | using Vector4Type = svuint8x4_t; | ||
| 56 | }; // end of class VectorTypes<uint8_t> | ||
| 57 | |||
| 58 | template <> | ||
| 59 | class VectorTypes<int16_t> { | ||
| 60 | public: | ||
| 61 | using ScalarType = int16_t; | ||
| 62 | using VectorType = svint16_t; | ||
| 63 | using Vector2Type = svint16x2_t; | ||
| 64 | using Vector3Type = svint16x3_t; | ||
| 65 | using Vector4Type = svint16x4_t; | ||
| 66 | }; // end of class VectorTypes<int16_t> | ||
| 67 | |||
| 68 | template <> | ||
| 69 | class VectorTypes<uint16_t> { | ||
| 70 | public: | ||
| 71 | using ScalarType = uint16_t; | ||
| 72 | using VectorType = svuint16_t; | ||
| 73 | using Vector2Type = svuint16x2_t; | ||
| 74 | using Vector3Type = svuint16x3_t; | ||
| 75 | using Vector4Type = svuint16x4_t; | ||
| 76 | }; // end of class VectorTypes<uint16_t> | ||
| 77 | |||
| 78 | template <> | ||
| 79 | class VectorTypes<int32_t> { | ||
| 80 | public: | ||
| 81 | using ScalarType = int32_t; | ||
| 82 | using VectorType = svint32_t; | ||
| 83 | using Vector2Type = svint32x2_t; | ||
| 84 | using Vector3Type = svint32x3_t; | ||
| 85 | using Vector4Type = svint32x4_t; | ||
| 86 | }; // end of class VectorTypes<int32_t> | ||
| 87 | |||
| 88 | template <> | ||
| 89 | class VectorTypes<uint32_t> { | ||
| 90 | public: | ||
| 91 | using ScalarType = uint32_t; | ||
| 92 | using VectorType = svuint32_t; | ||
| 93 | using Vector2Type = svuint32x2_t; | ||
| 94 | using Vector3Type = svuint32x3_t; | ||
| 95 | using Vector4Type = svuint32x4_t; | ||
| 96 | }; // end of class VectorTypes<uint32_t> | ||
| 97 | |||
| 98 | template <> | ||
| 99 | class VectorTypes<int64_t> { | ||
| 100 | public: | ||
| 101 | using ScalarType = int64_t; | ||
| 102 | using VectorType = svint64_t; | ||
| 103 | using Vector2Type = svint64x2_t; | ||
| 104 | using Vector3Type = svint64x3_t; | ||
| 105 | using Vector4Type = svint64x4_t; | ||
| 106 | }; // end of class VectorTypes<int64_t> | ||
| 107 | |||
| 108 | template <> | ||
| 109 | class VectorTypes<uint64_t> { | ||
| 110 | public: | ||
| 111 | using ScalarType = uint64_t; | ||
| 112 | using VectorType = svuint64_t; | ||
| 113 | using Vector2Type = svuint64x2_t; | ||
| 114 | using Vector3Type = svuint64x3_t; | ||
| 115 | using Vector4Type = svuint64x4_t; | ||
| 116 | }; // end of class VectorTypes<uint64_t> | ||
| 117 | |||
| 118 | template <> | ||
| 119 | class VectorTypes<float> { | ||
| 120 | public: | ||
| 121 | using ScalarType = float; | ||
| 122 | using VectorType = svfloat32_t; | ||
| 123 | using Vector2Type = svfloat32x2_t; | ||
| 124 | using Vector3Type = svfloat32x3_t; | ||
| 125 | using Vector4Type = svfloat32x4_t; | ||
| 126 | }; // end of class VectorTypes<float> | ||
| 127 | |||
| 128 | template <> | ||
| 129 | class VectorTypes<double> { | ||
| 130 | public: | ||
| 131 | using ScalarType = double; | ||
| 132 | using VectorType = svfloat64_t; | ||
| 133 | using Vector2Type = svfloat64x2_t; | ||
| 134 | using Vector3Type = svfloat64x3_t; | ||
| 135 | using Vector4Type = svfloat64x4_t; | ||
| 136 | }; // end of class VectorTypes<double> | ||
| 137 | |||
| 138 | template <> | ||
| 139 | class VectorTypes<float16_t> { | ||
| 140 | public: | ||
| 141 | using ScalarType = float16_t; | ||
| 142 | using VectorType = svfloat16_t; | ||
| 143 | using Vector2Type = svfloat16x2_t; | ||
| 144 | using Vector3Type = svfloat16x3_t; | ||
| 145 | using Vector4Type = svfloat16x4_t; | ||
| 146 | }; // end of class VectorTypes<float16_t> | ||
| 147 | |||
| 148 | // Base class for all SVE vector traits. | ||
| 149 | template <typename ScalarType> | ||
| 150 | class VecTraitsBase : public VectorTypes<ScalarType> { | ||
| 151 | public: | ||
| 152 | using typename VectorTypes<ScalarType>::VectorType; | ||
| 153 | using typename VectorTypes<ScalarType>::Vector2Type; | ||
| 154 | |||
| 155 | // Number of lanes in a vector. | ||
| 156 | 201667 | static inline size_t num_lanes() KLEIDICV_STREAMING { | |
| 157 | 201667 | return static_cast<size_t>(svcnt()); | |
| 158 | } | ||
| 159 | |||
| 160 | // Maximum number of lanes in a vector. | ||
| 161 | static constexpr size_t max_num_lanes() KLEIDICV_STREAMING { | ||
| 162 | return 256 / sizeof(ScalarType); | ||
| 163 | } | ||
| 164 | |||
| 165 | // Loads a single vector from 'src'. | ||
| 166 | 30015 | static inline void load(Context ctx, const ScalarType *src, | |
| 167 | VectorType &vec) KLEIDICV_STREAMING { | ||
| 168 | 30015 | vec = svld1(ctx.predicate(), &src[0]); | |
| 169 | 30015 | } | |
| 170 | |||
| 171 | // Loads two consecutive vectors from 'src'. | ||
| 172 | 83995 | static inline void load_consecutive(Context ctx, const ScalarType *src, | |
| 173 | VectorType &vec_0, | ||
| 174 | VectorType &vec_1) KLEIDICV_STREAMING { | ||
| 175 | #if KLEIDICV_TARGET_SME2 | ||
| 176 | // Assuming that ctx contains a full predicate. | ||
| 177 | (void)ctx; | ||
| 178 | 16391 | svcount_t p_counter = svptrue_c(); | |
| 179 | 16391 | Vector2Type v = svld1_x2(p_counter, &src[0]); | |
| 180 | 16391 | vec_0 = svget2(v, 0); | |
| 181 | 16391 | vec_1 = svget2(v, 1); | |
| 182 | #else | ||
| 183 | 67604 | vec_0 = svld1(ctx.predicate(), &src[0]); | |
| 184 | 67604 | vec_1 = svld1_vnum(ctx.predicate(), &src[0], 1); | |
| 185 | #endif | ||
| 186 | 83995 | } | |
| 187 | |||
| 188 | // Stores a single vector to 'dst'. | ||
| 189 | 17066 | static inline void store(Context ctx, VectorType vec, | |
| 190 | ScalarType *dst) KLEIDICV_STREAMING { | ||
| 191 | 17066 | svst1(ctx.predicate(), &dst[0], vec); | |
| 192 | 17066 | } | |
| 193 | |||
| 194 | // Stores two consecutive vectors to 'dst'. | ||
| 195 | 45597 | static inline void store_consecutive(Context ctx, VectorType vec_0, | |
| 196 | VectorType vec_1, | ||
| 197 | ScalarType *dst) KLEIDICV_STREAMING { | ||
| 198 | #if KLEIDICV_TARGET_SME2 | ||
| 199 | // Assuming that ctx contains a full predicate. | ||
| 200 | (void)ctx; | ||
| 201 | 8648 | svcount_t p_counter = svptrue_c(); | |
| 202 | 8648 | Vector2Type v = svcreate2(vec_0, vec_1); | |
| 203 | 8648 | svst1(p_counter, &dst[0], v); | |
| 204 | #else | ||
| 205 | 36949 | svst1(ctx.predicate(), &dst[0], vec_0); | |
| 206 | 36949 | svst1_vnum(ctx.predicate(), &dst[0], 1, vec_1); | |
| 207 | #endif | ||
| 208 | 45597 | } | |
| 209 | |||
| 210 | template <typename T = ScalarType> | ||
| 211 | 116555 | static std::enable_if_t<sizeof(T) == sizeof(int8_t), uint64_t> svcnt() | |
| 212 | KLEIDICV_STREAMING { | ||
| 213 | 116555 | return svcntb(); | |
| 214 | } | ||
| 215 | |||
| 216 | template <typename T = ScalarType> | ||
| 217 | 51641 | static std::enable_if_t<sizeof(T) == sizeof(int16_t), uint64_t> svcnt() | |
| 218 | KLEIDICV_STREAMING { | ||
| 219 | 51641 | return svcnth(); | |
| 220 | } | ||
| 221 | |||
| 222 | template <typename T = ScalarType> | ||
| 223 | 30627 | static std::enable_if_t<sizeof(T) == sizeof(int32_t), uint64_t> svcnt() | |
| 224 | KLEIDICV_STREAMING { | ||
| 225 | 30627 | return svcntw(); | |
| 226 | } | ||
| 227 | |||
| 228 | template <typename T = ScalarType> | ||
| 229 | 2844 | static std::enable_if_t<sizeof(T) == sizeof(int64_t), uint64_t> svcnt() | |
| 230 | KLEIDICV_STREAMING { | ||
| 231 | 2844 | return svcntd(); | |
| 232 | } | ||
| 233 | |||
| 234 | template <typename T = ScalarType> | ||
| 235 | 708 | static std::enable_if_t<sizeof(T) == sizeof(int8_t), uint64_t> svcntp( | |
| 236 | svbool_t pg) KLEIDICV_STREAMING { | ||
| 237 | 708 | return svcntp_b8(pg, pg); | |
| 238 | } | ||
| 239 | |||
| 240 | template <typename T = ScalarType> | ||
| 241 | static std::enable_if_t<sizeof(T) == sizeof(int16_t), uint64_t> svcntp( | ||
| 242 | svbool_t pg) KLEIDICV_STREAMING { | ||
| 243 | return svcntp_b16(pg, pg); | ||
| 244 | } | ||
| 245 | |||
| 246 | template <typename T = ScalarType> | ||
| 247 | static std::enable_if_t<sizeof(T) == sizeof(int32_t), uint64_t> svcntp( | ||
| 248 | svbool_t pg) KLEIDICV_STREAMING { | ||
| 249 | return svcntp_b32(pg, pg); | ||
| 250 | } | ||
| 251 | |||
| 252 | template <typename T = ScalarType> | ||
| 253 | static std::enable_if_t<sizeof(T) == sizeof(int64_t), uint64_t> svcntp( | ||
| 254 | svbool_t pg) KLEIDICV_STREAMING { | ||
| 255 | return svcntp_b64(pg, pg); | ||
| 256 | } | ||
| 257 | |||
| 258 | template <typename T = ScalarType> | ||
| 259 | 130943 | static std::enable_if_t<sizeof(T) == sizeof(int8_t), svbool_t> svptrue() | |
| 260 | KLEIDICV_STREAMING { | ||
| 261 | 130943 | return svptrue_b8(); | |
| 262 | } | ||
| 263 | |||
| 264 | template <typename T = ScalarType> | ||
| 265 | 66226 | static std::enable_if_t<sizeof(T) == sizeof(int16_t), svbool_t> svptrue() | |
| 266 | KLEIDICV_STREAMING { | ||
| 267 | 66226 | return svptrue_b16(); | |
| 268 | } | ||
| 269 | |||
| 270 | template <typename T = ScalarType> | ||
| 271 | 66160 | static std::enable_if_t<sizeof(T) == sizeof(int32_t), svbool_t> svptrue() | |
| 272 | KLEIDICV_STREAMING { | ||
| 273 | 66160 | return svptrue_b32(); | |
| 274 | } | ||
| 275 | |||
| 276 | template <typename T = ScalarType> | ||
| 277 | 13720 | static std::enable_if_t<sizeof(T) == sizeof(int64_t), svbool_t> svptrue() | |
| 278 | KLEIDICV_STREAMING { | ||
| 279 | 13720 | return svptrue_b64(); | |
| 280 | } | ||
| 281 | |||
| 282 | #if KLEIDICV_TARGET_SME2 | ||
| 283 | template <typename T = ScalarType> | ||
| 284 | 5114 | static std::enable_if_t<sizeof(T) == sizeof(int8_t), svcount_t> svptrue_c() | |
| 285 | KLEIDICV_STREAMING { | ||
| 286 | 5114 | return svptrue_c8(); | |
| 287 | } | ||
| 288 | |||
| 289 | template <typename T = ScalarType> | ||
| 290 | 4723 | static std::enable_if_t<sizeof(T) == sizeof(int16_t), svcount_t> svptrue_c() | |
| 291 | KLEIDICV_STREAMING { | ||
| 292 | 4723 | return svptrue_c16(); | |
| 293 | } | ||
| 294 | |||
| 295 | template <typename T = ScalarType> | ||
| 296 | 10504 | static std::enable_if_t<sizeof(T) == sizeof(int32_t), svcount_t> svptrue_c() | |
| 297 | KLEIDICV_STREAMING { | ||
| 298 | 10504 | return svptrue_c32(); | |
| 299 | } | ||
| 300 | |||
| 301 | template <typename T = ScalarType> | ||
| 302 | 6246 | static std::enable_if_t<sizeof(T) == sizeof(int64_t), svcount_t> svptrue_c() | |
| 303 | KLEIDICV_STREAMING { | ||
| 304 | 6246 | return svptrue_c64(); | |
| 305 | } | ||
| 306 | #endif // KLEIDICV_TARGET_SME2 | ||
| 307 | |||
| 308 | template <enum svpattern pat, typename T = ScalarType> | ||
| 309 | 73656 | static std::enable_if_t<sizeof(T) == sizeof(int8_t), svbool_t> svptrue_pat() | |
| 310 | KLEIDICV_STREAMING { | ||
| 311 | 73656 | return svptrue_pat_b8(pat); | |
| 312 | } | ||
| 313 | |||
| 314 | template <enum svpattern pat, typename T = ScalarType> | ||
| 315 | 88512 | static std::enable_if_t<sizeof(T) == sizeof(int16_t), svbool_t> svptrue_pat() | |
| 316 | KLEIDICV_STREAMING { | ||
| 317 | 88512 | return svptrue_pat_b16(pat); | |
| 318 | } | ||
| 319 | |||
| 320 | template <enum svpattern pat, typename T = ScalarType> | ||
| 321 | 103056 | static std::enable_if_t<sizeof(T) == sizeof(int32_t), svbool_t> svptrue_pat() | |
| 322 | KLEIDICV_STREAMING { | ||
| 323 | 103056 | return svptrue_pat_b32(pat); | |
| 324 | } | ||
| 325 | |||
| 326 | template <enum svpattern pat, typename T = ScalarType> | ||
| 327 | static std::enable_if_t<sizeof(T) == sizeof(int64_t), svbool_t> svptrue_pat() | ||
| 328 | KLEIDICV_STREAMING { | ||
| 329 | return svptrue_pat_b64(pat); | ||
| 330 | } | ||
| 331 | |||
| 332 | template <typename IndexType, typename T = ScalarType> | ||
| 333 | 104247 | static std::enable_if_t<sizeof(T) == sizeof(int8_t), svbool_t> svwhilelt( | |
| 334 | IndexType index, IndexType max_index) KLEIDICV_STREAMING { | ||
| 335 | if constexpr (std::is_same_v<IndexType, size_t>) { | ||
| 336 | 95579 | return svwhilelt_b8_u64(index, max_index); | |
| 337 | } else if constexpr (std::is_same_v<IndexType, ptrdiff_t>) { | ||
| 338 | 8668 | return svwhilelt_b8_s64(index, max_index); | |
| 339 | } else { | ||
| 340 | return svwhilelt_b8(index, max_index); | ||
| 341 | } | ||
| 342 | } | ||
| 343 | |||
| 344 | template <typename IndexType, typename T = ScalarType> | ||
| 345 | 33843 | static std::enable_if_t<sizeof(T) == sizeof(int16_t), svbool_t> svwhilelt( | |
| 346 | IndexType index, IndexType max_index) KLEIDICV_STREAMING { | ||
| 347 | if constexpr (std::is_same_v<IndexType, size_t>) { | ||
| 348 | 29333 | return svwhilelt_b16_u64(index, max_index); | |
| 349 | } else if constexpr (std::is_same_v<IndexType, ptrdiff_t>) { | ||
| 350 | 4510 | return svwhilelt_b16_s64(index, max_index); | |
| 351 | } else { | ||
| 352 | return svwhilelt_b16(index, max_index); | ||
| 353 | } | ||
| 354 | } | ||
| 355 | |||
| 356 | template <typename IndexType, typename T = ScalarType> | ||
| 357 | 17425 | static std::enable_if_t<sizeof(T) == sizeof(int32_t), svbool_t> svwhilelt( | |
| 358 | IndexType index, IndexType max_index) KLEIDICV_STREAMING { | ||
| 359 | if constexpr (std::is_same_v<IndexType, size_t>) { | ||
| 360 | 17425 | return svwhilelt_b32_u64(index, max_index); | |
| 361 | } else if constexpr (std::is_same_v<IndexType, ptrdiff_t>) { | ||
| 362 | return svwhilelt_b32_s64(index, max_index); | ||
| 363 | } else { | ||
| 364 | return svwhilelt_b32(index, max_index); | ||
| 365 | } | ||
| 366 | } | ||
| 367 | |||
| 368 | template <typename IndexType, typename T = ScalarType> | ||
| 369 | 2058 | static std::enable_if_t<sizeof(T) == sizeof(int64_t), svbool_t> svwhilelt( | |
| 370 | IndexType index, IndexType max_index) KLEIDICV_STREAMING { | ||
| 371 | if constexpr (std::is_same_v<IndexType, size_t>) { | ||
| 372 | 2058 | return svwhilelt_b64_u64(index, max_index); | |
| 373 | } else if constexpr (std::is_same_v<IndexType, ptrdiff_t>) { | ||
| 374 | return svwhilelt_b64_s64(index, max_index); | ||
| 375 | } else { | ||
| 376 | return svwhilelt_b64(index, max_index); | ||
| 377 | } | ||
| 378 | } | ||
| 379 | |||
| 380 | // Transforms a single predicate into three other predicates that then can be | ||
| 381 | // used for consecutive operations. The input predicate can only have | ||
| 382 | // consecutive ones starting at the lowest element. | ||
| 383 | 188 | static void make_consecutive_predicates(svbool_t pg, svbool_t &pg_0, | |
| 384 | svbool_t &pg_1, | ||
| 385 | svbool_t &pg_2) KLEIDICV_STREAMING { | ||
| 386 | // Length of data. Must be signed because of the unconditional subtraction | ||
| 387 | // of fixed values. | ||
| 388 | 188 | int64_t length = 3 * svcntp(pg); | |
| 389 | // Handle up to VL length worth of data with the first predicated operation. | ||
| 390 | 188 | pg_0 = svwhilelt(int64_t{0}, length); | |
| 391 | // Handle up to VL length worth of data with the second predicated operation | ||
| 392 | // taking into account data stored in the first predicated operation. | ||
| 393 | 188 | length -= num_lanes(); | |
| 394 | 188 | pg_1 = svwhilelt(int64_t{0}, length); | |
| 395 | // Handle up to VL length worth of data with the second predicated operation | ||
| 396 | // taking into account data stored in the first and second predicated | ||
| 397 | // operations. | ||
| 398 | 188 | length -= num_lanes(); | |
| 399 | 188 | pg_2 = svwhilelt(int64_t{0}, length); | |
| 400 | 188 | } | |
| 401 | |||
| 402 | // Transforms a single predicate into four other predicates that then can be | ||
| 403 | // used for consecutive operations. The input predicate can only have | ||
| 404 | // consecutive ones starting at the lowest element. | ||
| 405 | 520 | static void make_consecutive_predicates(svbool_t pg, svbool_t &pg_0, | |
| 406 | svbool_t &pg_1, svbool_t &pg_2, | ||
| 407 | svbool_t &pg_3) KLEIDICV_STREAMING { | ||
| 408 | // Length of data. Must be signed because of the unconditional subtraction | ||
| 409 | // of fixed values. | ||
| 410 | 520 | int64_t length = 4 * svcntp(pg); | |
| 411 | // Handle up to VL length worth of data with the first predicated operation. | ||
| 412 | 520 | pg_0 = svwhilelt(int64_t{0}, length); | |
| 413 | // Handle up to VL length worth of data with the second predicated operation | ||
| 414 | // taking into account data stored in the first predicated operation. | ||
| 415 | 520 | length -= num_lanes(); | |
| 416 | 520 | pg_1 = svwhilelt(int64_t{0}, length); | |
| 417 | // Handle up to VL length worth of data with the second predicated operation | ||
| 418 | // taking into account data stored in the first and second predicated | ||
| 419 | // operations. | ||
| 420 | 520 | length -= num_lanes(); | |
| 421 | 520 | pg_2 = svwhilelt(int64_t{0}, length); | |
| 422 | // Handle up to VL length worth of data with the third predicated operation | ||
| 423 | // taking into account data stored in the first, second and third predicated | ||
| 424 | // operations. | ||
| 425 | 520 | length -= num_lanes(); | |
| 426 | 520 | pg_3 = svwhilelt(int64_t{0}, length); | |
| 427 | 520 | } | |
| 428 | }; // end of class VecTraitsBase<ScalarType> | ||
| 429 | |||
| 430 | // Primary template for SVE vector traits. | ||
| 431 | template <typename ScalarType> | ||
| 432 | class VecTraits : public VecTraitsBase<ScalarType> {}; | ||
| 433 | |||
| 434 | template <> | ||
| 435 | class VecTraits<int8_t> : public VecTraitsBase<int8_t> { | ||
| 436 | public: | ||
| 437 | 888 | static inline svint8_t svdup(int8_t v) KLEIDICV_STREAMING { | |
| 438 | 888 | return svdup_s8(v); | |
| 439 | } | ||
| 440 | 2583 | static inline svint8_t svreinterpret(svuint8_t v) KLEIDICV_STREAMING { | |
| 441 | 2583 | return svreinterpret_s8(v); | |
| 442 | } | ||
| 443 | 1722 | static inline svint8_t svasr_n(svbool_t pg, svint8_t v, | |
| 444 | uint8_t s) KLEIDICV_STREAMING { | ||
| 445 | 1722 | return svasr_n_s8_x(pg, v, s); | |
| 446 | } | ||
| 447 | }; // end of class VecTraits<int8_t> | ||
| 448 | |||
| 449 | template <> | ||
| 450 | class VecTraits<uint8_t> : public VecTraitsBase<uint8_t> { | ||
| 451 | public: | ||
| 452 | 7832 | static inline svuint8_t svdup(uint8_t v) KLEIDICV_STREAMING { | |
| 453 | 7832 | return svdup_u8(v); | |
| 454 | } | ||
| 455 | 2583 | static inline svuint8_t svreinterpret(svint8_t v) KLEIDICV_STREAMING { | |
| 456 | 2583 | return svreinterpret_u8(v); | |
| 457 | } | ||
| 458 | static inline svuint8_t svsub(svbool_t pg, svuint8_t v, | ||
| 459 | svuint8_t u) KLEIDICV_STREAMING { | ||
| 460 | return svsub_u8_x(pg, v, u); | ||
| 461 | } | ||
| 462 | static inline svuint8_t svhsub(svbool_t pg, svuint8_t v, | ||
| 463 | svuint8_t u) KLEIDICV_STREAMING { | ||
| 464 | return svhsub_u8_x(pg, v, u); | ||
| 465 | } | ||
| 466 | }; // end of class VecTraits<uint8_t> | ||
| 467 | |||
| 468 | template <> | ||
| 469 | class VecTraits<int16_t> : public VecTraitsBase<int16_t> { | ||
| 470 | public: | ||
| 471 | 3436 | static inline svint16_t svdup(int16_t v) KLEIDICV_STREAMING { | |
| 472 | 3436 | return svdup_s16(v); | |
| 473 | } | ||
| 474 | static inline svint16_t svreinterpret(svuint16_t v) KLEIDICV_STREAMING { | ||
| 475 | return svreinterpret_s16(v); | ||
| 476 | } | ||
| 477 | }; // end of class VecTraits<int16_t> | ||
| 478 | |||
| 479 | template <> | ||
| 480 | class VecTraits<uint16_t> : public VecTraitsBase<uint16_t> { | ||
| 481 | public: | ||
| 482 | 2538 | static inline svuint16_t svdup(uint16_t v) KLEIDICV_STREAMING { | |
| 483 | 2538 | return svdup_u16(v); | |
| 484 | } | ||
| 485 | static inline svuint16_t svreinterpret(svint16_t v) KLEIDICV_STREAMING { | ||
| 486 | return svreinterpret_u16(v); | ||
| 487 | } | ||
| 488 | }; // end of class VecTraits<uint16_t> | ||
| 489 | |||
| 490 | template <> | ||
| 491 | class VecTraits<int32_t> : public VecTraitsBase<int32_t> { | ||
| 492 | public: | ||
| 493 | 1698 | static inline svint32_t svdup(int32_t v) KLEIDICV_STREAMING { | |
| 494 | 1698 | return svdup_s32(v); | |
| 495 | } | ||
| 496 | static inline svint32_t svreinterpret(svuint32_t v) KLEIDICV_STREAMING { | ||
| 497 | return svreinterpret_s32(v); | ||
| 498 | } | ||
| 499 | }; // end of class VecTraits<int32_t> | ||
| 500 | |||
| 501 | template <> | ||
| 502 | class VecTraits<uint32_t> : public VecTraitsBase<uint32_t> { | ||
| 503 | public: | ||
| 504 | 825 | static inline svuint32_t svdup(uint32_t v) KLEIDICV_STREAMING { | |
| 505 | 825 | return svdup_u32(v); | |
| 506 | } | ||
| 507 | static inline svuint32_t svreinterpret(svint32_t v) KLEIDICV_STREAMING { | ||
| 508 | return svreinterpret_u32(v); | ||
| 509 | } | ||
| 510 | }; // end of class VecTraits<uint32_t> | ||
| 511 | |||
| 512 | template <> | ||
| 513 | class VecTraits<int64_t> : public VecTraitsBase<int64_t> { | ||
| 514 | public: | ||
| 515 | static inline svint64_t svdup(int64_t v) KLEIDICV_STREAMING { | ||
| 516 | return svdup_s64(v); | ||
| 517 | } | ||
| 518 | static inline svint64_t svreinterpret(svuint64_t v) KLEIDICV_STREAMING { | ||
| 519 | return svreinterpret_s64(v); | ||
| 520 | } | ||
| 521 | }; // end of class VecTraits<int64_t> | ||
| 522 | |||
| 523 | template <> | ||
| 524 | class VecTraits<uint64_t> : public VecTraitsBase<uint64_t> { | ||
| 525 | public: | ||
| 526 | static inline svuint64_t svdup(uint64_t v) KLEIDICV_STREAMING { | ||
| 527 | return svdup_u64(v); | ||
| 528 | } | ||
| 529 | static inline svuint64_t svreinterpret(svint64_t v) KLEIDICV_STREAMING { | ||
| 530 | return svreinterpret_u64(v); | ||
| 531 | } | ||
| 532 | }; // end of class VecTraits<uint64_t> | ||
| 533 | |||
| 534 | template <> | ||
| 535 | class VecTraits<float> : public VecTraitsBase<float> { | ||
| 536 | public: | ||
| 537 | 900 | static inline svfloat32_t svdup(float v) KLEIDICV_STREAMING { | |
| 538 | 900 | return svdup_f32(v); | |
| 539 | } | ||
| 540 | static inline svfloat32_t svsub(svbool_t pg, svfloat32_t v, | ||
| 541 | svfloat32_t u) KLEIDICV_STREAMING { | ||
| 542 | return svsub_f32_x(pg, v, u); | ||
| 543 | } | ||
| 544 | }; // end of class VecTraits<float> | ||
| 545 | |||
| 546 | template <> | ||
| 547 | class VecTraits<double> : public VecTraitsBase<double> { | ||
| 548 | public: | ||
| 549 | 42 | static inline svfloat64_t svdup(double v) KLEIDICV_STREAMING { | |
| 550 | 42 | return svdup_f64(v); | |
| 551 | } | ||
| 552 | }; // end of class VecTraits<double> | ||
| 553 | |||
| 554 | template <> | ||
| 555 | class VecTraits<float16_t> : public VecTraitsBase<float16_t> { | ||
| 556 | public: | ||
| 557 | static inline svfloat16_t svdup(float16_t v) KLEIDICV_STREAMING { | ||
| 558 | return svdup_f16(v); | ||
| 559 | } | ||
| 560 | }; // end of class VecTraits<float16_t> | ||
| 561 | |||
| 562 | // Adapter which adds context and forwards arguments. | ||
| 563 | template <typename OperationType> | ||
| 564 | class OperationContextAdapter : public OperationBase<OperationType> { | ||
| 565 | // Shorten rows: no need to write 'this->'. | ||
| 566 | using OperationBase<OperationType>::operation; | ||
| 567 | using OperationBase<OperationType>::num_lanes; | ||
| 568 | |||
| 569 | public: | ||
| 570 | using ContextType = Context; | ||
| 571 | using VecTraits = typename OperationBase<OperationType>::VecTraits; | ||
| 572 | |||
| 573 | 22021 | explicit OperationContextAdapter(OperationType &operation) KLEIDICV_STREAMING | |
| 574 | 22021 | : OperationBase<OperationType>(operation) {} | |
| 575 | |||
| 576 | // Forwards vector_path_2x() calls to the inner operation. | ||
| 577 | template <typename... ArgTypes> | ||
| 578 | 53223 | void vector_path_2x(ArgTypes &&...args) KLEIDICV_STREAMING { | |
| 579 | 53223 | svbool_t ctx_pg; | |
| 580 | 53223 | ContextType ctx{ctx_pg}; | |
| 581 | 53223 | ctx.set_predicate(VecTraits::svptrue()); | |
| 582 | 53223 | operation().vector_path_2x(ctx, std::forward<ArgTypes>(args)...); | |
| 583 | 53223 | } | |
| 584 | |||
| 585 | // Forwards vector_path() calls to the inner operation. | ||
| 586 | template <typename... ArgTypes> | ||
| 587 | 5720 | void vector_path(ArgTypes &&...args) KLEIDICV_STREAMING { | |
| 588 | 5720 | svbool_t ctx_pg; | |
| 589 | 5720 | ContextType ctx{ctx_pg}; | |
| 590 | 5720 | ctx.set_predicate(VecTraits::svptrue()); | |
| 591 | 5720 | operation().vector_path(ctx, std::forward<ArgTypes>(args)...); | |
| 592 | 5720 | } | |
| 593 | |||
| 594 | // Forwards remaining_path() calls to the inner operation if the concrete | ||
| 595 | // operation is unrolled once. | ||
| 596 | template <typename... ColumnTypes, typename T = OperationType> | ||
| 597 | 1344 | std::enable_if_t<T::is_unrolled_once()> remaining_path( | |
| 598 | size_t length, ColumnTypes &&...columns) KLEIDICV_STREAMING { | ||
| 599 | 1344 | svbool_t ctx_pg; | |
| 600 | 1344 | ContextType ctx{ctx_pg}; | |
| 601 | 1344 | ctx.set_predicate(VecTraits::svwhilelt(size_t{0}, length)); | |
| 602 | 1344 | operation().remaining_path(ctx, std::forward<ColumnTypes>(columns)...); | |
| 603 | 1344 | } | |
| 604 | |||
| 605 | // Forwards remaining_path() calls to the inner operation if the concrete | ||
| 606 | // operation is not unrolled once. | ||
| 607 | template <typename... ColumnTypes, typename T = OperationType> | ||
| 608 | 23176 | std::enable_if_t<!T::is_unrolled_once()> remaining_path( | |
| 609 | size_t length, ColumnTypes... columns) KLEIDICV_STREAMING { | ||
| 610 | 23176 | svbool_t ctx_pg; | |
| 611 | 23176 | ContextType ctx{ctx_pg}; | |
| 612 | |||
| 613 | 23176 | size_t index = 0; | |
| 614 | 23176 | ctx.set_predicate(VecTraits::svwhilelt(index, length)); | |
| 615 |
16/16✓ Branch 0 taken 13104 times.
✓ Branch 1 taken 11459 times.
✓ Branch 2 taken 3218 times.
✓ Branch 3 taken 2794 times.
✓ Branch 4 taken 2968 times.
✓ Branch 5 taken 2664 times.
✓ Branch 6 taken 2588 times.
✓ Branch 7 taken 2309 times.
✓ Branch 8 taken 2028 times.
✓ Branch 9 taken 1729 times.
✓ Branch 10 taken 1475 times.
✓ Branch 11 taken 1213 times.
✓ Branch 12 taken 650 times.
✓ Branch 13 taken 624 times.
✓ Branch 14 taken 400 times.
✓ Branch 15 taken 384 times.
|
49607 | while (svptest_first(VecTraits::svptrue(), ctx.predicate())) { |
| 616 | 26431 | operation().remaining_path(ctx, columns.at(index)...); | |
| 617 | // Update loop counter and calculate the next governing predicate. | ||
| 618 | 26431 | index += num_lanes(); | |
| 619 | 26431 | ctx.set_predicate(VecTraits::svwhilelt(index, length)); | |
| 620 | } | ||
| 621 | 23176 | } | |
| 622 | }; // end of class OperationContextAdapter<OperationType> | ||
| 623 | |||
| 624 | // Adapter which implements remaining_path() for general SVE2 operations. | ||
| 625 | template <typename OperationType> | ||
| 626 | class RemainingPathAdapter : public OperationBase<OperationType> { | ||
| 627 | public: | ||
| 628 | using ContextType = Context; | ||
| 629 | |||
| 630 | 22021 | explicit RemainingPathAdapter(OperationType &operation) KLEIDICV_STREAMING | |
| 631 | 22021 | : OperationBase<OperationType>(operation) {} | |
| 632 | |||
| 633 | // Forwards remaining_path() to either vector_path() or tail_path() of the | ||
| 634 | // inner operation depending on what is requested by the innermost operation. | ||
| 635 | template <typename... ArgTypes> | ||
| 636 | 27775 | void remaining_path(ArgTypes... args) KLEIDICV_STREAMING { | |
| 637 | if constexpr (OperationType::uses_tail_path()) { | ||
| 638 | 708 | this->operation().tail_path(std::forward<ArgTypes>(args)...); | |
| 639 | } else { | ||
| 640 | 27067 | this->operation().vector_path(std::forward<ArgTypes>(args)...); | |
| 641 | } | ||
| 642 | 27775 | } | |
| 643 | }; // end of class RemainingPathAdapter<OperationType> | ||
| 644 | |||
| 645 | // Shorthand for applying a generic unrolled SVE2 operation. | ||
| 646 | template <typename OperationType, typename... ArgTypes> | ||
| 647 | 20805 | void apply_operation_by_rows(OperationType &operation, | |
| 648 | ArgTypes &&...args) KLEIDICV_STREAMING { | ||
| 649 | 20805 | ForwardingOperation forwarding_operation{operation}; | |
| 650 | 20805 | OperationAdapter operation_adapter{forwarding_operation}; | |
| 651 | 20805 | RemainingPathAdapter remaining_path_adapter{operation_adapter}; | |
| 652 | 20805 | OperationContextAdapter context_adapter{remaining_path_adapter}; | |
| 653 | 20805 | RowBasedOperation row_based_operation{context_adapter}; | |
| 654 | 20805 | zip_rows(row_based_operation, std::forward<ArgTypes>(args)...); | |
| 655 | 20805 | } | |
| 656 | |||
| 657 | // Swap two variables, since some C++ Standard Library implementations do not | ||
| 658 | // allow using std::swap for SVE vectors. | ||
| 659 | template <typename T> | ||
| 660 | 9024 | static inline void swap_scalable(T &a, T &b) KLEIDICV_STREAMING { | |
| 661 | 9024 | T tmp = a; | |
| 662 | 9024 | a = b; | |
| 663 | 9024 | b = tmp; | |
| 664 | 9024 | } | |
| 665 | |||
| 666 | // The following wrapper is used as a workaround to treat SVE variables as a 2D | ||
| 667 | // array. | ||
| 668 | template <typename VectorType, size_t Rows, size_t Cols> | ||
| 669 | class ScalableVectorArray2D { | ||
| 670 | public: | ||
| 671 | std::reference_wrapper<VectorType> window[Rows][Cols]; | ||
| 672 | 138939872 | VectorType &operator()(int row, int col) KLEIDICV_STREAMING { | |
| 673 | 138939872 | return window[row][col].get(); | |
| 674 | } | ||
| 675 | }; | ||
| 676 | |||
| 677 | template <typename VectorType, size_t element_size> | ||
| 678 | class ScalableVectorArray1D { | ||
| 679 | public: | ||
| 680 | std::reference_wrapper<VectorType> window[element_size]; | ||
| 681 | 7969600 | VectorType &operator()(int index) KLEIDICV_STREAMING { | |
| 682 | 7969600 | return window[index].get(); | |
| 683 | } | ||
| 684 | }; | ||
| 685 | |||
| 686 | #if KLEIDICV_TARGET_SME2 | ||
| 687 | |||
| 688 | // To provide svqvct_[u8|s8|...] functionality for any scalable SIMD backend. | ||
| 689 | class SvqvctWrapper { | ||
| 690 | public: | ||
| 691 | 150 | explicit SvqvctWrapper(svuint8_t &) KLEIDICV_STREAMING {} | |
| 692 | |||
| 693 | 103 | svuint8_t operator()(svuint32x4_t input) const KLEIDICV_STREAMING { | |
| 694 | 103 | return svqcvt_u8(input); | |
| 695 | } | ||
| 696 | |||
| 697 | 103 | svint8_t operator()(svint32x4_t input) const KLEIDICV_STREAMING { | |
| 698 | 103 | return svqcvt_s8(input); | |
| 699 | } | ||
| 700 | }; | ||
| 701 | |||
| 702 | #else // KLEIDICV_TARGET_SME2 | ||
| 703 | |||
| 704 | // To provide svqvct_[u8|s8|...] functionality for any scalable SIMD backend. | ||
| 705 | class SvqvctWrapper { | ||
| 706 | public: | ||
| 707 | 300 | explicit SvqvctWrapper(svuint8_t &index) KLEIDICV_STREAMING : index_(index) { | |
| 708 | // Index generation to reorder converted values by tbl instruction | ||
| 709 | 300 | auto index0 = svindex_u8(0, 4); | |
| 710 | 300 | auto index1 = svindex_u8(1, 4); | |
| 711 | 300 | auto index2 = svindex_u8(2, 4); | |
| 712 | 300 | auto index3 = svindex_u8(3, 4); | |
| 713 | |||
| 714 | 300 | svbool_t pg = svwhilelt_b8(uint64_t(0), svcntb() / 4); | |
| 715 | |||
| 716 | 300 | index_ = svsplice(pg, index3, svdup_u8(0)); | |
| 717 | 300 | index_ = svsplice(pg, index2, index_); | |
| 718 | 300 | index_ = svsplice(pg, index1, index_); | |
| 719 | 300 | index_ = svsplice(pg, index0, index_); | |
| 720 | 300 | } | |
| 721 | |||
| 722 | template <typename T> | ||
| 723 | 1130 | auto operator()(T input) const KLEIDICV_STREAMING { | |
| 724 | 1130 | auto half_width_res0 = svqxtnb(svget4(input, 0)); | |
| 725 | 1130 | half_width_res0 = svqxtnt(half_width_res0, svget4(input, 2)); | |
| 726 | |||
| 727 | 1130 | auto half_width_res1 = svqxtnb(svget4(input, 1)); | |
| 728 | 1130 | half_width_res1 = svqxtnt(half_width_res1, svget4(input, 3)); | |
| 729 | |||
| 730 | 1130 | auto quarter_width_res = svqxtnb(half_width_res0); | |
| 731 | 1130 | quarter_width_res = svqxtnt(quarter_width_res, half_width_res1); | |
| 732 | |||
| 733 | 2260 | return svtbl(quarter_width_res, index_); | |
| 734 | 1130 | } | |
| 735 | |||
| 736 | private: | ||
| 737 | // This mimics treating `index_` as a member variable, but scalable types | ||
| 738 | // cannot be declared that way, so the caller owns the storage. | ||
| 739 | svuint8_t &index_; | ||
| 740 | }; | ||
| 741 | |||
| 742 | #endif // KLEIDICV_TARGET_SME2 | ||
| 743 | |||
| 744 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
| 745 | |||
| 746 | #endif // KLEIDICV_SVE2_H | ||
| 747 |