KleidiCV Coverage Report


Directory: ./
File: kleidicv/include/kleidicv/sve2.h
Date: 2025-11-25 17:23:32
Exec Total Coverage
Lines: 186 186 100.0%
Functions: 1406 1418 99.2%
Branches: 16 16 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_SVE2_H
6 #define KLEIDICV_SVE2_H
7
8 #include <arm_sve.h>
9
10 #include <utility>
11
12 #include "kleidicv/operations.h"
13 #include "kleidicv/utils.h"
14
15 // It is used by SVE2 and SME, the actual namespace will reflect it.
16 namespace KLEIDICV_TARGET_NAMESPACE {
17
18 // Context associated with SVE operations.
19 class Context {
20 public:
21 83463 explicit Context(svbool_t &pg) KLEIDICV_STREAMING : pg_{pg} {}
22
23 // Sets the predicate associated with the context to a given predicate.
24 109894 void set_predicate(svbool_t pg) KLEIDICV_STREAMING { pg_ = pg; }
25
26 // Returns predicate associated with the context.
27 500619 svbool_t predicate() const KLEIDICV_STREAMING { return pg_; }
28
29 protected:
30 // Hold a reference to an svbool_t because a sizeless type cannot be a member.
31 svbool_t &pg_;
32 }; // end of class Context
33
34 // Primary template to describe logically grouped properties of vectors.
35 template <typename ScalarType>
36 class VectorTypes;
37
38 template <>
39 class VectorTypes<int8_t> {
40 public:
41 using ScalarType = int8_t;
42 using VectorType = svint8_t;
43 using Vector2Type = svint8x2_t;
44 using Vector3Type = svint8x3_t;
45 using Vector4Type = svint8x4_t;
46 }; // end of class VectorTypes<int8_t>
47
48 template <>
49 class VectorTypes<uint8_t> {
50 public:
51 using ScalarType = uint8_t;
52 using VectorType = svuint8_t;
53 using Vector2Type = svuint8x2_t;
54 using Vector3Type = svuint8x3_t;
55 using Vector4Type = svuint8x4_t;
56 }; // end of class VectorTypes<uint8_t>
57
58 template <>
59 class VectorTypes<int16_t> {
60 public:
61 using ScalarType = int16_t;
62 using VectorType = svint16_t;
63 using Vector2Type = svint16x2_t;
64 using Vector3Type = svint16x3_t;
65 using Vector4Type = svint16x4_t;
66 }; // end of class VectorTypes<int16_t>
67
68 template <>
69 class VectorTypes<uint16_t> {
70 public:
71 using ScalarType = uint16_t;
72 using VectorType = svuint16_t;
73 using Vector2Type = svuint16x2_t;
74 using Vector3Type = svuint16x3_t;
75 using Vector4Type = svuint16x4_t;
76 }; // end of class VectorTypes<uint16_t>
77
78 template <>
79 class VectorTypes<int32_t> {
80 public:
81 using ScalarType = int32_t;
82 using VectorType = svint32_t;
83 using Vector2Type = svint32x2_t;
84 using Vector3Type = svint32x3_t;
85 using Vector4Type = svint32x4_t;
86 }; // end of class VectorTypes<int32_t>
87
88 template <>
89 class VectorTypes<uint32_t> {
90 public:
91 using ScalarType = uint32_t;
92 using VectorType = svuint32_t;
93 using Vector2Type = svuint32x2_t;
94 using Vector3Type = svuint32x3_t;
95 using Vector4Type = svuint32x4_t;
96 }; // end of class VectorTypes<uint32_t>
97
98 template <>
99 class VectorTypes<int64_t> {
100 public:
101 using ScalarType = int64_t;
102 using VectorType = svint64_t;
103 using Vector2Type = svint64x2_t;
104 using Vector3Type = svint64x3_t;
105 using Vector4Type = svint64x4_t;
106 }; // end of class VectorTypes<int64_t>
107
108 template <>
109 class VectorTypes<uint64_t> {
110 public:
111 using ScalarType = uint64_t;
112 using VectorType = svuint64_t;
113 using Vector2Type = svuint64x2_t;
114 using Vector3Type = svuint64x3_t;
115 using Vector4Type = svuint64x4_t;
116 }; // end of class VectorTypes<uint64_t>
117
118 template <>
119 class VectorTypes<float> {
120 public:
121 using ScalarType = float;
122 using VectorType = svfloat32_t;
123 using Vector2Type = svfloat32x2_t;
124 using Vector3Type = svfloat32x3_t;
125 using Vector4Type = svfloat32x4_t;
126 }; // end of class VectorTypes<float>
127
128 template <>
129 class VectorTypes<double> {
130 public:
131 using ScalarType = double;
132 using VectorType = svfloat64_t;
133 using Vector2Type = svfloat64x2_t;
134 using Vector3Type = svfloat64x3_t;
135 using Vector4Type = svfloat64x4_t;
136 }; // end of class VectorTypes<double>
137
138 template <>
139 class VectorTypes<float16_t> {
140 public:
141 using ScalarType = float16_t;
142 using VectorType = svfloat16_t;
143 using Vector2Type = svfloat16x2_t;
144 using Vector3Type = svfloat16x3_t;
145 using Vector4Type = svfloat16x4_t;
146 }; // end of class VectorTypes<float16_t>
147
148 // Base class for all SVE vector traits.
149 template <typename ScalarType>
150 class VecTraitsBase : public VectorTypes<ScalarType> {
151 public:
152 using typename VectorTypes<ScalarType>::VectorType;
153 using typename VectorTypes<ScalarType>::Vector2Type;
154
155 // Number of lanes in a vector.
156 201667 static inline size_t num_lanes() KLEIDICV_STREAMING {
157 201667 return static_cast<size_t>(svcnt());
158 }
159
160 // Maximum number of lanes in a vector.
161 static constexpr size_t max_num_lanes() KLEIDICV_STREAMING {
162 return 256 / sizeof(ScalarType);
163 }
164
165 // Loads a single vector from 'src'.
166 30015 static inline void load(Context ctx, const ScalarType *src,
167 VectorType &vec) KLEIDICV_STREAMING {
168 30015 vec = svld1(ctx.predicate(), &src[0]);
169 30015 }
170
171 // Loads two consecutive vectors from 'src'.
172 83995 static inline void load_consecutive(Context ctx, const ScalarType *src,
173 VectorType &vec_0,
174 VectorType &vec_1) KLEIDICV_STREAMING {
175 #if KLEIDICV_TARGET_SME2
176 // Assuming that ctx contains a full predicate.
177 (void)ctx;
178 16391 svcount_t p_counter = svptrue_c();
179 16391 Vector2Type v = svld1_x2(p_counter, &src[0]);
180 16391 vec_0 = svget2(v, 0);
181 16391 vec_1 = svget2(v, 1);
182 #else
183 67604 vec_0 = svld1(ctx.predicate(), &src[0]);
184 67604 vec_1 = svld1_vnum(ctx.predicate(), &src[0], 1);
185 #endif
186 83995 }
187
188 // Stores a single vector to 'dst'.
189 17066 static inline void store(Context ctx, VectorType vec,
190 ScalarType *dst) KLEIDICV_STREAMING {
191 17066 svst1(ctx.predicate(), &dst[0], vec);
192 17066 }
193
194 // Stores two consecutive vectors to 'dst'.
195 45597 static inline void store_consecutive(Context ctx, VectorType vec_0,
196 VectorType vec_1,
197 ScalarType *dst) KLEIDICV_STREAMING {
198 #if KLEIDICV_TARGET_SME2
199 // Assuming that ctx contains a full predicate.
200 (void)ctx;
201 8648 svcount_t p_counter = svptrue_c();
202 8648 Vector2Type v = svcreate2(vec_0, vec_1);
203 8648 svst1(p_counter, &dst[0], v);
204 #else
205 36949 svst1(ctx.predicate(), &dst[0], vec_0);
206 36949 svst1_vnum(ctx.predicate(), &dst[0], 1, vec_1);
207 #endif
208 45597 }
209
210 template <typename T = ScalarType>
211 116555 static std::enable_if_t<sizeof(T) == sizeof(int8_t), uint64_t> svcnt()
212 KLEIDICV_STREAMING {
213 116555 return svcntb();
214 }
215
216 template <typename T = ScalarType>
217 51641 static std::enable_if_t<sizeof(T) == sizeof(int16_t), uint64_t> svcnt()
218 KLEIDICV_STREAMING {
219 51641 return svcnth();
220 }
221
222 template <typename T = ScalarType>
223 30627 static std::enable_if_t<sizeof(T) == sizeof(int32_t), uint64_t> svcnt()
224 KLEIDICV_STREAMING {
225 30627 return svcntw();
226 }
227
228 template <typename T = ScalarType>
229 2844 static std::enable_if_t<sizeof(T) == sizeof(int64_t), uint64_t> svcnt()
230 KLEIDICV_STREAMING {
231 2844 return svcntd();
232 }
233
234 template <typename T = ScalarType>
235 708 static std::enable_if_t<sizeof(T) == sizeof(int8_t), uint64_t> svcntp(
236 svbool_t pg) KLEIDICV_STREAMING {
237 708 return svcntp_b8(pg, pg);
238 }
239
240 template <typename T = ScalarType>
241 static std::enable_if_t<sizeof(T) == sizeof(int16_t), uint64_t> svcntp(
242 svbool_t pg) KLEIDICV_STREAMING {
243 return svcntp_b16(pg, pg);
244 }
245
246 template <typename T = ScalarType>
247 static std::enable_if_t<sizeof(T) == sizeof(int32_t), uint64_t> svcntp(
248 svbool_t pg) KLEIDICV_STREAMING {
249 return svcntp_b32(pg, pg);
250 }
251
252 template <typename T = ScalarType>
253 static std::enable_if_t<sizeof(T) == sizeof(int64_t), uint64_t> svcntp(
254 svbool_t pg) KLEIDICV_STREAMING {
255 return svcntp_b64(pg, pg);
256 }
257
258 template <typename T = ScalarType>
259 130943 static std::enable_if_t<sizeof(T) == sizeof(int8_t), svbool_t> svptrue()
260 KLEIDICV_STREAMING {
261 130943 return svptrue_b8();
262 }
263
264 template <typename T = ScalarType>
265 66226 static std::enable_if_t<sizeof(T) == sizeof(int16_t), svbool_t> svptrue()
266 KLEIDICV_STREAMING {
267 66226 return svptrue_b16();
268 }
269
270 template <typename T = ScalarType>
271 66160 static std::enable_if_t<sizeof(T) == sizeof(int32_t), svbool_t> svptrue()
272 KLEIDICV_STREAMING {
273 66160 return svptrue_b32();
274 }
275
276 template <typename T = ScalarType>
277 13720 static std::enable_if_t<sizeof(T) == sizeof(int64_t), svbool_t> svptrue()
278 KLEIDICV_STREAMING {
279 13720 return svptrue_b64();
280 }
281
282 #if KLEIDICV_TARGET_SME2
283 template <typename T = ScalarType>
284 5114 static std::enable_if_t<sizeof(T) == sizeof(int8_t), svcount_t> svptrue_c()
285 KLEIDICV_STREAMING {
286 5114 return svptrue_c8();
287 }
288
289 template <typename T = ScalarType>
290 4723 static std::enable_if_t<sizeof(T) == sizeof(int16_t), svcount_t> svptrue_c()
291 KLEIDICV_STREAMING {
292 4723 return svptrue_c16();
293 }
294
295 template <typename T = ScalarType>
296 10504 static std::enable_if_t<sizeof(T) == sizeof(int32_t), svcount_t> svptrue_c()
297 KLEIDICV_STREAMING {
298 10504 return svptrue_c32();
299 }
300
301 template <typename T = ScalarType>
302 6246 static std::enable_if_t<sizeof(T) == sizeof(int64_t), svcount_t> svptrue_c()
303 KLEIDICV_STREAMING {
304 6246 return svptrue_c64();
305 }
306 #endif // KLEIDICV_TARGET_SME2
307
308 template <enum svpattern pat, typename T = ScalarType>
309 73656 static std::enable_if_t<sizeof(T) == sizeof(int8_t), svbool_t> svptrue_pat()
310 KLEIDICV_STREAMING {
311 73656 return svptrue_pat_b8(pat);
312 }
313
314 template <enum svpattern pat, typename T = ScalarType>
315 88512 static std::enable_if_t<sizeof(T) == sizeof(int16_t), svbool_t> svptrue_pat()
316 KLEIDICV_STREAMING {
317 88512 return svptrue_pat_b16(pat);
318 }
319
320 template <enum svpattern pat, typename T = ScalarType>
321 103056 static std::enable_if_t<sizeof(T) == sizeof(int32_t), svbool_t> svptrue_pat()
322 KLEIDICV_STREAMING {
323 103056 return svptrue_pat_b32(pat);
324 }
325
326 template <enum svpattern pat, typename T = ScalarType>
327 static std::enable_if_t<sizeof(T) == sizeof(int64_t), svbool_t> svptrue_pat()
328 KLEIDICV_STREAMING {
329 return svptrue_pat_b64(pat);
330 }
331
332 template <typename IndexType, typename T = ScalarType>
333 104247 static std::enable_if_t<sizeof(T) == sizeof(int8_t), svbool_t> svwhilelt(
334 IndexType index, IndexType max_index) KLEIDICV_STREAMING {
335 if constexpr (std::is_same_v<IndexType, size_t>) {
336 95579 return svwhilelt_b8_u64(index, max_index);
337 } else if constexpr (std::is_same_v<IndexType, ptrdiff_t>) {
338 8668 return svwhilelt_b8_s64(index, max_index);
339 } else {
340 return svwhilelt_b8(index, max_index);
341 }
342 }
343
344 template <typename IndexType, typename T = ScalarType>
345 33843 static std::enable_if_t<sizeof(T) == sizeof(int16_t), svbool_t> svwhilelt(
346 IndexType index, IndexType max_index) KLEIDICV_STREAMING {
347 if constexpr (std::is_same_v<IndexType, size_t>) {
348 29333 return svwhilelt_b16_u64(index, max_index);
349 } else if constexpr (std::is_same_v<IndexType, ptrdiff_t>) {
350 4510 return svwhilelt_b16_s64(index, max_index);
351 } else {
352 return svwhilelt_b16(index, max_index);
353 }
354 }
355
356 template <typename IndexType, typename T = ScalarType>
357 17425 static std::enable_if_t<sizeof(T) == sizeof(int32_t), svbool_t> svwhilelt(
358 IndexType index, IndexType max_index) KLEIDICV_STREAMING {
359 if constexpr (std::is_same_v<IndexType, size_t>) {
360 17425 return svwhilelt_b32_u64(index, max_index);
361 } else if constexpr (std::is_same_v<IndexType, ptrdiff_t>) {
362 return svwhilelt_b32_s64(index, max_index);
363 } else {
364 return svwhilelt_b32(index, max_index);
365 }
366 }
367
368 template <typename IndexType, typename T = ScalarType>
369 2058 static std::enable_if_t<sizeof(T) == sizeof(int64_t), svbool_t> svwhilelt(
370 IndexType index, IndexType max_index) KLEIDICV_STREAMING {
371 if constexpr (std::is_same_v<IndexType, size_t>) {
372 2058 return svwhilelt_b64_u64(index, max_index);
373 } else if constexpr (std::is_same_v<IndexType, ptrdiff_t>) {
374 return svwhilelt_b64_s64(index, max_index);
375 } else {
376 return svwhilelt_b64(index, max_index);
377 }
378 }
379
380 // Transforms a single predicate into three other predicates that then can be
381 // used for consecutive operations. The input predicate can only have
382 // consecutive ones starting at the lowest element.
383 188 static void make_consecutive_predicates(svbool_t pg, svbool_t &pg_0,
384 svbool_t &pg_1,
385 svbool_t &pg_2) KLEIDICV_STREAMING {
386 // Length of data. Must be signed because of the unconditional subtraction
387 // of fixed values.
388 188 int64_t length = 3 * svcntp(pg);
389 // Handle up to VL length worth of data with the first predicated operation.
390 188 pg_0 = svwhilelt(int64_t{0}, length);
391 // Handle up to VL length worth of data with the second predicated operation
392 // taking into account data stored in the first predicated operation.
393 188 length -= num_lanes();
394 188 pg_1 = svwhilelt(int64_t{0}, length);
395 // Handle up to VL length worth of data with the second predicated operation
396 // taking into account data stored in the first and second predicated
397 // operations.
398 188 length -= num_lanes();
399 188 pg_2 = svwhilelt(int64_t{0}, length);
400 188 }
401
402 // Transforms a single predicate into four other predicates that then can be
403 // used for consecutive operations. The input predicate can only have
404 // consecutive ones starting at the lowest element.
405 520 static void make_consecutive_predicates(svbool_t pg, svbool_t &pg_0,
406 svbool_t &pg_1, svbool_t &pg_2,
407 svbool_t &pg_3) KLEIDICV_STREAMING {
408 // Length of data. Must be signed because of the unconditional subtraction
409 // of fixed values.
410 520 int64_t length = 4 * svcntp(pg);
411 // Handle up to VL length worth of data with the first predicated operation.
412 520 pg_0 = svwhilelt(int64_t{0}, length);
413 // Handle up to VL length worth of data with the second predicated operation
414 // taking into account data stored in the first predicated operation.
415 520 length -= num_lanes();
416 520 pg_1 = svwhilelt(int64_t{0}, length);
417 // Handle up to VL length worth of data with the second predicated operation
418 // taking into account data stored in the first and second predicated
419 // operations.
420 520 length -= num_lanes();
421 520 pg_2 = svwhilelt(int64_t{0}, length);
422 // Handle up to VL length worth of data with the third predicated operation
423 // taking into account data stored in the first, second and third predicated
424 // operations.
425 520 length -= num_lanes();
426 520 pg_3 = svwhilelt(int64_t{0}, length);
427 520 }
428 }; // end of class VecTraitsBase<ScalarType>
429
430 // Primary template for SVE vector traits.
431 template <typename ScalarType>
432 class VecTraits : public VecTraitsBase<ScalarType> {};
433
434 template <>
435 class VecTraits<int8_t> : public VecTraitsBase<int8_t> {
436 public:
437 888 static inline svint8_t svdup(int8_t v) KLEIDICV_STREAMING {
438 888 return svdup_s8(v);
439 }
440 2583 static inline svint8_t svreinterpret(svuint8_t v) KLEIDICV_STREAMING {
441 2583 return svreinterpret_s8(v);
442 }
443 1722 static inline svint8_t svasr_n(svbool_t pg, svint8_t v,
444 uint8_t s) KLEIDICV_STREAMING {
445 1722 return svasr_n_s8_x(pg, v, s);
446 }
447 }; // end of class VecTraits<int8_t>
448
449 template <>
450 class VecTraits<uint8_t> : public VecTraitsBase<uint8_t> {
451 public:
452 7832 static inline svuint8_t svdup(uint8_t v) KLEIDICV_STREAMING {
453 7832 return svdup_u8(v);
454 }
455 2583 static inline svuint8_t svreinterpret(svint8_t v) KLEIDICV_STREAMING {
456 2583 return svreinterpret_u8(v);
457 }
458 static inline svuint8_t svsub(svbool_t pg, svuint8_t v,
459 svuint8_t u) KLEIDICV_STREAMING {
460 return svsub_u8_x(pg, v, u);
461 }
462 static inline svuint8_t svhsub(svbool_t pg, svuint8_t v,
463 svuint8_t u) KLEIDICV_STREAMING {
464 return svhsub_u8_x(pg, v, u);
465 }
466 }; // end of class VecTraits<uint8_t>
467
468 template <>
469 class VecTraits<int16_t> : public VecTraitsBase<int16_t> {
470 public:
471 3436 static inline svint16_t svdup(int16_t v) KLEIDICV_STREAMING {
472 3436 return svdup_s16(v);
473 }
474 static inline svint16_t svreinterpret(svuint16_t v) KLEIDICV_STREAMING {
475 return svreinterpret_s16(v);
476 }
477 }; // end of class VecTraits<int16_t>
478
479 template <>
480 class VecTraits<uint16_t> : public VecTraitsBase<uint16_t> {
481 public:
482 2538 static inline svuint16_t svdup(uint16_t v) KLEIDICV_STREAMING {
483 2538 return svdup_u16(v);
484 }
485 static inline svuint16_t svreinterpret(svint16_t v) KLEIDICV_STREAMING {
486 return svreinterpret_u16(v);
487 }
488 }; // end of class VecTraits<uint16_t>
489
490 template <>
491 class VecTraits<int32_t> : public VecTraitsBase<int32_t> {
492 public:
493 1698 static inline svint32_t svdup(int32_t v) KLEIDICV_STREAMING {
494 1698 return svdup_s32(v);
495 }
496 static inline svint32_t svreinterpret(svuint32_t v) KLEIDICV_STREAMING {
497 return svreinterpret_s32(v);
498 }
499 }; // end of class VecTraits<int32_t>
500
501 template <>
502 class VecTraits<uint32_t> : public VecTraitsBase<uint32_t> {
503 public:
504 825 static inline svuint32_t svdup(uint32_t v) KLEIDICV_STREAMING {
505 825 return svdup_u32(v);
506 }
507 static inline svuint32_t svreinterpret(svint32_t v) KLEIDICV_STREAMING {
508 return svreinterpret_u32(v);
509 }
510 }; // end of class VecTraits<uint32_t>
511
512 template <>
513 class VecTraits<int64_t> : public VecTraitsBase<int64_t> {
514 public:
515 static inline svint64_t svdup(int64_t v) KLEIDICV_STREAMING {
516 return svdup_s64(v);
517 }
518 static inline svint64_t svreinterpret(svuint64_t v) KLEIDICV_STREAMING {
519 return svreinterpret_s64(v);
520 }
521 }; // end of class VecTraits<int64_t>
522
523 template <>
524 class VecTraits<uint64_t> : public VecTraitsBase<uint64_t> {
525 public:
526 static inline svuint64_t svdup(uint64_t v) KLEIDICV_STREAMING {
527 return svdup_u64(v);
528 }
529 static inline svuint64_t svreinterpret(svint64_t v) KLEIDICV_STREAMING {
530 return svreinterpret_u64(v);
531 }
532 }; // end of class VecTraits<uint64_t>
533
534 template <>
535 class VecTraits<float> : public VecTraitsBase<float> {
536 public:
537 900 static inline svfloat32_t svdup(float v) KLEIDICV_STREAMING {
538 900 return svdup_f32(v);
539 }
540 static inline svfloat32_t svsub(svbool_t pg, svfloat32_t v,
541 svfloat32_t u) KLEIDICV_STREAMING {
542 return svsub_f32_x(pg, v, u);
543 }
544 }; // end of class VecTraits<float>
545
546 template <>
547 class VecTraits<double> : public VecTraitsBase<double> {
548 public:
549 42 static inline svfloat64_t svdup(double v) KLEIDICV_STREAMING {
550 42 return svdup_f64(v);
551 }
552 }; // end of class VecTraits<double>
553
554 template <>
555 class VecTraits<float16_t> : public VecTraitsBase<float16_t> {
556 public:
557 static inline svfloat16_t svdup(float16_t v) KLEIDICV_STREAMING {
558 return svdup_f16(v);
559 }
560 }; // end of class VecTraits<float16_t>
561
562 // Adapter which adds context and forwards arguments.
563 template <typename OperationType>
564 class OperationContextAdapter : public OperationBase<OperationType> {
565 // Shorten rows: no need to write 'this->'.
566 using OperationBase<OperationType>::operation;
567 using OperationBase<OperationType>::num_lanes;
568
569 public:
570 using ContextType = Context;
571 using VecTraits = typename OperationBase<OperationType>::VecTraits;
572
573 22021 explicit OperationContextAdapter(OperationType &operation) KLEIDICV_STREAMING
574 22021 : OperationBase<OperationType>(operation) {}
575
576 // Forwards vector_path_2x() calls to the inner operation.
577 template <typename... ArgTypes>
578 53223 void vector_path_2x(ArgTypes &&...args) KLEIDICV_STREAMING {
579 53223 svbool_t ctx_pg;
580 53223 ContextType ctx{ctx_pg};
581 53223 ctx.set_predicate(VecTraits::svptrue());
582 53223 operation().vector_path_2x(ctx, std::forward<ArgTypes>(args)...);
583 53223 }
584
585 // Forwards vector_path() calls to the inner operation.
586 template <typename... ArgTypes>
587 5720 void vector_path(ArgTypes &&...args) KLEIDICV_STREAMING {
588 5720 svbool_t ctx_pg;
589 5720 ContextType ctx{ctx_pg};
590 5720 ctx.set_predicate(VecTraits::svptrue());
591 5720 operation().vector_path(ctx, std::forward<ArgTypes>(args)...);
592 5720 }
593
594 // Forwards remaining_path() calls to the inner operation if the concrete
595 // operation is unrolled once.
596 template <typename... ColumnTypes, typename T = OperationType>
597 1344 std::enable_if_t<T::is_unrolled_once()> remaining_path(
598 size_t length, ColumnTypes &&...columns) KLEIDICV_STREAMING {
599 1344 svbool_t ctx_pg;
600 1344 ContextType ctx{ctx_pg};
601 1344 ctx.set_predicate(VecTraits::svwhilelt(size_t{0}, length));
602 1344 operation().remaining_path(ctx, std::forward<ColumnTypes>(columns)...);
603 1344 }
604
605 // Forwards remaining_path() calls to the inner operation if the concrete
606 // operation is not unrolled once.
607 template <typename... ColumnTypes, typename T = OperationType>
608 23176 std::enable_if_t<!T::is_unrolled_once()> remaining_path(
609 size_t length, ColumnTypes... columns) KLEIDICV_STREAMING {
610 23176 svbool_t ctx_pg;
611 23176 ContextType ctx{ctx_pg};
612
613 23176 size_t index = 0;
614 23176 ctx.set_predicate(VecTraits::svwhilelt(index, length));
615
16/16
✓ Branch 0 taken 13104 times.
✓ Branch 1 taken 11459 times.
✓ Branch 2 taken 3218 times.
✓ Branch 3 taken 2794 times.
✓ Branch 4 taken 2968 times.
✓ Branch 5 taken 2664 times.
✓ Branch 6 taken 2588 times.
✓ Branch 7 taken 2309 times.
✓ Branch 8 taken 2028 times.
✓ Branch 9 taken 1729 times.
✓ Branch 10 taken 1475 times.
✓ Branch 11 taken 1213 times.
✓ Branch 12 taken 650 times.
✓ Branch 13 taken 624 times.
✓ Branch 14 taken 400 times.
✓ Branch 15 taken 384 times.
49607 while (svptest_first(VecTraits::svptrue(), ctx.predicate())) {
616 26431 operation().remaining_path(ctx, columns.at(index)...);
617 // Update loop counter and calculate the next governing predicate.
618 26431 index += num_lanes();
619 26431 ctx.set_predicate(VecTraits::svwhilelt(index, length));
620 }
621 23176 }
622 }; // end of class OperationContextAdapter<OperationType>
623
624 // Adapter which implements remaining_path() for general SVE2 operations.
625 template <typename OperationType>
626 class RemainingPathAdapter : public OperationBase<OperationType> {
627 public:
628 using ContextType = Context;
629
630 22021 explicit RemainingPathAdapter(OperationType &operation) KLEIDICV_STREAMING
631 22021 : OperationBase<OperationType>(operation) {}
632
633 // Forwards remaining_path() to either vector_path() or tail_path() of the
634 // inner operation depending on what is requested by the innermost operation.
635 template <typename... ArgTypes>
636 27775 void remaining_path(ArgTypes... args) KLEIDICV_STREAMING {
637 if constexpr (OperationType::uses_tail_path()) {
638 708 this->operation().tail_path(std::forward<ArgTypes>(args)...);
639 } else {
640 27067 this->operation().vector_path(std::forward<ArgTypes>(args)...);
641 }
642 27775 }
643 }; // end of class RemainingPathAdapter<OperationType>
644
645 // Shorthand for applying a generic unrolled SVE2 operation.
646 template <typename OperationType, typename... ArgTypes>
647 20805 void apply_operation_by_rows(OperationType &operation,
648 ArgTypes &&...args) KLEIDICV_STREAMING {
649 20805 ForwardingOperation forwarding_operation{operation};
650 20805 OperationAdapter operation_adapter{forwarding_operation};
651 20805 RemainingPathAdapter remaining_path_adapter{operation_adapter};
652 20805 OperationContextAdapter context_adapter{remaining_path_adapter};
653 20805 RowBasedOperation row_based_operation{context_adapter};
654 20805 zip_rows(row_based_operation, std::forward<ArgTypes>(args)...);
655 20805 }
656
657 // Swap two variables, since some C++ Standard Library implementations do not
658 // allow using std::swap for SVE vectors.
659 template <typename T>
660 9024 static inline void swap_scalable(T &a, T &b) KLEIDICV_STREAMING {
661 9024 T tmp = a;
662 9024 a = b;
663 9024 b = tmp;
664 9024 }
665
666 // The following wrapper is used as a workaround to treat SVE variables as a 2D
667 // array.
668 template <typename VectorType, size_t Rows, size_t Cols>
669 class ScalableVectorArray2D {
670 public:
671 std::reference_wrapper<VectorType> window[Rows][Cols];
672 138939872 VectorType &operator()(int row, int col) KLEIDICV_STREAMING {
673 138939872 return window[row][col].get();
674 }
675 };
676
677 template <typename VectorType, size_t element_size>
678 class ScalableVectorArray1D {
679 public:
680 std::reference_wrapper<VectorType> window[element_size];
681 7969600 VectorType &operator()(int index) KLEIDICV_STREAMING {
682 7969600 return window[index].get();
683 }
684 };
685
686 #if KLEIDICV_TARGET_SME2
687
688 // To provide svqvct_[u8|s8|...] functionality for any scalable SIMD backend.
689 class SvqvctWrapper {
690 public:
691 150 explicit SvqvctWrapper(svuint8_t &) KLEIDICV_STREAMING {}
692
693 103 svuint8_t operator()(svuint32x4_t input) const KLEIDICV_STREAMING {
694 103 return svqcvt_u8(input);
695 }
696
697 103 svint8_t operator()(svint32x4_t input) const KLEIDICV_STREAMING {
698 103 return svqcvt_s8(input);
699 }
700 };
701
702 #else // KLEIDICV_TARGET_SME2
703
704 // To provide svqvct_[u8|s8|...] functionality for any scalable SIMD backend.
705 class SvqvctWrapper {
706 public:
707 300 explicit SvqvctWrapper(svuint8_t &index) KLEIDICV_STREAMING : index_(index) {
708 // Index generation to reorder converted values by tbl instruction
709 300 auto index0 = svindex_u8(0, 4);
710 300 auto index1 = svindex_u8(1, 4);
711 300 auto index2 = svindex_u8(2, 4);
712 300 auto index3 = svindex_u8(3, 4);
713
714 300 svbool_t pg = svwhilelt_b8(uint64_t(0), svcntb() / 4);
715
716 300 index_ = svsplice(pg, index3, svdup_u8(0));
717 300 index_ = svsplice(pg, index2, index_);
718 300 index_ = svsplice(pg, index1, index_);
719 300 index_ = svsplice(pg, index0, index_);
720 300 }
721
722 template <typename T>
723 1130 auto operator()(T input) const KLEIDICV_STREAMING {
724 1130 auto half_width_res0 = svqxtnb(svget4(input, 0));
725 1130 half_width_res0 = svqxtnt(half_width_res0, svget4(input, 2));
726
727 1130 auto half_width_res1 = svqxtnb(svget4(input, 1));
728 1130 half_width_res1 = svqxtnt(half_width_res1, svget4(input, 3));
729
730 1130 auto quarter_width_res = svqxtnb(half_width_res0);
731 1130 quarter_width_res = svqxtnt(quarter_width_res, half_width_res1);
732
733 2260 return svtbl(quarter_width_res, index_);
734 1130 }
735
736 private:
737 // This mimics treating `index_` as a member variable, but scalable types
738 // cannot be declared that way, so the caller owns the storage.
739 svuint8_t &index_;
740 };
741
742 #endif // KLEIDICV_TARGET_SME2
743
744 } // namespace KLEIDICV_TARGET_NAMESPACE
745
746 #endif // KLEIDICV_SVE2_H
747