KleidiCV Coverage Report


Directory: ./
File: kleidicv/include/kleidicv/sve2.h
Date: 2026-01-20 20:58:59
Exec Total Coverage
Lines: 186 186 100.0%
Functions: 1422 1434 99.2%
Branches: 16 16 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_SVE2_H
6 #define KLEIDICV_SVE2_H
7
8 #include <arm_sve.h>
9
10 #include <utility>
11
12 #include "kleidicv/operations.h"
13 #include "kleidicv/utils.h"
14
15 // It is used by SVE2 and SME, the actual namespace will reflect it.
16 namespace KLEIDICV_TARGET_NAMESPACE {
17
18 // Context associated with SVE operations.
19 class Context {
20 public:
21 86482 explicit Context(svbool_t &pg) KLEIDICV_STREAMING : pg_{pg} {}
22
23 // Sets the predicate associated with the context to a given predicate.
24 117318 void set_predicate(svbool_t pg) KLEIDICV_STREAMING { pg_ = pg; }
25
26 // Returns predicate associated with the context.
27 519805 svbool_t predicate() const KLEIDICV_STREAMING { return pg_; }
28
29 protected:
30 // Hold a reference to an svbool_t because a sizeless type cannot be a member.
31 svbool_t &pg_;
32 }; // end of class Context
33
34 // Primary template to describe logically grouped properties of vectors.
35 template <typename ScalarType>
36 class VectorTypes;
37
38 template <>
39 class VectorTypes<int8_t> {
40 public:
41 using ScalarType = int8_t;
42 using VectorType = svint8_t;
43 using Vector2Type = svint8x2_t;
44 using Vector3Type = svint8x3_t;
45 using Vector4Type = svint8x4_t;
46 }; // end of class VectorTypes<int8_t>
47
48 template <>
49 class VectorTypes<uint8_t> {
50 public:
51 using ScalarType = uint8_t;
52 using VectorType = svuint8_t;
53 using Vector2Type = svuint8x2_t;
54 using Vector3Type = svuint8x3_t;
55 using Vector4Type = svuint8x4_t;
56 }; // end of class VectorTypes<uint8_t>
57
58 template <>
59 class VectorTypes<int16_t> {
60 public:
61 using ScalarType = int16_t;
62 using VectorType = svint16_t;
63 using Vector2Type = svint16x2_t;
64 using Vector3Type = svint16x3_t;
65 using Vector4Type = svint16x4_t;
66 }; // end of class VectorTypes<int16_t>
67
68 template <>
69 class VectorTypes<uint16_t> {
70 public:
71 using ScalarType = uint16_t;
72 using VectorType = svuint16_t;
73 using Vector2Type = svuint16x2_t;
74 using Vector3Type = svuint16x3_t;
75 using Vector4Type = svuint16x4_t;
76 }; // end of class VectorTypes<uint16_t>
77
78 template <>
79 class VectorTypes<int32_t> {
80 public:
81 using ScalarType = int32_t;
82 using VectorType = svint32_t;
83 using Vector2Type = svint32x2_t;
84 using Vector3Type = svint32x3_t;
85 using Vector4Type = svint32x4_t;
86 }; // end of class VectorTypes<int32_t>
87
88 template <>
89 class VectorTypes<uint32_t> {
90 public:
91 using ScalarType = uint32_t;
92 using VectorType = svuint32_t;
93 using Vector2Type = svuint32x2_t;
94 using Vector3Type = svuint32x3_t;
95 using Vector4Type = svuint32x4_t;
96 }; // end of class VectorTypes<uint32_t>
97
98 template <>
99 class VectorTypes<int64_t> {
100 public:
101 using ScalarType = int64_t;
102 using VectorType = svint64_t;
103 using Vector2Type = svint64x2_t;
104 using Vector3Type = svint64x3_t;
105 using Vector4Type = svint64x4_t;
106 }; // end of class VectorTypes<int64_t>
107
108 template <>
109 class VectorTypes<uint64_t> {
110 public:
111 using ScalarType = uint64_t;
112 using VectorType = svuint64_t;
113 using Vector2Type = svuint64x2_t;
114 using Vector3Type = svuint64x3_t;
115 using Vector4Type = svuint64x4_t;
116 }; // end of class VectorTypes<uint64_t>
117
118 template <>
119 class VectorTypes<float> {
120 public:
121 using ScalarType = float;
122 using VectorType = svfloat32_t;
123 using Vector2Type = svfloat32x2_t;
124 using Vector3Type = svfloat32x3_t;
125 using Vector4Type = svfloat32x4_t;
126 }; // end of class VectorTypes<float>
127
128 template <>
129 class VectorTypes<double> {
130 public:
131 using ScalarType = double;
132 using VectorType = svfloat64_t;
133 using Vector2Type = svfloat64x2_t;
134 using Vector3Type = svfloat64x3_t;
135 using Vector4Type = svfloat64x4_t;
136 }; // end of class VectorTypes<double>
137
138 template <>
139 class VectorTypes<float16_t> {
140 public:
141 using ScalarType = float16_t;
142 using VectorType = svfloat16_t;
143 using Vector2Type = svfloat16x2_t;
144 using Vector3Type = svfloat16x3_t;
145 using Vector4Type = svfloat16x4_t;
146 }; // end of class VectorTypes<float16_t>
147
148 // Base class for all SVE vector traits.
149 template <typename ScalarType>
150 class VecTraitsBase : public VectorTypes<ScalarType> {
151 public:
152 using typename VectorTypes<ScalarType>::VectorType;
153 using typename VectorTypes<ScalarType>::Vector2Type;
154
155 // Number of lanes in a vector.
156 211305 static inline size_t num_lanes() KLEIDICV_STREAMING {
157 211305 return static_cast<size_t>(svcnt());
158 }
159
160 // Maximum number of lanes in a vector.
161 static constexpr size_t max_num_lanes() KLEIDICV_STREAMING {
162 return 256 / sizeof(ScalarType);
163 }
164
165 // Loads a single vector from 'src'.
166 36459 static inline void load(Context ctx, const ScalarType *src,
167 VectorType &vec) KLEIDICV_STREAMING {
168 36459 vec = svld1(ctx.predicate(), &src[0]);
169 36459 }
170
171 // Loads two consecutive vectors from 'src'.
172 82437 static inline void load_consecutive(Context ctx, const ScalarType *src,
173 VectorType &vec_0,
174 VectorType &vec_1) KLEIDICV_STREAMING {
175 #if KLEIDICV_TARGET_SME2
176 // Assuming that ctx contains a full predicate.
177 (void)ctx;
178 15917 svcount_t p_counter = svptrue_c();
179 15917 Vector2Type v = svld1_x2(p_counter, &src[0]);
180 15917 vec_0 = svget2(v, 0);
181 15917 vec_1 = svget2(v, 1);
182 #else
183 66520 vec_0 = svld1(ctx.predicate(), &src[0]);
184 66520 vec_1 = svld1_vnum(ctx.predicate(), &src[0], 1);
185 #endif
186 82437 }
187
188 // Stores a single vector to 'dst'.
189 20383 static inline void store(Context ctx, VectorType vec,
190 ScalarType *dst) KLEIDICV_STREAMING {
191 20383 svst1(ctx.predicate(), &dst[0], vec);
192 20383 }
193
194 // Stores two consecutive vectors to 'dst'.
195 44802 static inline void store_consecutive(Context ctx, VectorType vec_0,
196 VectorType vec_1,
197 ScalarType *dst) KLEIDICV_STREAMING {
198 #if KLEIDICV_TARGET_SME2
199 // Assuming that ctx contains a full predicate.
200 (void)ctx;
201 8406 svcount_t p_counter = svptrue_c();
202 8406 Vector2Type v = svcreate2(vec_0, vec_1);
203 8406 svst1(p_counter, &dst[0], v);
204 #else
205 36396 svst1(ctx.predicate(), &dst[0], vec_0);
206 36396 svst1_vnum(ctx.predicate(), &dst[0], 1, vec_1);
207 #endif
208 44802 }
209
210 template <typename T = ScalarType>
211 120899 static std::enable_if_t<sizeof(T) == sizeof(int8_t), uint64_t> svcnt()
212 KLEIDICV_STREAMING {
213 120899 return svcntb();
214 }
215
216 template <typename T = ScalarType>
217 53219 static std::enable_if_t<sizeof(T) == sizeof(int16_t), uint64_t> svcnt()
218 KLEIDICV_STREAMING {
219 53219 return svcnth();
220 }
221
222 template <typename T = ScalarType>
223 33875 static std::enable_if_t<sizeof(T) == sizeof(int32_t), uint64_t> svcnt()
224 KLEIDICV_STREAMING {
225 33875 return svcntw();
226 }
227
228 template <typename T = ScalarType>
229 3312 static std::enable_if_t<sizeof(T) == sizeof(int64_t), uint64_t> svcnt()
230 KLEIDICV_STREAMING {
231 3312 return svcntd();
232 }
233
234 template <typename T = ScalarType>
235 1044 static std::enable_if_t<sizeof(T) == sizeof(int8_t), uint64_t> svcntp(
236 svbool_t pg) KLEIDICV_STREAMING {
237 1044 return svcntp_b8(pg, pg);
238 }
239
240 template <typename T = ScalarType>
241 static std::enable_if_t<sizeof(T) == sizeof(int16_t), uint64_t> svcntp(
242 svbool_t pg) KLEIDICV_STREAMING {
243 return svcntp_b16(pg, pg);
244 }
245
246 template <typename T = ScalarType>
247 static std::enable_if_t<sizeof(T) == sizeof(int32_t), uint64_t> svcntp(
248 svbool_t pg) KLEIDICV_STREAMING {
249 return svcntp_b32(pg, pg);
250 }
251
252 template <typename T = ScalarType>
253 static std::enable_if_t<sizeof(T) == sizeof(int64_t), uint64_t> svcntp(
254 svbool_t pg) KLEIDICV_STREAMING {
255 return svcntp_b64(pg, pg);
256 }
257
258 template <typename T = ScalarType>
259 135431 static std::enable_if_t<sizeof(T) == sizeof(int8_t), svbool_t> svptrue()
260 KLEIDICV_STREAMING {
261 135431 return svptrue_b8();
262 }
263
264 template <typename T = ScalarType>
265 67783 static std::enable_if_t<sizeof(T) == sizeof(int16_t), svbool_t> svptrue()
266 KLEIDICV_STREAMING {
267 67783 return svptrue_b16();
268 }
269
270 template <typename T = ScalarType>
271 69093 static std::enable_if_t<sizeof(T) == sizeof(int32_t), svbool_t> svptrue()
272 KLEIDICV_STREAMING {
273 69093 return svptrue_b32();
274 }
275
276 template <typename T = ScalarType>
277 14124 static std::enable_if_t<sizeof(T) == sizeof(int64_t), svbool_t> svptrue()
278 KLEIDICV_STREAMING {
279 14124 return svptrue_b64();
280 }
281
282 #if KLEIDICV_TARGET_SME2
283 template <typename T = ScalarType>
284 4903 static std::enable_if_t<sizeof(T) == sizeof(int8_t), svcount_t> svptrue_c()
285 KLEIDICV_STREAMING {
286 4903 return svptrue_c8();
287 }
288
289 template <typename T = ScalarType>
290 4604 static std::enable_if_t<sizeof(T) == sizeof(int16_t), svcount_t> svptrue_c()
291 KLEIDICV_STREAMING {
292 4604 return svptrue_c16();
293 }
294
295 template <typename T = ScalarType>
296 10208 static std::enable_if_t<sizeof(T) == sizeof(int32_t), svcount_t> svptrue_c()
297 KLEIDICV_STREAMING {
298 10208 return svptrue_c32();
299 }
300
301 template <typename T = ScalarType>
302 6138 static std::enable_if_t<sizeof(T) == sizeof(int64_t), svcount_t> svptrue_c()
303 KLEIDICV_STREAMING {
304 6138 return svptrue_c64();
305 }
306 #endif // KLEIDICV_TARGET_SME2
307
308 template <enum svpattern pat, typename T = ScalarType>
309 73692 static std::enable_if_t<sizeof(T) == sizeof(int8_t), svbool_t> svptrue_pat()
310 KLEIDICV_STREAMING {
311 73692 return svptrue_pat_b8(pat);
312 }
313
314 template <enum svpattern pat, typename T = ScalarType>
315 88584 static std::enable_if_t<sizeof(T) == sizeof(int16_t), svbool_t> svptrue_pat()
316 KLEIDICV_STREAMING {
317 88584 return svptrue_pat_b16(pat);
318 }
319
320 template <enum svpattern pat, typename T = ScalarType>
321 103092 static std::enable_if_t<sizeof(T) == sizeof(int32_t), svbool_t> svptrue_pat()
322 KLEIDICV_STREAMING {
323 103092 return svptrue_pat_b32(pat);
324 }
325
326 template <enum svpattern pat, typename T = ScalarType>
327 static std::enable_if_t<sizeof(T) == sizeof(int64_t), svbool_t> svptrue_pat()
328 KLEIDICV_STREAMING {
329 return svptrue_pat_b64(pat);
330 }
331
332 template <typename IndexType, typename T = ScalarType>
333 109695 static std::enable_if_t<sizeof(T) == sizeof(int8_t), svbool_t> svwhilelt(
334 IndexType index, IndexType max_index) KLEIDICV_STREAMING {
335 if constexpr (std::is_same_v<IndexType, size_t>) {
336 99780 return svwhilelt_b8_u64(index, max_index);
337 } else if constexpr (std::is_same_v<IndexType, ptrdiff_t>) {
338 9915 return svwhilelt_b8_s64(index, max_index);
339 } else {
340 return svwhilelt_b8(index, max_index);
341 }
342 }
343
344 template <typename IndexType, typename T = ScalarType>
345 35599 static std::enable_if_t<sizeof(T) == sizeof(int16_t), svbool_t> svwhilelt(
346 IndexType index, IndexType max_index) KLEIDICV_STREAMING {
347 if constexpr (std::is_same_v<IndexType, size_t>) {
348 31089 return svwhilelt_b16_u64(index, max_index);
349 } else if constexpr (std::is_same_v<IndexType, ptrdiff_t>) {
350 4510 return svwhilelt_b16_s64(index, max_index);
351 } else {
352 return svwhilelt_b16(index, max_index);
353 }
354 }
355
356 template <typename IndexType, typename T = ScalarType>
357 20541 static std::enable_if_t<sizeof(T) == sizeof(int32_t), svbool_t> svwhilelt(
358 IndexType index, IndexType max_index) KLEIDICV_STREAMING {
359 if constexpr (std::is_same_v<IndexType, size_t>) {
360 20541 return svwhilelt_b32_u64(index, max_index);
361 } else if constexpr (std::is_same_v<IndexType, ptrdiff_t>) {
362 return svwhilelt_b32_s64(index, max_index);
363 } else {
364 return svwhilelt_b32(index, max_index);
365 }
366 }
367
368 template <typename IndexType, typename T = ScalarType>
369 2538 static std::enable_if_t<sizeof(T) == sizeof(int64_t), svbool_t> svwhilelt(
370 IndexType index, IndexType max_index) KLEIDICV_STREAMING {
371 if constexpr (std::is_same_v<IndexType, size_t>) {
372 2538 return svwhilelt_b64_u64(index, max_index);
373 } else if constexpr (std::is_same_v<IndexType, ptrdiff_t>) {
374 return svwhilelt_b64_s64(index, max_index);
375 } else {
376 return svwhilelt_b64(index, max_index);
377 }
378 }
379
380 // Transforms a single predicate into three other predicates that then can be
381 // used for consecutive operations. The input predicate can only have
382 // consecutive ones starting at the lowest element.
383 285 static void make_consecutive_predicates(svbool_t pg, svbool_t &pg_0,
384 svbool_t &pg_1,
385 svbool_t &pg_2) KLEIDICV_STREAMING {
386 // Length of data. Must be signed because of the unconditional subtraction
387 // of fixed values.
388 285 int64_t length = 3 * svcntp(pg);
389 // Handle up to VL length worth of data with the first predicated operation.
390 285 pg_0 = svwhilelt(int64_t{0}, length);
391 // Handle up to VL length worth of data with the second predicated operation
392 // taking into account data stored in the first predicated operation.
393 285 length -= num_lanes();
394 285 pg_1 = svwhilelt(int64_t{0}, length);
395 // Handle up to VL length worth of data with the second predicated operation
396 // taking into account data stored in the first and second predicated
397 // operations.
398 285 length -= num_lanes();
399 285 pg_2 = svwhilelt(int64_t{0}, length);
400 285 }
401
402 // Transforms a single predicate into four other predicates that then can be
403 // used for consecutive operations. The input predicate can only have
404 // consecutive ones starting at the lowest element.
405 759 static void make_consecutive_predicates(svbool_t pg, svbool_t &pg_0,
406 svbool_t &pg_1, svbool_t &pg_2,
407 svbool_t &pg_3) KLEIDICV_STREAMING {
408 // Length of data. Must be signed because of the unconditional subtraction
409 // of fixed values.
410 759 int64_t length = 4 * svcntp(pg);
411 // Handle up to VL length worth of data with the first predicated operation.
412 759 pg_0 = svwhilelt(int64_t{0}, length);
413 // Handle up to VL length worth of data with the second predicated operation
414 // taking into account data stored in the first predicated operation.
415 759 length -= num_lanes();
416 759 pg_1 = svwhilelt(int64_t{0}, length);
417 // Handle up to VL length worth of data with the second predicated operation
418 // taking into account data stored in the first and second predicated
419 // operations.
420 759 length -= num_lanes();
421 759 pg_2 = svwhilelt(int64_t{0}, length);
422 // Handle up to VL length worth of data with the third predicated operation
423 // taking into account data stored in the first, second and third predicated
424 // operations.
425 759 length -= num_lanes();
426 759 pg_3 = svwhilelt(int64_t{0}, length);
427 759 }
428 }; // end of class VecTraitsBase<ScalarType>
429
430 // Primary template for SVE vector traits.
431 template <typename ScalarType>
432 class VecTraits : public VecTraitsBase<ScalarType> {};
433
434 template <>
435 class VecTraits<int8_t> : public VecTraitsBase<int8_t> {
436 public:
437 972 static inline svint8_t svdup(int8_t v) KLEIDICV_STREAMING {
438 972 return svdup_s8(v);
439 }
440 2742 static inline svint8_t svreinterpret(svuint8_t v) KLEIDICV_STREAMING {
441 2742 return svreinterpret_s8(v);
442 }
443 1828 static inline svint8_t svasr_n(svbool_t pg, svint8_t v,
444 uint8_t s) KLEIDICV_STREAMING {
445 1828 return svasr_n_s8_x(pg, v, s);
446 }
447 }; // end of class VecTraits<int8_t>
448
449 template <>
450 class VecTraits<uint8_t> : public VecTraitsBase<uint8_t> {
451 public:
452 8721 static inline svuint8_t svdup(uint8_t v) KLEIDICV_STREAMING {
453 8721 return svdup_u8(v);
454 }
455 2742 static inline svuint8_t svreinterpret(svint8_t v) KLEIDICV_STREAMING {
456 2742 return svreinterpret_u8(v);
457 }
458 static inline svuint8_t svsub(svbool_t pg, svuint8_t v,
459 svuint8_t u) KLEIDICV_STREAMING {
460 return svsub_u8_x(pg, v, u);
461 }
462 static inline svuint8_t svhsub(svbool_t pg, svuint8_t v,
463 svuint8_t u) KLEIDICV_STREAMING {
464 return svhsub_u8_x(pg, v, u);
465 }
466 }; // end of class VecTraits<uint8_t>
467
468 template <>
469 class VecTraits<int16_t> : public VecTraitsBase<int16_t> {
470 public:
471 3710 static inline svint16_t svdup(int16_t v) KLEIDICV_STREAMING {
472 3710 return svdup_s16(v);
473 }
474 static inline svint16_t svreinterpret(svuint16_t v) KLEIDICV_STREAMING {
475 return svreinterpret_s16(v);
476 }
477 }; // end of class VecTraits<int16_t>
478
479 template <>
480 class VecTraits<uint16_t> : public VecTraitsBase<uint16_t> {
481 public:
482 2922 static inline svuint16_t svdup(uint16_t v) KLEIDICV_STREAMING {
483 2922 return svdup_u16(v);
484 }
485 static inline svuint16_t svreinterpret(svint16_t v) KLEIDICV_STREAMING {
486 return svreinterpret_u16(v);
487 }
488 }; // end of class VecTraits<uint16_t>
489
490 template <>
491 class VecTraits<int32_t> : public VecTraitsBase<int32_t> {
492 public:
493 1932 static inline svint32_t svdup(int32_t v) KLEIDICV_STREAMING {
494 1932 return svdup_s32(v);
495 }
496 static inline svint32_t svreinterpret(svuint32_t v) KLEIDICV_STREAMING {
497 return svreinterpret_s32(v);
498 }
499 }; // end of class VecTraits<int32_t>
500
501 template <>
502 class VecTraits<uint32_t> : public VecTraitsBase<uint32_t> {
503 public:
504 975 static inline svuint32_t svdup(uint32_t v) KLEIDICV_STREAMING {
505 975 return svdup_u32(v);
506 }
507 static inline svuint32_t svreinterpret(svint32_t v) KLEIDICV_STREAMING {
508 return svreinterpret_u32(v);
509 }
510 }; // end of class VecTraits<uint32_t>
511
512 template <>
513 class VecTraits<int64_t> : public VecTraitsBase<int64_t> {
514 public:
515 static inline svint64_t svdup(int64_t v) KLEIDICV_STREAMING {
516 return svdup_s64(v);
517 }
518 static inline svint64_t svreinterpret(svuint64_t v) KLEIDICV_STREAMING {
519 return svreinterpret_s64(v);
520 }
521 }; // end of class VecTraits<int64_t>
522
523 template <>
524 class VecTraits<uint64_t> : public VecTraitsBase<uint64_t> {
525 public:
526 static inline svuint64_t svdup(uint64_t v) KLEIDICV_STREAMING {
527 return svdup_u64(v);
528 }
529 static inline svuint64_t svreinterpret(svint64_t v) KLEIDICV_STREAMING {
530 return svreinterpret_u64(v);
531 }
532 }; // end of class VecTraits<uint64_t>
533
534 template <>
535 class VecTraits<float> : public VecTraitsBase<float> {
536 public:
537 1008 static inline svfloat32_t svdup(float v) KLEIDICV_STREAMING {
538 1008 return svdup_f32(v);
539 }
540 static inline svfloat32_t svsub(svbool_t pg, svfloat32_t v,
541 svfloat32_t u) KLEIDICV_STREAMING {
542 return svsub_f32_x(pg, v, u);
543 }
544 }; // end of class VecTraits<float>
545
546 template <>
547 class VecTraits<double> : public VecTraitsBase<double> {
548 public:
549 42 static inline svfloat64_t svdup(double v) KLEIDICV_STREAMING {
550 42 return svdup_f64(v);
551 }
552 }; // end of class VecTraits<double>
553
554 template <>
555 class VecTraits<float16_t> : public VecTraitsBase<float16_t> {
556 public:
557 static inline svfloat16_t svdup(float16_t v) KLEIDICV_STREAMING {
558 return svdup_f16(v);
559 }
560 }; // end of class VecTraits<float16_t>
561
562 // Adapter which adds context and forwards arguments.
563 template <typename OperationType>
564 class OperationContextAdapter : public OperationBase<OperationType> {
565 // Shorten rows: no need to write 'this->'.
566 using OperationBase<OperationType>::operation;
567 using OperationBase<OperationType>::num_lanes;
568
569 public:
570 using ContextType = Context;
571 using VecTraits = typename OperationBase<OperationType>::VecTraits;
572
573 25449 explicit OperationContextAdapter(OperationType &operation) KLEIDICV_STREAMING
574 25449 : OperationBase<OperationType>(operation) {}
575
576 // Forwards vector_path_2x() calls to the inner operation.
577 template <typename... ArgTypes>
578 52208 void vector_path_2x(ArgTypes &&...args) KLEIDICV_STREAMING {
579 52208 svbool_t ctx_pg;
580 52208 ContextType ctx{ctx_pg};
581 52208 ctx.set_predicate(VecTraits::svptrue());
582 52208 operation().vector_path_2x(ctx, std::forward<ArgTypes>(args)...);
583 52208 }
584
585 // Forwards vector_path() calls to the inner operation.
586 template <typename... ArgTypes>
587 5768 void vector_path(ArgTypes &&...args) KLEIDICV_STREAMING {
588 5768 svbool_t ctx_pg;
589 5768 ContextType ctx{ctx_pg};
590 5768 ctx.set_predicate(VecTraits::svptrue());
591 5768 operation().vector_path(ctx, std::forward<ArgTypes>(args)...);
592 5768 }
593
594 // Forwards remaining_path() calls to the inner operation if the concrete
595 // operation is unrolled once.
596 template <typename... ColumnTypes, typename T = OperationType>
597 1912 std::enable_if_t<T::is_unrolled_once()> remaining_path(
598 size_t length, ColumnTypes &&...columns) KLEIDICV_STREAMING {
599 1912 svbool_t ctx_pg;
600 1912 ContextType ctx{ctx_pg};
601 1912 ctx.set_predicate(VecTraits::svwhilelt(size_t{0}, length));
602 1912 operation().remaining_path(ctx, std::forward<ColumnTypes>(columns)...);
603 1912 }
604
605 // Forwards remaining_path() calls to the inner operation if the concrete
606 // operation is not unrolled once.
607 template <typename... ColumnTypes, typename T = OperationType>
608 26594 std::enable_if_t<!T::is_unrolled_once()> remaining_path(
609 size_t length, ColumnTypes... columns) KLEIDICV_STREAMING {
610 26594 svbool_t ctx_pg;
611 26594 ContextType ctx{ctx_pg};
612
613 26594 size_t index = 0;
614 26594 ctx.set_predicate(VecTraits::svwhilelt(index, length));
615
16/16
✓ Branch 0 taken 14456 times.
✓ Branch 1 taken 12462 times.
✓ Branch 2 taken 4104 times.
✓ Branch 3 taken 3461 times.
✓ Branch 4 taken 3519 times.
✓ Branch 5 taken 3129 times.
✓ Branch 6 taken 3042 times.
✓ Branch 7 taken 2708 times.
✓ Branch 8 taken 2572 times.
✓ Branch 9 taken 2123 times.
✓ Branch 10 taken 1841 times.
✓ Branch 11 taken 1475 times.
✓ Branch 12 taken 776 times.
✓ Branch 13 taken 738 times.
✓ Branch 14 taken 526 times.
✓ Branch 15 taken 498 times.
57430 while (svptest_first(VecTraits::svptrue(), ctx.predicate())) {
616 30836 operation().remaining_path(ctx, columns.at(index)...);
617 // Update loop counter and calculate the next governing predicate.
618 30836 index += num_lanes();
619 30836 ctx.set_predicate(VecTraits::svwhilelt(index, length));
620 }
621 26594 }
622 }; // end of class OperationContextAdapter<OperationType>
623
624 // Adapter which implements remaining_path() for general SVE2 operations.
625 template <typename OperationType>
626 class RemainingPathAdapter : public OperationBase<OperationType> {
627 public:
628 using ContextType = Context;
629
630 25449 explicit RemainingPathAdapter(OperationType &operation) KLEIDICV_STREAMING
631 25449 : OperationBase<OperationType>(operation) {}
632
633 // Forwards remaining_path() to either vector_path() or tail_path() of the
634 // inner operation depending on what is requested by the innermost operation.
635 template <typename... ArgTypes>
636 32748 void remaining_path(ArgTypes... args) KLEIDICV_STREAMING {
637 if constexpr (OperationType::uses_tail_path()) {
638 1044 this->operation().tail_path(std::forward<ArgTypes>(args)...);
639 } else {
640 31704 this->operation().vector_path(std::forward<ArgTypes>(args)...);
641 }
642 32748 }
643 }; // end of class RemainingPathAdapter<OperationType>
644
645 // Shorthand for applying a generic unrolled SVE2 operation.
646 template <typename OperationType, typename... ArgTypes>
647 23881 void apply_operation_by_rows(OperationType &operation,
648 ArgTypes &&...args) KLEIDICV_STREAMING {
649 23881 ForwardingOperation forwarding_operation{operation};
650 23881 OperationAdapter operation_adapter{forwarding_operation};
651 23881 RemainingPathAdapter remaining_path_adapter{operation_adapter};
652 23881 OperationContextAdapter context_adapter{remaining_path_adapter};
653 23881 RowBasedOperation row_based_operation{context_adapter};
654 23881 zip_rows(row_based_operation, std::forward<ArgTypes>(args)...);
655 23881 }
656
657 // Swap two variables, since some C++ Standard Library implementations do not
658 // allow using std::swap for SVE vectors.
659 template <typename T>
660 63104 static inline void swap_scalable(T &a, T &b) KLEIDICV_STREAMING {
661 63104 T tmp = a;
662 63104 a = b;
663 63104 b = tmp;
664 63104 }
665
666 // The following wrapper is used as a workaround to treat SVE variables as a 2D
667 // array.
668 template <typename VectorType, size_t Rows, size_t Cols>
669 class ScalableVectorArray2D {
670 public:
671 std::reference_wrapper<VectorType> window[Rows][Cols];
672 138944448 VectorType &operator()(int row, int col) KLEIDICV_STREAMING {
673 138944448 return window[row][col].get();
674 }
675 };
676
677 template <typename VectorType, size_t element_size>
678 class ScalableVectorArray1D {
679 public:
680 std::reference_wrapper<VectorType> window[element_size];
681 15807552 VectorType &operator()(int index) KLEIDICV_STREAMING {
682 15807552 return window[index].get();
683 }
684 };
685
686 #if KLEIDICV_TARGET_SME2
687
688 // To provide svqvct_[u8|s8|...] functionality for any scalable SIMD backend.
689 class SvqvctWrapper {
690 public:
691 186 explicit SvqvctWrapper(svuint8_t &) KLEIDICV_STREAMING {}
692
693 98 svuint8_t operator()(svuint32x4_t input) const KLEIDICV_STREAMING {
694 98 return svqcvt_u8(input);
695 }
696
697 98 svint8_t operator()(svint32x4_t input) const KLEIDICV_STREAMING {
698 98 return svqcvt_s8(input);
699 }
700 };
701
702 #else // KLEIDICV_TARGET_SME2
703
704 // To provide svqvct_[u8|s8|...] functionality for any scalable SIMD backend.
705 class SvqvctWrapper {
706 public:
707 372 explicit SvqvctWrapper(svuint8_t &index) KLEIDICV_STREAMING : index_(index) {
708 // Index generation to reorder converted values by tbl instruction
709 372 auto index0 = svindex_u8(0, 4);
710 372 auto index1 = svindex_u8(1, 4);
711 372 auto index2 = svindex_u8(2, 4);
712 372 auto index3 = svindex_u8(3, 4);
713
714 372 svbool_t pg = svwhilelt_b8(uint64_t(0), svcntb() / 4);
715
716 372 index_ = svsplice(pg, index3, svdup_u8(0));
717 372 index_ = svsplice(pg, index2, index_);
718 372 index_ = svsplice(pg, index1, index_);
719 372 index_ = svsplice(pg, index0, index_);
720 372 }
721
722 template <typename T>
723 1102 auto operator()(T input) const KLEIDICV_STREAMING {
724 1102 auto half_width_res0 = svqxtnb(svget4(input, 0));
725 1102 half_width_res0 = svqxtnt(half_width_res0, svget4(input, 2));
726
727 1102 auto half_width_res1 = svqxtnb(svget4(input, 1));
728 1102 half_width_res1 = svqxtnt(half_width_res1, svget4(input, 3));
729
730 1102 auto quarter_width_res = svqxtnb(half_width_res0);
731 1102 quarter_width_res = svqxtnt(quarter_width_res, half_width_res1);
732
733 2204 return svtbl(quarter_width_res, index_);
734 1102 }
735
736 private:
737 // This mimics treating `index_` as a member variable, but scalable types
738 // cannot be declared that way, so the caller owns the storage.
739 svuint8_t &index_;
740 };
741
742 #endif // KLEIDICV_TARGET_SME2
743
744 } // namespace KLEIDICV_TARGET_NAMESPACE
745
746 #endif // KLEIDICV_SVE2_H
747