KleidiCV Coverage Report


Directory: ./
File: kleidicv/include/kleidicv/neon.h
Date: 2025-11-25 17:23:32
Exec Total Coverage
Lines: 158 158 100.0%
Functions: 622 622 100.0%
Branches: 24 24 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_NEON_H
6 #define KLEIDICV_NEON_H
7
8 #include <utility>
9
10 #include "kleidicv/neon_intrinsics.h"
11 #include "kleidicv/operations.h"
12 #include "kleidicv/utils.h"
13
14 namespace kleidicv::neon {
15
16 template <>
17 class half_element_width<uint16x8_t> {
18 public:
19 using type = uint8x16_t;
20 };
21
22 template <>
23 class half_element_width<uint32x4_t> {
24 public:
25 using type = uint16x8_t;
26 };
27
28 template <>
29 class half_element_width<uint64x2_t> {
30 public:
31 using type = uint32x4_t;
32 };
33
34 template <>
35 class double_element_width<uint8x16_t> {
36 public:
37 using type = uint16x8_t;
38 };
39
40 template <>
41 class double_element_width<uint16x8_t> {
42 public:
43 using type = uint32x4_t;
44 };
45
46 template <>
47 class double_element_width<uint32x4_t> {
48 public:
49 using type = uint64x2_t;
50 };
51
52 // Primary template to describe logically grouped peroperties of vectors.
53 template <typename ScalarType>
54 class VectorTypes;
55
56 template <>
57 class VectorTypes<int8_t> {
58 public:
59 using ScalarType = int8_t;
60 using VectorType = int8x16_t;
61 using Vector2Type = int8x16x2_t;
62 using Vector3Type = int8x16x3_t;
63 using Vector4Type = int8x16x4_t;
64 }; // end of class VectorTypes<int8_t>
65
66 template <>
67 class VectorTypes<uint8_t> {
68 public:
69 using ScalarType = uint8_t;
70 using VectorType = uint8x16_t;
71 using Vector2Type = uint8x16x2_t;
72 using Vector3Type = uint8x16x3_t;
73 using Vector4Type = uint8x16x4_t;
74 }; // end of class VectorTypes<uint8_t>
75
76 template <>
77 class VectorTypes<int16_t> {
78 public:
79 using ScalarType = int16_t;
80 using VectorType = int16x8_t;
81 using Vector2Type = int16x8x2_t;
82 using Vector3Type = int16x8x3_t;
83 using Vector4Type = int16x8x4_t;
84 }; // end of class VectorTypes<int16_t>
85
86 template <>
87 class VectorTypes<uint16_t> {
88 public:
89 using ScalarType = uint16_t;
90 using VectorType = uint16x8_t;
91 using Vector2Type = uint16x8x2_t;
92 using Vector3Type = uint16x8x3_t;
93 using Vector4Type = uint16x8x4_t;
94 }; // end of class VectorTypes<uint16_t>
95
96 template <>
97 class VectorTypes<int32_t> {
98 public:
99 using ScalarType = int32_t;
100 using VectorType = int32x4_t;
101 using Vector2Type = int32x4x2_t;
102 using Vector3Type = int32x4x3_t;
103 using Vector4Type = int32x4x4_t;
104 }; // end of class VectorTypes<int32_t>
105
106 template <>
107 class VectorTypes<uint32_t> {
108 public:
109 using ScalarType = uint32_t;
110 using VectorType = uint32x4_t;
111 using Vector2Type = uint32x4x2_t;
112 using Vector3Type = uint32x4x3_t;
113 using Vector4Type = uint32x4x4_t;
114 }; // end of class VectorTypes<uint32_t>
115
116 template <>
117 class VectorTypes<int64_t> {
118 public:
119 using ScalarType = int64_t;
120 using VectorType = int64x2_t;
121 using Vector2Type = int64x2x2_t;
122 using Vector3Type = int64x2x3_t;
123 using Vector4Type = int64x2x4_t;
124 }; // end of class VectorTypes<int64_t>
125
126 template <>
127 class VectorTypes<uint64_t> {
128 public:
129 using ScalarType = uint64_t;
130 using VectorType = uint64x2_t;
131 using Vector2Type = uint64x2x2_t;
132 using Vector3Type = uint64x2x3_t;
133 using Vector4Type = uint64x2x4_t;
134 }; // end of class VectorTypes<uint64_t>
135
136 template <>
137 class VectorTypes<float> {
138 public:
139 using ScalarType = float;
140 using VectorType = float32x4_t;
141 using Vector2Type = float32x4x2_t;
142 using Vector3Type = float32x4x3_t;
143 using Vector4Type = float32x4x4_t;
144 }; // end of class VectorTypes<float>
145
146 template <>
147 class VectorTypes<double> {
148 public:
149 using ScalarType = double;
150 using VectorType = float64x2_t;
151 using Vector2Type = float64x2x2_t;
152 using Vector3Type = float64x2x3_t;
153 using Vector4Type = float64x2x4_t;
154 }; // end of class VectorTypes<double>
155
156 template <>
157 class VectorTypes<float16_t> {
158 public:
159 using ScalarType = float16_t;
160 using VectorType = float16x8_t;
161 using Vector2Type = float16x8x2_t;
162 using Vector3Type = float16x8x3_t;
163 using Vector4Type = float16x8x4_t;
164 }; // end of class VectorTypes<float16_t>
165
166 // NEON vector length in bytes.
167 static constexpr size_t kVectorLength = 16;
168
169 // Base class for all NEON vector traits.
170 template <typename ScalarType>
171 class VecTraitsBase : public VectorTypes<ScalarType> {
172 public:
173 using typename VectorTypes<ScalarType>::VectorType;
174 using typename VectorTypes<ScalarType>::Vector2Type;
175 using typename VectorTypes<ScalarType>::Vector3Type;
176 using typename VectorTypes<ScalarType>::Vector4Type;
177
178 // Number of lanes in a vector.
179 621408 static constexpr size_t num_lanes() {
180 621408 return kVectorLength / sizeof(ScalarType);
181 }
182
183 // Maximum number of lanes in a vector.
184 static constexpr size_t max_num_lanes() { return num_lanes(); }
185
186 #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
187
188 private:
189 static inline int8x16x2_t vld1q_x2(const int8_t *src) {
190 return vld1q_s8_x2(src);
191 }
192
193 3892 static inline uint8x16x2_t vld1q_x2(const uint8_t *src) {
194 3892 return vld1q_u8_x2(src);
195 }
196
197 4728 static inline int16x8x2_t vld1q_x2(const int16_t *src) {
198 4728 return vld1q_s16_x2(src);
199 }
200
201 568 static inline uint16x8x2_t vld1q_x2(const uint16_t *src) {
202 568 return vld1q_u16_x2(src);
203 }
204
205 static inline int32x4x2_t vld1q_x2(const int32_t *src) {
206 return vld1q_s32_x2(src);
207 }
208
209 400 static inline uint32x4x2_t vld1q_x2(const uint32_t *src) {
210 400 return vld1q_u32_x2(src);
211 }
212
213 static inline int64x2x2_t vld1q_x2(const int64_t *src) {
214 return vld1q_s64_x2(src);
215 }
216
217 400 static inline uint64x2x2_t vld1q_x2(const uint64_t *src) {
218 400 return vld1q_u64_x2(src);
219 }
220
221 static inline float32x4x2_t vld1q_x2(const float32_t *src) {
222 return vld1q_f32_x2(src);
223 }
224
225 static inline int8x16x3_t vld1q_x3(const int8_t *src) {
226 return vld1q_s8_x3(src);
227 }
228
229 2103 static inline uint8x16x3_t vld1q_x3(const uint8_t *src) {
230 2103 return vld1q_u8_x3(src);
231 }
232
233 static inline int16x8x3_t vld1q_x3(const int16_t *src) {
234 return vld1q_s16_x3(src);
235 }
236
237 720 static inline uint16x8x3_t vld1q_x3(const uint16_t *src) {
238 720 return vld1q_u16_x3(src);
239 }
240
241 static inline int32x4x3_t vld1q_x3(const int32_t *src) {
242 return vld1q_s32_x3(src);
243 }
244
245 720 static inline uint32x4x3_t vld1q_x3(const uint32_t *src) {
246 720 return vld1q_u32_x3(src);
247 }
248
249 static inline int64x2x3_t vld1q_x3(const int64_t *src) {
250 return vld1q_s64_x3(src);
251 }
252
253 720 static inline uint64x2x3_t vld1q_x3(const uint64_t *src) {
254 720 return vld1q_u64_x3(src);
255 }
256
257 static inline float32x4x3_t vld1q_x3(const float32_t *src) {
258 return vld1q_f32_x3(src);
259 }
260
261 884 static inline int8x16x4_t vld1q_x4(const int8_t *src) {
262 884 return vld1q_s8_x4(src);
263 }
264
265 2207 static inline uint8x16x4_t vld1q_x4(const uint8_t *src) {
266 2207 return vld1q_u8_x4(src);
267 }
268
269 static inline int16x8x4_t vld1q_x4(const int16_t *src) {
270 return vld1q_s16_x4(src);
271 }
272
273 720 static inline uint16x8x4_t vld1q_x4(const uint16_t *src) {
274 720 return vld1q_u16_x4(src);
275 }
276
277 static inline int32x4x4_t vld1q_x4(const int32_t *src) {
278 return vld1q_s32_x4(src);
279 }
280
281 720 static inline uint32x4x4_t vld1q_x4(const uint32_t *src) {
282 720 return vld1q_u32_x4(src);
283 }
284
285 static inline int64x2x4_t vld1q_x4(const int64_t *src) {
286 return vld1q_s64_x4(src);
287 }
288
289 720 static inline uint64x2x4_t vld1q_x4(const uint64_t *src) {
290 720 return vld1q_u64_x4(src);
291 }
292
293 457 static inline float32x4x4_t vld1q_x4(const float32_t *src) {
294 457 return vld1q_f32_x4(src);
295 }
296
297 static inline void vst1q_x2(int8_t *dst, int8x16x2_t vec) {
298 vst1q_s8_x2(dst, vec);
299 }
300
301 13871 static inline void vst1q_x2(uint8_t *dst, uint8x16x2_t vec) {
302 13871 vst1q_u8_x2(dst, vec);
303 13871 }
304
305 static inline void vst1q_x2(int16_t *dst, int16x8x2_t vec) {
306 vst1q_s16_x2(dst, vec);
307 }
308
309 3320 static inline void vst1q_x2(uint16_t *dst, uint16x8x2_t vec) {
310 3320 vst1q_u16_x2(dst, vec);
311 3320 }
312
313 static inline void vst1q_x2(int32_t *dst, int32x4x2_t vec) {
314 vst1q_s32_x2(dst, vec);
315 }
316
317 3320 static inline void vst1q_x2(uint32_t *dst, uint32x4x2_t vec) {
318 3320 vst1q_u32_x2(dst, vec);
319 3320 }
320
321 static inline void vst1q_x2(int64_t *dst, int64x2x2_t vec) {
322 vst1q_s64_x2(dst, vec);
323 }
324
325 3320 static inline void vst1q_x2(uint64_t *dst, uint64x2x2_t vec) {
326 3320 vst1q_u64_x2(dst, vec);
327 3320 }
328
329 73024 static inline void vst1q_x2(float32_t *dst, float32x4x2_t vec) {
330 73024 vst1q_f32_x2(dst, vec);
331 73024 }
332
333 static inline void vst1q_x2(float16_t *dst, float16x8x2_t vec) {
334 vst1q_f16_x2(dst, vec);
335 }
336
337 static inline void vst1q_x3(int8_t *dst, int8x16x3_t vec) {
338 vst1q_s8_x3(dst, vec);
339 }
340
341 1628 static inline void vst1q_x3(uint8_t *dst, uint8x16x3_t vec) {
342 1628 vst1q_u8_x3(dst, vec);
343 1628 }
344
345 static inline void vst1q_x3(int16_t *dst, int16x8x3_t vec) {
346 vst1q_s16_x3(dst, vec);
347 }
348
349 720 static inline void vst1q_x3(uint16_t *dst, uint16x8x3_t vec) {
350 720 vst1q_u16_x3(dst, vec);
351 720 }
352
353 static inline void vst1q_x3(int32_t *dst, int32x4x3_t vec) {
354 vst1q_s32_x3(dst, vec);
355 }
356
357 static inline void vst1q_x3(uint32_t *dst, uint32x4x3_t vec) {
358 vst1q_u32_x3(dst, vec);
359 }
360
361 static inline void vst1q_x3(int64_t *dst, int64x2x3_t vec) {
362 vst1q_s64_x3(dst, vec);
363 }
364
365 720 static inline void vst1q_x3(uint64_t *dst, uint64x2x3_t vec) {
366 720 vst1q_u64_x3(dst, vec);
367 720 }
368
369 static inline void vst1q_x3(float32_t *dst, float32x4x3_t vec) {
370 vst1q_f32_x3(dst, vec);
371 }
372
373 static inline void vst1q_x3(float16_t *dst, float16x8x3_t vec) {
374 vst1q_f16_x3(dst, vec);
375 }
376
377 static inline void vst1q_x4(int8_t *dst, int8x16x4_t vec) {
378 vst1q_s8_x4(dst, vec);
379 }
380
381 908 static inline void vst1q_x4(uint8_t *dst, uint8x16x4_t vec) {
382 908 vst1q_u8_x4(dst, vec);
383 908 }
384
385 static inline void vst1q_x4(int16_t *dst, int16x8x4_t vec) {
386 vst1q_s16_x4(dst, vec);
387 }
388
389 11304 static inline void vst1q_x4(uint16_t *dst, uint16x8x4_t vec) {
390 11304 vst1q_u16_x4(dst, vec);
391 11304 }
392
393 static inline void vst1q_x4(int32_t *dst, int32x4x4_t vec) {
394 vst1q_s32_x4(dst, vec);
395 }
396
397 1360 static inline void vst1q_x4(uint32_t *dst, uint32x4x4_t vec) {
398 1360 vst1q_u32_x4(dst, vec);
399 1360 }
400
401 static inline void vst1q_x4(int64_t *dst, int64x2x4_t vec) {
402 vst1q_s64_x4(dst, vec);
403 }
404
405 2720 static inline void vst1q_x4(uint64_t *dst, uint64x2x4_t vec) {
406 2720 vst1q_u64_x4(dst, vec);
407 2720 }
408
409 312 static inline void vst1q_x4(float32_t *dst, float32x4x4_t vec) {
410 312 vst1q_f32_x4(dst, vec);
411 312 }
412
413 480 static inline void vst1q_x4(float16_t *dst, float16x8x4_t vec) {
414 480 vst1q_f16_x4(dst, vec);
415 480 }
416
417 public:
418 #endif
419
420 // Loads a single vector from 'src'.
421 4264 static inline void load(const ScalarType *src, VectorType &vec) {
422 4264 vec = vld1q(&src[0]);
423 4264 }
424
425 // Loads two consecutive vectors from 'src'.
426 8388 static inline void load(const ScalarType *src, Vector2Type &vec) {
427 #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
428 8388 vec = vld1q_x2(&src[0]);
429 #else
430 vec = {vld1q(&src[0]), vld1q(&src[0] + num_lanes())};
431 #endif
432 8388 }
433
434 // Loads three consecutive vectors from 'src'.
435 1383 static inline void load(const ScalarType *src, Vector3Type &vec) {
436 #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
437 1383 vec = vld1q_x3(&src[0]);
438 #else
439 vec = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()),
440 vld1q(&src[0] + (2 * num_lanes()))};
441 #endif
442 1383 }
443
444 // Loads four consecutive vectors from 'src'.
445 2828 static inline void load(const ScalarType *src, Vector4Type &vec) {
446 #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
447 2828 vec = vld1q_x4(&src[0]);
448 #else
449 vec = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()),
450 vld1q(&src[0] + (2 * num_lanes())),
451 vld1q(&src[0] + (3 * num_lanes()))};
452 #endif
453 2828 }
454
455 // Loads two consecutive vectors from 'src'.
456 490235 static inline void load_consecutive(const ScalarType *src, VectorType &vec_0,
457 VectorType &vec_1) {
458 490235 vec_0 = vld1q(&src[0]);
459 490235 vec_1 = vld1q(&src[num_lanes()]);
460 490235 }
461
462 // Loads 2x2 consecutive vectors from 'src'.
463 800 static inline void load_consecutive(const ScalarType *src, Vector2Type &vec_0,
464 Vector2Type &vec_1) {
465 #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
466 800 vec_0 = vld1q_x2(&src[0]);
467 800 vec_1 = vld1q_x2(&src[num_lanes() * 2]);
468 #else
469 vec_0 = {vld1q(&src[0]), vld1q(&src[0] + num_lanes())};
470 vec_1 = {vld1q(&src[num_lanes() * 2]),
471 vld1q(&src[num_lanes() * 2] + num_lanes())};
472 #endif
473 800 }
474
475 // Loads 2x3 consecutive vectors from 'src'.
476 1440 static inline void load_consecutive(const ScalarType *src, Vector3Type &vec_0,
477 Vector3Type &vec_1) {
478 #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
479 1440 vec_0 = vld1q_x3(&src[0]);
480 1440 vec_1 = vld1q_x3(&src[num_lanes() * 3]);
481 #else
482 vec_0 = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()),
483 vld1q(&src[0] + (2 * num_lanes()))};
484 vec_1 = {vld1q(&src[num_lanes() * 3]),
485 vld1q(&src[num_lanes() * 3] + num_lanes()),
486 vld1q(&src[num_lanes() * 3] + (2 * num_lanes()))};
487 #endif
488 1440 }
489
490 // Loads 2x4 consecutive vectors from 'src'.
491 1440 static inline void load_consecutive(const ScalarType *src, Vector4Type &vec_0,
492 Vector4Type &vec_1) {
493 #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
494 1440 vec_0 = vld1q_x4(&src[0]);
495 1440 vec_1 = vld1q_x4(&src[num_lanes() * 4]);
496
497 #else
498 vec_0 = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()),
499 vld1q(&src[0] + (2 * num_lanes())),
500 vld1q(&src[0] + (3 * num_lanes()))};
501 vec_1 = {vld1q(&src[num_lanes() * 4]),
502 vld1q(&src[num_lanes() * 4] + num_lanes()),
503 vld1q(&src[num_lanes() * 4] + (2 * num_lanes())),
504 vld1q(&src[num_lanes() * 4] + (3 * num_lanes()))};
505 #endif
506 1440 }
507
508 // Stores a single vector to 'dst'.
509 12166 static inline void store(VectorType vec, ScalarType *dst) {
510 12166 vst1q(&dst[0], vec);
511 12166 }
512
513 // Stores two consecutive vectors to 'dst'.
514 96855 static inline void store(Vector2Type vec, ScalarType *dst) {
515 #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
516 96855 vst1q_x2(&dst[0], vec);
517 #else
518 vst1q(&dst[0], vec.val[0]);
519 vst1q(&dst[0] + num_lanes(), vec.val[1]);
520 #endif
521 96855 }
522
523 // Stores three consecutive vectors to 'dst'.
524 3068 static inline void store(Vector3Type vec, ScalarType *dst) {
525 #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
526 3068 vst1q_x3(&dst[0], vec);
527 #else
528 vst1q(&dst[0], vec.val[0]);
529 vst1q(&dst[0] + num_lanes(), vec.val[1]);
530 vst1q(&dst[0] + (2 * num_lanes()), vec.val[2]);
531 #endif
532 3068 }
533
534 // Stores four consecutive vectors to 'dst'.
535 17084 static inline void store(Vector4Type vec, ScalarType *dst) {
536 #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
537 17084 vst1q_x4(&dst[0], vec);
538 #else
539 vst1q(&dst[0], vec.val[0]);
540 vst1q(&dst[0] + num_lanes(), vec.val[1]);
541 vst1q(&dst[0] + (2 * num_lanes()), vec.val[2]);
542 vst1q(&dst[0] + (3 * num_lanes()), vec.val[3]);
543 #endif
544 17084 }
545
546 // Stores two consecutive vectors to 'dst'.
547 28627 static inline void store_consecutive(VectorType vec_0, VectorType vec_1,
548 ScalarType *dst) {
549 28627 vst1q(&dst[0], vec_0);
550 28627 vst1q(&dst[num_lanes()], vec_1);
551 28627 }
552 }; // end of class VecTraitsBase<ScalarType>
553
554 // Available NEON vector traits.
555 template <typename ScalarType>
556 class VecTraits : public VecTraitsBase<ScalarType> {};
557
558 // NEON has no associated context yet.
559 using NeonContextType = Monostate;
560
561 // Adapter which simply adds context and forwards all arguments.
562 template <typename OperationType>
563 class OperationContextAdapter : public OperationBase<OperationType> {
564 // Shorten rows: no need to write 'this->'.
565 using OperationBase<OperationType>::operation;
566
567 public:
568 using ContextType = NeonContextType;
569
570 7255 explicit OperationContextAdapter(OperationType &operation)
571 7255 : OperationBase<OperationType>(operation) {}
572
573 // Forwards vector_path_2x() calls to the inner operation.
574 template <typename... ArgTypes>
575 462471 void vector_path_2x(ArgTypes &&...args) {
576 462471 operation().vector_path_2x(ContextType{}, std::forward<ArgTypes>(args)...);
577 462471 }
578
579 // Forwards vector_path() calls to the inner operation.
580 template <typename... ArgTypes>
581 7762 void vector_path(ArgTypes &&...args) {
582 7762 operation().vector_path(ContextType{}, std::forward<ArgTypes>(args)...);
583 7762 }
584
585 // Forwards remaining_path() calls to the inner operation.
586 template <typename... ArgTypes>
587 8596 void remaining_path(ArgTypes &&...args) {
588 8596 operation().remaining_path(ContextType{}, std::forward<ArgTypes>(args)...);
589 8596 }
590 }; // end of class OperationContextAdapter<OperationType>
591
592 // Adapter which implements remaining_path() for general NEON operations.
593 template <typename OperationType>
594 class RemainingPathAdapter : public OperationBase<OperationType> {
595 public:
596 using ContextType = NeonContextType;
597
598 6951 explicit RemainingPathAdapter(OperationType &operation)
599 6951 : OperationBase<OperationType>(operation) {}
600
601 // Forwards remaining_path() calls to scalar_path() of the inner operation
602 // element by element.
603 template <typename... ColumnTypes>
604 7940 void remaining_path(ContextType ctx, size_t length, ColumnTypes... columns) {
605
24/24
✓ Branch 0 taken 2068 times.
✓ Branch 1 taken 33794 times.
✓ Branch 2 taken 990 times.
✓ Branch 3 taken 11388 times.
✓ Branch 4 taken 957 times.
✓ Branch 5 taken 4668 times.
✓ Branch 6 taken 664 times.
✓ Branch 7 taken 3983 times.
✓ Branch 8 taken 764 times.
✓ Branch 9 taken 2353 times.
✓ Branch 10 taken 705 times.
✓ Branch 11 taken 1596 times.
✓ Branch 12 taken 312 times.
✓ Branch 13 taken 356 times.
✓ Branch 14 taken 360 times.
✓ Branch 15 taken 400 times.
✓ Branch 16 taken 376 times.
✓ Branch 17 taken 384 times.
✓ Branch 18 taken 120 times.
✓ Branch 19 taken 128 times.
✓ Branch 20 taken 248 times.
✓ Branch 21 taken 256 times.
✓ Branch 22 taken 376 times.
✓ Branch 23 taken 384 times.
67630 for (size_t index = 0; index < length; ++index) {
606 59690 disable_loop_vectorization();
607 59690 this->operation().scalar_path(ctx, columns.at(index)...);
608 59690 }
609 7940 }
610 }; // end of class RemainingPathAdapter<OperationType>
611
612 // Adapter which implements remaining_path() for NEON operations which
613 // implementation custom processing of remaining elements.
614 template <typename OperationType>
615 class RemainingPathToScalarPathAdapter : public OperationBase<OperationType> {
616 public:
617 using ContextType = NeonContextType;
618
619 304 explicit RemainingPathToScalarPathAdapter(OperationType &operation)
620 304 : OperationBase<OperationType>(operation) {}
621
622 // Forwards remaining_path() calls to scalar_path() of the inner operation.
623 template <typename... ArgTypes>
624 656 void remaining_path(ArgTypes &&...args) {
625 656 this->operation().scalar_path(std::forward<ArgTypes>(args)...);
626 656 }
627 }; // end of class RemainingPathToScalarPathAdapter<OperationType>
628
629 // Shorthand for applying a generic unrolled NEON operation.
630 template <typename OperationType, typename... ArgTypes>
631 6327 void apply_operation_by_rows(OperationType &operation, ArgTypes &&...args) {
632 6327 RemoveContextAdapter remove_context_adapter{operation};
633 6327 OperationAdapter operation_adapter{remove_context_adapter};
634 6327 RemainingPathAdapter remaining_path_adapter{operation_adapter};
635 6327 OperationContextAdapter context_adapter{remaining_path_adapter};
636 6327 RowBasedOperation row_based_operation{context_adapter};
637 6327 zip_rows(row_based_operation, std::forward<ArgTypes>(args)...);
638 6327 }
639
640 // Shorthand for applying a generic unrolled and block-based NEON operation.
641 template <typename OperationType, typename... ArgTypes>
642 624 void apply_block_operation_by_rows(OperationType &operation,
643 ArgTypes &&...args) {
644 624 RemoveContextAdapter remove_context_adapter{operation};
645 624 OperationAdapter operation_adapter{remove_context_adapter};
646 624 RemainingPathAdapter remaining_path_adapter{operation_adapter};
647 624 OperationContextAdapter context_adapter{remaining_path_adapter};
648 624 RowBasedBlockOperation block_operation{context_adapter};
649 624 zip_rows(block_operation, std::forward<ArgTypes>(args)...);
650 624 }
651
652 } // namespace kleidicv::neon
653
654 #endif // KLEIDICV_NEON_H
655