KleidiCV Coverage Report

Directory:	./
File:	kleidicv/include/kleidicv/neon.h
Date:	2025-09-25 14:13:34

	Exec	Total	Coverage
Lines:	155	155	100.0%
Functions:	632	632	100.0%
Branches:	24	24	100.0%

  
      Line
      Branch
      Exec
      Source
    
      // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
    
      //
    
      // SPDX-License-Identifier: Apache-2.0
    
      #ifndef KLEIDICV_NEON_H
    
      #define KLEIDICV_NEON_H
    
      #include <utility>
    
      #include "kleidicv/neon_intrinsics.h"
    
      #include "kleidicv/operations.h"
    
      #include "kleidicv/utils.h"
    
      namespace kleidicv::neon {
    
      template <>
    
      class half_element_width<uint16x8_t> {
    
       public:
    
        using type = uint8x16_t;
    
      };
    
      template <>
    
      class half_element_width<uint32x4_t> {
    
       public:
    
        using type = uint16x8_t;
    
      };
    
      template <>
    
      class half_element_width<uint64x2_t> {
    
       public:
    
        using type = uint32x4_t;
    
      };
    
      template <>
    
      class double_element_width<uint8x16_t> {
    
       public:
    
        using type = uint16x8_t;
    
      };
    
      template <>
    
      class double_element_width<uint16x8_t> {
    
       public:
    
        using type = uint32x4_t;
    
      };
    
      template <>
    
      class double_element_width<uint32x4_t> {
    
       public:
    
        using type = uint64x2_t;
    
      };
    
      // Primary template to describe logically grouped peroperties of vectors.
    
      template <typename ScalarType>
    
      class VectorTypes;
    
      template <>
    
      class VectorTypes<int8_t> {
    
       public:
    
        using ScalarType = int8_t;
    
        using VectorType = int8x16_t;
    
        using Vector2Type = int8x16x2_t;
    
        using Vector3Type = int8x16x3_t;
    
        using Vector4Type = int8x16x4_t;
    
      };  // end of class VectorTypes<int8_t>
    
      template <>
    
      class VectorTypes<uint8_t> {
    
       public:
    
        using ScalarType = uint8_t;
    
        using VectorType = uint8x16_t;
    
        using Vector2Type = uint8x16x2_t;
    
        using Vector3Type = uint8x16x3_t;
    
        using Vector4Type = uint8x16x4_t;
    
      };  // end of class VectorTypes<uint8_t>
    
      template <>
    
      class VectorTypes<int16_t> {
    
       public:
    
        using ScalarType = int16_t;
    
        using VectorType = int16x8_t;
    
        using Vector2Type = int16x8x2_t;
    
        using Vector3Type = int16x8x3_t;
    
        using Vector4Type = int16x8x4_t;
    
      };  // end of class VectorTypes<int16_t>
    
      template <>
    
      class VectorTypes<uint16_t> {
    
       public:
    
        using ScalarType = uint16_t;
    
        using VectorType = uint16x8_t;
    
        using Vector2Type = uint16x8x2_t;
    
        using Vector3Type = uint16x8x3_t;
    
        using Vector4Type = uint16x8x4_t;
    
      };  // end of class VectorTypes<uint16_t>
    
      template <>
    
      class VectorTypes<int32_t> {
    
       public:
    
        using ScalarType = int32_t;
    
        using VectorType = int32x4_t;
    
        using Vector2Type = int32x4x2_t;
    
        using Vector3Type = int32x4x3_t;
    
        using Vector4Type = int32x4x4_t;
    
      };  // end of class VectorTypes<int32_t>
    
      template <>
    
      class VectorTypes<uint32_t> {
    
       public:
    
        using ScalarType = uint32_t;
    
        using VectorType = uint32x4_t;
    
        using Vector2Type = uint32x4x2_t;
    
        using Vector3Type = uint32x4x3_t;
    
        using Vector4Type = uint32x4x4_t;
    
      };  // end of class VectorTypes<uint32_t>
    
      template <>
    
      class VectorTypes<int64_t> {
    
       public:
    
        using ScalarType = int64_t;
    
        using VectorType = int64x2_t;
    
        using Vector2Type = int64x2x2_t;
    
        using Vector3Type = int64x2x3_t;
    
        using Vector4Type = int64x2x4_t;
    
      };  // end of class VectorTypes<int64_t>
    
      template <>
    
      class VectorTypes<uint64_t> {
    
       public:
    
        using ScalarType = uint64_t;
    
        using VectorType = uint64x2_t;
    
        using Vector2Type = uint64x2x2_t;
    
        using Vector3Type = uint64x2x3_t;
    
        using Vector4Type = uint64x2x4_t;
    
      };  // end of class VectorTypes<uint64_t>
    
      template <>
    
      class VectorTypes<float> {
    
       public:
    
        using ScalarType = float;
    
        using VectorType = float32x4_t;
    
        using Vector2Type = float32x4x2_t;
    
        using Vector3Type = float32x4x3_t;
    
        using Vector4Type = float32x4x4_t;
    
      };  // end of class VectorTypes<float>
    
      template <>
    
      class VectorTypes<double> {
    
       public:
    
        using ScalarType = double;
    
        using VectorType = float64x2_t;
    
        using Vector2Type = float64x2x2_t;
    
        using Vector3Type = float64x2x3_t;
    
        using Vector4Type = float64x2x4_t;
    
      };  // end of class VectorTypes<double>
    
      // NEON vector length in bytes.
    
      static constexpr size_t kVectorLength = 16;
    
      // Base class for all NEON vector traits.
    
      template <typename ScalarType>
    
      class VecTraitsBase : public VectorTypes<ScalarType> {
    
       public:
    
        using typename VectorTypes<ScalarType>::VectorType;
    
        using typename VectorTypes<ScalarType>::Vector2Type;
    
        using typename VectorTypes<ScalarType>::Vector3Type;
    
        using typename VectorTypes<ScalarType>::Vector4Type;
    
        // Number of lanes in a vector.
    
      444265
        static constexpr size_t num_lanes() {
    
      444265
          return kVectorLength / sizeof(ScalarType);
    
        }
    
        // Maximum number of lanes in a vector.
    
        static constexpr size_t max_num_lanes() { return num_lanes(); }
    
      #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
    
       private:
    
        static inline int8x16x2_t vld1q_x2(const int8_t *src) {
    
          return vld1q_s8_x2(src);
    
        }
    
      3214
        static inline uint8x16x2_t vld1q_x2(const uint8_t *src) {
    
      3214
          return vld1q_u8_x2(src);
    
        }
    
      4728
        static inline int16x8x2_t vld1q_x2(const int16_t *src) {
    
      4728
          return vld1q_s16_x2(src);
    
        }
    
      408
        static inline uint16x8x2_t vld1q_x2(const uint16_t *src) {
    
      408
          return vld1q_u16_x2(src);
    
        }
    
        static inline int32x4x2_t vld1q_x2(const int32_t *src) {
    
          return vld1q_s32_x2(src);
    
        }
    
      240
        static inline uint32x4x2_t vld1q_x2(const uint32_t *src) {
    
      240
          return vld1q_u32_x2(src);
    
        }
    
        static inline int64x2x2_t vld1q_x2(const int64_t *src) {
    
          return vld1q_s64_x2(src);
    
        }
    
      240
        static inline uint64x2x2_t vld1q_x2(const uint64_t *src) {
    
      240
          return vld1q_u64_x2(src);
    
        }
    
        static inline float32x4x2_t vld1q_x2(const float32_t *src) {
    
          return vld1q_f32_x2(src);
    
        }
    
        static inline int8x16x3_t vld1q_x3(const int8_t *src) {
    
          return vld1q_s8_x3(src);
    
        }
    
      1688
        static inline uint8x16x3_t vld1q_x3(const uint8_t *src) {
    
      1688
          return vld1q_u8_x3(src);
    
        }
    
        static inline int16x8x3_t vld1q_x3(const int16_t *src) {
    
          return vld1q_s16_x3(src);
    
        }
    
      432
        static inline uint16x8x3_t vld1q_x3(const uint16_t *src) {
    
      432
          return vld1q_u16_x3(src);
    
        }
    
        static inline int32x4x3_t vld1q_x3(const int32_t *src) {
    
          return vld1q_s32_x3(src);
    
        }
    
      432
        static inline uint32x4x3_t vld1q_x3(const uint32_t *src) {
    
      432
          return vld1q_u32_x3(src);
    
        }
    
        static inline int64x2x3_t vld1q_x3(const int64_t *src) {
    
          return vld1q_s64_x3(src);
    
        }
    
      432
        static inline uint64x2x3_t vld1q_x3(const uint64_t *src) {
    
      432
          return vld1q_u64_x3(src);
    
        }
    
        static inline float32x4x3_t vld1q_x3(const float32_t *src) {
    
          return vld1q_f32_x3(src);
    
        }
    
      884
        static inline int8x16x4_t vld1q_x4(const int8_t *src) {
    
      884
          return vld1q_s8_x4(src);
    
        }
    
      1436
        static inline uint8x16x4_t vld1q_x4(const uint8_t *src) {
    
      1436
          return vld1q_u8_x4(src);
    
        }
    
        static inline int16x8x4_t vld1q_x4(const int16_t *src) {
    
          return vld1q_s16_x4(src);
    
        }
    
      432
        static inline uint16x8x4_t vld1q_x4(const uint16_t *src) {
    
      432
          return vld1q_u16_x4(src);
    
        }
    
        static inline int32x4x4_t vld1q_x4(const int32_t *src) {
    
          return vld1q_s32_x4(src);
    
        }
    
      432
        static inline uint32x4x4_t vld1q_x4(const uint32_t *src) {
    
      432
          return vld1q_u32_x4(src);
    
        }
    
        static inline int64x2x4_t vld1q_x4(const int64_t *src) {
    
          return vld1q_s64_x4(src);
    
        }
    
      432
        static inline uint64x2x4_t vld1q_x4(const uint64_t *src) {
    
      432
          return vld1q_u64_x4(src);
    
        }
    
      457
        static inline float32x4x4_t vld1q_x4(const float32_t *src) {
    
      457
          return vld1q_f32_x4(src);
    
        }
    
        static inline void vst1q_x2(int8_t *dst, int8x16x2_t vec) {
    
          vst1q_s8_x2(dst, vec);
    
        }
    
      6904
        static inline void vst1q_x2(uint8_t *dst, uint8x16x2_t vec) {
    
      6904
          vst1q_u8_x2(dst, vec);
    
      6904
        }
    
        static inline void vst1q_x2(int16_t *dst, int16x8x2_t vec) {
    
          vst1q_s16_x2(dst, vec);
    
        }
    
      1992
        static inline void vst1q_x2(uint16_t *dst, uint16x8x2_t vec) {
    
      1992
          vst1q_u16_x2(dst, vec);
    
      1992
        }
    
        static inline void vst1q_x2(int32_t *dst, int32x4x2_t vec) {
    
          vst1q_s32_x2(dst, vec);
    
        }
    
      1992
        static inline void vst1q_x2(uint32_t *dst, uint32x4x2_t vec) {
    
      1992
          vst1q_u32_x2(dst, vec);
    
      1992
        }
    
        static inline void vst1q_x2(int64_t *dst, int64x2x2_t vec) {
    
          vst1q_s64_x2(dst, vec);
    
        }
    
      1992
        static inline void vst1q_x2(uint64_t *dst, uint64x2x2_t vec) {
    
      1992
          vst1q_u64_x2(dst, vec);
    
      1992
        }
    
      73024
        static inline void vst1q_x2(float32_t *dst, float32x4x2_t vec) {
    
      73024
          vst1q_f32_x2(dst, vec);
    
      73024
        }
    
        static inline void vst1q_x3(int8_t *dst, int8x16x3_t vec) {
    
          vst1q_s8_x3(dst, vec);
    
        }
    
      1304
        static inline void vst1q_x3(uint8_t *dst, uint8x16x3_t vec) {
    
      1304
          vst1q_u8_x3(dst, vec);
    
      1304
        }
    
        static inline void vst1q_x3(int16_t *dst, int16x8x3_t vec) {
    
          vst1q_s16_x3(dst, vec);
    
        }
    
      432
        static inline void vst1q_x3(uint16_t *dst, uint16x8x3_t vec) {
    
      432
          vst1q_u16_x3(dst, vec);
    
      432
        }
    
        static inline void vst1q_x3(int32_t *dst, int32x4x3_t vec) {
    
          vst1q_s32_x3(dst, vec);
    
        }
    
        static inline void vst1q_x3(uint32_t *dst, uint32x4x3_t vec) {
    
          vst1q_u32_x3(dst, vec);
    
        }
    
        static inline void vst1q_x3(int64_t *dst, int64x2x3_t vec) {
    
          vst1q_s64_x3(dst, vec);
    
        }
    
      432
        static inline void vst1q_x3(uint64_t *dst, uint64x2x3_t vec) {
    
      432
          vst1q_u64_x3(dst, vec);
    
      432
        }
    
        static inline void vst1q_x3(float32_t *dst, float32x4x3_t vec) {
    
          vst1q_f32_x3(dst, vec);
    
        }
    
        static inline void vst1q_x4(int8_t *dst, int8x16x4_t vec) {
    
          vst1q_s8_x4(dst, vec);
    
        }
    
      442
        static inline void vst1q_x4(uint8_t *dst, uint8x16x4_t vec) {
    
      442
          vst1q_u8_x4(dst, vec);
    
      442
        }
    
        static inline void vst1q_x4(int16_t *dst, int16x8x4_t vec) {
    
          vst1q_s16_x4(dst, vec);
    
        }
    
      5608
        static inline void vst1q_x4(uint16_t *dst, uint16x8x4_t vec) {
    
      5608
          vst1q_u16_x4(dst, vec);
    
      5608
        }
    
        static inline void vst1q_x4(int32_t *dst, int32x4x4_t vec) {
    
          vst1q_s32_x4(dst, vec);
    
        }
    
      816
        static inline void vst1q_x4(uint32_t *dst, uint32x4x4_t vec) {
    
      816
          vst1q_u32_x4(dst, vec);
    
      816
        }
    
        static inline void vst1q_x4(int64_t *dst, int64x2x4_t vec) {
    
          vst1q_s64_x4(dst, vec);
    
        }
    
      1632
        static inline void vst1q_x4(uint64_t *dst, uint64x2x4_t vec) {
    
      1632
          vst1q_u64_x4(dst, vec);
    
      1632
        }
    
      312
        static inline void vst1q_x4(float32_t *dst, float32x4x4_t vec) {
    
      312
          vst1q_f32_x4(dst, vec);
    
      312
        }
    
       public:
    
      #endif
    
        // Loads a single vector from 'src'.
    
      4622
        static inline void load(const ScalarType *src, VectorType &vec) {
    
      4622
          vec = vld1q(&src[0]);
    
      4622
        }
    
        // Loads two consecutive vectors from 'src'.
    
      7870
        static inline void load(const ScalarType *src, Vector2Type &vec) {
    
      #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
    
      7870
          vec = vld1q_x2(&src[0]);
    
      #else
    
          vec = {vld1q(&src[0]), vld1q(&src[0] + num_lanes())};
    
      #endif
    
      7870
        }
    
        // Loads three consecutive vectors from 'src'.
    
      1256
        static inline void load(const ScalarType *src, Vector3Type &vec) {
    
      #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
    
      1256
          vec = vld1q_x3(&src[0]);
    
      #else
    
          vec = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()),
    
                 vld1q(&src[0] + (2 * num_lanes()))};
    
      #endif
    
      1256
        }
    
        // Loads four consecutive vectors from 'src'.
    
      2345
        static inline void load(const ScalarType *src, Vector4Type &vec) {
    
      #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
    
      2345
          vec = vld1q_x4(&src[0]);
    
      #else
    
          vec = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()),
    
                 vld1q(&src[0] + (2 * num_lanes())),
    
                 vld1q(&src[0] + (3 * num_lanes()))};
    
      #endif
    
      2345
        }
    
        // Loads two consecutive vectors from 'src'.
    
      304340
        static inline void load_consecutive(const ScalarType *src, VectorType &vec_0,
    
                                            VectorType &vec_1) {
    
      304340
          vec_0 = vld1q(&src[0]);
    
      304340
          vec_1 = vld1q(&src[num_lanes()]);
    
      304340
        }
    
        // Loads 2x2 consecutive vectors from 'src'.
    
      480
        static inline void load_consecutive(const ScalarType *src, Vector2Type &vec_0,
    
                                            Vector2Type &vec_1) {
    
      #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
    
      480
          vec_0 = vld1q_x2(&src[0]);
    
      480
          vec_1 = vld1q_x2(&src[num_lanes() * 2]);
    
      #else
    
          vec_0 = {vld1q(&src[0]), vld1q(&src[0] + num_lanes())};
    
          vec_1 = {vld1q(&src[num_lanes() * 2]),
    
                   vld1q(&src[num_lanes() * 2] + num_lanes())};
    
      #endif
    
      480
        }
    
        // Loads 2x3 consecutive vectors from 'src'.
    
      864
        static inline void load_consecutive(const ScalarType *src, Vector3Type &vec_0,
    
                                            Vector3Type &vec_1) {
    
      #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
    
      864
          vec_0 = vld1q_x3(&src[0]);
    
      864
          vec_1 = vld1q_x3(&src[num_lanes() * 3]);
    
      #else
    
          vec_0 = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()),
    
                   vld1q(&src[0] + (2 * num_lanes()))};
    
          vec_1 = {vld1q(&src[num_lanes() * 3]),
    
                   vld1q(&src[num_lanes() * 3] + num_lanes()),
    
                   vld1q(&src[num_lanes() * 3] + (2 * num_lanes()))};
    
      #endif
    
      864
        }
    
        // Loads 2x4 consecutive vectors from 'src'.
    
      864
        static inline void load_consecutive(const ScalarType *src, Vector4Type &vec_0,
    
                                            Vector4Type &vec_1) {
    
      #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
    
      864
          vec_0 = vld1q_x4(&src[0]);
    
      864
          vec_1 = vld1q_x4(&src[num_lanes() * 4]);
    
      #else
    
          vec_0 = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()),
    
                   vld1q(&src[0] + (2 * num_lanes())),
    
                   vld1q(&src[0] + (3 * num_lanes()))};
    
          vec_1 = {vld1q(&src[num_lanes() * 4]),
    
                   vld1q(&src[num_lanes() * 4] + num_lanes()),
    
                   vld1q(&src[num_lanes() * 4] + (2 * num_lanes())),
    
                   vld1q(&src[num_lanes() * 4] + (3 * num_lanes()))};
    
      #endif
    
      864
        }
    
        // Stores a single vector to 'dst'.
    
      11800
        static inline void store(VectorType vec, ScalarType *dst) {
    
      11800
          vst1q(&dst[0], vec);
    
      11800
        }
    
        // Stores two consecutive vectors to 'dst'.
    
      85904
        static inline void store(Vector2Type vec, ScalarType *dst) {
    
      #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
    
      85904
          vst1q_x2(&dst[0], vec);
    
      #else
    
          vst1q(&dst[0], vec.val[0]);
    
          vst1q(&dst[0] + num_lanes(), vec.val[1]);
    
      #endif
    
      85904
        }
    
        // Stores three consecutive vectors to 'dst'.
    
      2168
        static inline void store(Vector3Type vec, ScalarType *dst) {
    
      #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
    
      2168
          vst1q_x3(&dst[0], vec);
    
      #else
    
          vst1q(&dst[0], vec.val[0]);
    
          vst1q(&dst[0] + num_lanes(), vec.val[1]);
    
          vst1q(&dst[0] + (2 * num_lanes()), vec.val[2]);
    
      #endif
    
      2168
        }
    
        // Stores four consecutive vectors to 'dst'.
    
      8810
        static inline void store(Vector4Type vec, ScalarType *dst) {
    
      #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
    
      8810
          vst1q_x4(&dst[0], vec);
    
      #else
    
          vst1q(&dst[0], vec.val[0]);
    
          vst1q(&dst[0] + num_lanes(), vec.val[1]);
    
          vst1q(&dst[0] + (2 * num_lanes()), vec.val[2]);
    
          vst1q(&dst[0] + (3 * num_lanes()), vec.val[3]);
    
      #endif
    
      8810
        }
    
        // Stores two consecutive vectors to 'dst'.
    
      47670
        static inline void store_consecutive(VectorType vec_0, VectorType vec_1,
    
                                             ScalarType *dst) {
    
      47670
          vst1q(&dst[0], vec_0);
    
      47670
          vst1q(&dst[num_lanes()], vec_1);
    
      47670
        }
    
      };  // end of class VecTraitsBase<ScalarType>
    
      // Available NEON vector traits.
    
      template <typename ScalarType>
    
      class VecTraits : public VecTraitsBase<ScalarType> {};
    
      // NEON has no associated context yet.
    
      using NeonContextType = Monostate;
    
      // Adapter which simply adds context and forwards all arguments.
    
      template <typename OperationType>
    
      class OperationContextAdapter : public OperationBase<OperationType> {
    
        // Shorten rows: no need to write 'this->'.
    
        using OperationBase<OperationType>::operation;
    
       public:
    
        using ContextType = NeonContextType;
    
      6782
        explicit OperationContextAdapter(OperationType &operation)
    
      6782
            : OperationBase<OperationType>(operation) {}
    
        // Forwards vector_path_2x() calls to the inner operation.
    
        template <typename... ArgTypes>
    
      280270
        void vector_path_2x(ArgTypes &&...args) {
    
      280270
          operation().vector_path_2x(ContextType{}, std::forward<ArgTypes>(args)...);
    
      280270
        }
    
        // Forwards vector_path() calls to the inner operation.
    
        template <typename... ArgTypes>
    
      8156
        void vector_path(ArgTypes &&...args) {
    
      8156
          operation().vector_path(ContextType{}, std::forward<ArgTypes>(args)...);
    
      8156
        }
    
        // Forwards remaining_path() calls to the inner operation.
    
        template <typename... ArgTypes>
    
      9188
        void remaining_path(ArgTypes &&...args) {
    
      9188
          operation().remaining_path(ContextType{}, std::forward<ArgTypes>(args)...);
    
      9188
        }
    
      };  // end of class OperationContextAdapter<OperationType>
    
      // Adapter which implements remaining_path() for general NEON operations.
    
      template <typename OperationType>
    
      class RemainingPathAdapter : public OperationBase<OperationType> {
    
       public:
    
        using ContextType = NeonContextType;
    
      6478
        explicit RemainingPathAdapter(OperationType &operation)
    
      6478
            : OperationBase<OperationType>(operation) {}
    
        // Forwards remaining_path() calls to scalar_path() of the inner operation
    
        // element by element.
    
        template <typename... ColumnTypes>
    
      8532
        void remaining_path(ContextType ctx, size_t length, ColumnTypes... columns) {
    
        24/24✓ Branch 0 taken 1941 times.
✓ Branch 1 taken 30088 times.
✓ Branch 2 taken 984 times.
✓ Branch 3 taken 11947 times.
✓ Branch 4 taken 863 times.
✓ Branch 5 taken 4571 times.
✓ Branch 6 taken 1974 times.
✓ Branch 7 taken 7958 times.
✓ Branch 8 taken 702 times.
✓ Branch 9 taken 2289 times.
✓ Branch 10 taken 648 times.
✓ Branch 11 taken 1935 times.
✓ Branch 12 taken 282 times.
✓ Branch 13 taken 324 times.
✓ Branch 14 taken 298 times.
✓ Branch 15 taken 336 times.
✓ Branch 16 taken 282 times.
✓ Branch 17 taken 288 times.
✓ Branch 18 taken 90 times.
✓ Branch 19 taken 96 times.
✓ Branch 20 taken 186 times.
✓ Branch 21 taken 192 times.
✓ Branch 22 taken 282 times.
✓ Branch 23 taken 288 times.

      68844
          for (size_t index = 0; index < length; ++index) {
    
      60312
            disable_loop_vectorization();
    
      60312
            this->operation().scalar_path(ctx, columns.at(index)...);
    
      60312
          }
    
      8532
        }
    
      };  // end of class RemainingPathAdapter<OperationType>
    
      // Adapter which implements remaining_path() for NEON operations which
    
      // implementation custom processing of remaining elements.
    
      template <typename OperationType>
    
      class RemainingPathToScalarPathAdapter : public OperationBase<OperationType> {
    
       public:
    
        using ContextType = NeonContextType;
    
      304
        explicit RemainingPathToScalarPathAdapter(OperationType &operation)
    
      304
            : OperationBase<OperationType>(operation) {}
    
        // Forwards remaining_path() calls to scalar_path() of the inner operation.
    
        template <typename... ArgTypes>
    
      656
        void remaining_path(ArgTypes &&...args) {
    
      656
          this->operation().scalar_path(std::forward<ArgTypes>(args)...);
    
      656
        }
    
      };  // end of class RemainingPathToScalarPathAdapter<OperationType>
    
      // Shorthand for applying a generic unrolled NEON operation.
    
      template <typename OperationType, typename... ArgTypes>
    
      6010
      void apply_operation_by_rows(OperationType &operation, ArgTypes &&...args) {
    
      6010
        RemoveContextAdapter remove_context_adapter{operation};
    
      6010
        OperationAdapter operation_adapter{remove_context_adapter};
    
      6010
        RemainingPathAdapter remaining_path_adapter{operation_adapter};
    
      6010
        OperationContextAdapter context_adapter{remaining_path_adapter};
    
      6010
        RowBasedOperation row_based_operation{context_adapter};
    
      6010
        zip_rows(row_based_operation, std::forward<ArgTypes>(args)...);
    
      6010
      }
    
      // Shorthand for applying a generic unrolled and block-based NEON operation.
    
      template <typename OperationType, typename... ArgTypes>
    
      468
      void apply_block_operation_by_rows(OperationType &operation,
    
                                         ArgTypes &&...args) {
    
      468
        RemoveContextAdapter remove_context_adapter{operation};
    
      468
        OperationAdapter operation_adapter{remove_context_adapter};
    
      468
        RemainingPathAdapter remaining_path_adapter{operation_adapter};
    
      468
        OperationContextAdapter context_adapter{remaining_path_adapter};
    
      468
        RowBasedBlockOperation block_operation{context_adapter};
    
      468
        zip_rows(block_operation, std::forward<ArgTypes>(args)...);
    
      468
      }
    
      }  // namespace kleidicv::neon
    
      #endif  // KLEIDICV_NEON_H

Line	Branch	Exec	Source
1			// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2			//
3			// SPDX-License-Identifier: Apache-2.0
4
5			#ifndef KLEIDICV_NEON_H
6			#define KLEIDICV_NEON_H
7
8			#include <utility>
9
10			#include "kleidicv/neon_intrinsics.h"
11			#include "kleidicv/operations.h"
12			#include "kleidicv/utils.h"
13
14			namespace kleidicv::neon {
15
16			template <>
17			class half_element_width<uint16x8_t> {
18			public:
19			using type = uint8x16_t;
20			};
21
22			template <>
23			class half_element_width<uint32x4_t> {
24			public:
25			using type = uint16x8_t;
26			};
27
28			template <>
29			class half_element_width<uint64x2_t> {
30			public:
31			using type = uint32x4_t;
32			};
33
34			template <>
35			class double_element_width<uint8x16_t> {
36			public:
37			using type = uint16x8_t;
38			};
39
40			template <>
41			class double_element_width<uint16x8_t> {
42			public:
43			using type = uint32x4_t;
44			};
45
46			template <>
47			class double_element_width<uint32x4_t> {
48			public:
49			using type = uint64x2_t;
50			};
51
52			// Primary template to describe logically grouped peroperties of vectors.
53			template <typename ScalarType>
54			class VectorTypes;
55
56			template <>
57			class VectorTypes<int8_t> {
58			public:
59			using ScalarType = int8_t;
60			using VectorType = int8x16_t;
61			using Vector2Type = int8x16x2_t;
62			using Vector3Type = int8x16x3_t;
63			using Vector4Type = int8x16x4_t;
64			}; // end of class VectorTypes<int8_t>
65
66			template <>
67			class VectorTypes<uint8_t> {
68			public:
69			using ScalarType = uint8_t;
70			using VectorType = uint8x16_t;
71			using Vector2Type = uint8x16x2_t;
72			using Vector3Type = uint8x16x3_t;
73			using Vector4Type = uint8x16x4_t;
74			}; // end of class VectorTypes<uint8_t>
75
76			template <>
77			class VectorTypes<int16_t> {
78			public:
79			using ScalarType = int16_t;
80			using VectorType = int16x8_t;
81			using Vector2Type = int16x8x2_t;
82			using Vector3Type = int16x8x3_t;
83			using Vector4Type = int16x8x4_t;
84			}; // end of class VectorTypes<int16_t>
85
86			template <>
87			class VectorTypes<uint16_t> {
88			public:
89			using ScalarType = uint16_t;
90			using VectorType = uint16x8_t;
91			using Vector2Type = uint16x8x2_t;
92			using Vector3Type = uint16x8x3_t;
93			using Vector4Type = uint16x8x4_t;
94			}; // end of class VectorTypes<uint16_t>
95
96			template <>
97			class VectorTypes<int32_t> {
98			public:
99			using ScalarType = int32_t;
100			using VectorType = int32x4_t;
101			using Vector2Type = int32x4x2_t;
102			using Vector3Type = int32x4x3_t;
103			using Vector4Type = int32x4x4_t;
104			}; // end of class VectorTypes<int32_t>
105
106			template <>
107			class VectorTypes<uint32_t> {
108			public:
109			using ScalarType = uint32_t;
110			using VectorType = uint32x4_t;
111			using Vector2Type = uint32x4x2_t;
112			using Vector3Type = uint32x4x3_t;
113			using Vector4Type = uint32x4x4_t;
114			}; // end of class VectorTypes<uint32_t>
115
116			template <>
117			class VectorTypes<int64_t> {
118			public:
119			using ScalarType = int64_t;
120			using VectorType = int64x2_t;
121			using Vector2Type = int64x2x2_t;
122			using Vector3Type = int64x2x3_t;
123			using Vector4Type = int64x2x4_t;
124			}; // end of class VectorTypes<int64_t>
125
126			template <>
127			class VectorTypes<uint64_t> {
128			public:
129			using ScalarType = uint64_t;
130			using VectorType = uint64x2_t;
131			using Vector2Type = uint64x2x2_t;
132			using Vector3Type = uint64x2x3_t;
133			using Vector4Type = uint64x2x4_t;
134			}; // end of class VectorTypes<uint64_t>
135
136			template <>
137			class VectorTypes<float> {
138			public:
139			using ScalarType = float;
140			using VectorType = float32x4_t;
141			using Vector2Type = float32x4x2_t;
142			using Vector3Type = float32x4x3_t;
143			using Vector4Type = float32x4x4_t;
144			}; // end of class VectorTypes<float>
145
146			template <>
147			class VectorTypes<double> {
148			public:
149			using ScalarType = double;
150			using VectorType = float64x2_t;
151			using Vector2Type = float64x2x2_t;
152			using Vector3Type = float64x2x3_t;
153			using Vector4Type = float64x2x4_t;
154			}; // end of class VectorTypes<double>
155
156			// NEON vector length in bytes.
157			static constexpr size_t kVectorLength = 16;
158
159			// Base class for all NEON vector traits.
160			template <typename ScalarType>
161			class VecTraitsBase : public VectorTypes<ScalarType> {
162			public:
163			using typename VectorTypes<ScalarType>::VectorType;
164			using typename VectorTypes<ScalarType>::Vector2Type;
165			using typename VectorTypes<ScalarType>::Vector3Type;
166			using typename VectorTypes<ScalarType>::Vector4Type;
167
168			// Number of lanes in a vector.
169		444265	static constexpr size_t num_lanes() {
170		444265	return kVectorLength / sizeof(ScalarType);
171			}
172
173			// Maximum number of lanes in a vector.
174			static constexpr size_t max_num_lanes() { return num_lanes(); }
175
176			#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
177
178			private:
179			static inline int8x16x2_t vld1q_x2(const int8_t *src) {
180			return vld1q_s8_x2(src);
181			}
182
183		3214	static inline uint8x16x2_t vld1q_x2(const uint8_t *src) {
184		3214	return vld1q_u8_x2(src);
185			}
186
187		4728	static inline int16x8x2_t vld1q_x2(const int16_t *src) {
188		4728	return vld1q_s16_x2(src);
189			}
190
191		408	static inline uint16x8x2_t vld1q_x2(const uint16_t *src) {
192		408	return vld1q_u16_x2(src);
193			}
194
195			static inline int32x4x2_t vld1q_x2(const int32_t *src) {
196			return vld1q_s32_x2(src);
197			}
198
199		240	static inline uint32x4x2_t vld1q_x2(const uint32_t *src) {
200		240	return vld1q_u32_x2(src);
201			}
202
203			static inline int64x2x2_t vld1q_x2(const int64_t *src) {
204			return vld1q_s64_x2(src);
205			}
206
207		240	static inline uint64x2x2_t vld1q_x2(const uint64_t *src) {
208		240	return vld1q_u64_x2(src);
209			}
210
211			static inline float32x4x2_t vld1q_x2(const float32_t *src) {
212			return vld1q_f32_x2(src);
213			}
214
215			static inline int8x16x3_t vld1q_x3(const int8_t *src) {
216			return vld1q_s8_x3(src);
217			}
218
219		1688	static inline uint8x16x3_t vld1q_x3(const uint8_t *src) {
220		1688	return vld1q_u8_x3(src);
221			}
222
223			static inline int16x8x3_t vld1q_x3(const int16_t *src) {
224			return vld1q_s16_x3(src);
225			}
226
227		432	static inline uint16x8x3_t vld1q_x3(const uint16_t *src) {
228		432	return vld1q_u16_x3(src);
229			}
230
231			static inline int32x4x3_t vld1q_x3(const int32_t *src) {
232			return vld1q_s32_x3(src);
233			}
234
235		432	static inline uint32x4x3_t vld1q_x3(const uint32_t *src) {
236		432	return vld1q_u32_x3(src);
237			}
238
239			static inline int64x2x3_t vld1q_x3(const int64_t *src) {
240			return vld1q_s64_x3(src);
241			}
242
243		432	static inline uint64x2x3_t vld1q_x3(const uint64_t *src) {
244		432	return vld1q_u64_x3(src);
245			}
246
247			static inline float32x4x3_t vld1q_x3(const float32_t *src) {
248			return vld1q_f32_x3(src);
249			}
250
251		884	static inline int8x16x4_t vld1q_x4(const int8_t *src) {
252		884	return vld1q_s8_x4(src);
253			}
254
255		1436	static inline uint8x16x4_t vld1q_x4(const uint8_t *src) {
256		1436	return vld1q_u8_x4(src);
257			}
258
259			static inline int16x8x4_t vld1q_x4(const int16_t *src) {
260			return vld1q_s16_x4(src);
261			}
262
263		432	static inline uint16x8x4_t vld1q_x4(const uint16_t *src) {
264		432	return vld1q_u16_x4(src);
265			}
266
267			static inline int32x4x4_t vld1q_x4(const int32_t *src) {
268			return vld1q_s32_x4(src);
269			}
270
271		432	static inline uint32x4x4_t vld1q_x4(const uint32_t *src) {
272		432	return vld1q_u32_x4(src);
273			}
274
275			static inline int64x2x4_t vld1q_x4(const int64_t *src) {
276			return vld1q_s64_x4(src);
277			}
278
279		432	static inline uint64x2x4_t vld1q_x4(const uint64_t *src) {
280		432	return vld1q_u64_x4(src);
281			}
282
283		457	static inline float32x4x4_t vld1q_x4(const float32_t *src) {
284		457	return vld1q_f32_x4(src);
285			}
286
287			static inline void vst1q_x2(int8_t *dst, int8x16x2_t vec) {
288			vst1q_s8_x2(dst, vec);
289			}
290
291		6904	static inline void vst1q_x2(uint8_t *dst, uint8x16x2_t vec) {
292		6904	vst1q_u8_x2(dst, vec);
293		6904	}
294
295			static inline void vst1q_x2(int16_t *dst, int16x8x2_t vec) {
296			vst1q_s16_x2(dst, vec);
297			}
298
299		1992	static inline void vst1q_x2(uint16_t *dst, uint16x8x2_t vec) {
300		1992	vst1q_u16_x2(dst, vec);
301		1992	}
302
303			static inline void vst1q_x2(int32_t *dst, int32x4x2_t vec) {
304			vst1q_s32_x2(dst, vec);
305			}
306
307		1992	static inline void vst1q_x2(uint32_t *dst, uint32x4x2_t vec) {
308		1992	vst1q_u32_x2(dst, vec);
309		1992	}
310
311			static inline void vst1q_x2(int64_t *dst, int64x2x2_t vec) {
312			vst1q_s64_x2(dst, vec);
313			}
314
315		1992	static inline void vst1q_x2(uint64_t *dst, uint64x2x2_t vec) {
316		1992	vst1q_u64_x2(dst, vec);
317		1992	}
318
319		73024	static inline void vst1q_x2(float32_t *dst, float32x4x2_t vec) {
320		73024	vst1q_f32_x2(dst, vec);
321		73024	}
322
323			static inline void vst1q_x3(int8_t *dst, int8x16x3_t vec) {
324			vst1q_s8_x3(dst, vec);
325			}
326
327		1304	static inline void vst1q_x3(uint8_t *dst, uint8x16x3_t vec) {
328		1304	vst1q_u8_x3(dst, vec);
329		1304	}
330
331			static inline void vst1q_x3(int16_t *dst, int16x8x3_t vec) {
332			vst1q_s16_x3(dst, vec);
333			}
334
335		432	static inline void vst1q_x3(uint16_t *dst, uint16x8x3_t vec) {
336		432	vst1q_u16_x3(dst, vec);
337		432	}
338
339			static inline void vst1q_x3(int32_t *dst, int32x4x3_t vec) {
340			vst1q_s32_x3(dst, vec);
341			}
342
343			static inline void vst1q_x3(uint32_t *dst, uint32x4x3_t vec) {
344			vst1q_u32_x3(dst, vec);
345			}
346
347			static inline void vst1q_x3(int64_t *dst, int64x2x3_t vec) {
348			vst1q_s64_x3(dst, vec);
349			}
350
351		432	static inline void vst1q_x3(uint64_t *dst, uint64x2x3_t vec) {
352		432	vst1q_u64_x3(dst, vec);
353		432	}
354
355			static inline void vst1q_x3(float32_t *dst, float32x4x3_t vec) {
356			vst1q_f32_x3(dst, vec);
357			}
358
359			static inline void vst1q_x4(int8_t *dst, int8x16x4_t vec) {
360			vst1q_s8_x4(dst, vec);
361			}
362
363		442	static inline void vst1q_x4(uint8_t *dst, uint8x16x4_t vec) {
364		442	vst1q_u8_x4(dst, vec);
365		442	}
366
367			static inline void vst1q_x4(int16_t *dst, int16x8x4_t vec) {
368			vst1q_s16_x4(dst, vec);
369			}
370
371		5608	static inline void vst1q_x4(uint16_t *dst, uint16x8x4_t vec) {
372		5608	vst1q_u16_x4(dst, vec);
373		5608	}
374
375			static inline void vst1q_x4(int32_t *dst, int32x4x4_t vec) {
376			vst1q_s32_x4(dst, vec);
377			}
378
379		816	static inline void vst1q_x4(uint32_t *dst, uint32x4x4_t vec) {
380		816	vst1q_u32_x4(dst, vec);
381		816	}
382
383			static inline void vst1q_x4(int64_t *dst, int64x2x4_t vec) {
384			vst1q_s64_x4(dst, vec);
385			}
386
387		1632	static inline void vst1q_x4(uint64_t *dst, uint64x2x4_t vec) {
388		1632	vst1q_u64_x4(dst, vec);
389		1632	}
390
391		312	static inline void vst1q_x4(float32_t *dst, float32x4x4_t vec) {
392		312	vst1q_f32_x4(dst, vec);
393		312	}
394
395			public:
396			#endif
397
398			// Loads a single vector from 'src'.
399		4622	static inline void load(const ScalarType *src, VectorType &vec) {
400		4622	vec = vld1q(&src[0]);
401		4622	}
402
403			// Loads two consecutive vectors from 'src'.
404		7870	static inline void load(const ScalarType *src, Vector2Type &vec) {
405			#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
406		7870	vec = vld1q_x2(&src[0]);
407			#else
408			vec = {vld1q(&src[0]), vld1q(&src[0] + num_lanes())};
409			#endif
410		7870	}
411
412			// Loads three consecutive vectors from 'src'.
413		1256	static inline void load(const ScalarType *src, Vector3Type &vec) {
414			#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
415		1256	vec = vld1q_x3(&src[0]);
416			#else
417			vec = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()),
418			vld1q(&src[0] + (2 * num_lanes()))};
419			#endif
420		1256	}
421
422			// Loads four consecutive vectors from 'src'.
423		2345	static inline void load(const ScalarType *src, Vector4Type &vec) {
424			#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
425		2345	vec = vld1q_x4(&src[0]);
426			#else
427			vec = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()),
428			vld1q(&src[0] + (2 * num_lanes())),
429			vld1q(&src[0] + (3 * num_lanes()))};
430			#endif
431		2345	}
432
433			// Loads two consecutive vectors from 'src'.
434		304340	static inline void load_consecutive(const ScalarType *src, VectorType &vec_0,
435			VectorType &vec_1) {
436		304340	vec_0 = vld1q(&src[0]);
437		304340	vec_1 = vld1q(&src[num_lanes()]);
438		304340	}
439
440			// Loads 2x2 consecutive vectors from 'src'.
441		480	static inline void load_consecutive(const ScalarType *src, Vector2Type &vec_0,
442			Vector2Type &vec_1) {
443			#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
444		480	vec_0 = vld1q_x2(&src[0]);
445		480	vec_1 = vld1q_x2(&src[num_lanes() * 2]);
446			#else
447			vec_0 = {vld1q(&src[0]), vld1q(&src[0] + num_lanes())};
448			vec_1 = {vld1q(&src[num_lanes() * 2]),
449			vld1q(&src[num_lanes() * 2] + num_lanes())};
450			#endif
451		480	}
452
453			// Loads 2x3 consecutive vectors from 'src'.
454		864	static inline void load_consecutive(const ScalarType *src, Vector3Type &vec_0,
455			Vector3Type &vec_1) {
456			#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
457		864	vec_0 = vld1q_x3(&src[0]);
458		864	vec_1 = vld1q_x3(&src[num_lanes() * 3]);
459			#else
460			vec_0 = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()),
461			vld1q(&src[0] + (2 * num_lanes()))};
462			vec_1 = {vld1q(&src[num_lanes() * 3]),
463			vld1q(&src[num_lanes() * 3] + num_lanes()),
464			vld1q(&src[num_lanes() * 3] + (2 * num_lanes()))};
465			#endif
466		864	}
467
468			// Loads 2x4 consecutive vectors from 'src'.
469		864	static inline void load_consecutive(const ScalarType *src, Vector4Type &vec_0,
470			Vector4Type &vec_1) {
471			#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
472		864	vec_0 = vld1q_x4(&src[0]);
473		864	vec_1 = vld1q_x4(&src[num_lanes() * 4]);
474
475			#else
476			vec_0 = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()),
477			vld1q(&src[0] + (2 * num_lanes())),
478			vld1q(&src[0] + (3 * num_lanes()))};
479			vec_1 = {vld1q(&src[num_lanes() * 4]),
480			vld1q(&src[num_lanes() * 4] + num_lanes()),
481			vld1q(&src[num_lanes() * 4] + (2 * num_lanes())),
482			vld1q(&src[num_lanes() * 4] + (3 * num_lanes()))};
483			#endif
484		864	}
485
486			// Stores a single vector to 'dst'.
487		11800	static inline void store(VectorType vec, ScalarType *dst) {
488		11800	vst1q(&dst[0], vec);
489		11800	}
490
491			// Stores two consecutive vectors to 'dst'.
492		85904	static inline void store(Vector2Type vec, ScalarType *dst) {
493			#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
494		85904	vst1q_x2(&dst[0], vec);
495			#else
496			vst1q(&dst[0], vec.val[0]);
497			vst1q(&dst[0] + num_lanes(), vec.val[1]);
498			#endif
499		85904	}
500
501			// Stores three consecutive vectors to 'dst'.
502		2168	static inline void store(Vector3Type vec, ScalarType *dst) {
503			#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
504		2168	vst1q_x3(&dst[0], vec);
505			#else
506			vst1q(&dst[0], vec.val[0]);
507			vst1q(&dst[0] + num_lanes(), vec.val[1]);
508			vst1q(&dst[0] + (2 * num_lanes()), vec.val[2]);
509			#endif
510		2168	}
511
512			// Stores four consecutive vectors to 'dst'.
513		8810	static inline void store(Vector4Type vec, ScalarType *dst) {
514			#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
515		8810	vst1q_x4(&dst[0], vec);
516			#else
517			vst1q(&dst[0], vec.val[0]);
518			vst1q(&dst[0] + num_lanes(), vec.val[1]);
519			vst1q(&dst[0] + (2 * num_lanes()), vec.val[2]);
520			vst1q(&dst[0] + (3 * num_lanes()), vec.val[3]);
521			#endif
522		8810	}
523
524			// Stores two consecutive vectors to 'dst'.
525		47670	static inline void store_consecutive(VectorType vec_0, VectorType vec_1,
526			ScalarType *dst) {
527		47670	vst1q(&dst[0], vec_0);
528		47670	vst1q(&dst[num_lanes()], vec_1);
529		47670	}
530			}; // end of class VecTraitsBase<ScalarType>
531
532			// Available NEON vector traits.
533			template <typename ScalarType>
534			class VecTraits : public VecTraitsBase<ScalarType> {};
535
536			// NEON has no associated context yet.
537			using NeonContextType = Monostate;
538
539			// Adapter which simply adds context and forwards all arguments.
540			template <typename OperationType>
541			class OperationContextAdapter : public OperationBase<OperationType> {
542			// Shorten rows: no need to write 'this->'.
543			using OperationBase<OperationType>::operation;
544
545			public:
546			using ContextType = NeonContextType;
547
548		6782	explicit OperationContextAdapter(OperationType &operation)
549		6782	: OperationBase<OperationType>(operation) {}
550
551			// Forwards vector_path_2x() calls to the inner operation.
552			template <typename... ArgTypes>
553		280270	void vector_path_2x(ArgTypes &&...args) {
554		280270	operation().vector_path_2x(ContextType{}, std::forward<ArgTypes>(args)...);
555		280270	}
556
557			// Forwards vector_path() calls to the inner operation.
558			template <typename... ArgTypes>
559		8156	void vector_path(ArgTypes &&...args) {
560		8156	operation().vector_path(ContextType{}, std::forward<ArgTypes>(args)...);
561		8156	}
562
563			// Forwards remaining_path() calls to the inner operation.
564			template <typename... ArgTypes>
565		9188	void remaining_path(ArgTypes &&...args) {
566		9188	operation().remaining_path(ContextType{}, std::forward<ArgTypes>(args)...);
567		9188	}
568			}; // end of class OperationContextAdapter<OperationType>
569
570			// Adapter which implements remaining_path() for general NEON operations.
571			template <typename OperationType>
572			class RemainingPathAdapter : public OperationBase<OperationType> {
573			public:
574			using ContextType = NeonContextType;
575
576		6478	explicit RemainingPathAdapter(OperationType &operation)
577		6478	: OperationBase<OperationType>(operation) {}
578
579			// Forwards remaining_path() calls to scalar_path() of the inner operation
580			// element by element.
581			template <typename... ColumnTypes>
582		8532	void remaining_path(ContextType ctx, size_t length, ColumnTypes... columns) {
583	24/24 ✓ Branch 0 taken 1941 times. ✓ Branch 1 taken 30088 times. ✓ Branch 2 taken 984 times. ✓ Branch 3 taken 11947 times. ✓ Branch 4 taken 863 times. ✓ Branch 5 taken 4571 times. ✓ Branch 6 taken 1974 times. ✓ Branch 7 taken 7958 times. ✓ Branch 8 taken 702 times. ✓ Branch 9 taken 2289 times. ✓ Branch 10 taken 648 times. ✓ Branch 11 taken 1935 times. ✓ Branch 12 taken 282 times. ✓ Branch 13 taken 324 times. ✓ Branch 14 taken 298 times. ✓ Branch 15 taken 336 times. ✓ Branch 16 taken 282 times. ✓ Branch 17 taken 288 times. ✓ Branch 18 taken 90 times. ✓ Branch 19 taken 96 times. ✓ Branch 20 taken 186 times. ✓ Branch 21 taken 192 times. ✓ Branch 22 taken 282 times. ✓ Branch 23 taken 288 times.	68844	for (size_t index = 0; index < length; ++index) {
584		60312	disable_loop_vectorization();
585		60312	this->operation().scalar_path(ctx, columns.at(index)...);
586		60312	}
587		8532	}
588			}; // end of class RemainingPathAdapter<OperationType>
589
590			// Adapter which implements remaining_path() for NEON operations which
591			// implementation custom processing of remaining elements.
592			template <typename OperationType>
593			class RemainingPathToScalarPathAdapter : public OperationBase<OperationType> {
594			public:
595			using ContextType = NeonContextType;
596
597		304	explicit RemainingPathToScalarPathAdapter(OperationType &operation)
598		304	: OperationBase<OperationType>(operation) {}
599
600			// Forwards remaining_path() calls to scalar_path() of the inner operation.
601			template <typename... ArgTypes>
602		656	void remaining_path(ArgTypes &&...args) {
603		656	this->operation().scalar_path(std::forward<ArgTypes>(args)...);
604		656	}
605			}; // end of class RemainingPathToScalarPathAdapter<OperationType>
606
607			// Shorthand for applying a generic unrolled NEON operation.
608			template <typename OperationType, typename... ArgTypes>
609		6010	void apply_operation_by_rows(OperationType &operation, ArgTypes &&...args) {
610		6010	RemoveContextAdapter remove_context_adapter{operation};
611		6010	OperationAdapter operation_adapter{remove_context_adapter};
612		6010	RemainingPathAdapter remaining_path_adapter{operation_adapter};
613		6010	OperationContextAdapter context_adapter{remaining_path_adapter};
614		6010	RowBasedOperation row_based_operation{context_adapter};
615		6010	zip_rows(row_based_operation, std::forward<ArgTypes>(args)...);
616		6010	}
617
618			// Shorthand for applying a generic unrolled and block-based NEON operation.
619			template <typename OperationType, typename... ArgTypes>
620		468	void apply_block_operation_by_rows(OperationType &operation,
621			ArgTypes &&...args) {
622		468	RemoveContextAdapter remove_context_adapter{operation};
623		468	OperationAdapter operation_adapter{remove_context_adapter};
624		468	RemainingPathAdapter remaining_path_adapter{operation_adapter};
625		468	OperationContextAdapter context_adapter{remaining_path_adapter};
626		468	RowBasedBlockOperation block_operation{context_adapter};
627		468	zip_rows(block_operation, std::forward<ArgTypes>(args)...);
628		468	}
629
630			} // namespace kleidicv::neon
631
632			#endif // KLEIDICV_NEON_H
633