KleidiCV Coverage Report

Directory:	./
File:	kleidicv/src/conversions/merge_neon.cpp
Date:	2025-09-25 14:13:34
	Exec	Total	Coverage
Lines:	148	148	100.0%
Functions:	36	36	100.0%
Branches:	209	209	100.0%
  
      Line
      Branch
      Exec
      Source
    
      // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
    
      //
    
      // SPDX-License-Identifier: Apache-2.0
    
      #include "kleidicv/conversions/merge.h"
    
      #include "kleidicv/kleidicv.h"
    
      #include "kleidicv/neon.h"
    
      namespace kleidicv::neon {
    
      // ----------------------------------------
    
      // ------------ Two-way merge -------------
    
      // ----------------------------------------
    
      // Generic 2-way merge implementation.
    
      //
    
      // Algorithm description
    
      //
    
      //  Elements are identified by their intended final position in the output.
    
      //  The description is for 32-bit elements, but it works just the same way
    
      //  for different element sizes.
    
      //
    
      //    VECTOR / LANE:   0  1  2  3
    
      //            src_a: [ 0, 2, 4, 6 ]
    
      //            src_b: [ 1, 3, 5, 7 ]
    
      //
    
      //       zip1(a, b): [ 0, 1, 2, 3 ] -> d0
    
      //       zip2(a, b): [ 4, 5, 6, 7 ] -> d1
    
      //
    
      //  Continuous store of { d0, d1 } gives the expected order.
    
      template <typename ScalarType>
    
      class Merge2 final : public UnrollTwice {
    
       public:
    
        using VecTraits = neon::VecTraits<ScalarType>;
    
        using VectorType = typename VecTraits::VectorType;
    
        using Vector2Type = typename VecTraits::Vector2Type;
    
      960
        void vector_path(VectorType src_a, VectorType src_b, ScalarType *dst) {
    
      #if KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
    
          Vector2Type dst_vect;
    
          dst_vect.val[0] = src_a;
    
          dst_vect.val[1] = src_b;
    
          vst2q(&dst[0], dst_vect);
    
      #else  // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
    
      960
          Vector2Type dst_vect;
    
      960
          dst_vect.val[0] = vzip1q(src_a, src_b);
    
      960
          dst_vect.val[1] = vzip2q(src_a, src_b);
    
      960
          VecTraits::store(dst_vect, &dst[0]);
    
      #endif  // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
    
      960
        }
    
      192
        void scalar_path(const ScalarType *src_a, const ScalarType *src_b,
    
                         ScalarType *dst) {
    
      192
          dst[0] = src_a[0];
    
      192
          dst[1] = src_b[0];
    
      192
        }
    
      };  // end of class Merge2<ScalarType>
    
      // ----------------------------------------
    
      // ---------- Three-way merge -------------
    
      // ----------------------------------------
    
      template <typename ScalarType>
    
      class Merge3 final : public UnrollTwice {
    
       public:
    
        using VecTraits = neon::VecTraits<ScalarType>;
    
        using VectorType = typename VecTraits::VectorType;
    
        using Vector3Type = typename VecTraits::Vector3Type;
    
      #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
    
      102
        Merge3() : table_indices_{} {
    
      102
          neon::VecTraits<uint8_t>::load(lookup_table(ScalarType()), table_indices_);
    
      102
        }
    
      #endif
    
      864
        void vector_path(VectorType src_a, VectorType src_b, VectorType src_c,
    
                         ScalarType *dst) {
    
      864
          Vector3Type dst_vect;
    
      #if KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
    
          dst_vect.val[0] = src_a;
    
          dst_vect.val[1] = src_b;
    
          dst_vect.val[2] = src_c;
    
          vst3q(&dst[0], dst_vect);
    
      #else   // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
    
      864
          uint8x16x3_t src_vect;
    
      864
          src_vect.val[0] = vreinterpretq_u8(src_a);
    
      864
          src_vect.val[1] = vreinterpretq_u8(src_b);
    
      864
          src_vect.val[2] = vreinterpretq_u8(src_c);
    
      864
          dst_vect.val[0] = vqtbl3q_u8(src_vect, table_indices_.val[0]);
    
      864
          dst_vect.val[1] = vqtbl3q_u8(src_vect, table_indices_.val[1]);
    
      864
          dst_vect.val[2] = vqtbl3q_u8(src_vect, table_indices_.val[2]);
    
      864
          VecTraits::store(dst_vect, &dst[0]);
    
      #endif  // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
    
      864
        }
    
      192
        void scalar_path(const ScalarType *src_a, const ScalarType *src_b,
    
                         const ScalarType *src_c, ScalarType *dst) {
    
      192
          dst[0] = src_a[0];
    
      192
          dst[1] = src_b[0];
    
      192
          dst[2] = src_c[0];
    
      192
        }
    
       private:
    
      #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
    
      51
        static uint8_t *lookup_table(uint8_t) {
    
          // clang-format off
    
          static  uint8_t kIndices[48] = {
    
             0, 16, 32,  1, 17, 33,  2, 18, 34,  3, 19, 35,  4, 20, 36,  5,
    
            21, 37,  6, 22, 38,  7, 23, 39,  8, 24, 40,  9, 25, 41, 10, 26,
    
            42, 11, 27, 43, 12, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47,
    
          };
    
      51
          return &kIndices[0];
    
        }
    
        // Lookup table for 16-bit inputs.
    
      51
        static uint8_t *lookup_table(uint16_t) {
    
          // clang-format off
    
          static uint8_t kIndices[48] = {
    
             0,  1, 16, 17, 32, 33,  2,  3, 18, 19, 34, 35,  4,  5, 20, 21,
    
            36, 37,  6,  7, 22, 23, 38, 39,  8,  9, 24, 25, 40, 41, 10, 11,
    
            26, 27, 42, 43, 12, 13, 28, 29, 44, 45, 14, 15, 30, 31, 46, 47,
    
          };
    
          // clang-format on
    
      51
          return &kIndices[0];
    
        }
    
        uint8x16x3_t table_indices_;
    
      #endif
    
      };  // end of class Merge3<ScalarType>
    
      #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
    
      // Specialized 3-way merge implementation for 32-bit elements.
    
      //
    
      // Algorithm description
    
      //
    
      //  Elements are identified by their intended final position in the output.
    
      //
    
      //             VECTOR / LANE:    0   1   2   3
    
      //                     src_a: [  0,  3,  6,  9 ]
    
      //                     src_b: [  1,  4,  7, 10 ]
    
      //                     src_c: [  2,  5,  8, 11 ]
    
      //
    
      //                trn2(a, b): [  3,  4,  9, 10 ] -> w
    
      //                trn1(c, w): [  2,  3,  8,  9 ] -> x
    
      //                trn2(w, c): [  4,  5, 10, 11 ] -> y
    
      //                trn1(a, b): [  0,  1,  6,  7 ] -> z
    
      //
    
      //            zip1_u64(z, x): [  0,  1,  2,  3 ] -> d0
    
      //    [ y_u64[0], z_u64[1] ]: [  4,  5,  6,  7 ] -> d1
    
      //            zip2_u64(x, y): [  8,  9, 10, 11 ] -> d2
    
      //
    
      //  Continuous store of { d0, d1, d2 } gives the expected order.
    
      template <>
    
      class Merge3<uint32_t> final : public UnrollTwice {
    
       public:
    
        using ScalarType = uint32_t;
    
        using VecTraits = neon::VecTraits<ScalarType>;
    
        using VectorType = typename VecTraits::VectorType;
    
        using Vector3Type = typename VecTraits::Vector3Type;
    
      432
        void vector_path(VectorType src_a, VectorType src_b, VectorType src_c,
    
                         ScalarType *dst) {
    
      432
          uint32x4_t w = vtrn2q_u32(src_a, src_b);
    
      432
          uint32x4_t x = vtrn1q_u32(src_c, w);
    
      432
          uint32x4_t y = vtrn2q_u32(w, src_c);
    
      432
          uint32x4_t z = vtrn1q_u32(src_a, src_b);
    
      432
          uint32x4_t dst_vect_0 = vzip1q_u64(z, x);
    
      432
          uint64x2_t dst_vect_1 = y;
    
      432
          dst_vect_1[1] = vreinterpretq_u64_u32(z)[1];
    
      432
          uint32x4_t dst_vect_2 = vzip2q_u64(x, y);
    
          // Not using vst1q_u32_x3, becuse the requirement on continuous vector
    
          // register allocation may result in longer code.
    
      432
          vst1q_u32(&dst[0 * VecTraits::num_lanes()], dst_vect_0);
    
      432
          vst1q_u32(&dst[1 * VecTraits::num_lanes()], dst_vect_1);
    
      432
          vst1q_u32(&dst[2 * VecTraits::num_lanes()], dst_vect_2);
    
      432
        }
    
      96
        void scalar_path(const ScalarType *src_a, const ScalarType *src_b,
    
                         const ScalarType *src_c, ScalarType *dst) {
    
      96
          dst[0] = src_a[0];
    
      96
          dst[1] = src_b[0];
    
      96
          dst[2] = src_c[0];
    
      96
        }
    
      };  // end of class Merge3<uint32_t>
    
      // Specialized 3-way merge implementation for 64-bit elements.
    
      //
    
      // Algorithm description
    
      //
    
      //  Elements are identified by their intended final position in the output.
    
      //
    
      //             VECTOR / LANE:   0  1
    
      //                     src_a: [ 0, 3 ]
    
      //                     src_b: [ 1, 4 ]
    
      //                     src_c: [ 2, 5 ]
    
      //
    
      //                zip1(a, b): [ 0, 1 ] -> d0
    
      //    [ src_c[0], src_a[1] ]: [ 2, 3 ] -> d1
    
      //                zip2(b, c): [ 4, 5 ] -> d2
    
      //
    
      //  Continuous store of { d0, d1, d2 } gives the expected order.
    
      template <>
    
      class Merge3<uint64_t> final : public UnrollTwice {
    
       public:
    
        using ScalarType = uint64_t;
    
        using VecTraits = neon::VecTraits<ScalarType>;
    
        using VectorType = typename VecTraits::VectorType;
    
      432
        void vector_path(VectorType src_a, VectorType src_b, VectorType src_c,
    
                         ScalarType *dst) {
    
      432
          uint64x2x3_t dst_vect;
    
      432
          dst_vect.val[0] = vzip1q_u64(src_a, src_b);
    
      432
          dst_vect.val[1] = src_c;
    
      432
          dst_vect.val[1][1] = src_a[1];
    
      432
          dst_vect.val[2] = vzip2q_u64(src_b, src_c);
    
      432
          VecTraits::store(dst_vect, &dst[0]);
    
      432
        }
    
      96
        void scalar_path(const ScalarType *src_a, const ScalarType *src_b,
    
                         const ScalarType *src_c, ScalarType *dst) {
    
      96
          dst[0] = src_a[0];
    
      96
          dst[1] = src_b[0];
    
      96
          dst[2] = src_c[0];
    
      96
        }
    
      };  // end of class Merge3<uint64_t>
    
      #endif  // !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
    
      // ----------------------------------------
    
      // ----------- Four-way merge -------------
    
      // ----------------------------------------
    
      // Generic 4-way merge implementation.
    
      //
    
      // Algorithm description
    
      //
    
      //  Elements are identified by their intended final position in the output.
    
      //  The description is for 32-bit elements, but it works just the same way
    
      //  for smaller element sizes.
    
      //
    
      //     VECTOR / LANE:    0   1   2   3
    
      //             src_a: [  0,  4,  8, 12 ]
    
      //             src_b: [  1,  5,  9, 13 ]
    
      //             src_c: [  2,  6, 10, 14 ]
    
      //             src_d: [  3,  7, 11, 15 ]
    
      //
    
      //    zip1_u32(a, b): [  0,  1,  4,  5 ] -> w
    
      //    zip1_u32(c, d): [  2,  3,  6,  7 ] -> x
    
      //    zip2_u32(a, b): [  8,  9, 12, 13 ] -> y
    
      //    zip2_u32(c, d): [ 10, 11, 14, 15 ] -> z
    
      //
    
      //    zip1_u64(w, x): [  0,  1,  2,  3 ] -> d0
    
      //    zip2_u64(w, x): [  4,  5,  6,  7 ] -> d1
    
      //    zip1_u64(y, z): [  8,  9, 10, 11 ] -> d2
    
      //    zip2_u64(y, z): [ 12, 13, 14, 15 ] -> d3
    
      //
    
      //  Continuous store of { d0, d1, d2, d3 } gives the expected order.
    
      template <typename ScalarType>
    
      class Merge4 final : public UnrollTwice {
    
       public:
    
        using VecTraits = neon::VecTraits<ScalarType>;
    
        using VectorType = typename VecTraits::VectorType;
    
        using Vector4Type = typename VecTraits::Vector4Type;
    
      2448
        void vector_path(VectorType src_a, VectorType src_b, VectorType src_c,
    
                         VectorType src_d, ScalarType *dst) {
    
      #if KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
    
          Vector4Type dst_vect;
    
          dst_vect.val[0] = src_a;
    
          dst_vect.val[1] = src_b;
    
          dst_vect.val[2] = src_c;
    
          dst_vect.val[3] = src_d;
    
          vst4q(&dst[0], dst_vect);
    
      #else  // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
    
      2448
          auto zip1_a_b = double_width(vzip1q(src_a, src_b));
    
      2448
          auto zip1_c_d = double_width(vzip1q(src_c, src_d));
    
      2448
          auto zip2_a_b = double_width(vzip2q(src_a, src_b));
    
      2448
          auto zip2_c_d = double_width(vzip2q(src_c, src_d));
    
          // Compilers tend to replace zip instructions with mov, resulting in
    
          // longer generated code. Omitting a bitcast appears to help.
    
          using DoubleScalarType = double_element_width_t<ScalarType>;
    
      2448
          typename neon::VecTraits<DoubleScalarType>::Vector4Type dst_vect;
    
      2448
          dst_vect.val[0] = vzip1q(zip1_a_b, zip1_c_d);
    
      2448
          dst_vect.val[1] = vzip2q(zip1_a_b, zip1_c_d);
    
      2448
          dst_vect.val[2] = vzip1q(zip2_a_b, zip2_c_d);
    
      2448
          dst_vect.val[3] = vzip2q(zip2_a_b, zip2_c_d);
    
      2448
          neon::VecTraits<DoubleScalarType>::store(
    
      2448
              dst_vect, reinterpret_cast<DoubleScalarType *>(&dst[0]));
    
      #endif  // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
    
      2448
        }
    
      576
        void scalar_path(const ScalarType *src_a, const ScalarType *src_b,
    
                         const ScalarType *src_c, const ScalarType *src_d,
    
                         ScalarType *dst) {
    
      576
          dst[0] = src_a[0];
    
      576
          dst[1] = src_b[0];
    
      576
          dst[2] = src_c[0];
    
      576
          dst[3] = src_d[0];
    
      576
        }
    
       private:
    
      #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
    
        // Polymorphic retinterpret_cast<>() between vector types where the element
    
        // size is doubled. For example, if 'VectorType' is 'uint8x16_t', this
    
        // method returns 'reinterpret_cast<uint16x8_t>(vector)'.
    
      9792
        static double_element_width_t<VectorType> double_width(VectorType vector) {
    
      9792
          return reinterpret_cast<double_element_width_t<VectorType>>(vector);
    
        }
    
      #endif
    
      };  // end of class Merge4<ScalarType>
    
      #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
    
      // Specialized 4-way merge implementation for 64-bit elements.
    
      //
    
      // Algorithm description
    
      //
    
      //  Elements are identified by their intended final position in the output.
    
      //
    
      //    VECTOR / LANE:   0  1
    
      //            src_a: [ 0, 4 ]
    
      //            src_b: [ 1, 5 ]
    
      //            src_c: [ 2, 6 ]
    
      //            src_d: [ 3, 7 ]
    
      //
    
      //       zip1(a, b): [ 0, 1 ] -> d0
    
      //       zip1(c, d): [ 2, 3 ] -> d1
    
      //       zip2(a, b): [ 4, 5 ] -> d2
    
      //       zip2(c, d): [ 6, 7 ] -> d3
    
      //
    
      //  Continuous store of { d0, d1, d2, d3 } gives the expected order.
    
      template <>
    
      class Merge4<uint64_t> final : public UnrollTwice {
    
       public:
    
        using ScalarType = uint64_t;
    
        using VecTraits = neon::VecTraits<ScalarType>;
    
        using VectorType = typename VecTraits::VectorType;
    
        using Vector4Type = typename VecTraits::Vector4Type;
    
      816
        void vector_path(VectorType src_a, VectorType src_b, VectorType src_c,
    
                         VectorType src_d, ScalarType *dst) {
    
      816
          Vector4Type dst_vect;
    
      816
          dst_vect.val[0] = vzip1q(src_a, src_b);
    
      816
          dst_vect.val[1] = vzip1q(src_c, src_d);
    
      816
          dst_vect.val[2] = vzip2q(src_a, src_b);
    
      816
          dst_vect.val[3] = vzip2q(src_c, src_d);
    
      816
          VecTraits::store(dst_vect, &dst[0]);
    
      816
        }
    
      192
        void scalar_path(const ScalarType *src_a, const ScalarType *src_b,
    
                         const ScalarType *src_c, const ScalarType *src_d,
    
                         ScalarType *dst) {
    
      192
          dst[0] = src_a[0];
    
      192
          dst[1] = src_b[0];
    
      192
          dst[2] = src_c[0];
    
      192
          dst[3] = src_d[0];
    
      192
        }
    
      };  // end of class Merge4<uint64_t>
    
      #endif  // !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
    
      // Most of the complexity comes from parameter checking.
    
      // NOLINTBEGIN(readability-function-cognitive-complexity)
    
      template <typename ScalarType>
    
      1140
      kleidicv_error_t merge(const void **srcs, const size_t *src_strides,
    
                             void *dst_void, size_t dst_stride, size_t width,
    
                             size_t height, size_t channels) {
    
        8/8✓ Branch 0 taken 3 times.
✓ Branch 1 taken 228 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 300 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 300 times.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 300 times.

      1140
        if (channels < 2) {
    
      12
          return KLEIDICV_ERROR_RANGE;
    
        }
    
        8/8✓ Branch 0 taken 6 times.
✓ Branch 1 taken 222 times.
✓ Branch 2 taken 6 times.
✓ Branch 3 taken 294 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 294 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 294 times.

      1128
        CHECK_POINTERS(srcs, src_strides);
    
        6/6✓ Branch 0 taken 9 times.
✓ Branch 1 taken 285 times.
✓ Branch 2 taken 9 times.
✓ Branch 3 taken 285 times.
✓ Branch 4 taken 9 times.
✓ Branch 5 taken 285 times.

      1104
        MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src0, srcs[0]);
    
        6/6✓ Branch 0 taken 9 times.
✓ Branch 1 taken 276 times.
✓ Branch 2 taken 9 times.
✓ Branch 3 taken 276 times.
✓ Branch 4 taken 9 times.
✓ Branch 5 taken 276 times.

      1077
        MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src1, srcs[1]);
    
        6/6✓ Branch 0 taken 9 times.
✓ Branch 1 taken 267 times.
✓ Branch 2 taken 9 times.
✓ Branch 3 taken 267 times.
✓ Branch 4 taken 9 times.
✓ Branch 5 taken 267 times.

      1050
        MAKE_POINTER_CHECK_ALIGNMENT(ScalarType, dst, dst_void);
    
        16/16✓ Branch 0 taken 9 times.
✓ Branch 1 taken 213 times.
✓ Branch 2 taken 9 times.
✓ Branch 3 taken 213 times.
✓ Branch 4 taken 18 times.
✓ Branch 5 taken 249 times.
✓ Branch 6 taken 18 times.
✓ Branch 7 taken 249 times.
✓ Branch 8 taken 18 times.
✓ Branch 9 taken 249 times.
✓ Branch 10 taken 18 times.
✓ Branch 11 taken 249 times.
✓ Branch 12 taken 18 times.
✓ Branch 13 taken 249 times.
✓ Branch 14 taken 18 times.
✓ Branch 15 taken 249 times.

      1023
        CHECK_POINTER_AND_STRIDE(src0, src_strides[0], height);
    
        16/16✓ Branch 0 taken 9 times.
✓ Branch 1 taken 204 times.
✓ Branch 2 taken 9 times.
✓ Branch 3 taken 204 times.
✓ Branch 4 taken 18 times.
✓ Branch 5 taken 231 times.
✓ Branch 6 taken 18 times.
✓ Branch 7 taken 231 times.
✓ Branch 8 taken 18 times.
✓ Branch 9 taken 231 times.
✓ Branch 10 taken 18 times.
✓ Branch 11 taken 231 times.
✓ Branch 12 taken 18 times.
✓ Branch 13 taken 231 times.
✓ Branch 14 taken 18 times.
✓ Branch 15 taken 231 times.

      960
        CHECK_POINTER_AND_STRIDE(src1, src_strides[1], height);
    
        16/16✓ Branch 0 taken 3 times.
✓ Branch 1 taken 201 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 201 times.
✓ Branch 4 taken 12 times.
✓ Branch 5 taken 219 times.
✓ Branch 6 taken 12 times.
✓ Branch 7 taken 219 times.
✓ Branch 8 taken 12 times.
✓ Branch 9 taken 219 times.
✓ Branch 10 taken 12 times.
✓ Branch 11 taken 219 times.
✓ Branch 12 taken 12 times.
✓ Branch 13 taken 219 times.
✓ Branch 14 taken 12 times.
✓ Branch 15 taken 219 times.

      897
        CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
    
        24/24✓ Branch 0 taken 3 times.
✓ Branch 1 taken 198 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 195 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 195 times.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 216 times.
✓ Branch 8 taken 3 times.
✓ Branch 9 taken 213 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 213 times.
✓ Branch 12 taken 3 times.
✓ Branch 13 taken 216 times.
✓ Branch 14 taken 3 times.
✓ Branch 15 taken 213 times.
✓ Branch 16 taken 6 times.
✓ Branch 17 taken 213 times.
✓ Branch 18 taken 3 times.
✓ Branch 19 taken 216 times.
✓ Branch 20 taken 3 times.
✓ Branch 21 taken 213 times.
✓ Branch 22 taken 6 times.
✓ Branch 23 taken 213 times.

      858
        CHECK_IMAGE_SIZE(width, height);
    
      834
        Rectangle rect{width, height};
    
      834
        Rows<const ScalarType> src_a_rows{src0, src_strides[0]};
    
      834
        Rows<const ScalarType> src_b_rows{src1, src_strides[1]};
    
      834
        Rows<ScalarType> dst_rows{dst, dst_stride, channels};
    
        16/16✓ Branch 0 taken 3 times.
✓ Branch 1 taken 33 times.
✓ Branch 2 taken 54 times.
✓ Branch 3 taken 105 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 33 times.
✓ Branch 6 taken 60 times.
✓ Branch 7 taken 117 times.
✓ Branch 8 taken 3 times.
✓ Branch 9 taken 33 times.
✓ Branch 10 taken 60 times.
✓ Branch 11 taken 117 times.
✓ Branch 12 taken 3 times.
✓ Branch 13 taken 33 times.
✓ Branch 14 taken 60 times.
✓ Branch 15 taken 117 times.

      834
        switch (channels) {
    
          case 2: {
    
      132
            Merge2<ScalarType> operation;
    
      132
            apply_operation_by_rows(operation, rect, src_a_rows, src_b_rows,
    
                                    dst_rows);
    
      132
          } break;
    
          case 3: {
    
        6/6✓ Branch 0 taken 3 times.
✓ Branch 1 taken 57 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 57 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 57 times.

      234
            MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src2, srcs[2]);
    
        16/16✓ Branch 0 taken 3 times.
✓ Branch 1 taken 51 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 51 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 51 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 51 times.
✓ Branch 8 taken 6 times.
✓ Branch 9 taken 51 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 51 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 51 times.
✓ Branch 14 taken 6 times.
✓ Branch 15 taken 51 times.

      225
            CHECK_POINTER_AND_STRIDE(src2, src_strides[2], height);
    
      204
            Merge3<ScalarType> operation;
    
      204
            Rows<const ScalarType> src_c_rows{src2, src_strides[2]};
    
      204
            apply_operation_by_rows(operation, rect, src_a_rows, src_b_rows,
    
                                    src_c_rows, dst_rows);
    
        8/8✓ Branch 0 taken 3 times.
✓ Branch 1 taken 51 times.
✓ Branch 2 taken 6 times.
✓ Branch 3 taken 51 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 51 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 51 times.

      225
          } break;
    
          case 4: {
    
        6/6✓ Branch 0 taken 3 times.
✓ Branch 1 taken 114 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 114 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 114 times.

      456
            MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src2, srcs[2]);
    
        6/6✓ Branch 0 taken 3 times.
✓ Branch 1 taken 111 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 111 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 111 times.

      447
            MAKE_POINTER_CHECK_ALIGNMENT(const ScalarType, src3, srcs[3]);
    
        16/16✓ Branch 0 taken 3 times.
✓ Branch 1 taken 102 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 102 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 105 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 105 times.
✓ Branch 8 taken 6 times.
✓ Branch 9 taken 105 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 105 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 105 times.
✓ Branch 14 taken 6 times.
✓ Branch 15 taken 105 times.

      438
            CHECK_POINTER_AND_STRIDE(src2, src_strides[2], height);
    
        16/16✓ Branch 0 taken 3 times.
✓ Branch 1 taken 99 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 99 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 99 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 99 times.
✓ Branch 8 taken 6 times.
✓ Branch 9 taken 99 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 99 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 99 times.
✓ Branch 14 taken 6 times.
✓ Branch 15 taken 99 times.

      417
            CHECK_POINTER_AND_STRIDE(src3, src_strides[3], height);
    
      396
            Merge4<ScalarType> operation;
    
      396
            Rows<const ScalarType> src_c_rows{src2, src_strides[2]};
    
      396
            Rows<const ScalarType> src_d_rows{src3, src_strides[3]};
    
      396
            apply_operation_by_rows(operation, rect, src_a_rows, src_b_rows,
    
                                    src_c_rows, src_d_rows, dst_rows);
    
        8/8✓ Branch 0 taken 6 times.
✓ Branch 1 taken 99 times.
✓ Branch 2 taken 15 times.
✓ Branch 3 taken 99 times.
✓ Branch 4 taken 15 times.
✓ Branch 5 taken 99 times.
✓ Branch 6 taken 15 times.
✓ Branch 7 taken 99 times.

      447
          } break;
    
          default:
    
      12
            return KLEIDICV_ERROR_NOT_IMPLEMENTED;
    
        }
    
      732
        return KLEIDICV_OK;
    
      1140
      }
    
      // NOLINTEND(readability-function-cognitive-complexity)
    
      KLEIDICV_TARGET_FN_ATTRS
    
      1143
      kleidicv_error_t merge(const void **srcs, const size_t *src_strides, void *dst,
    
                             size_t dst_stride, size_t width, size_t height,
    
                             size_t channels, size_t element_size) {
    
        5/5✓ Branch 0 taken 303 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 231 times.
✓ Branch 3 taken 303 times.
✓ Branch 4 taken 303 times.

      1143
        switch (element_size) {
    
          case sizeof(uint8_t):
    
      462
            return merge<uint8_t>(srcs, src_strides, dst, dst_stride, width, height,
    
      231
                                  channels);
    
          case sizeof(uint16_t):
    
      606
            return merge<uint16_t>(srcs, src_strides, dst, dst_stride, width, height,
    
      303
                                   channels);
    
          case sizeof(uint32_t):
    
      606
            return merge<uint32_t>(srcs, src_strides, dst, dst_stride, width, height,
    
      303
                                   channels);
    
          case sizeof(uint64_t):
    
      606
            return merge<uint64_t>(srcs, src_strides, dst, dst_stride, width, height,
    
      303
                                   channels);
    
          default:
    
      3
            return KLEIDICV_ERROR_NOT_IMPLEMENTED;
    
        }
    
      1143
      }
    
      }  // namespace kleidicv::neon