KleidiCV Coverage Report

Directory:	./
File:	kleidicv/src/filters/gaussian_blur_fixed_neon.cpp
Date:	2025-09-25 14:13:34

	Exec	Total	Coverage
Lines:	224	224	100.0%
Functions:	55	55	100.0%
Branches:	54	56	96.4%

  
      Line
      Branch
      Exec
      Source
    
      // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
    
      //
    
      // SPDX-License-Identifier: Apache-2.0
    
      #include <cassert>
    
      #include <cstddef>
    
      #include "kleidicv/config.h"
    
      #include "kleidicv/ctypes.h"
    
      #include "kleidicv/filters/gaussian_blur.h"
    
      #include "kleidicv/filters/separable_filter_15x15_neon.h"
    
      #include "kleidicv/filters/separable_filter_21x21_neon.h"
    
      #include "kleidicv/filters/separable_filter_3x3_neon.h"
    
      #include "kleidicv/filters/separable_filter_5x5_neon.h"
    
      #include "kleidicv/filters/separable_filter_7x7_neon.h"
    
      #include "kleidicv/filters/sigma.h"
    
      #include "kleidicv/neon.h"
    
      #include "kleidicv/workspace/border_types.h"
    
      #include "kleidicv/workspace/separable.h"
    
      namespace kleidicv::neon {
    
      // Primary template for Gaussian Blur filters.
    
      template <typename ScalarType, size_t KernelSize, bool IsBinomial>
    
      class GaussianBlur;
    
      // Template for 3x3 Gaussian Blur binomial filters.
    
      //
    
      //             [ 1, 2, 1 ]          [ 1 ]
    
      //  F = 1/16 * [ 2, 4, 2 ] = 1/16 * [ 2 ] * [ 1, 2, 1 ]
    
      //             [ 1, 2, 1 ]          [ 1 ]
    
      template <>
    
      class GaussianBlur<uint8_t, 3, true> {
    
       public:
    
        using ScalarType = uint8_t;
    
        using SourceType = ScalarType;
    
        using SourceVectorType = typename VecTraits<SourceType>::VectorType;
    
        using BufferType = double_element_width_t<ScalarType>;
    
        using BufferVectorType = typename VecTraits<BufferType>::VectorType;
    
        using DestinationType = ScalarType;
    
        // Applies vertical filtering vector using SIMD operations.
    
        //
    
        // DST = [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
    
      114
        void vertical_vector_path(SourceVectorType src[3], BufferType *dst) const {
    
          // acc_0_2 = src[0] + src[2]
    
      114
          BufferVectorType acc_0_2_l = vaddl(vget_low(src[0]), vget_low(src[2]));
    
      114
          BufferVectorType acc_0_2_h = vaddl(vget_high(src[0]), vget_high(src[2]));
    
          // acc_1 = src[1] + src[1]
    
      114
          BufferVectorType acc_1_l = vshll_n<1>(vget_low(src[1]));
    
      114
          BufferVectorType acc_1_h = vshll_n<1>(vget_high(src[1]));
    
          // acc = acc_0_2 + acc_1
    
      114
          BufferVectorType acc_l = vaddq(acc_0_2_l, acc_1_l);
    
      114
          BufferVectorType acc_h = vaddq(acc_0_2_h, acc_1_h);
    
      114
          VecTraits<BufferType>::store_consecutive(acc_l, acc_h, &dst[0]);
    
      114
        }
    
        // Applies vertical filtering vector using scalar operations.
    
        //
    
        // DST = [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
    
      320
        void vertical_scalar_path(const SourceType src[3], BufferType *dst) const {
    
      320
          dst[0] = src[0] + 2 * src[1] + src[2];
    
      320
        }
    
        // Applies horizontal filtering vector using SIMD operations.
    
        //
    
        // DST = 1/16 * [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
    
      176
        void horizontal_vector_path(BufferVectorType src[3],
    
                                    DestinationType *dst) const {
    
      176
          BufferVectorType acc_wide = vaddq(src[0], src[2]);
    
      176
          acc_wide = vaddq(acc_wide, vshlq_n<1>(src[1]));
    
      176
          auto acc_narrow = vrshrn_n<4>(acc_wide);
    
      176
          vst1(&dst[0], acc_narrow);
    
      176
        }
    
        // Applies horizontal filtering vector using scalar operations.
    
        //
    
        // DST = 1/16 * [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
    
      452
        void horizontal_scalar_path(const BufferType src[3],
    
                                    DestinationType *dst) const {
    
      452
          auto acc = src[0] + 2 * src[1] + src[2];
    
      452
          dst[0] = rounding_shift_right(acc, 4);
    
      452
        }
    
      };  // end of class GaussianBlur<uint8_t, 3, true>
    
      // Template for 5x5 Gaussian Blur binomial filters.
    
      //
    
      //              [ 1,  4,  6,  4, 1 ]           [ 1 ]
    
      //              [ 4, 16, 24, 16, 4 ]           [ 4 ]
    
      //  F = 1/256 * [ 6, 24, 36, 24, 6 ] = 1/256 * [ 6 ] * [ 1,  4,  6,  4, 1 ]
    
      //              [ 4, 16, 24, 16, 4 ]           [ 4 ]
    
      //              [ 1,  4,  6,  4, 1 ]           [ 1 ]
    
      template <>
    
      class GaussianBlur<uint8_t, 5, true> {
    
       public:
    
        using SourceType = uint8_t;
    
        using BufferType = uint16_t;
    
        using DestinationType = uint8_t;
    
      51
        GaussianBlur()
    
      51
            : const_6_u8_half_{vdup_n_u8(6)},
    
      51
              const_6_u16_{vdupq_n_u16(6)},
    
      51
              const_4_u16_{vdupq_n_u16(4)} {}
    
        // Applies vertical filtering vector using SIMD operations.
    
        //
    
        // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
    
      300
        void vertical_vector_path(uint8x16_t src[5], BufferType *dst) const {
    
      300
          uint16x8_t acc_0_4_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[4]));
    
      300
          uint16x8_t acc_0_4_h = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[4]));
    
      300
          uint16x8_t acc_1_3_l = vaddl_u8(vget_low_u8(src[1]), vget_low_u8(src[3]));
    
      300
          uint16x8_t acc_1_3_h = vaddl_u8(vget_high_u8(src[1]), vget_high_u8(src[3]));
    
      600
          uint16x8_t acc_l =
    
      300
              vmlal_u8(acc_0_4_l, vget_low_u8(src[2]), const_6_u8_half_);
    
      600
          uint16x8_t acc_h =
    
      300
              vmlal_u8(acc_0_4_h, vget_high_u8(src[2]), const_6_u8_half_);
    
      300
          acc_l = vmlaq_u16(acc_l, acc_1_3_l, const_4_u16_);
    
      300
          acc_h = vmlaq_u16(acc_h, acc_1_3_h, const_4_u16_);
    
      300
          vst1q(&dst[0], acc_l);
    
      300
          vst1q(&dst[8], acc_h);
    
      300
        }
    
        // Applies vertical filtering vector using scalar operations.
    
        //
    
        // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
    
      3820
        void vertical_scalar_path(const SourceType src[5], BufferType *dst) const {
    
      3820
          dst[0] = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2];
    
      3820
        }
    
        // Applies horizontal filtering vector using SIMD operations.
    
        //
    
        // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
    
      604
        void horizontal_vector_path(uint16x8_t src[5], DestinationType *dst) const {
    
      604
          uint16x8_t acc_0_4 = vaddq_u16(src[0], src[4]);
    
      604
          uint16x8_t acc_1_3 = vaddq_u16(src[1], src[3]);
    
      604
          uint16x8_t acc_u16 = vmlaq_u16(acc_0_4, src[2], const_6_u16_);
    
      604
          acc_u16 = vmlaq_u16(acc_u16, acc_1_3, const_4_u16_);
    
      604
          uint8x8_t acc_u8 = vrshrn_n_u16(acc_u16, 8);
    
      604
          vst1(&dst[0], acc_u8);
    
      604
        }
    
        // Applies horizontal filtering vector using scalar operations.
    
        //
    
        // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
    
      3068
        void horizontal_scalar_path(const BufferType src[5],
    
                                    DestinationType *dst) const {
    
      3068
          auto acc = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2];
    
      3068
          dst[0] = rounding_shift_right(acc, 8);
    
      3068
        }
    
       private:
    
        uint8x8_t const_6_u8_half_;
    
        uint16x8_t const_6_u16_;
    
        uint16x8_t const_4_u16_;
    
      };  // end of class GaussianBlur<uint8_t, 5, true>
    
      // Template for 7x7 Gaussian Blur binomial filters.
    
      //
    
      //               [  4,  14,  28,  36,  28,  14,  4 ]
    
      //               [ 14,  49,  98, 126,  98,  49, 14 ]
    
      //               [ 28,  98, 196, 252, 196,  98, 28 ]
    
      //  F = 1/4096 * [ 36, 126, 252, 324, 252, 126, 36 ] =
    
      //               [ 28,  98, 196, 252, 196,  98, 28 ]
    
      //               [ 14,  49,  98, 126,  98,  49, 14 ]
    
      //               [  4,  14,  28,  36,  28,  14,  4 ]
    
      //
    
      //               [  2 ]
    
      //               [  7 ]
    
      //               [ 14 ]
    
      //  = 1/4096  *  [ 18 ] * [ 2, 7, 14, 18, 14, 7, 2 ]
    
      //               [ 14 ]
    
      //               [  7 ]
    
      //               [  2 ]
    
      template <>
    
      class GaussianBlur<uint8_t, 7, true> {
    
       public:
    
        using SourceType = uint8_t;
    
        using BufferType = uint16_t;
    
        using DestinationType = uint8_t;
    
      33
        GaussianBlur()
    
      33
            : const_7_u16_{vdupq_n_u16(7)},
    
      33
              const_7_u32_{vdupq_n_u32(7)},
    
      33
              const_9_u16_{vdupq_n_u16(9)} {}
    
        // Applies vertical filtering vector using SIMD operations.
    
        //
    
        // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
    
        //     * [ 2, 7, 14, 18, 14, 7, 2 ]T
    
      240
        void vertical_vector_path(uint8x16_t src[7], BufferType *dst) const {
    
      240
          uint16x8_t acc_0_6_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[6]));
    
      240
          uint16x8_t acc_0_6_h = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[6]));
    
      240
          uint16x8_t acc_1_5_l = vaddl_u8(vget_low_u8(src[1]), vget_low_u8(src[5]));
    
      240
          uint16x8_t acc_1_5_h = vaddl_u8(vget_high_u8(src[1]), vget_high_u8(src[5]));
    
      240
          uint16x8_t acc_2_4_l = vaddl_u8(vget_low_u8(src[2]), vget_low_u8(src[4]));
    
      240
          uint16x8_t acc_2_4_h = vaddl_u8(vget_high_u8(src[2]), vget_high_u8(src[4]));
    
      240
          uint16x8_t acc_3_l = vmovl_u8(vget_low_u8(src[3]));
    
      240
          uint16x8_t acc_3_h = vmovl_u8(vget_high_u8(src[3]));
    
      240
          uint16x8_t acc_0_2_4_6_l = vmlaq_u16(acc_0_6_l, acc_2_4_l, const_7_u16_);
    
      240
          uint16x8_t acc_0_2_4_6_h = vmlaq_u16(acc_0_6_h, acc_2_4_h, const_7_u16_);
    
      480
          uint16x8_t acc_0_2_3_4_6_l =
    
      240
              vmlaq_u16(acc_0_2_4_6_l, acc_3_l, const_9_u16_);
    
      480
          uint16x8_t acc_0_2_3_4_6_h =
    
      240
              vmlaq_u16(acc_0_2_4_6_h, acc_3_h, const_9_u16_);
    
      240
          acc_0_2_3_4_6_l = vshlq_n_u16(acc_0_2_3_4_6_l, 1);
    
      240
          acc_0_2_3_4_6_h = vshlq_n_u16(acc_0_2_3_4_6_h, 1);
    
      480
          uint16x8_t acc_0_1_2_3_4_5_6_l =
    
      240
              vmlaq_u16(acc_0_2_3_4_6_l, acc_1_5_l, const_7_u16_);
    
      480
          uint16x8_t acc_0_1_2_3_4_5_6_h =
    
      240
              vmlaq_u16(acc_0_2_3_4_6_h, acc_1_5_h, const_7_u16_);
    
      240
          vst1q(&dst[0], acc_0_1_2_3_4_5_6_l);
    
      240
          vst1q(&dst[8], acc_0_1_2_3_4_5_6_h);
    
      240
        }
    
        // Applies vertical filtering vector using scalar operations.
    
        //
    
        // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
    
        //     * [ 2, 7, 14, 18, 14, 7, 2 ]T
    
      664
        void vertical_scalar_path(const SourceType src[7], BufferType *dst) const {
    
      1992
          uint16_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 +
    
      1328
                         src[4] * 14 + src[5] * 7 + src[6] * 2;
    
      664
          dst[0] = acc;
    
      664
        }
    
        // Applies horizontal filtering vector using SIMD operations.
    
        //
    
        // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
    
        //              * [ 2, 7, 14, 18, 14, 7, 2 ]T
    
      216
        void horizontal_vector_path(uint16x8_t src[7], DestinationType *dst) const {
    
      432
          uint32x4_t acc_0_6_l =
    
      216
              vaddl_u16(vget_low_u16(src[0]), vget_low_u16(src[6]));
    
      432
          uint32x4_t acc_0_6_h =
    
      216
              vaddl_u16(vget_high_u16(src[0]), vget_high_u16(src[6]));
    
      432
          uint32x4_t acc_1_5_l =
    
      216
              vaddl_u16(vget_low_u16(src[1]), vget_low_u16(src[5]));
    
      432
          uint32x4_t acc_1_5_h =
    
      216
              vaddl_u16(vget_high_u16(src[1]), vget_high_u16(src[5]));
    
      216
          uint16x8_t acc_2_4 = vaddq_u16(src[2], src[4]);
    
      432
          uint32x4_t acc_0_2_4_6_l =
    
      216
              vmlal_u16(acc_0_6_l, vget_low_u16(acc_2_4), vget_low_u16(const_7_u16_));
    
      432
          uint32x4_t acc_0_2_4_6_h = vmlal_u16(acc_0_6_h, vget_high_u16(acc_2_4),
    
      216
                                               vget_high_u16(const_7_u16_));
    
      432
          uint32x4_t acc_0_2_3_4_6_l = vmlal_u16(acc_0_2_4_6_l, vget_low_u16(src[3]),
    
      216
                                                 vget_low_u16(const_9_u16_));
    
      432
          uint32x4_t acc_0_2_3_4_6_h = vmlal_u16(acc_0_2_4_6_h, vget_high_u16(src[3]),
    
      216
                                                 vget_high_u16(const_9_u16_));
    
      216
          acc_0_2_3_4_6_l = vshlq_n_u32(acc_0_2_3_4_6_l, 1);
    
      216
          acc_0_2_3_4_6_h = vshlq_n_u32(acc_0_2_3_4_6_h, 1);
    
      432
          uint32x4_t acc_0_1_2_3_4_5_6_l =
    
      216
              vmlaq_u32(acc_0_2_3_4_6_l, acc_1_5_l, const_7_u32_);
    
      432
          uint32x4_t acc_0_1_2_3_4_5_6_h =
    
      216
              vmlaq_u32(acc_0_2_3_4_6_h, acc_1_5_h, const_7_u32_);
    
      216
          uint16x4_t acc_0_1_2_3_4_5_6_u16_l = vrshrn_n_u32(acc_0_1_2_3_4_5_6_l, 12);
    
      216
          uint16x4_t acc_0_1_2_3_4_5_6_u16_h = vrshrn_n_u32(acc_0_1_2_3_4_5_6_h, 12);
    
      432
          uint16x8_t acc_0_1_2_3_4_5_6_u16 =
    
      216
              vcombine_u16(acc_0_1_2_3_4_5_6_u16_l, acc_0_1_2_3_4_5_6_u16_h);
    
      216
          uint8x8_t acc_0_1_2_3_4_5_6_u8 = vmovn_u16(acc_0_1_2_3_4_5_6_u16);
    
      216
          vst1(&dst[0], acc_0_1_2_3_4_5_6_u8);
    
      216
        }
    
        // Applies horizontal filtering vector using scalar operations.
    
        //
    
        // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
    
        //              * [ 2, 7, 14, 18, 14, 7, 2 ]T
    
      1672
        void horizontal_scalar_path(const BufferType src[7],
    
                                    DestinationType *dst) const {
    
      5016
          uint32_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 +
    
      3344
                         src[4] * 14 + src[5] * 7 + src[6] * 2;
    
      1672
          dst[0] = static_cast<DestinationType>(rounding_shift_right(acc, 12));
    
      1672
        }
    
       private:
    
        uint16x8_t const_7_u16_;
    
        uint32x4_t const_7_u32_;
    
        uint16x8_t const_9_u16_;
    
      };  // end of class GaussianBlur<uint8_t, 7, true>
    
      template <size_t KernelSize>
    
      class GaussianBlur<uint8_t, KernelSize, false> {
    
       public:
    
        using SourceType = uint8_t;
    
        using BufferType = uint8_t;
    
        using DestinationType = uint8_t;
    
        static constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize);
    
      115
        explicit GaussianBlur(const uint16_t *half_kernel)
    
      115
            : half_kernel_(half_kernel) {}
    
      3504
        void vertical_vector_path(uint8x16_t src[KernelSize], BufferType *dst) const {
    
      3504
          common_vector_path(src, dst);
    
      3504
        }
    
      40312
        void vertical_scalar_path(const SourceType src[KernelSize],
    
                                  BufferType *dst) const {
    
      80624
          uint16_t acc = static_cast<uint16_t>(src[kHalfKernelSize - 1]) *
    
      40312
                         half_kernel_[kHalfKernelSize - 1];
    
          // Optimization to avoid unnecessary branching in vector code.
    
          KLEIDICV_FORCE_LOOP_UNROLL
    
        10/10✓ Branch 0 taken 13568 times.
✓ Branch 1 taken 94976 times.
✓ Branch 2 taken 23528 times.
✓ Branch 3 taken 235280 times.
✓ Branch 4 taken 336 times.
✓ Branch 5 taken 336 times.
✓ Branch 6 taken 1096 times.
✓ Branch 7 taken 2192 times.
✓ Branch 8 taken 1784 times.
✓ Branch 9 taken 5352 times.

      378448
          for (size_t i = 0; i < kHalfKernelSize - 1; i++) {
    
      1014408
            acc += (static_cast<uint16_t>(src[i]) +
    
      676272
                    static_cast<uint16_t>(src[KernelSize - i - 1])) *
    
      338136
                   half_kernel_[i];
    
      338136
          }
    
      40312
          dst[0] = static_cast<DestinationType>(rounding_shift_right(acc, 8));
    
      40312
        }
    
      960
        void horizontal_vector_path(uint8x16_t src[KernelSize],
    
                                    DestinationType *dst) const {
    
      960
          common_vector_path(src, dst);
    
      960
        }
    
      37268
        void horizontal_scalar_path(const BufferType src[KernelSize],
    
                                    DestinationType *dst) const {
    
      37268
          vertical_scalar_path(src, dst);
    
      37268
        }
    
       private:
    
      4464
        void common_vector_path(uint8x16_t src[KernelSize], BufferType *dst) const {
    
      4464
          uint8x8_t half_kernel_mid = vdup_n_u8(half_kernel_[kHalfKernelSize - 1]);
    
      8928
          uint16x8_t acc_l =
    
      8928
              vmlal_u8(vdupq_n_u16(128), vget_low_u8(src[kHalfKernelSize - 1]),
    
      4464
                       half_kernel_mid);
    
      8928
          uint16x8_t acc_h =
    
      8928
              vmlal_u8(vdupq_n_u16(128), vget_high_u8(src[kHalfKernelSize - 1]),
    
      4464
                       half_kernel_mid);
    
          // Optimization to avoid unnecessary branching in vector code.
    
          KLEIDICV_FORCE_LOOP_UNROLL
    
        10/10✓ Branch 0 taken 1344 times.
✓ Branch 1 taken 9408 times.
✓ Branch 2 taken 2736 times.
✓ Branch 3 taken 27360 times.
✓ Branch 4 taken 56 times.
✓ Branch 5 taken 56 times.
✓ Branch 6 taken 112 times.
✓ Branch 7 taken 224 times.
✓ Branch 8 taken 216 times.
✓ Branch 9 taken 648 times.

      42160
          for (size_t i = 0; i < kHalfKernelSize - 1; i++) {
    
      37696
            const size_t j = KernelSize - i - 1;
    
      37696
            uint16x8_t vec_l = vaddl_u8(vget_low_u8(src[i]), vget_low_u8(src[j]));
    
      37696
            uint16x8_t vec_h = vaddl_high_u8(src[i], src[j]);
    
      37696
            uint16x8_t coeff = vdupq_n_u16(half_kernel_[i]);
    
      37696
            acc_l = vmlaq_u16(acc_l, vec_l, coeff);
    
      37696
            acc_h = vmlaq_u16(acc_h, vec_h, coeff);
    
      37696
          }
    
          // Keep only the highest 8 bits
    
      8928
          uint8x16_t result =
    
      4464
              vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h));
    
      4464
          neon::VecTraits<uint8_t>::store(result, &dst[0]);
    
      4464
        }
    
        const uint16_t *half_kernel_;
    
      };  // end of class GaussianBlur<uint8_t, KernelSize, false>
    
      template <size_t KernelSize, bool IsBinomial, typename ScalarType>
    
      326
      static kleidicv_error_t gaussian_blur_fixed_kernel_size(
    
          const ScalarType *src, size_t src_stride, ScalarType *dst,
    
          size_t dst_stride, Rectangle &rect, size_t y_begin, size_t y_end,
    
          size_t channels, float sigma, FixedBorderType border_type,
    
          SeparableFilterWorkspace *workspace) {
    
        using GaussianBlurFilter = GaussianBlur<ScalarType, KernelSize, IsBinomial>;
    
      326
        Rows<const ScalarType> src_rows{src, src_stride, channels};
    
      326
        Rows<ScalarType> dst_rows{dst, dst_stride, channels};
    
        if constexpr (IsBinomial) {
    
      131
          GaussianBlurFilter blur;
    
      131
          SeparableFilter<GaussianBlurFilter, KernelSize> filter{blur};
    
      262
          workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels,
    
      131
                             border_type, filter);
    
      131
          return KLEIDICV_OK;
    
      131
        } else {
    
      195
          constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize);
    
      195
          uint16_t half_kernel[128];
    
      195
          generate_gaussian_half_kernel(half_kernel, kHalfKernelSize, sigma);
    
          // If sigma is so small that the middle point gets all the weights, it's
    
          // just a copy
    
        10/10✓ Branch 0 taken 32 times.
✓ Branch 1 taken 16 times.
✓ Branch 2 taken 32 times.
✓ Branch 3 taken 16 times.
✓ Branch 4 taken 17 times.
✓ Branch 5 taken 16 times.
✓ Branch 6 taken 17 times.
✓ Branch 7 taken 16 times.
✓ Branch 8 taken 17 times.
✓ Branch 9 taken 16 times.

      195
          if (half_kernel[kHalfKernelSize - 1] < 256) {
    
      115
            GaussianBlurFilter blur(half_kernel);
    
      115
            SeparableFilter<GaussianBlurFilter, KernelSize> filter{blur};
    
      230
            workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels,
    
      115
                               border_type, filter);
    
      115
          } else {
    
        10/10✓ Branch 0 taken 228 times.
✓ Branch 1 taken 16 times.
✓ Branch 2 taken 324 times.
✓ Branch 3 taken 16 times.
✓ Branch 4 taken 36 times.
✓ Branch 5 taken 16 times.
✓ Branch 6 taken 68 times.
✓ Branch 7 taken 16 times.
✓ Branch 8 taken 100 times.
✓ Branch 9 taken 16 times.

      836
            for (size_t row = y_begin; row < y_end; ++row) {
    
      1512
              std::memcpy(static_cast<void *>(&dst_rows.at(row)[0]),
    
      756
                          static_cast<const void *>(&src_rows.at(row)[0]),
    
      756
                          rect.width() * sizeof(ScalarType) * dst_rows.channels());
    
      756
            }
    
          }
    
      195
          return KLEIDICV_OK;
    
      195
        }
    
      326
      }
    
      template <bool IsBinomial, typename ScalarType>
    
      326
      static kleidicv_error_t gaussian_blur_fixed(
    
          size_t kernel_size, const ScalarType *src, size_t src_stride,
    
          ScalarType *dst, size_t dst_stride, Rectangle &rect, size_t y_begin,
    
          size_t y_end, size_t channels, float sigma, FixedBorderType border_type,
    
          SeparableFilterWorkspace *workspace) {
    
        10/12✓ Branch 0 taken 47 times.
✓ Branch 1 taken 51 times.
✓ Branch 2 taken 33 times.
✓ Branch 3 taken 16 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 16 times.
✓ Branch 6 taken 33 times.
✓ Branch 7 taken 33 times.
✓ Branch 8 taken 33 times.
✓ Branch 9 taken 32 times.
✗ Branch 10 not taken.
✓ Branch 11 taken 32 times.

      326
        switch (kernel_size) {
    
          case 3:
    
      80
            return gaussian_blur_fixed_kernel_size<3, IsBinomial>(
    
      80
                src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
    
      80
                sigma, border_type, workspace);
    
          case 5:
    
      84
            return gaussian_blur_fixed_kernel_size<5, IsBinomial>(
    
      84
                src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
    
      84
                sigma, border_type, workspace);
    
          case 7:
    
      66
            return gaussian_blur_fixed_kernel_size<7, IsBinomial>(
    
      66
                src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
    
      66
                sigma, border_type, workspace);
    
          case 15:
    
            // 15x15 does not have a binomial variant
    
      48
            return gaussian_blur_fixed_kernel_size<15, false>(
    
      48
                src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
    
      48
                sigma, border_type, workspace);
    
          case 21:
    
            // 21x21 does not have a binomial variant
    
      48
            return gaussian_blur_fixed_kernel_size<21, false>(
    
      48
                src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
    
      48
                sigma, border_type, workspace);
    
            // gaussian_blur_is_implemented checked the kernel size already.
    
          // GCOVR_EXCL_START
    
          default:
    
            assert(!"kernel size not implemented");
    
      −
            return KLEIDICV_ERROR_NOT_IMPLEMENTED;
    
            // GCOVR_EXCL_STOP
    
        }
    
      326
      }
    
      KLEIDICV_TARGET_FN_ATTRS
    
      345
      kleidicv_error_t gaussian_blur_fixed_stripe_u8(
    
          const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
    
          size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels,
    
          size_t kernel_width, size_t /*kernel_height*/, float sigma_x,
    
          float /*sigma_y*/, FixedBorderType fixed_border_type,
    
          kleidicv_filter_context_t *context) {
    
      345
        auto *workspace = reinterpret_cast<SeparableFilterWorkspace *>(context);
    
      690
        kleidicv_error_t checks_result = gaussian_blur_checks(
    
      345
            src, src_stride, dst, dst_stride, width, height, channels, workspace);
    
        2/2✓ Branch 0 taken 19 times.
✓ Branch 1 taken 326 times.

      345
        if (checks_result != KLEIDICV_OK) {
    
      19
          return checks_result;
    
        }
    
      326
        Rectangle rect{width, height};
    
        2/2✓ Branch 0 taken 163 times.
✓ Branch 1 taken 163 times.

      326
        if (sigma_x == 0.0) {
    
      326
          return gaussian_blur_fixed<true>(kernel_width, src, src_stride, dst,
    
      163
                                           dst_stride, rect, y_begin, y_end, channels,
    
      163
                                           sigma_x, fixed_border_type, workspace);
    
        }
    
      326
        return gaussian_blur_fixed<false>(kernel_width, src, src_stride, dst,
    
      163
                                          dst_stride, rect, y_begin, y_end, channels,
    
      163
                                          sigma_x, fixed_border_type, workspace);
    
      345
      }
    
      }  // namespace kleidicv::neon

Line	Branch	Exec	Source
1			// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2			//
3			// SPDX-License-Identifier: Apache-2.0
4
5			#include <cassert>
6			#include <cstddef>
7
8			#include "kleidicv/config.h"
9			#include "kleidicv/ctypes.h"
10			#include "kleidicv/filters/gaussian_blur.h"
11			#include "kleidicv/filters/separable_filter_15x15_neon.h"
12			#include "kleidicv/filters/separable_filter_21x21_neon.h"
13			#include "kleidicv/filters/separable_filter_3x3_neon.h"
14			#include "kleidicv/filters/separable_filter_5x5_neon.h"
15			#include "kleidicv/filters/separable_filter_7x7_neon.h"
16			#include "kleidicv/filters/sigma.h"
17			#include "kleidicv/neon.h"
18			#include "kleidicv/workspace/border_types.h"
19			#include "kleidicv/workspace/separable.h"
20
21			namespace kleidicv::neon {
22
23			// Primary template for Gaussian Blur filters.
24			template <typename ScalarType, size_t KernelSize, bool IsBinomial>
25			class GaussianBlur;
26
27			// Template for 3x3 Gaussian Blur binomial filters.
28			//
29			// [ 1, 2, 1 ] [ 1 ]
30			// F = 1/16 * [ 2, 4, 2 ] = 1/16 * [ 2 ] * [ 1, 2, 1 ]
31			// [ 1, 2, 1 ] [ 1 ]
32			template <>
33			class GaussianBlur<uint8_t, 3, true> {
34			public:
35			using ScalarType = uint8_t;
36			using SourceType = ScalarType;
37			using SourceVectorType = typename VecTraits<SourceType>::VectorType;
38			using BufferType = double_element_width_t<ScalarType>;
39			using BufferVectorType = typename VecTraits<BufferType>::VectorType;
40			using DestinationType = ScalarType;
41
42			// Applies vertical filtering vector using SIMD operations.
43			//
44			// DST = [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
45		114	void vertical_vector_path(SourceVectorType src[3], BufferType *dst) const {
46			// acc_0_2 = src[0] + src[2]
47		114	BufferVectorType acc_0_2_l = vaddl(vget_low(src[0]), vget_low(src[2]));
48		114	BufferVectorType acc_0_2_h = vaddl(vget_high(src[0]), vget_high(src[2]));
49			// acc_1 = src[1] + src[1]
50		114	BufferVectorType acc_1_l = vshll_n<1>(vget_low(src[1]));
51		114	BufferVectorType acc_1_h = vshll_n<1>(vget_high(src[1]));
52			// acc = acc_0_2 + acc_1
53		114	BufferVectorType acc_l = vaddq(acc_0_2_l, acc_1_l);
54		114	BufferVectorType acc_h = vaddq(acc_0_2_h, acc_1_h);
55
56		114	VecTraits<BufferType>::store_consecutive(acc_l, acc_h, &dst[0]);
57		114	}
58
59			// Applies vertical filtering vector using scalar operations.
60			//
61			// DST = [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
62		320	void vertical_scalar_path(const SourceType src[3], BufferType *dst) const {
63		320	dst[0] = src[0] + 2 * src[1] + src[2];
64		320	}
65
66			// Applies horizontal filtering vector using SIMD operations.
67			//
68			// DST = 1/16 * [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
69		176	void horizontal_vector_path(BufferVectorType src[3],
70			DestinationType *dst) const {
71		176	BufferVectorType acc_wide = vaddq(src[0], src[2]);
72		176	acc_wide = vaddq(acc_wide, vshlq_n<1>(src[1]));
73		176	auto acc_narrow = vrshrn_n<4>(acc_wide);
74		176	vst1(&dst[0], acc_narrow);
75		176	}
76
77			// Applies horizontal filtering vector using scalar operations.
78			//
79			// DST = 1/16 * [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
80		452	void horizontal_scalar_path(const BufferType src[3],
81			DestinationType *dst) const {
82		452	auto acc = src[0] + 2 * src[1] + src[2];
83		452	dst[0] = rounding_shift_right(acc, 4);
84		452	}
85			}; // end of class GaussianBlur<uint8_t, 3, true>
86
87			// Template for 5x5 Gaussian Blur binomial filters.
88			//
89			// [ 1, 4, 6, 4, 1 ] [ 1 ]
90			// [ 4, 16, 24, 16, 4 ] [ 4 ]
91			// F = 1/256 * [ 6, 24, 36, 24, 6 ] = 1/256 * [ 6 ] * [ 1, 4, 6, 4, 1 ]
92			// [ 4, 16, 24, 16, 4 ] [ 4 ]
93			// [ 1, 4, 6, 4, 1 ] [ 1 ]
94			template <>
95			class GaussianBlur<uint8_t, 5, true> {
96			public:
97			using SourceType = uint8_t;
98			using BufferType = uint16_t;
99			using DestinationType = uint8_t;
100
101		51	GaussianBlur()
102		51	: const_6_u8_half_{vdup_n_u8(6)},
103		51	const_6_u16_{vdupq_n_u16(6)},
104		51	const_4_u16_{vdupq_n_u16(4)} {}
105
106			// Applies vertical filtering vector using SIMD operations.
107			//
108			// DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
109		300	void vertical_vector_path(uint8x16_t src[5], BufferType *dst) const {
110		300	uint16x8_t acc_0_4_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[4]));
111		300	uint16x8_t acc_0_4_h = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[4]));
112		300	uint16x8_t acc_1_3_l = vaddl_u8(vget_low_u8(src[1]), vget_low_u8(src[3]));
113		300	uint16x8_t acc_1_3_h = vaddl_u8(vget_high_u8(src[1]), vget_high_u8(src[3]));
114		600	uint16x8_t acc_l =
115		300	vmlal_u8(acc_0_4_l, vget_low_u8(src[2]), const_6_u8_half_);
116		600	uint16x8_t acc_h =
117		300	vmlal_u8(acc_0_4_h, vget_high_u8(src[2]), const_6_u8_half_);
118		300	acc_l = vmlaq_u16(acc_l, acc_1_3_l, const_4_u16_);
119		300	acc_h = vmlaq_u16(acc_h, acc_1_3_h, const_4_u16_);
120		300	vst1q(&dst[0], acc_l);
121		300	vst1q(&dst[8], acc_h);
122		300	}
123
124			// Applies vertical filtering vector using scalar operations.
125			//
126			// DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
127		3820	void vertical_scalar_path(const SourceType src[5], BufferType *dst) const {
128		3820	dst[0] = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2];
129		3820	}
130
131			// Applies horizontal filtering vector using SIMD operations.
132			//
133			// DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
134		604	void horizontal_vector_path(uint16x8_t src[5], DestinationType *dst) const {
135		604	uint16x8_t acc_0_4 = vaddq_u16(src[0], src[4]);
136		604	uint16x8_t acc_1_3 = vaddq_u16(src[1], src[3]);
137		604	uint16x8_t acc_u16 = vmlaq_u16(acc_0_4, src[2], const_6_u16_);
138		604	acc_u16 = vmlaq_u16(acc_u16, acc_1_3, const_4_u16_);
139		604	uint8x8_t acc_u8 = vrshrn_n_u16(acc_u16, 8);
140		604	vst1(&dst[0], acc_u8);
141		604	}
142
143			// Applies horizontal filtering vector using scalar operations.
144			//
145			// DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
146		3068	void horizontal_scalar_path(const BufferType src[5],
147			DestinationType *dst) const {
148		3068	auto acc = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2];
149		3068	dst[0] = rounding_shift_right(acc, 8);
150		3068	}
151
152			private:
153			uint8x8_t const_6_u8_half_;
154			uint16x8_t const_6_u16_;
155			uint16x8_t const_4_u16_;
156			}; // end of class GaussianBlur<uint8_t, 5, true>
157
158			// Template for 7x7 Gaussian Blur binomial filters.
159			//
160			// [ 4, 14, 28, 36, 28, 14, 4 ]
161			// [ 14, 49, 98, 126, 98, 49, 14 ]
162			// [ 28, 98, 196, 252, 196, 98, 28 ]
163			// F = 1/4096 * [ 36, 126, 252, 324, 252, 126, 36 ] =
164			// [ 28, 98, 196, 252, 196, 98, 28 ]
165			// [ 14, 49, 98, 126, 98, 49, 14 ]
166			// [ 4, 14, 28, 36, 28, 14, 4 ]
167			//
168			// [ 2 ]
169			// [ 7 ]
170			// [ 14 ]
171			// = 1/4096 * [ 18 ] * [ 2, 7, 14, 18, 14, 7, 2 ]
172			// [ 14 ]
173			// [ 7 ]
174			// [ 2 ]
175			template <>
176			class GaussianBlur<uint8_t, 7, true> {
177			public:
178			using SourceType = uint8_t;
179			using BufferType = uint16_t;
180			using DestinationType = uint8_t;
181
182		33	GaussianBlur()
183		33	: const_7_u16_{vdupq_n_u16(7)},
184		33	const_7_u32_{vdupq_n_u32(7)},
185		33	const_9_u16_{vdupq_n_u16(9)} {}
186
187			// Applies vertical filtering vector using SIMD operations.
188			//
189			// DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
190			// * [ 2, 7, 14, 18, 14, 7, 2 ]T
191		240	void vertical_vector_path(uint8x16_t src[7], BufferType *dst) const {
192		240	uint16x8_t acc_0_6_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[6]));
193		240	uint16x8_t acc_0_6_h = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[6]));
194
195		240	uint16x8_t acc_1_5_l = vaddl_u8(vget_low_u8(src[1]), vget_low_u8(src[5]));
196		240	uint16x8_t acc_1_5_h = vaddl_u8(vget_high_u8(src[1]), vget_high_u8(src[5]));
197
198		240	uint16x8_t acc_2_4_l = vaddl_u8(vget_low_u8(src[2]), vget_low_u8(src[4]));
199		240	uint16x8_t acc_2_4_h = vaddl_u8(vget_high_u8(src[2]), vget_high_u8(src[4]));
200
201		240	uint16x8_t acc_3_l = vmovl_u8(vget_low_u8(src[3]));
202		240	uint16x8_t acc_3_h = vmovl_u8(vget_high_u8(src[3]));
203
204		240	uint16x8_t acc_0_2_4_6_l = vmlaq_u16(acc_0_6_l, acc_2_4_l, const_7_u16_);
205		240	uint16x8_t acc_0_2_4_6_h = vmlaq_u16(acc_0_6_h, acc_2_4_h, const_7_u16_);
206
207		480	uint16x8_t acc_0_2_3_4_6_l =
208		240	vmlaq_u16(acc_0_2_4_6_l, acc_3_l, const_9_u16_);
209		480	uint16x8_t acc_0_2_3_4_6_h =
210		240	vmlaq_u16(acc_0_2_4_6_h, acc_3_h, const_9_u16_);
211
212		240	acc_0_2_3_4_6_l = vshlq_n_u16(acc_0_2_3_4_6_l, 1);
213		240	acc_0_2_3_4_6_h = vshlq_n_u16(acc_0_2_3_4_6_h, 1);
214
215		480	uint16x8_t acc_0_1_2_3_4_5_6_l =
216		240	vmlaq_u16(acc_0_2_3_4_6_l, acc_1_5_l, const_7_u16_);
217		480	uint16x8_t acc_0_1_2_3_4_5_6_h =
218		240	vmlaq_u16(acc_0_2_3_4_6_h, acc_1_5_h, const_7_u16_);
219
220		240	vst1q(&dst[0], acc_0_1_2_3_4_5_6_l);
221		240	vst1q(&dst[8], acc_0_1_2_3_4_5_6_h);
222		240	}
223
224			// Applies vertical filtering vector using scalar operations.
225			//
226			// DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
227			// * [ 2, 7, 14, 18, 14, 7, 2 ]T
228		664	void vertical_scalar_path(const SourceType src[7], BufferType *dst) const {
229		1992	uint16_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 +
230		1328	src[4] * 14 + src[5] * 7 + src[6] * 2;
231		664	dst[0] = acc;
232		664	}
233
234			// Applies horizontal filtering vector using SIMD operations.
235			//
236			// DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
237			// * [ 2, 7, 14, 18, 14, 7, 2 ]T
238		216	void horizontal_vector_path(uint16x8_t src[7], DestinationType *dst) const {
239		432	uint32x4_t acc_0_6_l =
240		216	vaddl_u16(vget_low_u16(src[0]), vget_low_u16(src[6]));
241		432	uint32x4_t acc_0_6_h =
242		216	vaddl_u16(vget_high_u16(src[0]), vget_high_u16(src[6]));
243
244		432	uint32x4_t acc_1_5_l =
245		216	vaddl_u16(vget_low_u16(src[1]), vget_low_u16(src[5]));
246		432	uint32x4_t acc_1_5_h =
247		216	vaddl_u16(vget_high_u16(src[1]), vget_high_u16(src[5]));
248
249		216	uint16x8_t acc_2_4 = vaddq_u16(src[2], src[4]);
250
251		432	uint32x4_t acc_0_2_4_6_l =
252		216	vmlal_u16(acc_0_6_l, vget_low_u16(acc_2_4), vget_low_u16(const_7_u16_));
253		432	uint32x4_t acc_0_2_4_6_h = vmlal_u16(acc_0_6_h, vget_high_u16(acc_2_4),
254		216	vget_high_u16(const_7_u16_));
255
256		432	uint32x4_t acc_0_2_3_4_6_l = vmlal_u16(acc_0_2_4_6_l, vget_low_u16(src[3]),
257		216	vget_low_u16(const_9_u16_));
258		432	uint32x4_t acc_0_2_3_4_6_h = vmlal_u16(acc_0_2_4_6_h, vget_high_u16(src[3]),
259		216	vget_high_u16(const_9_u16_));
260
261		216	acc_0_2_3_4_6_l = vshlq_n_u32(acc_0_2_3_4_6_l, 1);
262		216	acc_0_2_3_4_6_h = vshlq_n_u32(acc_0_2_3_4_6_h, 1);
263
264		432	uint32x4_t acc_0_1_2_3_4_5_6_l =
265		216	vmlaq_u32(acc_0_2_3_4_6_l, acc_1_5_l, const_7_u32_);
266		432	uint32x4_t acc_0_1_2_3_4_5_6_h =
267		216	vmlaq_u32(acc_0_2_3_4_6_h, acc_1_5_h, const_7_u32_);
268
269		216	uint16x4_t acc_0_1_2_3_4_5_6_u16_l = vrshrn_n_u32(acc_0_1_2_3_4_5_6_l, 12);
270		216	uint16x4_t acc_0_1_2_3_4_5_6_u16_h = vrshrn_n_u32(acc_0_1_2_3_4_5_6_h, 12);
271
272		432	uint16x8_t acc_0_1_2_3_4_5_6_u16 =
273		216	vcombine_u16(acc_0_1_2_3_4_5_6_u16_l, acc_0_1_2_3_4_5_6_u16_h);
274		216	uint8x8_t acc_0_1_2_3_4_5_6_u8 = vmovn_u16(acc_0_1_2_3_4_5_6_u16);
275
276		216	vst1(&dst[0], acc_0_1_2_3_4_5_6_u8);
277		216	}
278
279			// Applies horizontal filtering vector using scalar operations.
280			//
281			// DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
282			// * [ 2, 7, 14, 18, 14, 7, 2 ]T
283		1672	void horizontal_scalar_path(const BufferType src[7],
284			DestinationType *dst) const {
285		5016	uint32_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 +
286		3344	src[4] * 14 + src[5] * 7 + src[6] * 2;
287		1672	dst[0] = static_cast<DestinationType>(rounding_shift_right(acc, 12));
288		1672	}
289
290			private:
291			uint16x8_t const_7_u16_;
292			uint32x4_t const_7_u32_;
293			uint16x8_t const_9_u16_;
294			}; // end of class GaussianBlur<uint8_t, 7, true>
295
296			template <size_t KernelSize>
297			class GaussianBlur<uint8_t, KernelSize, false> {
298			public:
299			using SourceType = uint8_t;
300			using BufferType = uint8_t;
301			using DestinationType = uint8_t;
302
303			static constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize);
304
305		115	explicit GaussianBlur(const uint16_t *half_kernel)
306		115	: half_kernel_(half_kernel) {}
307
308		3504	void vertical_vector_path(uint8x16_t src[KernelSize], BufferType *dst) const {
309		3504	common_vector_path(src, dst);
310		3504	}
311
312		40312	void vertical_scalar_path(const SourceType src[KernelSize],
313			BufferType *dst) const {
314		80624	uint16_t acc = static_cast<uint16_t>(src[kHalfKernelSize - 1]) *
315		40312	half_kernel_[kHalfKernelSize - 1];
316
317			// Optimization to avoid unnecessary branching in vector code.
318			KLEIDICV_FORCE_LOOP_UNROLL
319	10/10 ✓ Branch 0 taken 13568 times. ✓ Branch 1 taken 94976 times. ✓ Branch 2 taken 23528 times. ✓ Branch 3 taken 235280 times. ✓ Branch 4 taken 336 times. ✓ Branch 5 taken 336 times. ✓ Branch 6 taken 1096 times. ✓ Branch 7 taken 2192 times. ✓ Branch 8 taken 1784 times. ✓ Branch 9 taken 5352 times.	378448	for (size_t i = 0; i < kHalfKernelSize - 1; i++) {
320		1014408	acc += (static_cast<uint16_t>(src[i]) +
321		676272	static_cast<uint16_t>(src[KernelSize - i - 1])) *
322		338136	half_kernel_[i];
323		338136	}
324
325		40312	dst[0] = static_cast<DestinationType>(rounding_shift_right(acc, 8));
326		40312	}
327
328		960	void horizontal_vector_path(uint8x16_t src[KernelSize],
329			DestinationType *dst) const {
330		960	common_vector_path(src, dst);
331		960	}
332
333		37268	void horizontal_scalar_path(const BufferType src[KernelSize],
334			DestinationType *dst) const {
335		37268	vertical_scalar_path(src, dst);
336		37268	}
337
338			private:
339		4464	void common_vector_path(uint8x16_t src[KernelSize], BufferType *dst) const {
340		4464	uint8x8_t half_kernel_mid = vdup_n_u8(half_kernel_[kHalfKernelSize - 1]);
341		8928	uint16x8_t acc_l =
342		8928	vmlal_u8(vdupq_n_u16(128), vget_low_u8(src[kHalfKernelSize - 1]),
343		4464	half_kernel_mid);
344		8928	uint16x8_t acc_h =
345		8928	vmlal_u8(vdupq_n_u16(128), vget_high_u8(src[kHalfKernelSize - 1]),
346		4464	half_kernel_mid);
347
348			// Optimization to avoid unnecessary branching in vector code.
349			KLEIDICV_FORCE_LOOP_UNROLL
350	10/10 ✓ Branch 0 taken 1344 times. ✓ Branch 1 taken 9408 times. ✓ Branch 2 taken 2736 times. ✓ Branch 3 taken 27360 times. ✓ Branch 4 taken 56 times. ✓ Branch 5 taken 56 times. ✓ Branch 6 taken 112 times. ✓ Branch 7 taken 224 times. ✓ Branch 8 taken 216 times. ✓ Branch 9 taken 648 times.	42160	for (size_t i = 0; i < kHalfKernelSize - 1; i++) {
351		37696	const size_t j = KernelSize - i - 1;
352		37696	uint16x8_t vec_l = vaddl_u8(vget_low_u8(src[i]), vget_low_u8(src[j]));
353		37696	uint16x8_t vec_h = vaddl_high_u8(src[i], src[j]);
354		37696	uint16x8_t coeff = vdupq_n_u16(half_kernel_[i]);
355
356		37696	acc_l = vmlaq_u16(acc_l, vec_l, coeff);
357		37696	acc_h = vmlaq_u16(acc_h, vec_h, coeff);
358		37696	}
359
360			// Keep only the highest 8 bits
361		8928	uint8x16_t result =
362		4464	vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h));
363		4464	neon::VecTraits<uint8_t>::store(result, &dst[0]);
364		4464	}
365
366			const uint16_t *half_kernel_;
367			}; // end of class GaussianBlur<uint8_t, KernelSize, false>
368
369			template <size_t KernelSize, bool IsBinomial, typename ScalarType>
370		326	static kleidicv_error_t gaussian_blur_fixed_kernel_size(
371			const ScalarType src, size_t src_stride, ScalarType dst,
372			size_t dst_stride, Rectangle &rect, size_t y_begin, size_t y_end,
373			size_t channels, float sigma, FixedBorderType border_type,
374			SeparableFilterWorkspace *workspace) {
375			using GaussianBlurFilter = GaussianBlur<ScalarType, KernelSize, IsBinomial>;
376
377		326	Rows<const ScalarType> src_rows{src, src_stride, channels};
378		326	Rows<ScalarType> dst_rows{dst, dst_stride, channels};
379
380			if constexpr (IsBinomial) {
381		131	GaussianBlurFilter blur;
382		131	SeparableFilter<GaussianBlurFilter, KernelSize> filter{blur};
383		262	workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels,
384		131	border_type, filter);
385
386		131	return KLEIDICV_OK;
387		131	} else {
388		195	constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize);
389		195	uint16_t half_kernel[128];
390		195	generate_gaussian_half_kernel(half_kernel, kHalfKernelSize, sigma);
391			// If sigma is so small that the middle point gets all the weights, it's
392			// just a copy
393	10/10 ✓ Branch 0 taken 32 times. ✓ Branch 1 taken 16 times. ✓ Branch 2 taken 32 times. ✓ Branch 3 taken 16 times. ✓ Branch 4 taken 17 times. ✓ Branch 5 taken 16 times. ✓ Branch 6 taken 17 times. ✓ Branch 7 taken 16 times. ✓ Branch 8 taken 17 times. ✓ Branch 9 taken 16 times.	195	if (half_kernel[kHalfKernelSize - 1] < 256) {
394		115	GaussianBlurFilter blur(half_kernel);
395		115	SeparableFilter<GaussianBlurFilter, KernelSize> filter{blur};
396		230	workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels,
397		115	border_type, filter);
398		115	} else {
399	10/10 ✓ Branch 0 taken 228 times. ✓ Branch 1 taken 16 times. ✓ Branch 2 taken 324 times. ✓ Branch 3 taken 16 times. ✓ Branch 4 taken 36 times. ✓ Branch 5 taken 16 times. ✓ Branch 6 taken 68 times. ✓ Branch 7 taken 16 times. ✓ Branch 8 taken 100 times. ✓ Branch 9 taken 16 times.	836	for (size_t row = y_begin; row < y_end; ++row) {
400		1512	std::memcpy(static_cast<void *>(&dst_rows.at(row)[0]),
401		756	static_cast<const void *>(&src_rows.at(row)[0]),
402		756	rect.width() * sizeof(ScalarType) * dst_rows.channels());
403		756	}
404			}
405		195	return KLEIDICV_OK;
406		195	}
407		326	}
408
409			template <bool IsBinomial, typename ScalarType>
410		326	static kleidicv_error_t gaussian_blur_fixed(
411			size_t kernel_size, const ScalarType *src, size_t src_stride,
412			ScalarType *dst, size_t dst_stride, Rectangle &rect, size_t y_begin,
413			size_t y_end, size_t channels, float sigma, FixedBorderType border_type,
414			SeparableFilterWorkspace *workspace) {
415	10/12 ✓ Branch 0 taken 47 times. ✓ Branch 1 taken 51 times. ✓ Branch 2 taken 33 times. ✓ Branch 3 taken 16 times. ✗ Branch 4 not taken. ✓ Branch 5 taken 16 times. ✓ Branch 6 taken 33 times. ✓ Branch 7 taken 33 times. ✓ Branch 8 taken 33 times. ✓ Branch 9 taken 32 times. ✗ Branch 10 not taken. ✓ Branch 11 taken 32 times.	326	switch (kernel_size) {
416			case 3:
417		80	return gaussian_blur_fixed_kernel_size<3, IsBinomial>(
418		80	src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
419		80	sigma, border_type, workspace);
420			case 5:
421		84	return gaussian_blur_fixed_kernel_size<5, IsBinomial>(
422		84	src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
423		84	sigma, border_type, workspace);
424			case 7:
425		66	return gaussian_blur_fixed_kernel_size<7, IsBinomial>(
426		66	src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
427		66	sigma, border_type, workspace);
428			case 15:
429			// 15x15 does not have a binomial variant
430		48	return gaussian_blur_fixed_kernel_size<15, false>(
431		48	src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
432		48	sigma, border_type, workspace);
433			case 21:
434			// 21x21 does not have a binomial variant
435		48	return gaussian_blur_fixed_kernel_size<21, false>(
436		48	src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
437		48	sigma, border_type, workspace);
438			// gaussian_blur_is_implemented checked the kernel size already.
439			// GCOVR_EXCL_START
440			default:
441			assert(!"kernel size not implemented");
442		−	return KLEIDICV_ERROR_NOT_IMPLEMENTED;
443			// GCOVR_EXCL_STOP
444			}
445		326	}
446
447			KLEIDICV_TARGET_FN_ATTRS
448		345	kleidicv_error_t gaussian_blur_fixed_stripe_u8(
449			const uint8_t src, size_t src_stride, uint8_t dst, size_t dst_stride,
450			size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels,
451			size_t kernel_width, size_t /kernel_height/, float sigma_x,
452			float /sigma_y/, FixedBorderType fixed_border_type,
453			kleidicv_filter_context_t *context) {
454		345	auto workspace = reinterpret_cast<SeparableFilterWorkspace >(context);
455		690	kleidicv_error_t checks_result = gaussian_blur_checks(
456		345	src, src_stride, dst, dst_stride, width, height, channels, workspace);
457
458	2/2 ✓ Branch 0 taken 19 times. ✓ Branch 1 taken 326 times.	345	if (checks_result != KLEIDICV_OK) {
459		19	return checks_result;
460			}
461
462		326	Rectangle rect{width, height};
463
464	2/2 ✓ Branch 0 taken 163 times. ✓ Branch 1 taken 163 times.	326	if (sigma_x == 0.0) {
465		326	return gaussian_blur_fixed<true>(kernel_width, src, src_stride, dst,
466		163	dst_stride, rect, y_begin, y_end, channels,
467		163	sigma_x, fixed_border_type, workspace);
468			}
469
470		326	return gaussian_blur_fixed<false>(kernel_width, src, src_stride, dst,
471		163	dst_stride, rect, y_begin, y_end, channels,
472		163	sigma_x, fixed_border_type, workspace);
473		345	}
474
475			} // namespace kleidicv::neon
476