KleidiCV Coverage Report

Directory:	./
File:	kleidicv/src/filters/gaussian_blur_fixed_sc.h
Date:	2025-09-25 14:13:34

	Exec	Total	Coverage
Lines:	193	193	100.0%
Functions:	100	100	100.0%
Branches:	54	56	96.4%

  
      Line
      Branch
      Exec
      Source
    
      // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
    
      //
    
      // SPDX-License-Identifier: Apache-2.0
    
      #ifndef KLEIDICV_GAUSSIAN_BLUR_SC_H
    
      #define KLEIDICV_GAUSSIAN_BLUR_SC_H
    
      #include <array>
    
      #include <cassert>
    
      #include "kleidicv/filters/gaussian_blur.h"
    
      #include "kleidicv/filters/separable_filter_15x15_sc.h"
    
      #include "kleidicv/filters/separable_filter_21x21_sc.h"
    
      #include "kleidicv/filters/separable_filter_3x3_sc.h"
    
      #include "kleidicv/filters/separable_filter_5x5_sc.h"
    
      #include "kleidicv/filters/separable_filter_7x7_sc.h"
    
      #include "kleidicv/filters/sigma.h"
    
      #include "kleidicv/workspace/separable.h"
    
      #if KLEIDICV_TARGET_SME || KLEIDICV_TARGET_SME2
    
      #include <arm_sme.h>
    
      #endif
    
      namespace KLEIDICV_TARGET_NAMESPACE {
    
      // Primary template for Gaussian Blur filters.
    
      template <typename ScalarType, size_t KernelSize, bool IsBinomial>
    
      class GaussianBlur;
    
      // Template for 3x3 Gaussian Blur binomial filters.
    
      //
    
      //             [ 1, 2, 1 ]          [ 1 ]
    
      //  F = 1/16 * [ 2, 4, 2 ] = 1/16 * [ 2 ] * [ 1, 2, 1 ]
    
      //             [ 1, 2, 1 ]          [ 1 ]
    
      template <>
    
      class GaussianBlur<uint8_t, 3, true> {
    
       public:
    
        using SourceType = uint8_t;
    
        using BufferType = uint16_t;
    
        using DestinationType = uint8_t;
    
        // Applies vertical filtering vector using SIMD operations.
    
        //
    
        // DST = [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
    
      504
        void vertical_vector_path(svbool_t pg,
    
                                  std::reference_wrapper<svuint8_t> src[3],
    
                                  BufferType *dst) const KLEIDICV_STREAMING {
    
      504
          svuint16_t acc_0_2_b = svaddlb_u16(src[0], src[2]);
    
      504
          svuint16_t acc_0_2_t = svaddlt_u16(src[0], src[2]);
    
      504
          svuint16_t acc_1_b = svshllb_n_u16(src[1], 1);
    
      504
          svuint16_t acc_1_t = svshllt_n_u16(src[1], 1);
    
      504
          svuint16_t acc_u16_b = svadd_u16_x(pg, acc_0_2_b, acc_1_b);
    
      504
          svuint16_t acc_u16_t = svadd_u16_x(pg, acc_0_2_t, acc_1_t);
    
      504
          svuint16x2_t interleaved = svcreate2(acc_u16_b, acc_u16_t);
    
      504
          svst2(pg, &dst[0], interleaved);
    
      504
        }
    
        // Applies horizontal filtering vector using SIMD operations.
    
        //
    
        // DST = 1/16 * [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
    
      400
        void horizontal_vector_path(svbool_t pg,
    
                                    std::reference_wrapper<svuint16_t> src[3],
    
                                    DestinationType *dst) const KLEIDICV_STREAMING {
    
      400
          svuint16_t acc_0_2 = svhadd_u16_x(pg, src[0], src[2]);
    
      400
          svuint16_t acc = svadd_u16_x(pg, acc_0_2, src[1]);
    
      400
          acc = svrshr_x(pg, acc, 3);
    
      400
          svst1b(pg, &dst[0], acc);
    
      400
        }
    
        // Applies horizontal filtering vector using scalar operations.
    
        //
    
        // DST = 1/16 * [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
    
      1008
        void horizontal_scalar_path(const BufferType src[3],
    
                                    DestinationType *dst) const KLEIDICV_STREAMING {
    
      1008
          auto acc = src[0] + 2 * src[1] + src[2];
    
      1008
          dst[0] = rounding_shift_right(acc, 4);
    
      1008
        }
    
      };  // end of class GaussianBlur<uint8_t, 3, true>
    
      // Template for 5x5 Gaussian Blur binomial filters.
    
      //
    
      //              [ 1,  4,  6,  4, 1 ]           [ 1 ]
    
      //              [ 4, 16, 24, 16, 4 ]           [ 4 ]
    
      //  F = 1/256 * [ 6, 24, 36, 24, 6 ] = 1/256 * [ 6 ] * [ 1,  4,  6,  4, 1 ]
    
      //              [ 4, 16, 24, 16, 4 ]           [ 4 ]
    
      //              [ 1,  4,  6,  4, 1 ]           [ 1 ]
    
      template <>
    
      class GaussianBlur<uint8_t, 5, true> {
    
       public:
    
        using SourceType = uint8_t;
    
        using BufferType = uint16_t;
    
        using DestinationType = uint8_t;
    
        // Applies vertical filtering vector using SIMD operations.
    
        //
    
        // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
    
      1524
        void vertical_vector_path(svbool_t pg,
    
                                  std::reference_wrapper<svuint8_t> src[5],
    
                                  BufferType *dst) const KLEIDICV_STREAMING {
    
      1524
          svuint16_t acc_0_4_b = svaddlb_u16(src[0], src[4]);
    
      1524
          svuint16_t acc_0_4_t = svaddlt_u16(src[0], src[4]);
    
      1524
          svuint16_t acc_1_3_b = svaddlb_u16(src[1], src[3]);
    
      1524
          svuint16_t acc_1_3_t = svaddlt_u16(src[1], src[3]);
    
      1524
          svuint16_t acc_u16_b = svmlalb_n_u16(acc_0_4_b, src[2], 6);
    
      1524
          svuint16_t acc_u16_t = svmlalt_n_u16(acc_0_4_t, src[2], 6);
    
      1524
          acc_u16_b = svmla_n_u16_x(pg, acc_u16_b, acc_1_3_b, 4);
    
      1524
          acc_u16_t = svmla_n_u16_x(pg, acc_u16_t, acc_1_3_t, 4);
    
      1524
          svuint16x2_t interleaved = svcreate2(acc_u16_b, acc_u16_t);
    
      1524
          svst2(pg, &dst[0], interleaved);
    
      1524
        }
    
        // Applies horizontal filtering vector using SIMD operations.
    
        //
    
        // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
    
      1316
        void horizontal_vector_path(svbool_t pg,
    
                                    std::reference_wrapper<svuint16_t> src[5],
    
                                    DestinationType *dst) const KLEIDICV_STREAMING {
    
      1316
          svuint16_t acc_0_4 = svadd_x(pg, src[0], src[4]);
    
      1316
          svuint16_t acc_1_3 = svadd_x(pg, src[1], src[3]);
    
      1316
          svuint16_t acc = svmla_n_u16_x(pg, acc_0_4, src[2], 6);
    
      1316
          acc = svmla_n_u16_x(pg, acc, acc_1_3, 4);
    
      1316
          acc = svrshr_x(pg, acc, 8);
    
      1316
          svst1b(pg, &dst[0], acc);
    
      1316
        }
    
        // Applies horizontal filtering vector using scalar operations.
    
        //
    
        // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
    
      5552
        void horizontal_scalar_path(const BufferType src[5],
    
                                    DestinationType *dst) const KLEIDICV_STREAMING {
    
      5552
          auto acc = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2];
    
      5552
          dst[0] = rounding_shift_right(acc, 8);
    
      5552
        }
    
      };  // end of class GaussianBlur<uint8_t, 5, true>
    
      // Template for 7x7 Gaussian Blur binomial filters.
    
      //
    
      //               [  4,  14,  28,  36,  28,  14,  4 ]
    
      //               [ 14,  49,  98, 126,  98,  49, 14 ]
    
      //               [ 28,  98, 196, 252, 196,  98, 28 ]
    
      //  F = 1/4096 * [ 36, 126, 252, 324, 252, 126, 36 ] =
    
      //               [ 28,  98, 196, 252, 196,  98, 28 ]
    
      //               [ 14,  49,  98, 126,  98,  49, 14 ]
    
      //               [  4,  14,  28,  36,  28,  14,  4 ]
    
      //
    
      //               [  2 ]
    
      //               [  7 ]
    
      //               [ 14 ]
    
      //  = 1/4096  *  [ 18 ] * [ 2, 7, 14, 18, 14, 7, 2 ]
    
      //               [ 14 ]
    
      //               [  7 ]
    
      //               [  2 ]
    
      template <>
    
      class GaussianBlur<uint8_t, 7, true> {
    
       public:
    
        using SourceType = uint8_t;
    
        using BufferType = uint16_t;
    
        using DestinationType = uint8_t;
    
        // Applies vertical filtering vector using SIMD operations.
    
        //
    
        // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
    
        //     * [ 2, 7, 14, 18, 14, 7, 2 ]T
    
      596
        void vertical_vector_path(svbool_t pg,
    
                                  std::reference_wrapper<svuint8_t> src[7],
    
                                  BufferType *dst) const KLEIDICV_STREAMING {
    
      596
          svuint16_t acc_0_6_b = svaddlb_u16(src[0], src[6]);
    
      596
          svuint16_t acc_0_6_t = svaddlt_u16(src[0], src[6]);
    
      596
          svuint16_t acc_1_5_b = svaddlb_u16(src[1], src[5]);
    
      596
          svuint16_t acc_1_5_t = svaddlt_u16(src[1], src[5]);
    
      596
          svuint16_t acc_2_4_b = svaddlb_u16(src[2], src[4]);
    
      596
          svuint16_t acc_2_4_t = svaddlt_u16(src[2], src[4]);
    
      596
          svuint16_t acc_3_b = svmovlb_u16(src[3]);
    
      596
          svuint16_t acc_3_t = svmovlt_u16(src[3]);
    
      596
          svuint16_t acc_0_2_4_6_b = svmla_n_u16_x(pg, acc_0_6_b, acc_2_4_b, 7);
    
      596
          svuint16_t acc_0_2_4_6_t = svmla_n_u16_x(pg, acc_0_6_t, acc_2_4_t, 7);
    
      596
          svuint16_t acc_0_2_3_4_6_b = svmla_n_u16_x(pg, acc_0_2_4_6_b, acc_3_b, 9);
    
      596
          svuint16_t acc_0_2_3_4_6_t = svmla_n_u16_x(pg, acc_0_2_4_6_t, acc_3_t, 9);
    
      596
          acc_0_2_3_4_6_b = svlsl_n_u16_x(pg, acc_0_2_3_4_6_b, 1);
    
      596
          acc_0_2_3_4_6_t = svlsl_n_u16_x(pg, acc_0_2_3_4_6_t, 1);
    
      1192
          svuint16_t acc_0_1_2_3_4_5_6_b =
    
      596
              svmla_n_u16_x(pg, acc_0_2_3_4_6_b, acc_1_5_b, 7);
    
      1192
          svuint16_t acc_0_1_2_3_4_5_6_t =
    
      596
              svmla_n_u16_x(pg, acc_0_2_3_4_6_t, acc_1_5_t, 7);
    
      1192
          svuint16x2_t interleaved =
    
      596
              svcreate2(acc_0_1_2_3_4_5_6_b, acc_0_1_2_3_4_5_6_t);
    
      596
          svst2(pg, &dst[0], interleaved);
    
      596
        }
    
        // Applies horizontal filtering vector using SIMD operations.
    
        //
    
        // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
    
        //              * [ 2, 7, 14, 18, 14, 7, 2 ]T
    
      464
        void horizontal_vector_path(svbool_t pg,
    
                                    std::reference_wrapper<svuint16_t> src[7],
    
                                    DestinationType *dst) const KLEIDICV_STREAMING {
    
      464
          svuint32_t acc_0_6_b = svaddlb_u32(src[0], src[6]);
    
      464
          svuint32_t acc_0_6_t = svaddlt_u32(src[0], src[6]);
    
      464
          svuint32_t acc_1_5_b = svaddlb_u32(src[1], src[5]);
    
      464
          svuint32_t acc_1_5_t = svaddlt_u32(src[1], src[5]);
    
      464
          svuint16_t acc_2_4 = svadd_u16_x(pg, src[2], src[4]);
    
      464
          svuint32_t acc_0_2_4_6_b = svmlalb_n_u32(acc_0_6_b, acc_2_4, 7);
    
      464
          svuint32_t acc_0_2_4_6_t = svmlalt_n_u32(acc_0_6_t, acc_2_4, 7);
    
      464
          svuint32_t acc_0_2_3_4_6_b = svmlalb_n_u32(acc_0_2_4_6_b, src[3], 9);
    
      464
          svuint32_t acc_0_2_3_4_6_t = svmlalt_n_u32(acc_0_2_4_6_t, src[3], 9);
    
      464
          acc_0_2_3_4_6_b = svlsl_n_u32_x(pg, acc_0_2_3_4_6_b, 1);
    
      464
          acc_0_2_3_4_6_t = svlsl_n_u32_x(pg, acc_0_2_3_4_6_t, 1);
    
      928
          svuint32_t acc_0_1_2_3_4_5_6_b =
    
      464
              svmla_n_u32_x(pg, acc_0_2_3_4_6_b, acc_1_5_b, 7);
    
      928
          svuint32_t acc_0_1_2_3_4_5_6_t =
    
      464
              svmla_n_u32_x(pg, acc_0_2_3_4_6_t, acc_1_5_t, 7);
    
      928
          svuint16_t acc_0_1_2_3_4_5_6_u16_b =
    
      464
              svrshrnb_n_u32(acc_0_1_2_3_4_5_6_b, 12);
    
      928
          svuint16_t acc_0_1_2_3_4_5_6_u16 =
    
      464
              svrshrnt_n_u32(acc_0_1_2_3_4_5_6_u16_b, acc_0_1_2_3_4_5_6_t, 12);
    
      464
          svst1b(pg, &dst[0], acc_0_1_2_3_4_5_6_u16);
    
      464
        }
    
        // Applies horizontal filtering vector using scalar operations.
    
        //
    
        // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
    
        //              * [ 2, 7, 14, 18, 14, 7, 2 ]T
    
      3288
        void horizontal_scalar_path(const BufferType src[7],
    
                                    DestinationType *dst) const KLEIDICV_STREAMING {
    
      9864
          uint32_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 +
    
      6576
                         src[4] * 14 + src[5] * 7 + src[6] * 2;
    
      3288
          dst[0] = rounding_shift_right(acc, 12);
    
      3288
        }
    
      };  // end of class GaussianBlur<uint8_t, 7, true>
    
      // CustomSigma variant
    
      template <size_t KernelSize>
    
      class GaussianBlur<uint8_t, KernelSize, false> {
    
       public:
    
        using SourceType = uint8_t;
    
        using BufferType = uint8_t;
    
        using DestinationType = uint8_t;
    
        using SourceVecTraits =
    
            typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits<SourceType>;
    
        using SourceVectorType = typename SourceVecTraits::VectorType;
    
        static constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize);
    
      230
        explicit GaussianBlur(const uint16_t *half_kernel)
    
      230
            : half_kernel_(half_kernel) {}
    
      6048
        void vertical_vector_path(
    
            svbool_t pg, std::reference_wrapper<SourceVectorType> src[KernelSize],
    
            BufferType *dst) const KLEIDICV_STREAMING {
    
      6048
          common_vector_path(pg, src, dst);
    
      6048
        }
    
      70640
        void vertical_scalar_path(const SourceType src[KernelSize],
    
                                  BufferType *dst) const KLEIDICV_STREAMING {
    
      141280
          uint32_t acc = static_cast<uint32_t>(src[kHalfKernelSize - 1]) *
    
      70640
                         half_kernel_[kHalfKernelSize - 1];
    
          // Optimization to avoid unnecessary branching in vector code.
    
          KLEIDICV_FORCE_LOOP_UNROLL
    
        10/10✓ Branch 0 taken 22176 times.
✓ Branch 1 taken 155232 times.
✓ Branch 2 taken 45120 times.
✓ Branch 3 taken 451200 times.
✓ Branch 4 taken 248 times.
✓ Branch 5 taken 248 times.
✓ Branch 6 taken 960 times.
✓ Branch 7 taken 1920 times.
✓ Branch 8 taken 2136 times.
✓ Branch 9 taken 6408 times.

      685648
          for (size_t i = 0; i < kHalfKernelSize - 1; i++) {
    
      1845024
            acc += (static_cast<uint32_t>(src[i]) +
    
      1230016
                    static_cast<uint32_t>(src[KernelSize - i - 1])) *
    
      615008
                   half_kernel_[i];
    
      615008
          }
    
      70640
          dst[0] = static_cast<BufferType>(rounding_shift_right(acc, 8));
    
      70640
        }
    
      3896
        void horizontal_vector_path(
    
            svbool_t pg, std::reference_wrapper<SourceVectorType> src[KernelSize],
    
            BufferType *dst) const KLEIDICV_STREAMING {
    
      3896
          common_vector_path(pg, src, dst);
    
      3896
        }
    
      70640
        void horizontal_scalar_path(const BufferType src[KernelSize],
    
                                    DestinationType *dst) const KLEIDICV_STREAMING {
    
      70640
          vertical_scalar_path(src, dst);
    
      70640
        }
    
       private:
    
      9944
        void common_vector_path(
    
            svbool_t pg, std::reference_wrapper<SourceVectorType> src[KernelSize],
    
            BufferType *dst) const KLEIDICV_STREAMING {
    
      9944
          svbool_t pg16_all = svptrue_b16();
    
      19888
          svuint16_t acc_b = svmullb_n_u16(src[kHalfKernelSize - 1],
    
      9944
                                           half_kernel_[kHalfKernelSize - 1]);
    
      19888
          svuint16_t acc_t = svmullt_n_u16(src[kHalfKernelSize - 1],
    
      9944
                                           half_kernel_[kHalfKernelSize - 1]);
    
          // Optimization to avoid unnecessary branching in vector code.
    
          KLEIDICV_FORCE_LOOP_UNROLL
    
        10/10✓ Branch 0 taken 3280 times.
✓ Branch 1 taken 22960 times.
✓ Branch 2 taken 5320 times.
✓ Branch 3 taken 53200 times.
✓ Branch 4 taken 228 times.
✓ Branch 5 taken 228 times.
✓ Branch 6 taken 440 times.
✓ Branch 7 taken 880 times.
✓ Branch 8 taken 676 times.
✓ Branch 9 taken 2028 times.

      89240
          for (size_t i = 0; i < kHalfKernelSize - 1; i++) {
    
      79296
            const size_t j = KernelSize - i - 1;
    
      79296
            svuint16_t vec_b = svaddlb_u16(src[i], src[j]);
    
      79296
            svuint16_t vec_t = svaddlt_u16(src[i], src[j]);
    
      79296
            acc_b = svmla_n_u16_x(pg16_all, acc_b, vec_b, half_kernel_[i]);
    
      79296
            acc_t = svmla_n_u16_x(pg16_all, acc_t, vec_t, half_kernel_[i]);
    
      79296
          }
    
          // Rounding before narrowing
    
      9944
          acc_b = svqadd_n_u16(acc_b, 128);
    
      9944
          acc_t = svqadd_n_u16(acc_t, 128);
    
          // Keep only the highest 8 bits
    
      19888
          svuint8_t result =
    
      9944
              svtrn2_u8(svreinterpret_u8_u16(acc_b), svreinterpret_u8_u16(acc_t));
    
      9944
          svst1(pg, &dst[0], result);
    
      9944
        }
    
        const uint16_t *half_kernel_;
    
      };  // end of class GaussianBlur<uint8_t, KernelSize, false>
    
      template <size_t KernelSize, bool IsBinomial, typename ScalarType>
    
      652
      static kleidicv_error_t gaussian_blur_fixed_kernel_size(
    
          const ScalarType *src, size_t src_stride, ScalarType *dst,
    
          size_t dst_stride, Rectangle &rect, size_t y_begin, size_t y_end,
    
          size_t channels, float sigma, FixedBorderType border_type,
    
          SeparableFilterWorkspace *workspace) KLEIDICV_STREAMING {
    
        using GaussianBlurFilter = GaussianBlur<ScalarType, KernelSize, IsBinomial>;
    
      652
        Rows<const ScalarType> src_rows{src, src_stride, channels};
    
      652
        Rows<ScalarType> dst_rows{dst, dst_stride, channels};
    
        if constexpr (IsBinomial) {
    
      262
          GaussianBlurFilter blur;
    
      262
          SeparableFilter<GaussianBlurFilter, KernelSize> filter{blur};
    
      524
          workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels,
    
      262
                             border_type, filter);
    
      262
          return KLEIDICV_OK;
    
      262
        } else {
    
      390
          constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize);
    
      390
          uint16_t half_kernel[128];
    
      390
          generate_gaussian_half_kernel(half_kernel, kHalfKernelSize, sigma);
    
          // If sigma is so small that the middle point gets all the weights, it's
    
          // just a copy
    
        10/10✓ Branch 0 taken 64 times.
✓ Branch 1 taken 32 times.
✓ Branch 2 taken 64 times.
✓ Branch 3 taken 32 times.
✓ Branch 4 taken 34 times.
✓ Branch 5 taken 32 times.
✓ Branch 6 taken 34 times.
✓ Branch 7 taken 32 times.
✓ Branch 8 taken 34 times.
✓ Branch 9 taken 32 times.

      390
          if (half_kernel[kHalfKernelSize - 1] < 256) {
    
      230
            GaussianBlurFilter blur(half_kernel);
    
      230
            SeparableFilter<GaussianBlurFilter, KernelSize> filter{blur};
    
      460
            workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels,
    
      230
                               border_type, filter);
    
      230
          } else {
    
        10/10✓ Branch 0 taken 456 times.
✓ Branch 1 taken 32 times.
✓ Branch 2 taken 648 times.
✓ Branch 3 taken 32 times.
✓ Branch 4 taken 72 times.
✓ Branch 5 taken 32 times.
✓ Branch 6 taken 136 times.
✓ Branch 7 taken 32 times.
✓ Branch 8 taken 200 times.
✓ Branch 9 taken 32 times.

      1672
            for (size_t row = y_begin; row < y_end; ++row) {
    
      #if KLEIDICV_TARGET_SME && defined(__ANDROID__)
    
              __arm_sc_memcpy(
    
                  static_cast<void *>(&dst_rows.at(row)[0]),
    
                  static_cast<const void *>(&src_rows.at(row)[0]),
    
                  rect.width() * sizeof(ScalarType) * dst_rows.channels());
    
      #else
    
      3024
              std::memcpy(static_cast<void *>(&dst_rows.at(row)[0]),
    
      1512
                          static_cast<const void *>(&src_rows.at(row)[0]),
    
      1512
                          rect.width() * sizeof(ScalarType) * dst_rows.channels());
    
      #endif
    
      1512
            }
    
          }
    
      390
          return KLEIDICV_OK;
    
      390
        }
    
      652
      }
    
      template <bool IsBinomial, typename ScalarType>
    
      652
      static kleidicv_error_t gaussian_blur(
    
          size_t kernel_size, const ScalarType *src, size_t src_stride,
    
          ScalarType *dst, size_t dst_stride, Rectangle &rect, size_t y_begin,
    
          size_t y_end, size_t channels, float sigma, FixedBorderType border_type,
    
          SeparableFilterWorkspace *workspace) KLEIDICV_STREAMING {
    
        10/12✓ Branch 0 taken 94 times.
✓ Branch 1 taken 102 times.
✓ Branch 2 taken 66 times.
✓ Branch 3 taken 32 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 32 times.
✓ Branch 6 taken 66 times.
✓ Branch 7 taken 66 times.
✓ Branch 8 taken 66 times.
✓ Branch 9 taken 64 times.
✗ Branch 10 not taken.
✓ Branch 11 taken 64 times.

      652
        switch (kernel_size) {
    
          case 3:
    
      160
            return gaussian_blur_fixed_kernel_size<3, IsBinomial>(
    
      160
                src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
    
      160
                sigma, border_type, workspace);
    
          case 5:
    
      168
            return gaussian_blur_fixed_kernel_size<5, IsBinomial>(
    
      168
                src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
    
      168
                sigma, border_type, workspace);
    
          case 7:
    
      132
            return gaussian_blur_fixed_kernel_size<7, IsBinomial>(
    
      132
                src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
    
      132
                sigma, border_type, workspace);
    
          case 15:
    
            // 15x15 does not have a binomial variant
    
      96
            return gaussian_blur_fixed_kernel_size<15, false>(
    
      96
                src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
    
      96
                sigma, border_type, workspace);
    
          case 21:
    
            // 21x21 does not have a binomial variant
    
      96
            return gaussian_blur_fixed_kernel_size<21, false>(
    
      96
                src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
    
      96
                sigma, border_type, workspace);
    
            // gaussian_blur_is_implemented checked the kernel size already.
    
          // GCOVR_EXCL_START
    
          default:
    
            assert(!"kernel size not implemented");
    
      −
            return KLEIDICV_ERROR_NOT_IMPLEMENTED;
    
            // GCOVR_EXCL_STOP
    
        }
    
      652
      }
    
      690
      static kleidicv_error_t gaussian_blur_fixed_stripe_u8_sc(
    
          const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
    
          size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels,
    
          size_t kernel_width, size_t /*kernel_height*/, float sigma_x,
    
          float /*sigma_y*/, FixedBorderType fixed_border_type,
    
          kleidicv_filter_context_t *context) KLEIDICV_STREAMING {
    
      690
        auto *workspace = reinterpret_cast<SeparableFilterWorkspace *>(context);
    
      1380
        kleidicv_error_t checks_result = gaussian_blur_checks(
    
      690
            src, src_stride, dst, dst_stride, width, height, channels, workspace);
    
        2/2✓ Branch 0 taken 38 times.
✓ Branch 1 taken 652 times.

      690
        if (checks_result != KLEIDICV_OK) {
    
      38
          return checks_result;
    
        }
    
      652
        Rectangle rect{width, height};
    
        2/2✓ Branch 0 taken 326 times.
✓ Branch 1 taken 326 times.

      652
        if (sigma_x == 0.0) {
    
      652
          return gaussian_blur<true>(kernel_width, src, src_stride, dst, dst_stride,
    
      326
                                     rect, y_begin, y_end, channels, sigma_x,
    
      326
                                     fixed_border_type, workspace);
    
        }
    
      652
        return gaussian_blur<false>(kernel_width, src, src_stride, dst, dst_stride,
    
      326
                                    rect, y_begin, y_end, channels, sigma_x,
    
      326
                                    fixed_border_type, workspace);
    
      690
      }
    
      }  // namespace KLEIDICV_TARGET_NAMESPACE
    
      #endif  // KLEIDICV_GAUSSIAN_BLUR_SC_H

Line	Branch	Exec	Source
1			// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2			//
3			// SPDX-License-Identifier: Apache-2.0
4
5			#ifndef KLEIDICV_GAUSSIAN_BLUR_SC_H
6			#define KLEIDICV_GAUSSIAN_BLUR_SC_H
7
8			#include <array>
9			#include <cassert>
10
11			#include "kleidicv/filters/gaussian_blur.h"
12			#include "kleidicv/filters/separable_filter_15x15_sc.h"
13			#include "kleidicv/filters/separable_filter_21x21_sc.h"
14			#include "kleidicv/filters/separable_filter_3x3_sc.h"
15			#include "kleidicv/filters/separable_filter_5x5_sc.h"
16			#include "kleidicv/filters/separable_filter_7x7_sc.h"
17			#include "kleidicv/filters/sigma.h"
18			#include "kleidicv/workspace/separable.h"
19
20			#if KLEIDICV_TARGET_SME \|\| KLEIDICV_TARGET_SME2
21			#include <arm_sme.h>
22			#endif
23
24			namespace KLEIDICV_TARGET_NAMESPACE {
25
26			// Primary template for Gaussian Blur filters.
27			template <typename ScalarType, size_t KernelSize, bool IsBinomial>
28			class GaussianBlur;
29
30			// Template for 3x3 Gaussian Blur binomial filters.
31			//
32			// [ 1, 2, 1 ] [ 1 ]
33			// F = 1/16 * [ 2, 4, 2 ] = 1/16 * [ 2 ] * [ 1, 2, 1 ]
34			// [ 1, 2, 1 ] [ 1 ]
35			template <>
36			class GaussianBlur<uint8_t, 3, true> {
37			public:
38			using SourceType = uint8_t;
39			using BufferType = uint16_t;
40			using DestinationType = uint8_t;
41
42			// Applies vertical filtering vector using SIMD operations.
43			//
44			// DST = [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
45		504	void vertical_vector_path(svbool_t pg,
46			std::reference_wrapper<svuint8_t> src[3],
47			BufferType *dst) const KLEIDICV_STREAMING {
48		504	svuint16_t acc_0_2_b = svaddlb_u16(src[0], src[2]);
49		504	svuint16_t acc_0_2_t = svaddlt_u16(src[0], src[2]);
50
51		504	svuint16_t acc_1_b = svshllb_n_u16(src[1], 1);
52		504	svuint16_t acc_1_t = svshllt_n_u16(src[1], 1);
53
54		504	svuint16_t acc_u16_b = svadd_u16_x(pg, acc_0_2_b, acc_1_b);
55		504	svuint16_t acc_u16_t = svadd_u16_x(pg, acc_0_2_t, acc_1_t);
56
57		504	svuint16x2_t interleaved = svcreate2(acc_u16_b, acc_u16_t);
58		504	svst2(pg, &dst[0], interleaved);
59		504	}
60
61			// Applies horizontal filtering vector using SIMD operations.
62			//
63			// DST = 1/16 * [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
64		400	void horizontal_vector_path(svbool_t pg,
65			std::reference_wrapper<svuint16_t> src[3],
66			DestinationType *dst) const KLEIDICV_STREAMING {
67		400	svuint16_t acc_0_2 = svhadd_u16_x(pg, src[0], src[2]);
68
69		400	svuint16_t acc = svadd_u16_x(pg, acc_0_2, src[1]);
70		400	acc = svrshr_x(pg, acc, 3);
71
72		400	svst1b(pg, &dst[0], acc);
73		400	}
74
75			// Applies horizontal filtering vector using scalar operations.
76			//
77			// DST = 1/16 * [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
78		1008	void horizontal_scalar_path(const BufferType src[3],
79			DestinationType *dst) const KLEIDICV_STREAMING {
80		1008	auto acc = src[0] + 2 * src[1] + src[2];
81		1008	dst[0] = rounding_shift_right(acc, 4);
82		1008	}
83			}; // end of class GaussianBlur<uint8_t, 3, true>
84
85			// Template for 5x5 Gaussian Blur binomial filters.
86			//
87			// [ 1, 4, 6, 4, 1 ] [ 1 ]
88			// [ 4, 16, 24, 16, 4 ] [ 4 ]
89			// F = 1/256 * [ 6, 24, 36, 24, 6 ] = 1/256 * [ 6 ] * [ 1, 4, 6, 4, 1 ]
90			// [ 4, 16, 24, 16, 4 ] [ 4 ]
91			// [ 1, 4, 6, 4, 1 ] [ 1 ]
92			template <>
93			class GaussianBlur<uint8_t, 5, true> {
94			public:
95			using SourceType = uint8_t;
96			using BufferType = uint16_t;
97			using DestinationType = uint8_t;
98
99			// Applies vertical filtering vector using SIMD operations.
100			//
101			// DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
102		1524	void vertical_vector_path(svbool_t pg,
103			std::reference_wrapper<svuint8_t> src[5],
104			BufferType *dst) const KLEIDICV_STREAMING {
105		1524	svuint16_t acc_0_4_b = svaddlb_u16(src[0], src[4]);
106		1524	svuint16_t acc_0_4_t = svaddlt_u16(src[0], src[4]);
107		1524	svuint16_t acc_1_3_b = svaddlb_u16(src[1], src[3]);
108		1524	svuint16_t acc_1_3_t = svaddlt_u16(src[1], src[3]);
109
110		1524	svuint16_t acc_u16_b = svmlalb_n_u16(acc_0_4_b, src[2], 6);
111		1524	svuint16_t acc_u16_t = svmlalt_n_u16(acc_0_4_t, src[2], 6);
112		1524	acc_u16_b = svmla_n_u16_x(pg, acc_u16_b, acc_1_3_b, 4);
113		1524	acc_u16_t = svmla_n_u16_x(pg, acc_u16_t, acc_1_3_t, 4);
114
115		1524	svuint16x2_t interleaved = svcreate2(acc_u16_b, acc_u16_t);
116		1524	svst2(pg, &dst[0], interleaved);
117		1524	}
118
119			// Applies horizontal filtering vector using SIMD operations.
120			//
121			// DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
122		1316	void horizontal_vector_path(svbool_t pg,
123			std::reference_wrapper<svuint16_t> src[5],
124			DestinationType *dst) const KLEIDICV_STREAMING {
125		1316	svuint16_t acc_0_4 = svadd_x(pg, src[0], src[4]);
126		1316	svuint16_t acc_1_3 = svadd_x(pg, src[1], src[3]);
127		1316	svuint16_t acc = svmla_n_u16_x(pg, acc_0_4, src[2], 6);
128		1316	acc = svmla_n_u16_x(pg, acc, acc_1_3, 4);
129		1316	acc = svrshr_x(pg, acc, 8);
130		1316	svst1b(pg, &dst[0], acc);
131		1316	}
132
133			// Applies horizontal filtering vector using scalar operations.
134			//
135			// DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
136		5552	void horizontal_scalar_path(const BufferType src[5],
137			DestinationType *dst) const KLEIDICV_STREAMING {
138		5552	auto acc = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2];
139		5552	dst[0] = rounding_shift_right(acc, 8);
140		5552	}
141			}; // end of class GaussianBlur<uint8_t, 5, true>
142
143			// Template for 7x7 Gaussian Blur binomial filters.
144			//
145			// [ 4, 14, 28, 36, 28, 14, 4 ]
146			// [ 14, 49, 98, 126, 98, 49, 14 ]
147			// [ 28, 98, 196, 252, 196, 98, 28 ]
148			// F = 1/4096 * [ 36, 126, 252, 324, 252, 126, 36 ] =
149			// [ 28, 98, 196, 252, 196, 98, 28 ]
150			// [ 14, 49, 98, 126, 98, 49, 14 ]
151			// [ 4, 14, 28, 36, 28, 14, 4 ]
152			//
153			// [ 2 ]
154			// [ 7 ]
155			// [ 14 ]
156			// = 1/4096 * [ 18 ] * [ 2, 7, 14, 18, 14, 7, 2 ]
157			// [ 14 ]
158			// [ 7 ]
159			// [ 2 ]
160			template <>
161			class GaussianBlur<uint8_t, 7, true> {
162			public:
163			using SourceType = uint8_t;
164			using BufferType = uint16_t;
165			using DestinationType = uint8_t;
166
167			// Applies vertical filtering vector using SIMD operations.
168			//
169			// DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
170			// * [ 2, 7, 14, 18, 14, 7, 2 ]T
171		596	void vertical_vector_path(svbool_t pg,
172			std::reference_wrapper<svuint8_t> src[7],
173			BufferType *dst) const KLEIDICV_STREAMING {
174		596	svuint16_t acc_0_6_b = svaddlb_u16(src[0], src[6]);
175		596	svuint16_t acc_0_6_t = svaddlt_u16(src[0], src[6]);
176
177		596	svuint16_t acc_1_5_b = svaddlb_u16(src[1], src[5]);
178		596	svuint16_t acc_1_5_t = svaddlt_u16(src[1], src[5]);
179
180		596	svuint16_t acc_2_4_b = svaddlb_u16(src[2], src[4]);
181		596	svuint16_t acc_2_4_t = svaddlt_u16(src[2], src[4]);
182
183		596	svuint16_t acc_3_b = svmovlb_u16(src[3]);
184		596	svuint16_t acc_3_t = svmovlt_u16(src[3]);
185
186		596	svuint16_t acc_0_2_4_6_b = svmla_n_u16_x(pg, acc_0_6_b, acc_2_4_b, 7);
187		596	svuint16_t acc_0_2_4_6_t = svmla_n_u16_x(pg, acc_0_6_t, acc_2_4_t, 7);
188
189		596	svuint16_t acc_0_2_3_4_6_b = svmla_n_u16_x(pg, acc_0_2_4_6_b, acc_3_b, 9);
190		596	svuint16_t acc_0_2_3_4_6_t = svmla_n_u16_x(pg, acc_0_2_4_6_t, acc_3_t, 9);
191		596	acc_0_2_3_4_6_b = svlsl_n_u16_x(pg, acc_0_2_3_4_6_b, 1);
192		596	acc_0_2_3_4_6_t = svlsl_n_u16_x(pg, acc_0_2_3_4_6_t, 1);
193
194		1192	svuint16_t acc_0_1_2_3_4_5_6_b =
195		596	svmla_n_u16_x(pg, acc_0_2_3_4_6_b, acc_1_5_b, 7);
196		1192	svuint16_t acc_0_1_2_3_4_5_6_t =
197		596	svmla_n_u16_x(pg, acc_0_2_3_4_6_t, acc_1_5_t, 7);
198
199		1192	svuint16x2_t interleaved =
200		596	svcreate2(acc_0_1_2_3_4_5_6_b, acc_0_1_2_3_4_5_6_t);
201		596	svst2(pg, &dst[0], interleaved);
202		596	}
203
204			// Applies horizontal filtering vector using SIMD operations.
205			//
206			// DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
207			// * [ 2, 7, 14, 18, 14, 7, 2 ]T
208		464	void horizontal_vector_path(svbool_t pg,
209			std::reference_wrapper<svuint16_t> src[7],
210			DestinationType *dst) const KLEIDICV_STREAMING {
211		464	svuint32_t acc_0_6_b = svaddlb_u32(src[0], src[6]);
212		464	svuint32_t acc_0_6_t = svaddlt_u32(src[0], src[6]);
213
214		464	svuint32_t acc_1_5_b = svaddlb_u32(src[1], src[5]);
215		464	svuint32_t acc_1_5_t = svaddlt_u32(src[1], src[5]);
216
217		464	svuint16_t acc_2_4 = svadd_u16_x(pg, src[2], src[4]);
218
219		464	svuint32_t acc_0_2_4_6_b = svmlalb_n_u32(acc_0_6_b, acc_2_4, 7);
220		464	svuint32_t acc_0_2_4_6_t = svmlalt_n_u32(acc_0_6_t, acc_2_4, 7);
221
222		464	svuint32_t acc_0_2_3_4_6_b = svmlalb_n_u32(acc_0_2_4_6_b, src[3], 9);
223		464	svuint32_t acc_0_2_3_4_6_t = svmlalt_n_u32(acc_0_2_4_6_t, src[3], 9);
224
225		464	acc_0_2_3_4_6_b = svlsl_n_u32_x(pg, acc_0_2_3_4_6_b, 1);
226		464	acc_0_2_3_4_6_t = svlsl_n_u32_x(pg, acc_0_2_3_4_6_t, 1);
227
228		928	svuint32_t acc_0_1_2_3_4_5_6_b =
229		464	svmla_n_u32_x(pg, acc_0_2_3_4_6_b, acc_1_5_b, 7);
230		928	svuint32_t acc_0_1_2_3_4_5_6_t =
231		464	svmla_n_u32_x(pg, acc_0_2_3_4_6_t, acc_1_5_t, 7);
232
233		928	svuint16_t acc_0_1_2_3_4_5_6_u16_b =
234		464	svrshrnb_n_u32(acc_0_1_2_3_4_5_6_b, 12);
235		928	svuint16_t acc_0_1_2_3_4_5_6_u16 =
236		464	svrshrnt_n_u32(acc_0_1_2_3_4_5_6_u16_b, acc_0_1_2_3_4_5_6_t, 12);
237
238		464	svst1b(pg, &dst[0], acc_0_1_2_3_4_5_6_u16);
239		464	}
240
241			// Applies horizontal filtering vector using scalar operations.
242			//
243			// DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
244			// * [ 2, 7, 14, 18, 14, 7, 2 ]T
245		3288	void horizontal_scalar_path(const BufferType src[7],
246			DestinationType *dst) const KLEIDICV_STREAMING {
247		9864	uint32_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 +
248		6576	src[4] * 14 + src[5] * 7 + src[6] * 2;
249		3288	dst[0] = rounding_shift_right(acc, 12);
250		3288	}
251			}; // end of class GaussianBlur<uint8_t, 7, true>
252
253			// CustomSigma variant
254			template <size_t KernelSize>
255			class GaussianBlur<uint8_t, KernelSize, false> {
256			public:
257			using SourceType = uint8_t;
258			using BufferType = uint8_t;
259			using DestinationType = uint8_t;
260			using SourceVecTraits =
261			typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits<SourceType>;
262			using SourceVectorType = typename SourceVecTraits::VectorType;
263
264			static constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize);
265
266		230	explicit GaussianBlur(const uint16_t *half_kernel)
267		230	: half_kernel_(half_kernel) {}
268
269		6048	void vertical_vector_path(
270			svbool_t pg, std::reference_wrapper<SourceVectorType> src[KernelSize],
271			BufferType *dst) const KLEIDICV_STREAMING {
272		6048	common_vector_path(pg, src, dst);
273		6048	}
274
275		70640	void vertical_scalar_path(const SourceType src[KernelSize],
276			BufferType *dst) const KLEIDICV_STREAMING {
277		141280	uint32_t acc = static_cast<uint32_t>(src[kHalfKernelSize - 1]) *
278		70640	half_kernel_[kHalfKernelSize - 1];
279
280			// Optimization to avoid unnecessary branching in vector code.
281			KLEIDICV_FORCE_LOOP_UNROLL
282	10/10 ✓ Branch 0 taken 22176 times. ✓ Branch 1 taken 155232 times. ✓ Branch 2 taken 45120 times. ✓ Branch 3 taken 451200 times. ✓ Branch 4 taken 248 times. ✓ Branch 5 taken 248 times. ✓ Branch 6 taken 960 times. ✓ Branch 7 taken 1920 times. ✓ Branch 8 taken 2136 times. ✓ Branch 9 taken 6408 times.	685648	for (size_t i = 0; i < kHalfKernelSize - 1; i++) {
283		1845024	acc += (static_cast<uint32_t>(src[i]) +
284		1230016	static_cast<uint32_t>(src[KernelSize - i - 1])) *
285		615008	half_kernel_[i];
286		615008	}
287
288		70640	dst[0] = static_cast<BufferType>(rounding_shift_right(acc, 8));
289		70640	}
290
291		3896	void horizontal_vector_path(
292			svbool_t pg, std::reference_wrapper<SourceVectorType> src[KernelSize],
293			BufferType *dst) const KLEIDICV_STREAMING {
294		3896	common_vector_path(pg, src, dst);
295		3896	}
296
297		70640	void horizontal_scalar_path(const BufferType src[KernelSize],
298			DestinationType *dst) const KLEIDICV_STREAMING {
299		70640	vertical_scalar_path(src, dst);
300		70640	}
301
302			private:
303		9944	void common_vector_path(
304			svbool_t pg, std::reference_wrapper<SourceVectorType> src[KernelSize],
305			BufferType *dst) const KLEIDICV_STREAMING {
306		9944	svbool_t pg16_all = svptrue_b16();
307		19888	svuint16_t acc_b = svmullb_n_u16(src[kHalfKernelSize - 1],
308		9944	half_kernel_[kHalfKernelSize - 1]);
309		19888	svuint16_t acc_t = svmullt_n_u16(src[kHalfKernelSize - 1],
310		9944	half_kernel_[kHalfKernelSize - 1]);
311
312			// Optimization to avoid unnecessary branching in vector code.
313			KLEIDICV_FORCE_LOOP_UNROLL
314	10/10 ✓ Branch 0 taken 3280 times. ✓ Branch 1 taken 22960 times. ✓ Branch 2 taken 5320 times. ✓ Branch 3 taken 53200 times. ✓ Branch 4 taken 228 times. ✓ Branch 5 taken 228 times. ✓ Branch 6 taken 440 times. ✓ Branch 7 taken 880 times. ✓ Branch 8 taken 676 times. ✓ Branch 9 taken 2028 times.	89240	for (size_t i = 0; i < kHalfKernelSize - 1; i++) {
315		79296	const size_t j = KernelSize - i - 1;
316		79296	svuint16_t vec_b = svaddlb_u16(src[i], src[j]);
317		79296	svuint16_t vec_t = svaddlt_u16(src[i], src[j]);
318
319		79296	acc_b = svmla_n_u16_x(pg16_all, acc_b, vec_b, half_kernel_[i]);
320		79296	acc_t = svmla_n_u16_x(pg16_all, acc_t, vec_t, half_kernel_[i]);
321		79296	}
322
323			// Rounding before narrowing
324		9944	acc_b = svqadd_n_u16(acc_b, 128);
325		9944	acc_t = svqadd_n_u16(acc_t, 128);
326			// Keep only the highest 8 bits
327		19888	svuint8_t result =
328		9944	svtrn2_u8(svreinterpret_u8_u16(acc_b), svreinterpret_u8_u16(acc_t));
329		9944	svst1(pg, &dst[0], result);
330		9944	}
331
332			const uint16_t *half_kernel_;
333			}; // end of class GaussianBlur<uint8_t, KernelSize, false>
334
335			template <size_t KernelSize, bool IsBinomial, typename ScalarType>
336		652	static kleidicv_error_t gaussian_blur_fixed_kernel_size(
337			const ScalarType src, size_t src_stride, ScalarType dst,
338			size_t dst_stride, Rectangle &rect, size_t y_begin, size_t y_end,
339			size_t channels, float sigma, FixedBorderType border_type,
340			SeparableFilterWorkspace *workspace) KLEIDICV_STREAMING {
341			using GaussianBlurFilter = GaussianBlur<ScalarType, KernelSize, IsBinomial>;
342
343		652	Rows<const ScalarType> src_rows{src, src_stride, channels};
344		652	Rows<ScalarType> dst_rows{dst, dst_stride, channels};
345
346			if constexpr (IsBinomial) {
347		262	GaussianBlurFilter blur;
348		262	SeparableFilter<GaussianBlurFilter, KernelSize> filter{blur};
349		524	workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels,
350		262	border_type, filter);
351
352		262	return KLEIDICV_OK;
353		262	} else {
354		390	constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize);
355		390	uint16_t half_kernel[128];
356		390	generate_gaussian_half_kernel(half_kernel, kHalfKernelSize, sigma);
357			// If sigma is so small that the middle point gets all the weights, it's
358			// just a copy
359	10/10 ✓ Branch 0 taken 64 times. ✓ Branch 1 taken 32 times. ✓ Branch 2 taken 64 times. ✓ Branch 3 taken 32 times. ✓ Branch 4 taken 34 times. ✓ Branch 5 taken 32 times. ✓ Branch 6 taken 34 times. ✓ Branch 7 taken 32 times. ✓ Branch 8 taken 34 times. ✓ Branch 9 taken 32 times.	390	if (half_kernel[kHalfKernelSize - 1] < 256) {
360		230	GaussianBlurFilter blur(half_kernel);
361		230	SeparableFilter<GaussianBlurFilter, KernelSize> filter{blur};
362		460	workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels,
363		230	border_type, filter);
364		230	} else {
365	10/10 ✓ Branch 0 taken 456 times. ✓ Branch 1 taken 32 times. ✓ Branch 2 taken 648 times. ✓ Branch 3 taken 32 times. ✓ Branch 4 taken 72 times. ✓ Branch 5 taken 32 times. ✓ Branch 6 taken 136 times. ✓ Branch 7 taken 32 times. ✓ Branch 8 taken 200 times. ✓ Branch 9 taken 32 times.	1672	for (size_t row = y_begin; row < y_end; ++row) {
366			#if KLEIDICV_TARGET_SME && defined(__ANDROID__)
367			__arm_sc_memcpy(
368			static_cast<void *>(&dst_rows.at(row)[0]),
369			static_cast<const void *>(&src_rows.at(row)[0]),
370			rect.width() * sizeof(ScalarType) * dst_rows.channels());
371			#else
372		3024	std::memcpy(static_cast<void *>(&dst_rows.at(row)[0]),
373		1512	static_cast<const void *>(&src_rows.at(row)[0]),
374		1512	rect.width() * sizeof(ScalarType) * dst_rows.channels());
375			#endif
376		1512	}
377			}
378		390	return KLEIDICV_OK;
379		390	}
380		652	}
381
382			template <bool IsBinomial, typename ScalarType>
383		652	static kleidicv_error_t gaussian_blur(
384			size_t kernel_size, const ScalarType *src, size_t src_stride,
385			ScalarType *dst, size_t dst_stride, Rectangle &rect, size_t y_begin,
386			size_t y_end, size_t channels, float sigma, FixedBorderType border_type,
387			SeparableFilterWorkspace *workspace) KLEIDICV_STREAMING {
388	10/12 ✓ Branch 0 taken 94 times. ✓ Branch 1 taken 102 times. ✓ Branch 2 taken 66 times. ✓ Branch 3 taken 32 times. ✗ Branch 4 not taken. ✓ Branch 5 taken 32 times. ✓ Branch 6 taken 66 times. ✓ Branch 7 taken 66 times. ✓ Branch 8 taken 66 times. ✓ Branch 9 taken 64 times. ✗ Branch 10 not taken. ✓ Branch 11 taken 64 times.	652	switch (kernel_size) {
389			case 3:
390		160	return gaussian_blur_fixed_kernel_size<3, IsBinomial>(
391		160	src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
392		160	sigma, border_type, workspace);
393			case 5:
394		168	return gaussian_blur_fixed_kernel_size<5, IsBinomial>(
395		168	src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
396		168	sigma, border_type, workspace);
397			case 7:
398		132	return gaussian_blur_fixed_kernel_size<7, IsBinomial>(
399		132	src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
400		132	sigma, border_type, workspace);
401			case 15:
402			// 15x15 does not have a binomial variant
403		96	return gaussian_blur_fixed_kernel_size<15, false>(
404		96	src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
405		96	sigma, border_type, workspace);
406			case 21:
407			// 21x21 does not have a binomial variant
408		96	return gaussian_blur_fixed_kernel_size<21, false>(
409		96	src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
410		96	sigma, border_type, workspace);
411			// gaussian_blur_is_implemented checked the kernel size already.
412			// GCOVR_EXCL_START
413			default:
414			assert(!"kernel size not implemented");
415		−	return KLEIDICV_ERROR_NOT_IMPLEMENTED;
416			// GCOVR_EXCL_STOP
417			}
418		652	}
419
420		690	static kleidicv_error_t gaussian_blur_fixed_stripe_u8_sc(
421			const uint8_t src, size_t src_stride, uint8_t dst, size_t dst_stride,
422			size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels,
423			size_t kernel_width, size_t /kernel_height/, float sigma_x,
424			float /sigma_y/, FixedBorderType fixed_border_type,
425			kleidicv_filter_context_t *context) KLEIDICV_STREAMING {
426		690	auto workspace = reinterpret_cast<SeparableFilterWorkspace >(context);
427		1380	kleidicv_error_t checks_result = gaussian_blur_checks(
428		690	src, src_stride, dst, dst_stride, width, height, channels, workspace);
429
430	2/2 ✓ Branch 0 taken 38 times. ✓ Branch 1 taken 652 times.	690	if (checks_result != KLEIDICV_OK) {
431		38	return checks_result;
432			}
433
434		652	Rectangle rect{width, height};
435
436	2/2 ✓ Branch 0 taken 326 times. ✓ Branch 1 taken 326 times.	652	if (sigma_x == 0.0) {
437		652	return gaussian_blur<true>(kernel_width, src, src_stride, dst, dst_stride,
438		326	rect, y_begin, y_end, channels, sigma_x,
439		326	fixed_border_type, workspace);
440			}
441
442		652	return gaussian_blur<false>(kernel_width, src, src_stride, dst, dst_stride,
443		326	rect, y_begin, y_end, channels, sigma_x,
444		326	fixed_border_type, workspace);
445		690	}
446
447			} // namespace KLEIDICV_TARGET_NAMESPACE
448
449			#endif // KLEIDICV_GAUSSIAN_BLUR_SC_H
450