KleidiCV Coverage Report

Directory:	./
File:	kleidicv/src/filters/gaussian_blur_arbitrary_neon.cpp
Date:	2025-09-25 14:13:34

	Exec	Total	Coverage
Lines:	288	288	100.0%
Functions:	21	21	100.0%
Branches:	34	34	100.0%

  
      Line
      Branch
      Exec
      Source
    
      // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
    
      //
    
      // SPDX-License-Identifier: Apache-2.0
    
      #include <cassert>
    
      #include <cstddef>
    
      #include "border_generic_neon.h"
    
      #include "kleidicv/config.h"
    
      #include "kleidicv/ctypes.h"
    
      #include "kleidicv/filters/gaussian_blur.h"
    
      #include "kleidicv/filters/sigma.h"
    
      #include "kleidicv/neon.h"
    
      #include "kleidicv/workspace/border_types.h"
    
      #include "kleidicv/workspace/separable.h"
    
      namespace kleidicv::neon {
    
      // Template for arbitrary kernel size Gaussian Blur filters.
    
      template <typename ScalarType, FixedBorderType>
    
      class GaussianBlurArbitrary;
    
      template <FixedBorderType BorderT>
    
      class GaussianBlurArbitrary<uint8_t, BorderT> {
    
       public:
    
        using SourceType = uint8_t;
    
        using BufferType = uint8_t;
    
        using DestinationType = uint8_t;
    
        using SourceVecTraits = typename neon::VecTraits<SourceType>;
    
        using SourceVectorType = typename SourceVecTraits::VectorType;
    
        using BufferVecTraits = typename neon::VecTraits<BufferType>;
    
        using BufferVectorType = typename BufferVecTraits::VectorType;
    
        using BorderType = FixedBorderType;
    
      39
        GaussianBlurArbitrary(const uint16_t *half_kernel, ptrdiff_t half_kernel_size,
    
                              Rectangle &rect, size_t channels)
    
      39
            : half_kernel_size_(half_kernel_size),
    
      39
              half_kernel_u16_(half_kernel),
    
      39
              width_(static_cast<ptrdiff_t>(rect.width())),
    
      39
              vertical_border_(rect.height()),
    
      39
              horizontal_border_(rect.width(), channels) {}
    
        // Not border-affected parts
    
      156
        void process_arbitrary_vertical(size_t width, Rows<const SourceType> src_rows,
    
                                        Rows<BufferType> buffer_rows) const {
    
      312
          LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
    
      156
                                               SourceVecTraits::num_lanes()};
    
      612
          loop.unroll_once([&](size_t index) {
    
      456
            vertical_vector_path(src_rows, buffer_rows, index);
    
      456
          });
    
      208
          loop.tail([&](size_t index) {
    
      52
            vertical_scalar_path(src_rows, buffer_rows, index);
    
      52
          });
    
      156
        }
    
        // Border-affected parts
    
      360
        void process_arbitrary_border_vertical(size_t width,
    
                                               Rows<const SourceType> src_rows,
    
                                               ptrdiff_t row_index,
    
                                               Rows<BufferType> buffer_rows) const {
    
      720
          LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
    
      360
                                               SourceVecTraits::num_lanes()};
    
      1760
          loop.unroll_once([&](size_t column_index) {
    
      2800
            vertical_border_vector_path(src_rows, buffer_rows, row_index,
    
      1400
                                        column_index);
    
      1400
          });
    
      880
          loop.tail([&](size_t column_index) {
    
      1040
            vertical_border_scalar_path(src_rows, buffer_rows, row_index,
    
      520
                                        column_index);
    
      520
          });
    
      360
        }
    
      516
        void process_arbitrary_horizontal(
    
            size_t width, size_t kernel_size, Rows<BufferType> buffer_rows,
    
            Rows<DestinationType> dst_rows) KLEIDICV_STREAMING {
    
      516
          size_t x = 0;
    
          // Assume that there is always a widening when calculating, so the
    
          // horizontal vector path processes double-width vectors
    
      516
          const size_t num_lanes = BufferVecTraits::num_lanes() / 2;
    
      516
          const size_t block_len = num_lanes;
    
      516
          const size_t margin = kernel_size / 2;
    
      516
          const size_t border_len = buffer_rows.channels() * margin;
    
      1032
          const size_t border_process_len =
    
      516
              ((border_len + block_len - 1) / block_len) * block_len;
    
        2/2✓ Branch 0 taken 636 times.
✓ Branch 1 taken 516 times.

      1152
          for (; x < border_process_len; x += num_lanes) {
    
      636
            horizontal_left_border_vector_path(buffer_rows, dst_rows, x);
    
      636
          }
    
          // Process data which is not affected by any borders in bulk.
    
        2/2✓ Branch 0 taken 432 times.
✓ Branch 1 taken 84 times.

      516
          if (width * buffer_rows.channels() > 2 * border_process_len) {
    
      864
            size_t total_width_without_borders =
    
      432
                width * buffer_rows.channels() - 2 * border_process_len;
    
      864
            LoopUnroll2<TryToAvoidTailLoop> loop{total_width_without_borders,
    
      432
                                                 BufferVecTraits::num_lanes()};
    
      694
            loop.unroll_twice([&](size_t index) {
    
      262
              horizontal_vector_path(buffer_rows, dst_rows, x + index);
    
      524
              horizontal_vector_path(buffer_rows, dst_rows,
    
      262
                                     x + index + BufferVecTraits::num_lanes());
    
      262
            });
    
      1088
            loop.unroll_once([&](size_t index) {
    
      656
              horizontal_vector_path(buffer_rows, dst_rows, x + index);
    
      656
            });
    
      756
            loop.tail([&](size_t index) {
    
      324
              horizontal_scalar_path(buffer_rows, dst_rows, x + index);
    
      324
            });
    
      432
            x += total_width_without_borders;
    
      432
          } else {
    
            // rewind if needed, so we'll have exact vector paths at the right side
    
      84
            x = width * buffer_rows.channels() - border_process_len;
    
          }
    
        2/2✓ Branch 0 taken 636 times.
✓ Branch 1 taken 516 times.

      1152
          for (; x < width * buffer_rows.channels(); x += num_lanes) {
    
      636
            horizontal_right_border_vector_path(buffer_rows, dst_rows, x);
    
      636
          }
    
      516
        }
    
       private:
    
      456
        void vertical_vector_path(Rows<const SourceType> src_rows,
    
                                  Rows<BufferType> dst_rows, ptrdiff_t x) const {
    
      456
          uint8x16_t src_mid = vld1q_u8(&src_rows[x]);
    
      912
          uint8x8_t half_kernel_mid = vdup_n_u8(
    
      456
              static_cast<uint8_t>(half_kernel_u16_[half_kernel_size_ - 1]));
    
      456
          uint16x8_t acc_l = vmull_u8(vget_low_u8(src_mid), half_kernel_mid);
    
      456
          uint16x8_t acc_h = vmull_u8(vget_high_u8(src_mid), half_kernel_mid);
    
      456
          ptrdiff_t i = 0;
    
          // Unroll 4 times
    
        2/2✓ Branch 0 taken 456 times.
✓ Branch 1 taken 456 times.

      912
          for (; i < half_kernel_size_ - 4; i += 4) {
    
      456
            uint8x16_t src_i = vld1q_u8(&src_rows.at(i - half_kernel_size_ + 1)[x]);
    
      456
            uint8x16_t src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 1)[x]);
    
      456
            uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
    
      456
            uint16x8_t vec_h = vaddl_high_u8(src_i, src_j);
    
      456
            uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
    
      456
            uint16x8_t prod0_l = vmulq_u16(vec_l, coeff);
    
      456
            uint16x8_t prod0_h = vmulq_u16(vec_h, coeff);
    
      456
            src_i = vld1q_u8(&src_rows.at(i + 2 - half_kernel_size_)[x]);
    
      456
            src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 2)[x]);
    
      456
            vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
    
      456
            vec_h = vaddl_high_u8(src_i, src_j);
    
      456
            coeff = vdupq_n_u16(half_kernel_u16_[i + 1]);
    
      456
            uint16x8_t prod1_l = vmulq_u16(vec_l, coeff);
    
      456
            uint16x8_t prod1_h = vmulq_u16(vec_h, coeff);
    
      456
            src_i = vld1q_u8(&src_rows.at(i + 3 - half_kernel_size_)[x]);
    
      456
            src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 3)[x]);
    
      456
            vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
    
      456
            vec_h = vaddl_high_u8(src_i, src_j);
    
      456
            coeff = vdupq_n_u16(half_kernel_u16_[i + 2]);
    
      456
            uint16x8_t prod2_l = vmulq_u16(vec_l, coeff);
    
      456
            uint16x8_t prod2_h = vmulq_u16(vec_h, coeff);
    
      456
            src_i = vld1q_u8(&src_rows.at(i + 4 - half_kernel_size_)[x]);
    
      456
            src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 4)[x]);
    
      456
            vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
    
      456
            vec_h = vaddl_high_u8(src_i, src_j);
    
      456
            coeff = vdupq_n_u16(half_kernel_u16_[i + 3]);
    
      456
            uint16x8_t prod3_l = vmulq_u16(vec_l, coeff);
    
      456
            uint16x8_t prod3_h = vmulq_u16(vec_h, coeff);
    
      456
            uint16x8_t acc0_l = vaddq_u16(prod0_l, prod1_l);
    
      456
            uint16x8_t acc0_h = vaddq_u16(prod0_h, prod1_h);
    
      456
            uint16x8_t acc1_l = vaddq_u16(prod2_l, prod3_l);
    
      456
            uint16x8_t acc1_h = vaddq_u16(prod2_h, prod3_h);
    
      456
            uint16x8_t acc_new_l = vaddq_u16(acc0_l, acc1_l);
    
      456
            uint16x8_t acc_new_h = vaddq_u16(acc0_h, acc1_h);
    
      456
            acc_l = vaddq_u16(acc_l, acc_new_l);
    
      456
            acc_h = vaddq_u16(acc_h, acc_new_h);
    
      456
          }
    
        2/2✓ Branch 0 taken 456 times.
✓ Branch 1 taken 456 times.

      912
          for (; i < half_kernel_size_ - 1; ++i) {
    
      456
            uint8x16_t src_i = vld1q_u8(&src_rows.at(i - half_kernel_size_ + 1)[x]);
    
      456
            uint8x16_t src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 1)[x]);
    
      456
            uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
    
      456
            uint16x8_t vec_h = vaddl_high_u8(src_i, src_j);
    
      456
            uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
    
      456
            acc_l = vmlaq_u16(acc_l, vec_l, coeff);
    
      456
            acc_h = vmlaq_u16(acc_h, vec_h, coeff);
    
      456
          }
    
          // Rounding
    
      456
          acc_l = vqaddq_u16(acc_l, vdupq_n_u16(128));
    
      456
          acc_h = vqaddq_u16(acc_h, vdupq_n_u16(128));
    
          // Keep only the highest 8 bits
    
      912
          uint8x16_t result =
    
      456
              vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h));
    
      456
          neon::VecTraits<uint8_t>::store(result, &dst_rows[x]);
    
      456
        }
    
        // Where y is affected by border
    
      1400
        void vertical_border_vector_path(Rows<const SourceType> src_rows,
    
                                         Rows<BufferType> dst_rows, ptrdiff_t y,
    
                                         ptrdiff_t x) const {
    
      1400
          uint8x16_t src_mid = vld1q_u8(&src_rows.at(y)[x]);
    
      2800
          uint8x8_t half_kernel_mid = vdup_n_u8(
    
      1400
              static_cast<uint8_t>(half_kernel_u16_[half_kernel_size_ - 1]));
    
      1400
          uint16x8_t acc_l = vmull_u8(vget_low_u8(src_mid), half_kernel_mid);
    
      1400
          uint16x8_t acc_h = vmull_u8(vget_high_u8(src_mid), half_kernel_mid);
    
      1400
          ptrdiff_t i = 0;
    
        2/2✓ Branch 0 taken 7000 times.
✓ Branch 1 taken 1400 times.

      8400
          for (; i < half_kernel_size_ - 1; ++i) {
    
      7000
            uint8x16_t src_i = vld1q_u8(&src_rows.at(
    
                vertical_border_.get_row(y - half_kernel_size_ + 1 + i))[x]);
    
      7000
            uint8x16_t src_j = vld1q_u8(&src_rows.at(
    
                vertical_border_.get_row(y + half_kernel_size_ - 1 - i))[x]);
    
      7000
            uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
    
      7000
            uint16x8_t vec_h = vaddl_high_u8(src_i, src_j);
    
      7000
            uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
    
      7000
            acc_l = vmlaq_u16(acc_l, vec_l, coeff);
    
      7000
            acc_h = vmlaq_u16(acc_h, vec_h, coeff);
    
      7000
          }
    
          // Rounding
    
      1400
          acc_l = vqaddq_u16(acc_l, vdupq_n_u16(128));
    
      1400
          acc_h = vqaddq_u16(acc_h, vdupq_n_u16(128));
    
          // Keep only the highest 8 bits
    
      2800
          uint8x16_t result =
    
      1400
              vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h));
    
      1400
          neon::VecTraits<uint8_t>::store(result, &dst_rows[x]);
    
      1400
        }
    
      52
        void vertical_scalar_path(Rows<const SourceType> src_rows,
    
                                  Rows<BufferType> dst_rows, ptrdiff_t x) const {
    
      104
          uint32_t acc = static_cast<uint32_t>(src_rows[x]) *
    
      52
                         half_kernel_u16_[half_kernel_size_ - 1];
    
        2/2✓ Branch 0 taken 52 times.
✓ Branch 1 taken 260 times.

      312
          for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; i++) {
    
      260
            acc +=
    
      520
                (static_cast<uint32_t>(src_rows.at(i + 1 - half_kernel_size_)[x]) +
    
      520
                 static_cast<uint32_t>(src_rows.at(half_kernel_size_ - i - 1)[x])) *
    
      260
                half_kernel_u16_[i];
    
      260
          }
    
      52
          dst_rows[x] = static_cast<BufferType>(rounding_shift_right(acc, 8));
    
      52
        }
    
      520
        void vertical_border_scalar_path(Rows<const SourceType> src_rows,
    
                                         Rows<BufferType> dst_rows, ptrdiff_t y,
    
                                         ptrdiff_t x) const {
    
      1040
          uint32_t acc = static_cast<uint32_t>(src_rows.at(y)[x]) *
    
      520
                         half_kernel_u16_[half_kernel_size_ - 1];
    
        2/2✓ Branch 0 taken 520 times.
✓ Branch 1 taken 2600 times.

      3120
          for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; i++) {
    
      10400
            acc += (static_cast<uint32_t>(src_rows.at(
    
      7800
                        vertical_border_.get_row(y + i + 1 - half_kernel_size_))[x]) +
    
      7800
                    static_cast<uint32_t>(src_rows.at(vertical_border_.get_row(
    
      7800
                        y + half_kernel_size_ - i - 1))[x])) *
    
      2600
                   half_kernel_u16_[i];
    
      2600
          }
    
      520
          dst_rows[x] = static_cast<BufferType>(rounding_shift_right(acc, 8));
    
      520
        }
    
      1180
        void horizontal_vector_path(Rows<BufferType> src_rows,
    
                                    Rows<DestinationType> dst_rows,
    
                                    ptrdiff_t x) const {
    
          // very similar to the vertical path, the difference is only the loading
    
          // pattern
    
      1180
          uint8x16_t src_mid = vld1q_u8(&src_rows[x]);
    
      2360
          uint8x8_t half_kernel_mid = vdup_n_u8(
    
      1180
              static_cast<uint8_t>(half_kernel_u16_[half_kernel_size_ - 1]));
    
      1180
          uint16x8_t acc_l = vmull_u8(vget_low_u8(src_mid), half_kernel_mid);
    
      1180
          uint16x8_t acc_h = vmull_u8(vget_high_u8(src_mid), half_kernel_mid);
    
      2360
          ptrdiff_t ch = static_cast<ptrdiff_t>(src_rows.channels()),
    
      1180
                    left = x - ch * (half_kernel_size_ - 1),
    
      1180
                    right = x + ch * (half_kernel_size_ - 1);
    
        2/2✓ Branch 0 taken 1180 times.
✓ Branch 1 taken 5900 times.

      7080
          for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; ++i) {
    
      5900
            uint8x16_t src_i = vld1q_u8(&src_rows[left + i * ch]);
    
      5900
            uint8x16_t src_j = vld1q_u8(&src_rows[right - i * ch]);
    
      5900
            uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
    
      5900
            uint16x8_t vec_h = vaddl_high_u8(src_i, src_j);
    
      5900
            uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
    
      5900
            acc_l = vmlaq_u16(acc_l, vec_l, coeff);
    
      5900
            acc_h = vmlaq_u16(acc_h, vec_h, coeff);
    
      5900
          }
    
          // Rounding
    
      1180
          acc_l = vqaddq_u16(acc_l, vdupq_n_u16(128));
    
      1180
          acc_h = vqaddq_u16(acc_h, vdupq_n_u16(128));
    
          // Keep only the highest 8 bits
    
      2360
          uint8x16_t result =
    
      1180
              vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h));
    
      1180
          neon::VecTraits<uint8_t>::store(result, &dst_rows[x]);
    
      1180
        }
    
      636
        void horizontal_left_border_vector_path(Rows<BufferType> src_rows,
    
                                                Rows<DestinationType> dst_rows,
    
                                                ptrdiff_t x) const {
    
          // similar to the simple horizontal path, except the loading pattern:
    
          // - this is loading indirect columns, and half of that data
    
      636
          uint16x8_t src_mid = vmovl_u8(vld1_u8(&src_rows[x]));
    
      1272
          uint16x8_t acc =
    
      636
              vmulq_n_u16(src_mid, half_kernel_u16_[half_kernel_size_ - 1]);
    
      636
          ptrdiff_t ch = static_cast<ptrdiff_t>(src_rows.channels());
    
      636
          ptrdiff_t i = 0, left = x - ch * (half_kernel_size_ - 1),
    
      636
                    right = x + ch * (half_kernel_size_ - 1);
    
        2/2✓ Branch 0 taken 2820 times.
✓ Branch 1 taken 636 times.

      3456
          for (; i * ch + left < 0; ++i) {
    
      2820
            uint16x8_t src_i = horizontal_border_.load_left(src_rows, left + i * ch);
    
      2820
            uint16x8_t src_j = vmovl_u8(vld1_u8(&src_rows[right - i * ch]));
    
      2820
            uint16x8_t vec = vaddq_u16(src_i, src_j);
    
      2820
            uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
    
      2820
            acc = vmlaq_u16(acc, vec, coeff);
    
      2820
          }
    
        2/2✓ Branch 0 taken 360 times.
✓ Branch 1 taken 636 times.

      996
          for (; i < half_kernel_size_ - 1; ++i) {
    
      360
            uint16x8_t src_i = vmovl_u8(vld1_u8(&src_rows[left + i * ch]));
    
      360
            uint16x8_t src_j = vmovl_u8(vld1_u8(&src_rows[right - i * ch]));
    
      360
            uint16x8_t vec = vaddq_u16(src_i, src_j);
    
      360
            uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
    
      360
            acc = vmlaq_u16(acc, vec, coeff);
    
      360
          }
    
          // Store only the highest 8 bits
    
      636
          uint8x8_t result = vrshrn_n_u16(acc, 8);
    
      636
          vst1_u8(&dst_rows[x], result);
    
      636
        }
    
      636
        void horizontal_right_border_vector_path(Rows<BufferType> src_rows,
    
                                                 Rows<DestinationType> dst_rows,
    
                                                 ptrdiff_t x) const {
    
          // similar to the simple horizontal path, except the loading pattern:
    
          // - this is loading indirect columns, and half of that data
    
      636
          uint16x8_t src_mid = vmovl_u8(vld1_u8(&src_rows[x]));
    
      1272
          uint16x8_t acc =
    
      636
              vmulq_n_u16(src_mid, half_kernel_u16_[half_kernel_size_ - 1]);
    
      636
          ptrdiff_t ch = static_cast<ptrdiff_t>(src_rows.channels());
    
      636
          ptrdiff_t i = 0, left = x - ch * (half_kernel_size_ - 1),
    
      636
                    right = x + ch * (half_kernel_size_ - 1);
    
        2/2✓ Branch 0 taken 2820 times.
✓ Branch 1 taken 636 times.

      3456
          for (; right - i * ch > width_ * ch - 8; ++i) {
    
      2820
            uint16x8_t src_i = vmovl_u8(vld1_u8(&src_rows[left + i * ch]));
    
      5640
            uint16x8_t src_j =
    
      2820
                horizontal_border_.load_right(src_rows, right - i * ch);
    
      2820
            uint16x8_t vec = vaddq_u16(src_i, src_j);
    
      2820
            uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
    
      2820
            acc = vmlaq_u16(acc, vec, coeff);
    
      2820
          }
    
        2/2✓ Branch 0 taken 360 times.
✓ Branch 1 taken 636 times.

      996
          for (; i < half_kernel_size_ - 1; ++i) {
    
      360
            uint16x8_t src_i = vmovl_u8(vld1_u8(&src_rows[left + i * ch]));
    
      360
            uint16x8_t src_j = vmovl_u8(vld1_u8(&src_rows[right - i * ch]));
    
      360
            uint16x8_t vec = vaddq_u16(src_i, src_j);
    
      360
            uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
    
      360
            acc = vmlaq_u16(acc, vec, coeff);
    
      360
          }
    
          // Store only the highest 8 bits
    
      636
          uint8x8_t result = vrshrn_n_u16(acc, 8);
    
      636
          vst1_u8(&dst_rows[x], result);
    
      636
        }
    
      324
        void horizontal_scalar_path(Rows<BufferType> src_rows,
    
                                    Rows<DestinationType> dst_rows,
    
                                    ptrdiff_t x) const {
    
      648
          uint32_t acc = static_cast<uint32_t>(src_rows[x]) *
    
      324
                         half_kernel_u16_[half_kernel_size_ - 1];
    
      324
          ptrdiff_t ch = static_cast<ptrdiff_t>(src_rows.channels());
    
      324
          ptrdiff_t channel_offset = x % ch;
    
      324
          ptrdiff_t left_col = x / ch - (half_kernel_size_ - 1),
    
      324
                    right_col = x / ch + (half_kernel_size_ - 1);
    
        2/2✓ Branch 0 taken 324 times.
✓ Branch 1 taken 1620 times.

      1944
          for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; i++) {
    
      1620
            acc += (static_cast<uint32_t>(
    
      3240
                        src_rows[horizontal_border_.get_column(left_col + i) * ch +
    
      3240
                                 channel_offset]) +
    
                    static_cast<uint32_t>(
    
      3240
                        src_rows[horizontal_border_.get_column(right_col - i) * ch +
    
      3240
                                 channel_offset])) *
    
      1620
                   half_kernel_u16_[i];
    
      1620
          }
    
      324
          dst_rows[x] = static_cast<DestinationType>(rounding_shift_right(acc, 8));
    
      324
        }
    
        const ptrdiff_t half_kernel_size_;
    
        const uint16_t *half_kernel_u16_;
    
        const ptrdiff_t width_;
    
        KLEIDICV_TARGET_NAMESPACE::GenericBorderVertical<BorderT> vertical_border_;
    
        KLEIDICV_TARGET_NAMESPACE::GenericBorderHorizontal<BorderT>
    
            horizontal_border_;
    
      };  // end of class GaussianBlurArbitrary<uint8_t>
    
      template <typename ScalarType>
    
      54
      static kleidicv_error_t gaussian_blur_arbitrary_kernel_size(
    
          const ScalarType *src, size_t src_stride, ScalarType *dst,
    
          size_t dst_stride, Rectangle &rect, size_t kernel_size, size_t y_begin,
    
          size_t y_end, size_t channels, float sigma, FixedBorderType border_type,
    
          SeparableFilterWorkspace *workspace) {
    
      54
        Rows<const ScalarType> src_rows{src, src_stride, channels};
    
      54
        Rows<ScalarType> dst_rows{dst, dst_stride, channels};
    
      108
        const ptrdiff_t kHalfKernelSize =
    
      54
            static_cast<ptrdiff_t>(get_half_kernel_size(kernel_size));
    
      54
        uint16_t half_kernel[128];
    
      54
        generate_gaussian_half_kernel(half_kernel, kHalfKernelSize, sigma);
    
        // If sigma is so small that the middle point gets all the weights, it's
    
        // just a copy
    
        2/2✓ Branch 0 taken 39 times.
✓ Branch 1 taken 15 times.

      54
        if (half_kernel[kHalfKernelSize - 1] < 256) {
    
          // Only replicated border is implemented so far.
    
      78
          GaussianBlurArbitrary<ScalarType, FixedBorderType::REPLICATE> filter{
    
      39
              half_kernel, kHalfKernelSize, rect, src_rows.channels()};
    
      78
          workspace->process_arbitrary(rect, kernel_size, y_begin, y_end, src_rows,
    
      39
                                       dst_rows, channels, border_type, filter);
    
      39
        } else {
    
        2/2✓ Branch 0 taken 156 times.
✓ Branch 1 taken 15 times.

      171
          for (size_t row = y_begin; row < y_end; ++row) {
    
      312
            std::memcpy(static_cast<void *>(&dst_rows.at(row)[0]),
    
      156
                        static_cast<const void *>(&src_rows.at(row)[0]),
    
      156
                        rect.width() * sizeof(ScalarType) * dst_rows.channels());
    
      156
          }
    
        }
    
      54
        return KLEIDICV_OK;
    
      54
      }
    
      KLEIDICV_TARGET_FN_ATTRS
    
      66
      kleidicv_error_t gaussian_blur_arbitrary_stripe_u8(
    
          const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
    
          size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels,
    
          size_t kernel_width, size_t /*kernel_height*/, float sigma_x,
    
          float /*sigma_y*/, FixedBorderType fixed_border_type,
    
          kleidicv_filter_context_t *context) {
    
      66
        auto *workspace = reinterpret_cast<SeparableFilterWorkspace *>(context);
    
      132
        kleidicv_error_t checks_result = gaussian_blur_checks(
    
      66
            src, src_stride, dst, dst_stride, width, height, channels, workspace);
    
        2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 54 times.

      66
        if (checks_result != KLEIDICV_OK) {
    
      12
          return checks_result;
    
        }
    
      54
        Rectangle rect{width, height};
    
      54
        return gaussian_blur_arbitrary_kernel_size(
    
      54
            src, src_stride, dst, dst_stride, rect, kernel_width, y_begin, y_end,
    
      54
            channels, sigma_x, fixed_border_type, workspace);
    
      66
      }
    
      }  // namespace kleidicv::neon

Line	Branch	Exec	Source
1			// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2			//
3			// SPDX-License-Identifier: Apache-2.0
4
5			#include <cassert>
6			#include <cstddef>
7
8			#include "border_generic_neon.h"
9			#include "kleidicv/config.h"
10			#include "kleidicv/ctypes.h"
11			#include "kleidicv/filters/gaussian_blur.h"
12			#include "kleidicv/filters/sigma.h"
13			#include "kleidicv/neon.h"
14			#include "kleidicv/workspace/border_types.h"
15			#include "kleidicv/workspace/separable.h"
16
17			namespace kleidicv::neon {
18
19			// Template for arbitrary kernel size Gaussian Blur filters.
20			template <typename ScalarType, FixedBorderType>
21			class GaussianBlurArbitrary;
22
23			template <FixedBorderType BorderT>
24			class GaussianBlurArbitrary<uint8_t, BorderT> {
25			public:
26			using SourceType = uint8_t;
27			using BufferType = uint8_t;
28			using DestinationType = uint8_t;
29			using SourceVecTraits = typename neon::VecTraits<SourceType>;
30			using SourceVectorType = typename SourceVecTraits::VectorType;
31			using BufferVecTraits = typename neon::VecTraits<BufferType>;
32			using BufferVectorType = typename BufferVecTraits::VectorType;
33			using BorderType = FixedBorderType;
34
35		39	GaussianBlurArbitrary(const uint16_t *half_kernel, ptrdiff_t half_kernel_size,
36			Rectangle &rect, size_t channels)
37		39	: half_kernel_size_(half_kernel_size),
38		39	half_kernel_u16_(half_kernel),
39		39	width_(static_cast<ptrdiff_t>(rect.width())),
40		39	vertical_border_(rect.height()),
41		39	horizontal_border_(rect.width(), channels) {}
42
43			// Not border-affected parts
44		156	void process_arbitrary_vertical(size_t width, Rows<const SourceType> src_rows,
45			Rows<BufferType> buffer_rows) const {
46		312	LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
47		156	SourceVecTraits::num_lanes()};
48
49		612	loop.unroll_once([&](size_t index) {
50		456	vertical_vector_path(src_rows, buffer_rows, index);
51		456	});
52
53		208	loop.tail([&](size_t index) {
54		52	vertical_scalar_path(src_rows, buffer_rows, index);
55		52	});
56		156	}
57
58			// Border-affected parts
59		360	void process_arbitrary_border_vertical(size_t width,
60			Rows<const SourceType> src_rows,
61			ptrdiff_t row_index,
62			Rows<BufferType> buffer_rows) const {
63		720	LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
64		360	SourceVecTraits::num_lanes()};
65
66		1760	loop.unroll_once([&](size_t column_index) {
67		2800	vertical_border_vector_path(src_rows, buffer_rows, row_index,
68		1400	column_index);
69		1400	});
70
71		880	loop.tail([&](size_t column_index) {
72		1040	vertical_border_scalar_path(src_rows, buffer_rows, row_index,
73		520	column_index);
74		520	});
75		360	}
76
77		516	void process_arbitrary_horizontal(
78			size_t width, size_t kernel_size, Rows<BufferType> buffer_rows,
79			Rows<DestinationType> dst_rows) KLEIDICV_STREAMING {
80		516	size_t x = 0;
81			// Assume that there is always a widening when calculating, so the
82			// horizontal vector path processes double-width vectors
83		516	const size_t num_lanes = BufferVecTraits::num_lanes() / 2;
84		516	const size_t block_len = num_lanes;
85		516	const size_t margin = kernel_size / 2;
86		516	const size_t border_len = buffer_rows.channels() * margin;
87		1032	const size_t border_process_len =
88		516	((border_len + block_len - 1) / block_len) * block_len;
89
90	2/2 ✓ Branch 0 taken 636 times. ✓ Branch 1 taken 516 times.	1152	for (; x < border_process_len; x += num_lanes) {
91		636	horizontal_left_border_vector_path(buffer_rows, dst_rows, x);
92		636	}
93
94			// Process data which is not affected by any borders in bulk.
95	2/2 ✓ Branch 0 taken 432 times. ✓ Branch 1 taken 84 times.	516	if (width * buffer_rows.channels() > 2 * border_process_len) {
96		864	size_t total_width_without_borders =
97		432	width * buffer_rows.channels() - 2 * border_process_len;
98
99		864	LoopUnroll2<TryToAvoidTailLoop> loop{total_width_without_borders,
100		432	BufferVecTraits::num_lanes()};
101
102		694	loop.unroll_twice([&](size_t index) {
103		262	horizontal_vector_path(buffer_rows, dst_rows, x + index);
104		524	horizontal_vector_path(buffer_rows, dst_rows,
105		262	x + index + BufferVecTraits::num_lanes());
106		262	});
107
108		1088	loop.unroll_once([&](size_t index) {
109		656	horizontal_vector_path(buffer_rows, dst_rows, x + index);
110		656	});
111
112		756	loop.tail([&](size_t index) {
113		324	horizontal_scalar_path(buffer_rows, dst_rows, x + index);
114		324	});
115
116		432	x += total_width_without_borders;
117		432	} else {
118			// rewind if needed, so we'll have exact vector paths at the right side
119		84	x = width * buffer_rows.channels() - border_process_len;
120			}
121
122	2/2 ✓ Branch 0 taken 636 times. ✓ Branch 1 taken 516 times.	1152	for (; x < width * buffer_rows.channels(); x += num_lanes) {
123		636	horizontal_right_border_vector_path(buffer_rows, dst_rows, x);
124		636	}
125		516	}
126
127			private:
128		456	void vertical_vector_path(Rows<const SourceType> src_rows,
129			Rows<BufferType> dst_rows, ptrdiff_t x) const {
130		456	uint8x16_t src_mid = vld1q_u8(&src_rows[x]);
131		912	uint8x8_t half_kernel_mid = vdup_n_u8(
132		456	static_cast<uint8_t>(half_kernel_u16_[half_kernel_size_ - 1]));
133		456	uint16x8_t acc_l = vmull_u8(vget_low_u8(src_mid), half_kernel_mid);
134		456	uint16x8_t acc_h = vmull_u8(vget_high_u8(src_mid), half_kernel_mid);
135
136		456	ptrdiff_t i = 0;
137			// Unroll 4 times
138	2/2 ✓ Branch 0 taken 456 times. ✓ Branch 1 taken 456 times.	912	for (; i < half_kernel_size_ - 4; i += 4) {
139		456	uint8x16_t src_i = vld1q_u8(&src_rows.at(i - half_kernel_size_ + 1)[x]);
140		456	uint8x16_t src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 1)[x]);
141		456	uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
142		456	uint16x8_t vec_h = vaddl_high_u8(src_i, src_j);
143		456	uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
144		456	uint16x8_t prod0_l = vmulq_u16(vec_l, coeff);
145		456	uint16x8_t prod0_h = vmulq_u16(vec_h, coeff);
146
147		456	src_i = vld1q_u8(&src_rows.at(i + 2 - half_kernel_size_)[x]);
148		456	src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 2)[x]);
149		456	vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
150		456	vec_h = vaddl_high_u8(src_i, src_j);
151		456	coeff = vdupq_n_u16(half_kernel_u16_[i + 1]);
152		456	uint16x8_t prod1_l = vmulq_u16(vec_l, coeff);
153		456	uint16x8_t prod1_h = vmulq_u16(vec_h, coeff);
154
155		456	src_i = vld1q_u8(&src_rows.at(i + 3 - half_kernel_size_)[x]);
156		456	src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 3)[x]);
157		456	vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
158		456	vec_h = vaddl_high_u8(src_i, src_j);
159		456	coeff = vdupq_n_u16(half_kernel_u16_[i + 2]);
160		456	uint16x8_t prod2_l = vmulq_u16(vec_l, coeff);
161		456	uint16x8_t prod2_h = vmulq_u16(vec_h, coeff);
162
163		456	src_i = vld1q_u8(&src_rows.at(i + 4 - half_kernel_size_)[x]);
164		456	src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 4)[x]);
165		456	vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
166		456	vec_h = vaddl_high_u8(src_i, src_j);
167		456	coeff = vdupq_n_u16(half_kernel_u16_[i + 3]);
168		456	uint16x8_t prod3_l = vmulq_u16(vec_l, coeff);
169		456	uint16x8_t prod3_h = vmulq_u16(vec_h, coeff);
170
171		456	uint16x8_t acc0_l = vaddq_u16(prod0_l, prod1_l);
172		456	uint16x8_t acc0_h = vaddq_u16(prod0_h, prod1_h);
173		456	uint16x8_t acc1_l = vaddq_u16(prod2_l, prod3_l);
174		456	uint16x8_t acc1_h = vaddq_u16(prod2_h, prod3_h);
175
176		456	uint16x8_t acc_new_l = vaddq_u16(acc0_l, acc1_l);
177		456	uint16x8_t acc_new_h = vaddq_u16(acc0_h, acc1_h);
178
179		456	acc_l = vaddq_u16(acc_l, acc_new_l);
180		456	acc_h = vaddq_u16(acc_h, acc_new_h);
181		456	}
182
183	2/2 ✓ Branch 0 taken 456 times. ✓ Branch 1 taken 456 times.	912	for (; i < half_kernel_size_ - 1; ++i) {
184		456	uint8x16_t src_i = vld1q_u8(&src_rows.at(i - half_kernel_size_ + 1)[x]);
185		456	uint8x16_t src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 1)[x]);
186		456	uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
187		456	uint16x8_t vec_h = vaddl_high_u8(src_i, src_j);
188		456	uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
189		456	acc_l = vmlaq_u16(acc_l, vec_l, coeff);
190		456	acc_h = vmlaq_u16(acc_h, vec_h, coeff);
191		456	}
192
193			// Rounding
194		456	acc_l = vqaddq_u16(acc_l, vdupq_n_u16(128));
195		456	acc_h = vqaddq_u16(acc_h, vdupq_n_u16(128));
196			// Keep only the highest 8 bits
197		912	uint8x16_t result =
198		456	vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h));
199		456	neon::VecTraits<uint8_t>::store(result, &dst_rows[x]);
200		456	}
201
202			// Where y is affected by border
203		1400	void vertical_border_vector_path(Rows<const SourceType> src_rows,
204			Rows<BufferType> dst_rows, ptrdiff_t y,
205			ptrdiff_t x) const {
206		1400	uint8x16_t src_mid = vld1q_u8(&src_rows.at(y)[x]);
207		2800	uint8x8_t half_kernel_mid = vdup_n_u8(
208		1400	static_cast<uint8_t>(half_kernel_u16_[half_kernel_size_ - 1]));
209		1400	uint16x8_t acc_l = vmull_u8(vget_low_u8(src_mid), half_kernel_mid);
210		1400	uint16x8_t acc_h = vmull_u8(vget_high_u8(src_mid), half_kernel_mid);
211
212		1400	ptrdiff_t i = 0;
213	2/2 ✓ Branch 0 taken 7000 times. ✓ Branch 1 taken 1400 times.	8400	for (; i < half_kernel_size_ - 1; ++i) {
214		7000	uint8x16_t src_i = vld1q_u8(&src_rows.at(
215			vertical_border_.get_row(y - half_kernel_size_ + 1 + i))[x]);
216		7000	uint8x16_t src_j = vld1q_u8(&src_rows.at(
217			vertical_border_.get_row(y + half_kernel_size_ - 1 - i))[x]);
218		7000	uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
219		7000	uint16x8_t vec_h = vaddl_high_u8(src_i, src_j);
220		7000	uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
221		7000	acc_l = vmlaq_u16(acc_l, vec_l, coeff);
222		7000	acc_h = vmlaq_u16(acc_h, vec_h, coeff);
223		7000	}
224
225			// Rounding
226		1400	acc_l = vqaddq_u16(acc_l, vdupq_n_u16(128));
227		1400	acc_h = vqaddq_u16(acc_h, vdupq_n_u16(128));
228			// Keep only the highest 8 bits
229		2800	uint8x16_t result =
230		1400	vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h));
231		1400	neon::VecTraits<uint8_t>::store(result, &dst_rows[x]);
232		1400	}
233
234		52	void vertical_scalar_path(Rows<const SourceType> src_rows,
235			Rows<BufferType> dst_rows, ptrdiff_t x) const {
236		104	uint32_t acc = static_cast<uint32_t>(src_rows[x]) *
237		52	half_kernel_u16_[half_kernel_size_ - 1];
238
239	2/2 ✓ Branch 0 taken 52 times. ✓ Branch 1 taken 260 times.	312	for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; i++) {
240		260	acc +=
241		520	(static_cast<uint32_t>(src_rows.at(i + 1 - half_kernel_size_)[x]) +
242		520	static_cast<uint32_t>(src_rows.at(half_kernel_size_ - i - 1)[x])) *
243		260	half_kernel_u16_[i];
244		260	}
245
246		52	dst_rows[x] = static_cast<BufferType>(rounding_shift_right(acc, 8));
247		52	}
248
249		520	void vertical_border_scalar_path(Rows<const SourceType> src_rows,
250			Rows<BufferType> dst_rows, ptrdiff_t y,
251			ptrdiff_t x) const {
252		1040	uint32_t acc = static_cast<uint32_t>(src_rows.at(y)[x]) *
253		520	half_kernel_u16_[half_kernel_size_ - 1];
254
255	2/2 ✓ Branch 0 taken 520 times. ✓ Branch 1 taken 2600 times.	3120	for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; i++) {
256		10400	acc += (static_cast<uint32_t>(src_rows.at(
257		7800	vertical_border_.get_row(y + i + 1 - half_kernel_size_))[x]) +
258		7800	static_cast<uint32_t>(src_rows.at(vertical_border_.get_row(
259		7800	y + half_kernel_size_ - i - 1))[x])) *
260		2600	half_kernel_u16_[i];
261		2600	}
262
263		520	dst_rows[x] = static_cast<BufferType>(rounding_shift_right(acc, 8));
264		520	}
265
266		1180	void horizontal_vector_path(Rows<BufferType> src_rows,
267			Rows<DestinationType> dst_rows,
268			ptrdiff_t x) const {
269			// very similar to the vertical path, the difference is only the loading
270			// pattern
271		1180	uint8x16_t src_mid = vld1q_u8(&src_rows[x]);
272		2360	uint8x8_t half_kernel_mid = vdup_n_u8(
273		1180	static_cast<uint8_t>(half_kernel_u16_[half_kernel_size_ - 1]));
274		1180	uint16x8_t acc_l = vmull_u8(vget_low_u8(src_mid), half_kernel_mid);
275		1180	uint16x8_t acc_h = vmull_u8(vget_high_u8(src_mid), half_kernel_mid);
276
277		2360	ptrdiff_t ch = static_cast<ptrdiff_t>(src_rows.channels()),
278		1180	left = x - ch * (half_kernel_size_ - 1),
279		1180	right = x + ch * (half_kernel_size_ - 1);
280	2/2 ✓ Branch 0 taken 1180 times. ✓ Branch 1 taken 5900 times.	7080	for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; ++i) {
281		5900	uint8x16_t src_i = vld1q_u8(&src_rows[left + i * ch]);
282		5900	uint8x16_t src_j = vld1q_u8(&src_rows[right - i * ch]);
283		5900	uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
284		5900	uint16x8_t vec_h = vaddl_high_u8(src_i, src_j);
285		5900	uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
286		5900	acc_l = vmlaq_u16(acc_l, vec_l, coeff);
287		5900	acc_h = vmlaq_u16(acc_h, vec_h, coeff);
288		5900	}
289
290			// Rounding
291		1180	acc_l = vqaddq_u16(acc_l, vdupq_n_u16(128));
292		1180	acc_h = vqaddq_u16(acc_h, vdupq_n_u16(128));
293			// Keep only the highest 8 bits
294		2360	uint8x16_t result =
295		1180	vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h));
296		1180	neon::VecTraits<uint8_t>::store(result, &dst_rows[x]);
297		1180	}
298
299		636	void horizontal_left_border_vector_path(Rows<BufferType> src_rows,
300			Rows<DestinationType> dst_rows,
301			ptrdiff_t x) const {
302			// similar to the simple horizontal path, except the loading pattern:
303			// - this is loading indirect columns, and half of that data
304		636	uint16x8_t src_mid = vmovl_u8(vld1_u8(&src_rows[x]));
305		1272	uint16x8_t acc =
306		636	vmulq_n_u16(src_mid, half_kernel_u16_[half_kernel_size_ - 1]);
307
308		636	ptrdiff_t ch = static_cast<ptrdiff_t>(src_rows.channels());
309		636	ptrdiff_t i = 0, left = x - ch * (half_kernel_size_ - 1),
310		636	right = x + ch * (half_kernel_size_ - 1);
311	2/2 ✓ Branch 0 taken 2820 times. ✓ Branch 1 taken 636 times.	3456	for (; i * ch + left < 0; ++i) {
312		2820	uint16x8_t src_i = horizontal_border_.load_left(src_rows, left + i * ch);
313		2820	uint16x8_t src_j = vmovl_u8(vld1_u8(&src_rows[right - i * ch]));
314		2820	uint16x8_t vec = vaddq_u16(src_i, src_j);
315		2820	uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
316		2820	acc = vmlaq_u16(acc, vec, coeff);
317		2820	}
318
319	2/2 ✓ Branch 0 taken 360 times. ✓ Branch 1 taken 636 times.	996	for (; i < half_kernel_size_ - 1; ++i) {
320		360	uint16x8_t src_i = vmovl_u8(vld1_u8(&src_rows[left + i * ch]));
321		360	uint16x8_t src_j = vmovl_u8(vld1_u8(&src_rows[right - i * ch]));
322		360	uint16x8_t vec = vaddq_u16(src_i, src_j);
323		360	uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
324		360	acc = vmlaq_u16(acc, vec, coeff);
325		360	}
326
327			// Store only the highest 8 bits
328		636	uint8x8_t result = vrshrn_n_u16(acc, 8);
329		636	vst1_u8(&dst_rows[x], result);
330		636	}
331
332		636	void horizontal_right_border_vector_path(Rows<BufferType> src_rows,
333			Rows<DestinationType> dst_rows,
334			ptrdiff_t x) const {
335			// similar to the simple horizontal path, except the loading pattern:
336			// - this is loading indirect columns, and half of that data
337		636	uint16x8_t src_mid = vmovl_u8(vld1_u8(&src_rows[x]));
338		1272	uint16x8_t acc =
339		636	vmulq_n_u16(src_mid, half_kernel_u16_[half_kernel_size_ - 1]);
340
341		636	ptrdiff_t ch = static_cast<ptrdiff_t>(src_rows.channels());
342		636	ptrdiff_t i = 0, left = x - ch * (half_kernel_size_ - 1),
343		636	right = x + ch * (half_kernel_size_ - 1);
344	2/2 ✓ Branch 0 taken 2820 times. ✓ Branch 1 taken 636 times.	3456	for (; right - i * ch > width_ * ch - 8; ++i) {
345		2820	uint16x8_t src_i = vmovl_u8(vld1_u8(&src_rows[left + i * ch]));
346		5640	uint16x8_t src_j =
347		2820	horizontal_border_.load_right(src_rows, right - i * ch);
348		2820	uint16x8_t vec = vaddq_u16(src_i, src_j);
349		2820	uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
350		2820	acc = vmlaq_u16(acc, vec, coeff);
351		2820	}
352
353	2/2 ✓ Branch 0 taken 360 times. ✓ Branch 1 taken 636 times.	996	for (; i < half_kernel_size_ - 1; ++i) {
354		360	uint16x8_t src_i = vmovl_u8(vld1_u8(&src_rows[left + i * ch]));
355		360	uint16x8_t src_j = vmovl_u8(vld1_u8(&src_rows[right - i * ch]));
356		360	uint16x8_t vec = vaddq_u16(src_i, src_j);
357		360	uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
358		360	acc = vmlaq_u16(acc, vec, coeff);
359		360	}
360
361			// Store only the highest 8 bits
362		636	uint8x8_t result = vrshrn_n_u16(acc, 8);
363		636	vst1_u8(&dst_rows[x], result);
364		636	}
365
366		324	void horizontal_scalar_path(Rows<BufferType> src_rows,
367			Rows<DestinationType> dst_rows,
368			ptrdiff_t x) const {
369		648	uint32_t acc = static_cast<uint32_t>(src_rows[x]) *
370		324	half_kernel_u16_[half_kernel_size_ - 1];
371		324	ptrdiff_t ch = static_cast<ptrdiff_t>(src_rows.channels());
372		324	ptrdiff_t channel_offset = x % ch;
373		324	ptrdiff_t left_col = x / ch - (half_kernel_size_ - 1),
374		324	right_col = x / ch + (half_kernel_size_ - 1);
375
376	2/2 ✓ Branch 0 taken 324 times. ✓ Branch 1 taken 1620 times.	1944	for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; i++) {
377		1620	acc += (static_cast<uint32_t>(
378		3240	src_rows[horizontal_border_.get_column(left_col + i) * ch +
379		3240	channel_offset]) +
380			static_cast<uint32_t>(
381		3240	src_rows[horizontal_border_.get_column(right_col - i) * ch +
382		3240	channel_offset])) *
383		1620	half_kernel_u16_[i];
384		1620	}
385
386		324	dst_rows[x] = static_cast<DestinationType>(rounding_shift_right(acc, 8));
387		324	}
388
389			const ptrdiff_t half_kernel_size_;
390			const uint16_t *half_kernel_u16_;
391			const ptrdiff_t width_;
392			KLEIDICV_TARGET_NAMESPACE::GenericBorderVertical<BorderT> vertical_border_;
393			KLEIDICV_TARGET_NAMESPACE::GenericBorderHorizontal<BorderT>
394			horizontal_border_;
395			}; // end of class GaussianBlurArbitrary<uint8_t>
396
397			template <typename ScalarType>
398		54	static kleidicv_error_t gaussian_blur_arbitrary_kernel_size(
399			const ScalarType src, size_t src_stride, ScalarType dst,
400			size_t dst_stride, Rectangle &rect, size_t kernel_size, size_t y_begin,
401			size_t y_end, size_t channels, float sigma, FixedBorderType border_type,
402			SeparableFilterWorkspace *workspace) {
403		54	Rows<const ScalarType> src_rows{src, src_stride, channels};
404		54	Rows<ScalarType> dst_rows{dst, dst_stride, channels};
405
406		108	const ptrdiff_t kHalfKernelSize =
407		54	static_cast<ptrdiff_t>(get_half_kernel_size(kernel_size));
408		54	uint16_t half_kernel[128];
409		54	generate_gaussian_half_kernel(half_kernel, kHalfKernelSize, sigma);
410			// If sigma is so small that the middle point gets all the weights, it's
411			// just a copy
412	2/2 ✓ Branch 0 taken 39 times. ✓ Branch 1 taken 15 times.	54	if (half_kernel[kHalfKernelSize - 1] < 256) {
413			// Only replicated border is implemented so far.
414		78	GaussianBlurArbitrary<ScalarType, FixedBorderType::REPLICATE> filter{
415		39	half_kernel, kHalfKernelSize, rect, src_rows.channels()};
416		78	workspace->process_arbitrary(rect, kernel_size, y_begin, y_end, src_rows,
417		39	dst_rows, channels, border_type, filter);
418		39	} else {
419	2/2 ✓ Branch 0 taken 156 times. ✓ Branch 1 taken 15 times.	171	for (size_t row = y_begin; row < y_end; ++row) {
420		312	std::memcpy(static_cast<void *>(&dst_rows.at(row)[0]),
421		156	static_cast<const void *>(&src_rows.at(row)[0]),
422		156	rect.width() * sizeof(ScalarType) * dst_rows.channels());
423		156	}
424			}
425		54	return KLEIDICV_OK;
426		54	}
427
428			KLEIDICV_TARGET_FN_ATTRS
429		66	kleidicv_error_t gaussian_blur_arbitrary_stripe_u8(
430			const uint8_t src, size_t src_stride, uint8_t dst, size_t dst_stride,
431			size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels,
432			size_t kernel_width, size_t /kernel_height/, float sigma_x,
433			float /sigma_y/, FixedBorderType fixed_border_type,
434			kleidicv_filter_context_t *context) {
435		66	auto workspace = reinterpret_cast<SeparableFilterWorkspace >(context);
436		132	kleidicv_error_t checks_result = gaussian_blur_checks(
437		66	src, src_stride, dst, dst_stride, width, height, channels, workspace);
438
439	2/2 ✓ Branch 0 taken 12 times. ✓ Branch 1 taken 54 times.	66	if (checks_result != KLEIDICV_OK) {
440		12	return checks_result;
441			}
442
443		54	Rectangle rect{width, height};
444
445		54	return gaussian_blur_arbitrary_kernel_size(
446		54	src, src_stride, dst, dst_stride, rect, kernel_width, y_begin, y_end,
447		54	channels, sigma_x, fixed_border_type, workspace);
448		66	}
449
450			} // namespace kleidicv::neon
451