KleidiCV Coverage Report

Directory:	./
File:	kleidicv/src/filters/blur_and_downsample_neon.cpp
Date:	2025-09-25 14:13:34

	Exec	Total	Coverage
Lines:	145	145	100.0%
Functions:	15	15	100.0%
Branches:	30	30	100.0%

  
      Line
      Branch
      Exec
      Source
    
      // SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
    
      //
    
      // SPDX-License-Identifier: Apache-2.0
    
      #include "kleidicv/ctypes.h"
    
      #include "kleidicv/filters/blur_and_downsample.h"
    
      #include "kleidicv/kleidicv.h"
    
      #include "kleidicv/neon.h"
    
      #include "kleidicv/utils.h"
    
      #include "kleidicv/workspace/blur_and_downsample_ws.h"
    
      #include "kleidicv/workspace/border_5x5.h"
    
      namespace kleidicv::neon {
    
      // Applies Gaussian Blur binomial filter to even rows and columns
    
      //
    
      //              [ 1,  4,  6,  4, 1 ]           [ 1 ]
    
      //              [ 4, 16, 24, 16, 4 ]           [ 4 ]
    
      //  F = 1/256 * [ 6, 24, 36, 24, 6 ] = 1/256 * [ 6 ] * [ 1,  4,  6,  4, 1 ]
    
      //              [ 4, 16, 24, 16, 4 ]           [ 4 ]
    
      //              [ 1,  4,  6,  4, 1 ]           [ 1 ]
    
      class BlurAndDownsample {
    
       public:
    
        using SourceType = uint8_t;
    
        using BufferType = uint16_t;
    
        using DestinationType = uint8_t;
    
        using SourceVecTraits = typename neon::VecTraits<SourceType>;
    
        using SourceVectorType = typename SourceVecTraits::VectorType;
    
        using BufferVecTraits = typename neon::VecTraits<BufferType>;
    
        using BufferVectorType = typename BufferVecTraits::VectorType;
    
        using BorderInfoType =
    
            typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5<SourceType>;
    
        using BorderType = FixedBorderType;
    
        using BorderOffsets = typename BorderInfoType::Offsets;
    
      43
        BlurAndDownsample()
    
      43
            : const_6_u8_half_{vdup_n_u8(6)},
    
      43
              const_6_u16_{vdupq_n_u16(6)},
    
      43
              const_4_u16_{vdupq_n_u16(4)} {}
    
        static constexpr size_t margin = 2UL;
    
      363
        void process_vertical(size_t width, Rows<const SourceType> src_rows,
    
                              Rows<BufferType> dst_rows,
    
                              BorderOffsets border_offsets) const {
    
      726
          LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
    
      363
                                               SourceVecTraits::num_lanes()};
    
      441
          loop.unroll_twice([&](ptrdiff_t index) {
    
      78
            const auto *src_0 = &src_rows.at(border_offsets.c0())[index];
    
      78
            const auto *src_1 = &src_rows.at(border_offsets.c1())[index];
    
      78
            const auto *src_2 = &src_rows.at(border_offsets.c2())[index];
    
      78
            const auto *src_3 = &src_rows.at(border_offsets.c3())[index];
    
      78
            const auto *src_4 = &src_rows.at(border_offsets.c4())[index];
    
      78
            SourceVectorType src_a[5], src_b[5];
    
      78
            src_a[0] = vld1q(&src_0[0]);
    
      78
            src_b[0] = vld1q(&src_0[SourceVecTraits::num_lanes()]);
    
      78
            src_a[1] = vld1q(&src_1[0]);
    
      78
            src_b[1] = vld1q(&src_1[SourceVecTraits::num_lanes()]);
    
      78
            src_a[2] = vld1q(&src_2[0]);
    
      78
            src_b[2] = vld1q(&src_2[SourceVecTraits::num_lanes()]);
    
      78
            src_a[3] = vld1q(&src_3[0]);
    
      78
            src_b[3] = vld1q(&src_3[SourceVecTraits::num_lanes()]);
    
      78
            src_a[4] = vld1q(&src_4[0]);
    
      78
            src_b[4] = vld1q(&src_4[SourceVecTraits::num_lanes()]);
    
      78
            vertical_vector_path(src_a, &dst_rows[index]);
    
      78
            vertical_vector_path(
    
      156
                src_b, &dst_rows[index + static_cast<ptrdiff_t>(
    
      78
                                             SourceVecTraits::num_lanes())]);
    
      78
          });
    
      437
          loop.unroll_once([&](ptrdiff_t index) {
    
      74
            SourceVectorType src[5];
    
      74
            src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]);
    
      74
            src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]);
    
      74
            src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]);
    
      74
            src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]);
    
      74
            src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]);
    
      74
            vertical_vector_path(src, &dst_rows[index]);
    
      74
          });
    
      2202
          loop.tail([&](ptrdiff_t index) {
    
      1839
            SourceType src[5];
    
      1839
            src[0] = src_rows.at(border_offsets.c0())[index];
    
      1839
            src[1] = src_rows.at(border_offsets.c1())[index];
    
      1839
            src[2] = src_rows.at(border_offsets.c2())[index];
    
      1839
            src[3] = src_rows.at(border_offsets.c3())[index];
    
      1839
            src[4] = src_rows.at(border_offsets.c4())[index];
    
      1839
            vertical_scalar_path(src, &dst_rows[index]);
    
      1839
          });
    
      363
        }
    
      363
        void process_horizontal(size_t width, Rows<const BufferType> src_rows,
    
                                Rows<DestinationType> dst_rows,
    
                                BorderOffsets border_offsets) const {
    
      726
          LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
    
      363
                                               BufferVecTraits::num_lanes()};
    
      487
          loop.unroll_twice([&](ptrdiff_t index) {
    
      124
            const auto *src_0 = &src_rows.at(0, border_offsets.c0())[index];
    
      124
            const auto *src_1 = &src_rows.at(0, border_offsets.c1())[index];
    
      124
            const auto *src_2 = &src_rows.at(0, border_offsets.c2())[index];
    
      124
            const auto *src_3 = &src_rows.at(0, border_offsets.c3())[index];
    
      124
            const auto *src_4 = &src_rows.at(0, border_offsets.c4())[index];
    
      124
            BufferVectorType src_a[5], src_b[5];
    
      124
            src_a[0] = vld1q(&src_0[0]);
    
      124
            src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]);
    
      124
            src_a[1] = vld1q(&src_1[0]);
    
      124
            src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]);
    
      124
            src_a[2] = vld1q(&src_2[0]);
    
      124
            src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]);
    
      124
            src_a[3] = vld1q(&src_3[0]);
    
      124
            src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]);
    
      124
            src_a[4] = vld1q(&src_4[0]);
    
      124
            src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]);
    
      124
            uint8x8_t res_a = horizontal_vector_path(src_a);
    
      124
            uint8x8_t res_b = horizontal_vector_path(src_b);
    
            // Only store even indices
    
      124
            vst1(&dst_rows[index / 2], vuzp1_u8(res_a, res_b));
    
      124
          });
    
      576
          loop.remaining([&](ptrdiff_t index, size_t max_index) {
    
      213
            index = align_up(index, 2);
    
        2/2✓ Branch 0 taken 683 times.
✓ Branch 1 taken 213 times.

      896
            while (index < static_cast<ptrdiff_t>(max_index)) {
    
      683
              process_horizontal_scalar(src_rows, dst_rows, border_offsets, index);
    
      683
              index += 2;
    
            }
    
      213
          });
    
      363
        }
    
      726
        void process_horizontal_borders(Rows<const BufferType> src_rows,
    
                                        Rows<DestinationType> dst_rows,
    
                                        BorderOffsets border_offsets) const {
    
        2/2✓ Branch 0 taken 726 times.
✓ Branch 1 taken 726 times.

      1452
          for (ptrdiff_t index = 0;
    
      1452
               index < static_cast<ptrdiff_t>(src_rows.channels()); ++index) {
    
      726
            disable_loop_vectorization();
    
      726
            process_horizontal_scalar(src_rows, dst_rows, border_offsets, index);
    
      726
          }
    
      726
        }
    
       private:
    
        // Applies vertical filtering vector using SIMD operations.
    
        //
    
        // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
    
      230
        void vertical_vector_path(uint8x16_t src[5], BufferType *dst) const {
    
      230
          uint16x8_t acc_0_4_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[4]));
    
      230
          uint16x8_t acc_0_4_h = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[4]));
    
      230
          uint16x8_t acc_1_3_l = vaddl_u8(vget_low_u8(src[1]), vget_low_u8(src[3]));
    
      230
          uint16x8_t acc_1_3_h = vaddl_u8(vget_high_u8(src[1]), vget_high_u8(src[3]));
    
      460
          uint16x8_t acc_l =
    
      230
              vmlal_u8(acc_0_4_l, vget_low_u8(src[2]), const_6_u8_half_);
    
      460
          uint16x8_t acc_h =
    
      230
              vmlal_u8(acc_0_4_h, vget_high_u8(src[2]), const_6_u8_half_);
    
      230
          acc_l = vmlaq_u16(acc_l, acc_1_3_l, const_4_u16_);
    
      230
          acc_h = vmlaq_u16(acc_h, acc_1_3_h, const_4_u16_);
    
      230
          vst1q(&dst[0], acc_l);
    
      230
          vst1q(&dst[8], acc_h);
    
      230
        }
    
        // Applies vertical filtering vector using scalar operations.
    
        //
    
        // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
    
      1839
        void vertical_scalar_path(const SourceType src[5], BufferType *dst) const {
    
      1839
          dst[0] = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2];
    
      1839
        }
    
        // Applies horizontal filtering vector using SIMD operations.
    
        //
    
        // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
    
      248
        uint8x8_t horizontal_vector_path(uint16x8_t src[5]) const {
    
      248
          uint16x8_t acc_0_4 = vaddq_u16(src[0], src[4]);
    
      248
          uint16x8_t acc_1_3 = vaddq_u16(src[1], src[3]);
    
      248
          uint16x8_t acc_u16 = vmlaq_u16(acc_0_4, src[2], const_6_u16_);
    
      248
          acc_u16 = vmlaq_u16(acc_u16, acc_1_3, const_4_u16_);
    
      496
          return vrshrn_n_u16(acc_u16, 8);
    
      248
        }
    
        // Applies horizontal filtering vector using scalar operations.
    
        //
    
        // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
    
      1409
        void process_horizontal_scalar(Rows<const BufferType> src_rows,
    
                                       Rows<DestinationType> dst_rows,
    
                                       BorderOffsets border_offsets,
    
                                       ptrdiff_t index) const {
    
      1409
          BufferType src[5];
    
      1409
          src[0] = src_rows.at(0, border_offsets.c0())[index];
    
      1409
          src[1] = src_rows.at(0, border_offsets.c1())[index];
    
      1409
          src[2] = src_rows.at(0, border_offsets.c2())[index];
    
      1409
          src[3] = src_rows.at(0, border_offsets.c3())[index];
    
      1409
          src[4] = src_rows.at(0, border_offsets.c4())[index];
    
      1409
          auto acc = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2];
    
      1409
          dst_rows[index / 2] = rounding_shift_right(acc, 8);
    
      1409
        }
    
        uint8x8_t const_6_u8_half_;
    
        uint16x8_t const_6_u16_;
    
        uint16x8_t const_4_u16_;
    
      };  // end of class BlurAndDownsample
    
      // Does not include checks for whether the operation is implemented.
    
      // This must be done earlier, by blur_and_downsample_is_implemented.
    
      51
      static kleidicv_error_t blur_and_downsample_checks(
    
          const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
    
          uint8_t *dst, size_t dst_stride, size_t channels,
    
          BlurAndDownsampleFilterWorkspace *workspace) {
    
        2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 50 times.

      51
        CHECK_POINTERS(workspace);
    
        4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 49 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 49 times.

      50
        CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
    
        4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 48 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 48 times.

      49
        CHECK_POINTER_AND_STRIDE(dst, dst_stride, (src_height + 1) / 2);
    
        6/6✓ Branch 0 taken 1 times.
✓ Branch 1 taken 47 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 46 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 46 times.

      48
        CHECK_IMAGE_SIZE(src_width, src_height);
    
      46
        Rectangle rect{src_width, src_height};
    
      46
        const Rectangle &context_rect = workspace->image_size();
    
        4/4✓ Branch 0 taken 44 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 43 times.

      46
        if (context_rect.width() < src_width || context_rect.height() < src_height) {
    
      3
          return KLEIDICV_ERROR_CONTEXT_MISMATCH;
    
        }
    
        // Currently supports only one channel, so it cannot be tested.
    
        // GCOVR_EXCL_START
    
      −
        if (workspace->channels() < channels) {
    
      −
          return KLEIDICV_ERROR_CONTEXT_MISMATCH;
    
        }
    
        // GCOVR_EXCL_STOP
    
      43
        return KLEIDICV_OK;
    
      51
      }
    
      KLEIDICV_TARGET_FN_ATTRS
    
      51
      kleidicv_error_t kleidicv_blur_and_downsample_stripe_u8(
    
          const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
    
          uint8_t *dst, size_t dst_stride, size_t y_begin, size_t y_end,
    
          size_t channels, FixedBorderType fixed_border_type,
    
          kleidicv_filter_context_t *context) {
    
        // Does not include checks for whether the operation is implemented.
    
        // This must be done earlier, by blur_and_downsample_is_implemented.
    
      102
        auto *workspace =
    
      51
            reinterpret_cast<BlurAndDownsampleFilterWorkspace *>(context);
    
        6/6✓ Branch 0 taken 8 times.
✓ Branch 1 taken 43 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 43 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 43 times.

      110
        if (auto check_result =
    
      102
                blur_and_downsample_checks(src, src_stride, src_width, src_height,
    
      51
                                           dst, dst_stride, channels, workspace)) {
    
      8
          return check_result;
    
        }
    
      43
        Rectangle rect{src_width, src_height};
    
      43
        Rows<const uint8_t> src_rows{src, src_stride, channels};
    
      43
        Rows<uint8_t> dst_rows{dst, dst_stride, channels};
    
      86
        workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels,
    
      43
                           fixed_border_type, BlurAndDownsample{});
    
      43
        return KLEIDICV_OK;
    
      51
      }
    
      }  // namespace kleidicv::neon

Line	Branch	Exec	Source
1			// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
2			//
3			// SPDX-License-Identifier: Apache-2.0
4
5			#include "kleidicv/ctypes.h"
6			#include "kleidicv/filters/blur_and_downsample.h"
7			#include "kleidicv/kleidicv.h"
8			#include "kleidicv/neon.h"
9			#include "kleidicv/utils.h"
10			#include "kleidicv/workspace/blur_and_downsample_ws.h"
11			#include "kleidicv/workspace/border_5x5.h"
12
13			namespace kleidicv::neon {
14
15			// Applies Gaussian Blur binomial filter to even rows and columns
16			//
17			// [ 1, 4, 6, 4, 1 ] [ 1 ]
18			// [ 4, 16, 24, 16, 4 ] [ 4 ]
19			// F = 1/256 * [ 6, 24, 36, 24, 6 ] = 1/256 * [ 6 ] * [ 1, 4, 6, 4, 1 ]
20			// [ 4, 16, 24, 16, 4 ] [ 4 ]
21			// [ 1, 4, 6, 4, 1 ] [ 1 ]
22			class BlurAndDownsample {
23			public:
24			using SourceType = uint8_t;
25			using BufferType = uint16_t;
26			using DestinationType = uint8_t;
27			using SourceVecTraits = typename neon::VecTraits<SourceType>;
28			using SourceVectorType = typename SourceVecTraits::VectorType;
29			using BufferVecTraits = typename neon::VecTraits<BufferType>;
30			using BufferVectorType = typename BufferVecTraits::VectorType;
31			using BorderInfoType =
32			typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5<SourceType>;
33			using BorderType = FixedBorderType;
34			using BorderOffsets = typename BorderInfoType::Offsets;
35
36		43	BlurAndDownsample()
37		43	: const_6_u8_half_{vdup_n_u8(6)},
38		43	const_6_u16_{vdupq_n_u16(6)},
39		43	const_4_u16_{vdupq_n_u16(4)} {}
40
41			static constexpr size_t margin = 2UL;
42
43		363	void process_vertical(size_t width, Rows<const SourceType> src_rows,
44			Rows<BufferType> dst_rows,
45			BorderOffsets border_offsets) const {
46		726	LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
47		363	SourceVecTraits::num_lanes()};
48
49		441	loop.unroll_twice([&](ptrdiff_t index) {
50		78	const auto *src_0 = &src_rows.at(border_offsets.c0())[index];
51		78	const auto *src_1 = &src_rows.at(border_offsets.c1())[index];
52		78	const auto *src_2 = &src_rows.at(border_offsets.c2())[index];
53		78	const auto *src_3 = &src_rows.at(border_offsets.c3())[index];
54		78	const auto *src_4 = &src_rows.at(border_offsets.c4())[index];
55
56		78	SourceVectorType src_a[5], src_b[5];
57		78	src_a[0] = vld1q(&src_0[0]);
58		78	src_b[0] = vld1q(&src_0[SourceVecTraits::num_lanes()]);
59		78	src_a[1] = vld1q(&src_1[0]);
60		78	src_b[1] = vld1q(&src_1[SourceVecTraits::num_lanes()]);
61		78	src_a[2] = vld1q(&src_2[0]);
62		78	src_b[2] = vld1q(&src_2[SourceVecTraits::num_lanes()]);
63		78	src_a[3] = vld1q(&src_3[0]);
64		78	src_b[3] = vld1q(&src_3[SourceVecTraits::num_lanes()]);
65		78	src_a[4] = vld1q(&src_4[0]);
66		78	src_b[4] = vld1q(&src_4[SourceVecTraits::num_lanes()]);
67		78	vertical_vector_path(src_a, &dst_rows[index]);
68		78	vertical_vector_path(
69		156	src_b, &dst_rows[index + static_cast<ptrdiff_t>(
70		78	SourceVecTraits::num_lanes())]);
71		78	});
72
73		437	loop.unroll_once([&](ptrdiff_t index) {
74		74	SourceVectorType src[5];
75		74	src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]);
76		74	src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]);
77		74	src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]);
78		74	src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]);
79		74	src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]);
80		74	vertical_vector_path(src, &dst_rows[index]);
81		74	});
82
83		2202	loop.tail([&](ptrdiff_t index) {
84		1839	SourceType src[5];
85		1839	src[0] = src_rows.at(border_offsets.c0())[index];
86		1839	src[1] = src_rows.at(border_offsets.c1())[index];
87		1839	src[2] = src_rows.at(border_offsets.c2())[index];
88		1839	src[3] = src_rows.at(border_offsets.c3())[index];
89		1839	src[4] = src_rows.at(border_offsets.c4())[index];
90		1839	vertical_scalar_path(src, &dst_rows[index]);
91		1839	});
92		363	}
93
94		363	void process_horizontal(size_t width, Rows<const BufferType> src_rows,
95			Rows<DestinationType> dst_rows,
96			BorderOffsets border_offsets) const {
97		726	LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
98		363	BufferVecTraits::num_lanes()};
99
100		487	loop.unroll_twice([&](ptrdiff_t index) {
101		124	const auto *src_0 = &src_rows.at(0, border_offsets.c0())[index];
102		124	const auto *src_1 = &src_rows.at(0, border_offsets.c1())[index];
103		124	const auto *src_2 = &src_rows.at(0, border_offsets.c2())[index];
104		124	const auto *src_3 = &src_rows.at(0, border_offsets.c3())[index];
105		124	const auto *src_4 = &src_rows.at(0, border_offsets.c4())[index];
106
107		124	BufferVectorType src_a[5], src_b[5];
108		124	src_a[0] = vld1q(&src_0[0]);
109		124	src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]);
110		124	src_a[1] = vld1q(&src_1[0]);
111		124	src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]);
112		124	src_a[2] = vld1q(&src_2[0]);
113		124	src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]);
114		124	src_a[3] = vld1q(&src_3[0]);
115		124	src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]);
116		124	src_a[4] = vld1q(&src_4[0]);
117		124	src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]);
118
119		124	uint8x8_t res_a = horizontal_vector_path(src_a);
120		124	uint8x8_t res_b = horizontal_vector_path(src_b);
121
122			// Only store even indices
123		124	vst1(&dst_rows[index / 2], vuzp1_u8(res_a, res_b));
124		124	});
125
126		576	loop.remaining([&](ptrdiff_t index, size_t max_index) {
127		213	index = align_up(index, 2);
128	2/2 ✓ Branch 0 taken 683 times. ✓ Branch 1 taken 213 times.	896	while (index < static_cast<ptrdiff_t>(max_index)) {
129		683	process_horizontal_scalar(src_rows, dst_rows, border_offsets, index);
130		683	index += 2;
131			}
132		213	});
133		363	}
134
135		726	void process_horizontal_borders(Rows<const BufferType> src_rows,
136			Rows<DestinationType> dst_rows,
137			BorderOffsets border_offsets) const {
138	2/2 ✓ Branch 0 taken 726 times. ✓ Branch 1 taken 726 times.	1452	for (ptrdiff_t index = 0;
139		1452	index < static_cast<ptrdiff_t>(src_rows.channels()); ++index) {
140		726	disable_loop_vectorization();
141		726	process_horizontal_scalar(src_rows, dst_rows, border_offsets, index);
142		726	}
143		726	}
144
145			private:
146			// Applies vertical filtering vector using SIMD operations.
147			//
148			// DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
149		230	void vertical_vector_path(uint8x16_t src[5], BufferType *dst) const {
150		230	uint16x8_t acc_0_4_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[4]));
151		230	uint16x8_t acc_0_4_h = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[4]));
152		230	uint16x8_t acc_1_3_l = vaddl_u8(vget_low_u8(src[1]), vget_low_u8(src[3]));
153		230	uint16x8_t acc_1_3_h = vaddl_u8(vget_high_u8(src[1]), vget_high_u8(src[3]));
154		460	uint16x8_t acc_l =
155		230	vmlal_u8(acc_0_4_l, vget_low_u8(src[2]), const_6_u8_half_);
156		460	uint16x8_t acc_h =
157		230	vmlal_u8(acc_0_4_h, vget_high_u8(src[2]), const_6_u8_half_);
158		230	acc_l = vmlaq_u16(acc_l, acc_1_3_l, const_4_u16_);
159		230	acc_h = vmlaq_u16(acc_h, acc_1_3_h, const_4_u16_);
160		230	vst1q(&dst[0], acc_l);
161		230	vst1q(&dst[8], acc_h);
162		230	}
163
164			// Applies vertical filtering vector using scalar operations.
165			//
166			// DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
167		1839	void vertical_scalar_path(const SourceType src[5], BufferType *dst) const {
168		1839	dst[0] = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2];
169		1839	}
170
171			// Applies horizontal filtering vector using SIMD operations.
172			//
173			// DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
174		248	uint8x8_t horizontal_vector_path(uint16x8_t src[5]) const {
175		248	uint16x8_t acc_0_4 = vaddq_u16(src[0], src[4]);
176		248	uint16x8_t acc_1_3 = vaddq_u16(src[1], src[3]);
177		248	uint16x8_t acc_u16 = vmlaq_u16(acc_0_4, src[2], const_6_u16_);
178		248	acc_u16 = vmlaq_u16(acc_u16, acc_1_3, const_4_u16_);
179		496	return vrshrn_n_u16(acc_u16, 8);
180		248	}
181
182			// Applies horizontal filtering vector using scalar operations.
183			//
184			// DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
185		1409	void process_horizontal_scalar(Rows<const BufferType> src_rows,
186			Rows<DestinationType> dst_rows,
187			BorderOffsets border_offsets,
188			ptrdiff_t index) const {
189		1409	BufferType src[5];
190		1409	src[0] = src_rows.at(0, border_offsets.c0())[index];
191		1409	src[1] = src_rows.at(0, border_offsets.c1())[index];
192		1409	src[2] = src_rows.at(0, border_offsets.c2())[index];
193		1409	src[3] = src_rows.at(0, border_offsets.c3())[index];
194		1409	src[4] = src_rows.at(0, border_offsets.c4())[index];
195
196		1409	auto acc = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2];
197		1409	dst_rows[index / 2] = rounding_shift_right(acc, 8);
198		1409	}
199
200			uint8x8_t const_6_u8_half_;
201			uint16x8_t const_6_u16_;
202			uint16x8_t const_4_u16_;
203			}; // end of class BlurAndDownsample
204
205			// Does not include checks for whether the operation is implemented.
206			// This must be done earlier, by blur_and_downsample_is_implemented.
207		51	static kleidicv_error_t blur_and_downsample_checks(
208			const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
209			uint8_t *dst, size_t dst_stride, size_t channels,
210			BlurAndDownsampleFilterWorkspace *workspace) {
211	2/2 ✓ Branch 0 taken 1 times. ✓ Branch 1 taken 50 times.	51	CHECK_POINTERS(workspace);
212	4/4 ✓ Branch 0 taken 1 times. ✓ Branch 1 taken 49 times. ✓ Branch 2 taken 1 times. ✓ Branch 3 taken 49 times.	50	CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
213	4/4 ✓ Branch 0 taken 1 times. ✓ Branch 1 taken 48 times. ✓ Branch 2 taken 1 times. ✓ Branch 3 taken 48 times.	49	CHECK_POINTER_AND_STRIDE(dst, dst_stride, (src_height + 1) / 2);
214	6/6 ✓ Branch 0 taken 1 times. ✓ Branch 1 taken 47 times. ✓ Branch 2 taken 1 times. ✓ Branch 3 taken 46 times. ✓ Branch 4 taken 2 times. ✓ Branch 5 taken 46 times.	48	CHECK_IMAGE_SIZE(src_width, src_height);
215
216		46	Rectangle rect{src_width, src_height};
217		46	const Rectangle &context_rect = workspace->image_size();
218	4/4 ✓ Branch 0 taken 44 times. ✓ Branch 1 taken 2 times. ✓ Branch 2 taken 1 times. ✓ Branch 3 taken 43 times.	46	if (context_rect.width() < src_width \|\| context_rect.height() < src_height) {
219		3	return KLEIDICV_ERROR_CONTEXT_MISMATCH;
220			}
221
222			// Currently supports only one channel, so it cannot be tested.
223			// GCOVR_EXCL_START
224		−	if (workspace->channels() < channels) {
225		−	return KLEIDICV_ERROR_CONTEXT_MISMATCH;
226			}
227			// GCOVR_EXCL_STOP
228
229		43	return KLEIDICV_OK;
230		51	}
231
232			KLEIDICV_TARGET_FN_ATTRS
233		51	kleidicv_error_t kleidicv_blur_and_downsample_stripe_u8(
234			const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
235			uint8_t *dst, size_t dst_stride, size_t y_begin, size_t y_end,
236			size_t channels, FixedBorderType fixed_border_type,
237			kleidicv_filter_context_t *context) {
238			// Does not include checks for whether the operation is implemented.
239			// This must be done earlier, by blur_and_downsample_is_implemented.
240		102	auto *workspace =
241		51	reinterpret_cast<BlurAndDownsampleFilterWorkspace *>(context);
242
243	6/6 ✓ Branch 0 taken 8 times. ✓ Branch 1 taken 43 times. ✓ Branch 2 taken 8 times. ✓ Branch 3 taken 43 times. ✓ Branch 4 taken 8 times. ✓ Branch 5 taken 43 times.	110	if (auto check_result =
244		102	blur_and_downsample_checks(src, src_stride, src_width, src_height,
245		51	dst, dst_stride, channels, workspace)) {
246		8	return check_result;
247			}
248
249		43	Rectangle rect{src_width, src_height};
250
251		43	Rows<const uint8_t> src_rows{src, src_stride, channels};
252		43	Rows<uint8_t> dst_rows{dst, dst_stride, channels};
253		86	workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels,
254		43	fixed_border_type, BlurAndDownsample{});
255
256		43	return KLEIDICV_OK;
257		51	}
258
259			} // namespace kleidicv::neon
260