KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/filters/blur_and_downsample_neon.cpp
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 145 145 100.0%
Functions: 15 15 100.0%
Branches: 30 30 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include "kleidicv/ctypes.h"
6 #include "kleidicv/filters/blur_and_downsample.h"
7 #include "kleidicv/kleidicv.h"
8 #include "kleidicv/neon.h"
9 #include "kleidicv/utils.h"
10 #include "kleidicv/workspace/blur_and_downsample_ws.h"
11 #include "kleidicv/workspace/border_5x5.h"
12
13 namespace kleidicv::neon {
14
15 // Applies Gaussian Blur binomial filter to even rows and columns
16 //
17 // [ 1, 4, 6, 4, 1 ] [ 1 ]
18 // [ 4, 16, 24, 16, 4 ] [ 4 ]
19 // F = 1/256 * [ 6, 24, 36, 24, 6 ] = 1/256 * [ 6 ] * [ 1, 4, 6, 4, 1 ]
20 // [ 4, 16, 24, 16, 4 ] [ 4 ]
21 // [ 1, 4, 6, 4, 1 ] [ 1 ]
22 class BlurAndDownsample {
23 public:
24 using SourceType = uint8_t;
25 using BufferType = uint16_t;
26 using DestinationType = uint8_t;
27 using SourceVecTraits = typename neon::VecTraits<SourceType>;
28 using SourceVectorType = typename SourceVecTraits::VectorType;
29 using BufferVecTraits = typename neon::VecTraits<BufferType>;
30 using BufferVectorType = typename BufferVecTraits::VectorType;
31 using BorderInfoType =
32 typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5<SourceType>;
33 using BorderType = FixedBorderType;
34 using BorderOffsets = typename BorderInfoType::Offsets;
35
36 43 BlurAndDownsample()
37 43 : const_6_u8_half_{vdup_n_u8(6)},
38 43 const_6_u16_{vdupq_n_u16(6)},
39 43 const_4_u16_{vdupq_n_u16(4)} {}
40
41 static constexpr size_t margin = 2UL;
42
43 363 void process_vertical(size_t width, Rows<const SourceType> src_rows,
44 Rows<BufferType> dst_rows,
45 BorderOffsets border_offsets) const {
46 726 LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
47 363 SourceVecTraits::num_lanes()};
48
49 441 loop.unroll_twice([&](ptrdiff_t index) {
50 78 const auto *src_0 = &src_rows.at(border_offsets.c0())[index];
51 78 const auto *src_1 = &src_rows.at(border_offsets.c1())[index];
52 78 const auto *src_2 = &src_rows.at(border_offsets.c2())[index];
53 78 const auto *src_3 = &src_rows.at(border_offsets.c3())[index];
54 78 const auto *src_4 = &src_rows.at(border_offsets.c4())[index];
55
56 78 SourceVectorType src_a[5], src_b[5];
57 78 src_a[0] = vld1q(&src_0[0]);
58 78 src_b[0] = vld1q(&src_0[SourceVecTraits::num_lanes()]);
59 78 src_a[1] = vld1q(&src_1[0]);
60 78 src_b[1] = vld1q(&src_1[SourceVecTraits::num_lanes()]);
61 78 src_a[2] = vld1q(&src_2[0]);
62 78 src_b[2] = vld1q(&src_2[SourceVecTraits::num_lanes()]);
63 78 src_a[3] = vld1q(&src_3[0]);
64 78 src_b[3] = vld1q(&src_3[SourceVecTraits::num_lanes()]);
65 78 src_a[4] = vld1q(&src_4[0]);
66 78 src_b[4] = vld1q(&src_4[SourceVecTraits::num_lanes()]);
67 78 vertical_vector_path(src_a, &dst_rows[index]);
68 78 vertical_vector_path(
69 156 src_b, &dst_rows[index + static_cast<ptrdiff_t>(
70 78 SourceVecTraits::num_lanes())]);
71 78 });
72
73 437 loop.unroll_once([&](ptrdiff_t index) {
74 74 SourceVectorType src[5];
75 74 src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]);
76 74 src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]);
77 74 src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]);
78 74 src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]);
79 74 src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]);
80 74 vertical_vector_path(src, &dst_rows[index]);
81 74 });
82
83 2202 loop.tail([&](ptrdiff_t index) {
84 1839 SourceType src[5];
85 1839 src[0] = src_rows.at(border_offsets.c0())[index];
86 1839 src[1] = src_rows.at(border_offsets.c1())[index];
87 1839 src[2] = src_rows.at(border_offsets.c2())[index];
88 1839 src[3] = src_rows.at(border_offsets.c3())[index];
89 1839 src[4] = src_rows.at(border_offsets.c4())[index];
90 1839 vertical_scalar_path(src, &dst_rows[index]);
91 1839 });
92 363 }
93
94 363 void process_horizontal(size_t width, Rows<const BufferType> src_rows,
95 Rows<DestinationType> dst_rows,
96 BorderOffsets border_offsets) const {
97 726 LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
98 363 BufferVecTraits::num_lanes()};
99
100 487 loop.unroll_twice([&](ptrdiff_t index) {
101 124 const auto *src_0 = &src_rows.at(0, border_offsets.c0())[index];
102 124 const auto *src_1 = &src_rows.at(0, border_offsets.c1())[index];
103 124 const auto *src_2 = &src_rows.at(0, border_offsets.c2())[index];
104 124 const auto *src_3 = &src_rows.at(0, border_offsets.c3())[index];
105 124 const auto *src_4 = &src_rows.at(0, border_offsets.c4())[index];
106
107 124 BufferVectorType src_a[5], src_b[5];
108 124 src_a[0] = vld1q(&src_0[0]);
109 124 src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]);
110 124 src_a[1] = vld1q(&src_1[0]);
111 124 src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]);
112 124 src_a[2] = vld1q(&src_2[0]);
113 124 src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]);
114 124 src_a[3] = vld1q(&src_3[0]);
115 124 src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]);
116 124 src_a[4] = vld1q(&src_4[0]);
117 124 src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]);
118
119 124 uint8x8_t res_a = horizontal_vector_path(src_a);
120 124 uint8x8_t res_b = horizontal_vector_path(src_b);
121
122 // Only store even indices
123 124 vst1(&dst_rows[index / 2], vuzp1_u8(res_a, res_b));
124 124 });
125
126 576 loop.remaining([&](ptrdiff_t index, size_t max_index) {
127 213 index = align_up(index, 2);
128
2/2
✓ Branch 0 taken 683 times.
✓ Branch 1 taken 213 times.
896 while (index < static_cast<ptrdiff_t>(max_index)) {
129 683 process_horizontal_scalar(src_rows, dst_rows, border_offsets, index);
130 683 index += 2;
131 }
132 213 });
133 363 }
134
135 726 void process_horizontal_borders(Rows<const BufferType> src_rows,
136 Rows<DestinationType> dst_rows,
137 BorderOffsets border_offsets) const {
138
2/2
✓ Branch 0 taken 726 times.
✓ Branch 1 taken 726 times.
1452 for (ptrdiff_t index = 0;
139 1452 index < static_cast<ptrdiff_t>(src_rows.channels()); ++index) {
140 726 disable_loop_vectorization();
141 726 process_horizontal_scalar(src_rows, dst_rows, border_offsets, index);
142 726 }
143 726 }
144
145 private:
146 // Applies vertical filtering vector using SIMD operations.
147 //
148 // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
149 230 void vertical_vector_path(uint8x16_t src[5], BufferType *dst) const {
150 230 uint16x8_t acc_0_4_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[4]));
151 230 uint16x8_t acc_0_4_h = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[4]));
152 230 uint16x8_t acc_1_3_l = vaddl_u8(vget_low_u8(src[1]), vget_low_u8(src[3]));
153 230 uint16x8_t acc_1_3_h = vaddl_u8(vget_high_u8(src[1]), vget_high_u8(src[3]));
154 460 uint16x8_t acc_l =
155 230 vmlal_u8(acc_0_4_l, vget_low_u8(src[2]), const_6_u8_half_);
156 460 uint16x8_t acc_h =
157 230 vmlal_u8(acc_0_4_h, vget_high_u8(src[2]), const_6_u8_half_);
158 230 acc_l = vmlaq_u16(acc_l, acc_1_3_l, const_4_u16_);
159 230 acc_h = vmlaq_u16(acc_h, acc_1_3_h, const_4_u16_);
160 230 vst1q(&dst[0], acc_l);
161 230 vst1q(&dst[8], acc_h);
162 230 }
163
164 // Applies vertical filtering vector using scalar operations.
165 //
166 // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
167 1839 void vertical_scalar_path(const SourceType src[5], BufferType *dst) const {
168 1839 dst[0] = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2];
169 1839 }
170
171 // Applies horizontal filtering vector using SIMD operations.
172 //
173 // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
174 248 uint8x8_t horizontal_vector_path(uint16x8_t src[5]) const {
175 248 uint16x8_t acc_0_4 = vaddq_u16(src[0], src[4]);
176 248 uint16x8_t acc_1_3 = vaddq_u16(src[1], src[3]);
177 248 uint16x8_t acc_u16 = vmlaq_u16(acc_0_4, src[2], const_6_u16_);
178 248 acc_u16 = vmlaq_u16(acc_u16, acc_1_3, const_4_u16_);
179 496 return vrshrn_n_u16(acc_u16, 8);
180 248 }
181
182 // Applies horizontal filtering vector using scalar operations.
183 //
184 // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
185 1409 void process_horizontal_scalar(Rows<const BufferType> src_rows,
186 Rows<DestinationType> dst_rows,
187 BorderOffsets border_offsets,
188 ptrdiff_t index) const {
189 1409 BufferType src[5];
190 1409 src[0] = src_rows.at(0, border_offsets.c0())[index];
191 1409 src[1] = src_rows.at(0, border_offsets.c1())[index];
192 1409 src[2] = src_rows.at(0, border_offsets.c2())[index];
193 1409 src[3] = src_rows.at(0, border_offsets.c3())[index];
194 1409 src[4] = src_rows.at(0, border_offsets.c4())[index];
195
196 1409 auto acc = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2];
197 1409 dst_rows[index / 2] = rounding_shift_right(acc, 8);
198 1409 }
199
200 uint8x8_t const_6_u8_half_;
201 uint16x8_t const_6_u16_;
202 uint16x8_t const_4_u16_;
203 }; // end of class BlurAndDownsample
204
205 // Does not include checks for whether the operation is implemented.
206 // This must be done earlier, by blur_and_downsample_is_implemented.
207 51 static kleidicv_error_t blur_and_downsample_checks(
208 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
209 uint8_t *dst, size_t dst_stride, size_t channels,
210 BlurAndDownsampleFilterWorkspace *workspace) {
211
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 50 times.
51 CHECK_POINTERS(workspace);
212
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 49 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 49 times.
50 CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
213
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 48 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 48 times.
49 CHECK_POINTER_AND_STRIDE(dst, dst_stride, (src_height + 1) / 2);
214
6/6
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 47 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 46 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 46 times.
48 CHECK_IMAGE_SIZE(src_width, src_height);
215
216 46 Rectangle rect{src_width, src_height};
217 46 const Rectangle &context_rect = workspace->image_size();
218
4/4
✓ Branch 0 taken 44 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 43 times.
46 if (context_rect.width() < src_width || context_rect.height() < src_height) {
219 3 return KLEIDICV_ERROR_CONTEXT_MISMATCH;
220 }
221
222 // Currently supports only one channel, so it cannot be tested.
223 // GCOVR_EXCL_START
224 if (workspace->channels() < channels) {
225 return KLEIDICV_ERROR_CONTEXT_MISMATCH;
226 }
227 // GCOVR_EXCL_STOP
228
229 43 return KLEIDICV_OK;
230 51 }
231
232 KLEIDICV_TARGET_FN_ATTRS
233 51 kleidicv_error_t kleidicv_blur_and_downsample_stripe_u8(
234 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
235 uint8_t *dst, size_t dst_stride, size_t y_begin, size_t y_end,
236 size_t channels, FixedBorderType fixed_border_type,
237 kleidicv_filter_context_t *context) {
238 // Does not include checks for whether the operation is implemented.
239 // This must be done earlier, by blur_and_downsample_is_implemented.
240 102 auto *workspace =
241 51 reinterpret_cast<BlurAndDownsampleFilterWorkspace *>(context);
242
243
6/6
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 43 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 43 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 43 times.
110 if (auto check_result =
244 102 blur_and_downsample_checks(src, src_stride, src_width, src_height,
245 51 dst, dst_stride, channels, workspace)) {
246 8 return check_result;
247 }
248
249 43 Rectangle rect{src_width, src_height};
250
251 43 Rows<const uint8_t> src_rows{src, src_stride, channels};
252 43 Rows<uint8_t> dst_rows{dst, dst_stride, channels};
253 86 workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels,
254 43 fixed_border_type, BlurAndDownsample{});
255
256 43 return KLEIDICV_OK;
257 51 }
258
259 } // namespace kleidicv::neon
260