KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/filters/blur_and_downsample_neon.cpp
Date: 2026-03-05 15:57:40
Exec Total Coverage
Lines: 223 223 100.0%
Functions: 24 24 100.0%
Branches: 40 40 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2026 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include "kleidicv/ctypes.h"
6 #include "kleidicv/filters/blur_and_downsample.h"
7 #include "kleidicv/kleidicv.h"
8 #include "kleidicv/neon.h"
9 #include "kleidicv/utils.h"
10 #include "kleidicv/workspace/blur_and_downsample_ws.h"
11 #include "kleidicv/workspace/border_5x5.h"
12
13 namespace kleidicv::neon {
14
15 // Applies Gaussian Blur binomial filter to even rows and columns
16 //
17 // [ 1, 4, 6, 4, 1 ] [ 1 ]
18 // [ 4, 16, 24, 16, 4 ] [ 4 ]
19 // F = 1/256 * [ 6, 24, 36, 24, 6 ] = 1/256 * [ 6 ] * [ 1, 4, 6, 4, 1 ]
20 // [ 4, 16, 24, 16, 4 ] [ 4 ]
21 // [ 1, 4, 6, 4, 1 ] [ 1 ]
22 class BlurAndDownsample {
23 public:
24 using SourceType = uint8_t;
25 using BufferType = uint16_t;
26 using DestinationType = uint8_t;
27 using SourceVecTraits = typename neon::VecTraits<SourceType>;
28 using SourceVectorType = typename SourceVecTraits::VectorType;
29 using BufferVecTraits = typename neon::VecTraits<BufferType>;
30 using BufferVectorType = typename BufferVecTraits::VectorType;
31 using BorderInfoType =
32 typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5<SourceType>;
33 using BorderType = FixedBorderType;
34 using BorderOffsets = typename BorderInfoType::Offsets;
35
36 137 BlurAndDownsample()
37 137 : const_6_u8_half_{vdup_n_u8(6)},
38 137 const_6_u16_{vdupq_n_u16(6)},
39 137 const_4_u16_{vdupq_n_u16(4)} {}
40
41 static constexpr size_t margin = 2UL;
42
43 2511 void process_vertical(size_t width, Rows<const SourceType> src_rows,
44 Rows<BufferType> dst_rows,
45 BorderOffsets border_offsets) const {
46 5022 LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
47 2511 SourceVecTraits::num_lanes()};
48
49 22981 loop.unroll_twice([&](ptrdiff_t index) {
50 20470 const auto *src_0 = &src_rows.at(border_offsets.c0())[index];
51 20470 const auto *src_1 = &src_rows.at(border_offsets.c1())[index];
52 20470 const auto *src_2 = &src_rows.at(border_offsets.c2())[index];
53 20470 const auto *src_3 = &src_rows.at(border_offsets.c3())[index];
54 20470 const auto *src_4 = &src_rows.at(border_offsets.c4())[index];
55
56 20470 SourceVectorType src_a[5], src_b[5];
57 20470 src_a[0] = vld1q(&src_0[0]);
58 20470 src_b[0] = vld1q(&src_0[SourceVecTraits::num_lanes()]);
59 20470 src_a[1] = vld1q(&src_1[0]);
60 20470 src_b[1] = vld1q(&src_1[SourceVecTraits::num_lanes()]);
61 20470 src_a[2] = vld1q(&src_2[0]);
62 20470 src_b[2] = vld1q(&src_2[SourceVecTraits::num_lanes()]);
63 20470 src_a[3] = vld1q(&src_3[0]);
64 20470 src_b[3] = vld1q(&src_3[SourceVecTraits::num_lanes()]);
65 20470 src_a[4] = vld1q(&src_4[0]);
66 20470 src_b[4] = vld1q(&src_4[SourceVecTraits::num_lanes()]);
67 20470 vertical_vector_path(src_a, &dst_rows[index]);
68 20470 vertical_vector_path(
69 40940 src_b, &dst_rows[index + static_cast<ptrdiff_t>(
70 20470 SourceVecTraits::num_lanes())]);
71 20470 });
72
73 3769 loop.unroll_once([&](ptrdiff_t index) {
74 1258 SourceVectorType src[5];
75 1258 src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]);
76 1258 src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]);
77 1258 src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]);
78 1258 src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]);
79 1258 src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]);
80 1258 vertical_vector_path(src, &dst_rows[index]);
81 1258 });
82
83 6630 loop.tail([&](ptrdiff_t index) {
84 4119 SourceType src[5];
85 4119 src[0] = src_rows.at(border_offsets.c0())[index];
86 4119 src[1] = src_rows.at(border_offsets.c1())[index];
87 4119 src[2] = src_rows.at(border_offsets.c2())[index];
88 4119 src[3] = src_rows.at(border_offsets.c3())[index];
89 4119 src[4] = src_rows.at(border_offsets.c4())[index];
90 4119 vertical_scalar_path(src, &dst_rows[index]);
91 4119 });
92 2511 }
93
94 2511 void process_horizontal(size_t width, Rows<const BufferType> src_rows,
95 Rows<DestinationType> dst_rows,
96 BorderOffsets border_offsets) const {
97
4/4
✓ Branch 0 taken 740 times.
✓ Branch 1 taken 760 times.
✓ Branch 2 taken 499 times.
✓ Branch 3 taken 512 times.
2511 switch (src_rows.channels()) {
98 case 1:
99 499 process_horizontal<1>(width, src_rows, dst_rows, border_offsets);
100 499 break;
101 case 2:
102 740 process_horizontal<2>(width, src_rows, dst_rows, border_offsets);
103 740 break;
104 case 3:
105 512 process_horizontal_3channels(width, src_rows, dst_rows, border_offsets);
106 512 break;
107 default /* channel == 4 */:
108 760 process_horizontal<4>(width, src_rows, dst_rows, border_offsets);
109 760 break;
110 }
111 2511 }
112
113 5022 void process_horizontal_borders(Rows<const BufferType> src_rows,
114 Rows<DestinationType> dst_rows,
115 BorderOffsets border_offsets) const {
116 5022 const ptrdiff_t channels = static_cast<ptrdiff_t>(src_rows.channels());
117
2/2
✓ Branch 0 taken 5022 times.
✓ Branch 1 taken 13110 times.
18132 for (ptrdiff_t channel = 0; channel < channels; ++channel) {
118 13110 disable_loop_vectorization();
119 26220 process_horizontal_scalar(src_rows, dst_rows, border_offsets, channel,
120 13110 channel);
121 13110 }
122 5022 }
123
124 private:
125 // Applies vertical filtering vector using SIMD operations.
126 //
127 // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
128 42198 void vertical_vector_path(uint8x16_t src[5], BufferType *dst) const {
129 42198 uint16x8_t acc_0_4_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[4]));
130 42198 uint16x8_t acc_0_4_h = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[4]));
131 42198 uint16x8_t acc_1_3_l = vaddl_u8(vget_low_u8(src[1]), vget_low_u8(src[3]));
132 42198 uint16x8_t acc_1_3_h = vaddl_u8(vget_high_u8(src[1]), vget_high_u8(src[3]));
133 84396 uint16x8_t acc_l =
134 42198 vmlal_u8(acc_0_4_l, vget_low_u8(src[2]), const_6_u8_half_);
135 84396 uint16x8_t acc_h =
136 42198 vmlal_u8(acc_0_4_h, vget_high_u8(src[2]), const_6_u8_half_);
137 42198 acc_l = vmlaq_u16(acc_l, acc_1_3_l, const_4_u16_);
138 42198 acc_h = vmlaq_u16(acc_h, acc_1_3_h, const_4_u16_);
139 42198 vst1q(&dst[0], acc_l);
140 42198 vst1q(&dst[8], acc_h);
141 42198 }
142
143 // Applies vertical filtering vector using scalar operations.
144 //
145 // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
146 4119 void vertical_scalar_path(const SourceType src[5], BufferType *dst) const {
147 4119 dst[0] = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2];
148 4119 }
149
150 // Applies horizontal filtering vector using SIMD operations.
151 //
152 // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
153 77684 uint8x8_t horizontal_vector_path(uint16x8_t src[5]) const {
154 77684 uint16x8_t acc_0_4 = vaddq_u16(src[0], src[4]);
155 77684 uint16x8_t acc_1_3 = vaddq_u16(src[1], src[3]);
156 77684 uint16x8_t acc_u16 = vmlaq_u16(acc_0_4, src[2], const_6_u16_);
157 77684 acc_u16 = vmlaq_u16(acc_u16, acc_1_3, const_4_u16_);
158 155368 return vrshrn_n_u16(acc_u16, 8);
159 77684 }
160
161 // Applies horizontal filtering vector using scalar operations.
162 //
163 // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
164 24753 void process_horizontal_scalar(Rows<const BufferType> src_rows,
165 Rows<DestinationType> dst_rows,
166 BorderOffsets border_offsets, ptrdiff_t index,
167 ptrdiff_t dst_index) const {
168 24753 BufferType src[5];
169 24753 src[0] = src_rows.at(0, border_offsets.c0())[index];
170 24753 src[1] = src_rows.at(0, border_offsets.c1())[index];
171 24753 src[2] = src_rows.at(0, border_offsets.c2())[index];
172 24753 src[3] = src_rows.at(0, border_offsets.c3())[index];
173 24753 src[4] = src_rows.at(0, border_offsets.c4())[index];
174
175 24753 auto acc = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2];
176 24753 dst_rows[dst_index] = rounding_shift_right(acc, 8);
177 24753 }
178
179 template <ptrdiff_t Channels>
180 1999 void process_horizontal(size_t width, Rows<const BufferType> src_rows,
181 Rows<DestinationType> dst_rows,
182 BorderOffsets border_offsets) const {
183 3998 LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
184 1999 BufferVecTraits::num_lanes()};
185
186 33725 loop.unroll_twice([&](ptrdiff_t index) {
187 31726 const auto *src_0 = &src_rows.at(0, border_offsets.c0())[index];
188 31726 const auto *src_1 = &src_rows.at(0, border_offsets.c1())[index];
189 31726 const auto *src_2 = &src_rows.at(0, border_offsets.c2())[index];
190 31726 const auto *src_3 = &src_rows.at(0, border_offsets.c3())[index];
191 31726 const auto *src_4 = &src_rows.at(0, border_offsets.c4())[index];
192
193 31726 BufferVectorType src_a[5], src_b[5];
194 31726 src_a[0] = vld1q(&src_0[0]);
195 31726 src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]);
196 31726 src_a[1] = vld1q(&src_1[0]);
197 31726 src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]);
198 31726 src_a[2] = vld1q(&src_2[0]);
199 31726 src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]);
200 31726 src_a[3] = vld1q(&src_3[0]);
201 31726 src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]);
202 31726 src_a[4] = vld1q(&src_4[0]);
203 31726 src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]);
204
205 31726 uint8x8_t res_a = horizontal_vector_path(src_a);
206 31726 uint8x8_t res_b = horizontal_vector_path(src_b);
207
208 // Only store even indices.
209 if constexpr (Channels == 1) {
210 2228 vst1(&dst_rows[index / 2], vuzp1_u8(res_a, res_b));
211 } else if constexpr (Channels == 2) {
212 18800 vst1(&dst_rows[index / 2],
213 18800 vreinterpret_u8_u16(vuzp1_u16(vreinterpret_u16_u8(res_a),
214 9400 vreinterpret_u16_u8(res_b))));
215 } else {
216 static_assert(Channels == 4);
217 40196 vst1(&dst_rows[index / 2],
218 40196 vreinterpret_u8_u32(vuzp1_u32(vreinterpret_u32_u8(res_a),
219 20098 vreinterpret_u32_u8(res_b))));
220 }
221 31726 });
222
223 3314 loop.remaining([&](ptrdiff_t index, size_t max_index) {
224 1315 ptrdiff_t pixel = index / Channels;
225 1315 pixel = align_up(pixel, static_cast<ptrdiff_t>(2));
226 1315 index = pixel * Channels;
227
6/6
✓ Branch 0 taken 1851 times.
✓ Branch 1 taken 413 times.
✓ Branch 2 taken 1348 times.
✓ Branch 3 taken 586 times.
✓ Branch 4 taken 316 times.
✓ Branch 5 taken 316 times.
4830 while (index < static_cast<ptrdiff_t>(max_index)) {
228
6/6
✓ Branch 0 taken 1851 times.
✓ Branch 1 taken 1851 times.
✓ Branch 2 taken 2696 times.
✓ Branch 3 taken 1348 times.
✓ Branch 4 taken 1264 times.
✓ Branch 5 taken 316 times.
9326 for (ptrdiff_t channel = 0; channel < Channels; ++channel) {
229 11622 process_horizontal_scalar(src_rows, dst_rows, border_offsets,
230 5811 index + channel, index / 2 + channel);
231 5811 }
232 3515 index += 2 * Channels;
233 }
234 1315 });
235 1999 }
236
237 512 void process_horizontal_3channels(size_t width,
238 Rows<const BufferType> src_rows,
239 Rows<DestinationType> dst_rows,
240 BorderOffsets border_offsets) const {
241 512 constexpr ptrdiff_t channels = 3;
242 512 const ptrdiff_t vec_stride =
243 static_cast<ptrdiff_t>(BufferVecTraits::num_lanes()) * channels;
244 512 LoopUnroll2<TryToAvoidTailLoop> loop{width, BufferVecTraits::num_lanes()};
245
246 2884 loop.unroll_twice([&](ptrdiff_t column) {
247 4744 const auto *src_0 =
248 2372 &src_rows.at(0, border_offsets.c0())[column * channels];
249 4744 const auto *src_1 =
250 2372 &src_rows.at(0, border_offsets.c1())[column * channels];
251 4744 const auto *src_2 =
252 2372 &src_rows.at(0, border_offsets.c2())[column * channels];
253 4744 const auto *src_3 =
254 2372 &src_rows.at(0, border_offsets.c3())[column * channels];
255 4744 const auto *src_4 =
256 2372 &src_rows.at(0, border_offsets.c4())[column * channels];
257
258 2372 uint16x8x3_t src0_a = vld3q_u16(&src_0[0]);
259 2372 uint16x8x3_t src1_a = vld3q_u16(&src_1[0]);
260 2372 uint16x8x3_t src2_a = vld3q_u16(&src_2[0]);
261 2372 uint16x8x3_t src3_a = vld3q_u16(&src_3[0]);
262 2372 uint16x8x3_t src4_a = vld3q_u16(&src_4[0]);
263
264 2372 uint16x8x3_t src0_b = vld3q_u16(&src_0[vec_stride]);
265 2372 uint16x8x3_t src1_b = vld3q_u16(&src_1[vec_stride]);
266 2372 uint16x8x3_t src2_b = vld3q_u16(&src_2[vec_stride]);
267 2372 uint16x8x3_t src3_b = vld3q_u16(&src_3[vec_stride]);
268 2372 uint16x8x3_t src4_b = vld3q_u16(&src_4[vec_stride]);
269
270 7116 uint16x8_t ch0_a[5] = {src0_a.val[0], src1_a.val[0], src2_a.val[0],
271 4744 src3_a.val[0], src4_a.val[0]};
272 7116 uint16x8_t ch0_b[5] = {src0_b.val[0], src1_b.val[0], src2_b.val[0],
273 4744 src3_b.val[0], src4_b.val[0]};
274 7116 uint16x8_t ch1_a[5] = {src0_a.val[1], src1_a.val[1], src2_a.val[1],
275 4744 src3_a.val[1], src4_a.val[1]};
276 7116 uint16x8_t ch1_b[5] = {src0_b.val[1], src1_b.val[1], src2_b.val[1],
277 4744 src3_b.val[1], src4_b.val[1]};
278 7116 uint16x8_t ch2_a[5] = {src0_a.val[2], src1_a.val[2], src2_a.val[2],
279 4744 src3_a.val[2], src4_a.val[2]};
280 7116 uint16x8_t ch2_b[5] = {src0_b.val[2], src1_b.val[2], src2_b.val[2],
281 4744 src3_b.val[2], src4_b.val[2]};
282
283 2372 uint8x8_t res0_a = horizontal_vector_path(ch0_a);
284 2372 uint8x8_t res0_b = horizontal_vector_path(ch0_b);
285 2372 uint8x8_t res1_a = horizontal_vector_path(ch1_a);
286 2372 uint8x8_t res1_b = horizontal_vector_path(ch1_b);
287 2372 uint8x8_t res2_a = horizontal_vector_path(ch2_a);
288 2372 uint8x8_t res2_b = horizontal_vector_path(ch2_b);
289
290 2372 uint8x8_t out0 = vuzp1_u8(res0_a, res0_b);
291 2372 uint8x8_t out1 = vuzp1_u8(res1_a, res1_b);
292 2372 uint8x8_t out2 = vuzp1_u8(res2_a, res2_b);
293
294 2372 uint8x8x3_t interleaved{out0, out1, out2};
295 2372 vst3_u8(&dst_rows.at(0, column / 2)[0], interleaved);
296 2372 });
297
298 938 loop.remaining([&](ptrdiff_t column, size_t max_column) {
299 426 column = align_up(column, 2);
300
2/2
✓ Branch 0 taken 1944 times.
✓ Branch 1 taken 426 times.
2370 while (column < static_cast<ptrdiff_t>(max_column)) {
301 1944 Rows<const BufferType> src_row = src_rows.at(0, column);
302 1944 Rows<DestinationType> dst_row = dst_rows.at(0, column / 2);
303
2/2
✓ Branch 0 taken 5832 times.
✓ Branch 1 taken 1944 times.
7776 for (ptrdiff_t channel = 0; channel < channels; ++channel) {
304 11664 process_horizontal_scalar(src_row, dst_row, border_offsets, channel,
305 5832 channel);
306 5832 }
307 1944 column += 2;
308 1944 }
309 426 });
310 512 }
311
312 uint8x8_t const_6_u8_half_;
313 uint16x8_t const_6_u16_;
314 uint16x8_t const_4_u16_;
315 }; // end of class BlurAndDownsample
316
317 KLEIDICV_TARGET_FN_ATTRS
318 142 kleidicv_error_t kleidicv_blur_and_downsample_stripe_u8(
319 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
320 uint8_t *dst, size_t dst_stride, size_t y_begin, size_t y_end,
321 size_t channels, FixedBorderType fixed_border_type) {
322
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 141 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 141 times.
142 CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
323
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 140 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 140 times.
141 CHECK_POINTER_AND_STRIDE(dst, dst_stride, (src_height + 1) / 2);
324
6/6
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 139 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 138 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 138 times.
140 CHECK_IMAGE_SIZE(src_width, src_height);
325
326 138 Rectangle rect{src_width, src_height};
327 138 constexpr size_t intermediate_size{
328 sizeof(typename BlurAndDownsample::BufferType)};
329
330 276 auto workspace_variant = BlurAndDownsampleFilterWorkspace::create(
331 138 rect, channels, intermediate_size);
332
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 137 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 137 times.
139 if (auto *err = std::get_if<kleidicv_error_t>(&workspace_variant)) {
333 1 return *err;
334 }
335 274 auto &workspace =
336 137 *std::get_if<BlurAndDownsampleFilterWorkspace>(&workspace_variant);
337
338 137 Rows<const uint8_t> src_rows{src, src_stride, channels};
339 137 Rows<uint8_t> dst_rows{dst, dst_stride, channels};
340 274 workspace.process(y_begin, y_end, src_rows, dst_rows, fixed_border_type,
341 137 BlurAndDownsample{});
342
343 137 return KLEIDICV_OK;
344 142 }
345
346 } // namespace kleidicv::neon
347