KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/filters/gaussian_blur_fixed_neon.cpp
Date: 2026-01-20 20:58:59
Exec Total Coverage
Lines: 225 225 100.0%
Functions: 55 55 100.0%
Branches: 62 65 95.4%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <cassert>
6 #include <cstddef>
7
8 #include "kleidicv/config.h"
9 #include "kleidicv/ctypes.h"
10 #include "kleidicv/filters/gaussian_blur.h"
11 #include "kleidicv/filters/separable_filter_15x15_neon.h"
12 #include "kleidicv/filters/separable_filter_21x21_neon.h"
13 #include "kleidicv/filters/separable_filter_3x3_neon.h"
14 #include "kleidicv/filters/separable_filter_5x5_neon.h"
15 #include "kleidicv/filters/separable_filter_7x7_neon.h"
16 #include "kleidicv/filters/sigma.h"
17 #include "kleidicv/neon.h"
18 #include "kleidicv/workspace/border_types.h"
19 #include "kleidicv/workspace/separable.h"
20
21 namespace kleidicv::neon {
22
23 // Primary template for Gaussian Blur filters.
24 template <typename ScalarType, size_t KernelSize, bool IsBinomial>
25 class GaussianBlur;
26
27 // Template for 3x3 Gaussian Blur binomial filters.
28 //
29 // [ 1, 2, 1 ] [ 1 ]
30 // F = 1/16 * [ 2, 4, 2 ] = 1/16 * [ 2 ] * [ 1, 2, 1 ]
31 // [ 1, 2, 1 ] [ 1 ]
32 template <>
33 class GaussianBlur<uint8_t, 3, true> {
34 public:
35 using ScalarType = uint8_t;
36 using SourceType = ScalarType;
37 using SourceVectorType = typename VecTraits<SourceType>::VectorType;
38 using BufferType = double_element_width_t<ScalarType>;
39 using BufferVectorType = typename VecTraits<BufferType>::VectorType;
40 using DestinationType = ScalarType;
41
42 // Applies vertical filtering vector using SIMD operations.
43 //
44 // DST = [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
45 114 void vertical_vector_path(SourceVectorType src[3], BufferType *dst) const {
46 // acc_0_2 = src[0] + src[2]
47 114 BufferVectorType acc_0_2_l = vaddl(vget_low(src[0]), vget_low(src[2]));
48 114 BufferVectorType acc_0_2_h = vaddl(vget_high(src[0]), vget_high(src[2]));
49 // acc_1 = src[1] + src[1]
50 114 BufferVectorType acc_1_l = vshll_n<1>(vget_low(src[1]));
51 114 BufferVectorType acc_1_h = vshll_n<1>(vget_high(src[1]));
52 // acc = acc_0_2 + acc_1
53 114 BufferVectorType acc_l = vaddq(acc_0_2_l, acc_1_l);
54 114 BufferVectorType acc_h = vaddq(acc_0_2_h, acc_1_h);
55
56 114 VecTraits<BufferType>::store_consecutive(acc_l, acc_h, &dst[0]);
57 114 }
58
59 // Applies vertical filtering vector using scalar operations.
60 //
61 // DST = [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
62 320 void vertical_scalar_path(const SourceType src[3], BufferType *dst) const {
63 320 dst[0] = src[0] + 2 * src[1] + src[2];
64 320 }
65
66 // Applies horizontal filtering vector using SIMD operations.
67 //
68 // DST = 1/16 * [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
69 176 void horizontal_vector_path(BufferVectorType src[3],
70 DestinationType *dst) const {
71 176 BufferVectorType acc_wide = vaddq(src[0], src[2]);
72 176 acc_wide = vaddq(acc_wide, vshlq_n<1>(src[1]));
73 176 auto acc_narrow = vrshrn_n<4>(acc_wide);
74 176 vst1(&dst[0], acc_narrow);
75 176 }
76
77 // Applies horizontal filtering vector using scalar operations.
78 //
79 // DST = 1/16 * [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
80 452 void horizontal_scalar_path(const BufferType src[3],
81 DestinationType *dst) const {
82 452 auto acc = src[0] + 2 * src[1] + src[2];
83 452 dst[0] = rounding_shift_right(acc, 4);
84 452 }
85 }; // end of class GaussianBlur<uint8_t, 3, true>
86
87 // Template for 5x5 Gaussian Blur binomial filters.
88 //
89 // [ 1, 4, 6, 4, 1 ] [ 1 ]
90 // [ 4, 16, 24, 16, 4 ] [ 4 ]
91 // F = 1/256 * [ 6, 24, 36, 24, 6 ] = 1/256 * [ 6 ] * [ 1, 4, 6, 4, 1 ]
92 // [ 4, 16, 24, 16, 4 ] [ 4 ]
93 // [ 1, 4, 6, 4, 1 ] [ 1 ]
94 template <>
95 class GaussianBlur<uint8_t, 5, true> {
96 public:
97 using SourceType = uint8_t;
98 using BufferType = uint16_t;
99 using DestinationType = uint8_t;
100
101 61 GaussianBlur()
102 61 : const_6_u8_half_{vdup_n_u8(6)},
103 61 const_6_u16_{vdupq_n_u16(6)},
104 61 const_4_u16_{vdupq_n_u16(4)} {}
105
106 // Applies vertical filtering vector using SIMD operations.
107 //
108 // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
109 300 void vertical_vector_path(uint8x16_t src[5], BufferType *dst) const {
110 300 uint16x8_t acc_0_4_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[4]));
111 300 uint16x8_t acc_0_4_h = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[4]));
112 300 uint16x8_t acc_1_3_l = vaddl_u8(vget_low_u8(src[1]), vget_low_u8(src[3]));
113 300 uint16x8_t acc_1_3_h = vaddl_u8(vget_high_u8(src[1]), vget_high_u8(src[3]));
114 600 uint16x8_t acc_l =
115 300 vmlal_u8(acc_0_4_l, vget_low_u8(src[2]), const_6_u8_half_);
116 600 uint16x8_t acc_h =
117 300 vmlal_u8(acc_0_4_h, vget_high_u8(src[2]), const_6_u8_half_);
118 300 acc_l = vmlaq_u16(acc_l, acc_1_3_l, const_4_u16_);
119 300 acc_h = vmlaq_u16(acc_h, acc_1_3_h, const_4_u16_);
120 300 vst1q(&dst[0], acc_l);
121 300 vst1q(&dst[8], acc_h);
122 300 }
123
124 // Applies vertical filtering vector using scalar operations.
125 //
126 // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
127 3820 void vertical_scalar_path(const SourceType src[5], BufferType *dst) const {
128 3820 dst[0] = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2];
129 3820 }
130
131 // Applies horizontal filtering vector using SIMD operations.
132 //
133 // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
134 604 void horizontal_vector_path(uint16x8_t src[5], DestinationType *dst) const {
135 604 uint16x8_t acc_0_4 = vaddq_u16(src[0], src[4]);
136 604 uint16x8_t acc_1_3 = vaddq_u16(src[1], src[3]);
137 604 uint16x8_t acc_u16 = vmlaq_u16(acc_0_4, src[2], const_6_u16_);
138 604 acc_u16 = vmlaq_u16(acc_u16, acc_1_3, const_4_u16_);
139 604 uint8x8_t acc_u8 = vrshrn_n_u16(acc_u16, 8);
140 604 vst1(&dst[0], acc_u8);
141 604 }
142
143 // Applies horizontal filtering vector using scalar operations.
144 //
145 // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
146 3068 void horizontal_scalar_path(const BufferType src[5],
147 DestinationType *dst) const {
148 3068 auto acc = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2];
149 3068 dst[0] = rounding_shift_right(acc, 8);
150 3068 }
151
152 private:
153 uint8x8_t const_6_u8_half_;
154 uint16x8_t const_6_u16_;
155 uint16x8_t const_4_u16_;
156 }; // end of class GaussianBlur<uint8_t, 5, true>
157
158 // Template for 7x7 Gaussian Blur binomial filters.
159 //
160 // [ 4, 14, 28, 36, 28, 14, 4 ]
161 // [ 14, 49, 98, 126, 98, 49, 14 ]
162 // [ 28, 98, 196, 252, 196, 98, 28 ]
163 // F = 1/4096 * [ 36, 126, 252, 324, 252, 126, 36 ] =
164 // [ 28, 98, 196, 252, 196, 98, 28 ]
165 // [ 14, 49, 98, 126, 98, 49, 14 ]
166 // [ 4, 14, 28, 36, 28, 14, 4 ]
167 //
168 // [ 2 ]
169 // [ 7 ]
170 // [ 14 ]
171 // = 1/4096 * [ 18 ] * [ 2, 7, 14, 18, 14, 7, 2 ]
172 // [ 14 ]
173 // [ 7 ]
174 // [ 2 ]
175 template <>
176 class GaussianBlur<uint8_t, 7, true> {
177 public:
178 using SourceType = uint8_t;
179 using BufferType = uint16_t;
180 using DestinationType = uint8_t;
181
182 33 GaussianBlur()
183 33 : const_7_u16_{vdupq_n_u16(7)},
184 33 const_7_u32_{vdupq_n_u32(7)},
185 33 const_9_u16_{vdupq_n_u16(9)} {}
186
187 // Applies vertical filtering vector using SIMD operations.
188 //
189 // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
190 // * [ 2, 7, 14, 18, 14, 7, 2 ]T
191 240 void vertical_vector_path(uint8x16_t src[7], BufferType *dst) const {
192 240 uint16x8_t acc_0_6_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[6]));
193 240 uint16x8_t acc_0_6_h = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[6]));
194
195 240 uint16x8_t acc_1_5_l = vaddl_u8(vget_low_u8(src[1]), vget_low_u8(src[5]));
196 240 uint16x8_t acc_1_5_h = vaddl_u8(vget_high_u8(src[1]), vget_high_u8(src[5]));
197
198 240 uint16x8_t acc_2_4_l = vaddl_u8(vget_low_u8(src[2]), vget_low_u8(src[4]));
199 240 uint16x8_t acc_2_4_h = vaddl_u8(vget_high_u8(src[2]), vget_high_u8(src[4]));
200
201 240 uint16x8_t acc_3_l = vmovl_u8(vget_low_u8(src[3]));
202 240 uint16x8_t acc_3_h = vmovl_u8(vget_high_u8(src[3]));
203
204 240 uint16x8_t acc_0_2_4_6_l = vmlaq_u16(acc_0_6_l, acc_2_4_l, const_7_u16_);
205 240 uint16x8_t acc_0_2_4_6_h = vmlaq_u16(acc_0_6_h, acc_2_4_h, const_7_u16_);
206
207 480 uint16x8_t acc_0_2_3_4_6_l =
208 240 vmlaq_u16(acc_0_2_4_6_l, acc_3_l, const_9_u16_);
209 480 uint16x8_t acc_0_2_3_4_6_h =
210 240 vmlaq_u16(acc_0_2_4_6_h, acc_3_h, const_9_u16_);
211
212 240 acc_0_2_3_4_6_l = vshlq_n_u16(acc_0_2_3_4_6_l, 1);
213 240 acc_0_2_3_4_6_h = vshlq_n_u16(acc_0_2_3_4_6_h, 1);
214
215 480 uint16x8_t acc_0_1_2_3_4_5_6_l =
216 240 vmlaq_u16(acc_0_2_3_4_6_l, acc_1_5_l, const_7_u16_);
217 480 uint16x8_t acc_0_1_2_3_4_5_6_h =
218 240 vmlaq_u16(acc_0_2_3_4_6_h, acc_1_5_h, const_7_u16_);
219
220 240 vst1q(&dst[0], acc_0_1_2_3_4_5_6_l);
221 240 vst1q(&dst[8], acc_0_1_2_3_4_5_6_h);
222 240 }
223
224 // Applies vertical filtering vector using scalar operations.
225 //
226 // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
227 // * [ 2, 7, 14, 18, 14, 7, 2 ]T
228 664 void vertical_scalar_path(const SourceType src[7], BufferType *dst) const {
229 1992 uint16_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 +
230 1328 src[4] * 14 + src[5] * 7 + src[6] * 2;
231 664 dst[0] = acc;
232 664 }
233
234 // Applies horizontal filtering vector using SIMD operations.
235 //
236 // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
237 // * [ 2, 7, 14, 18, 14, 7, 2 ]T
238 216 void horizontal_vector_path(uint16x8_t src[7], DestinationType *dst) const {
239 432 uint32x4_t acc_0_6_l =
240 216 vaddl_u16(vget_low_u16(src[0]), vget_low_u16(src[6]));
241 432 uint32x4_t acc_0_6_h =
242 216 vaddl_u16(vget_high_u16(src[0]), vget_high_u16(src[6]));
243
244 432 uint32x4_t acc_1_5_l =
245 216 vaddl_u16(vget_low_u16(src[1]), vget_low_u16(src[5]));
246 432 uint32x4_t acc_1_5_h =
247 216 vaddl_u16(vget_high_u16(src[1]), vget_high_u16(src[5]));
248
249 216 uint16x8_t acc_2_4 = vaddq_u16(src[2], src[4]);
250
251 432 uint32x4_t acc_0_2_4_6_l =
252 216 vmlal_u16(acc_0_6_l, vget_low_u16(acc_2_4), vget_low_u16(const_7_u16_));
253 432 uint32x4_t acc_0_2_4_6_h = vmlal_u16(acc_0_6_h, vget_high_u16(acc_2_4),
254 216 vget_high_u16(const_7_u16_));
255
256 432 uint32x4_t acc_0_2_3_4_6_l = vmlal_u16(acc_0_2_4_6_l, vget_low_u16(src[3]),
257 216 vget_low_u16(const_9_u16_));
258 432 uint32x4_t acc_0_2_3_4_6_h = vmlal_u16(acc_0_2_4_6_h, vget_high_u16(src[3]),
259 216 vget_high_u16(const_9_u16_));
260
261 216 acc_0_2_3_4_6_l = vshlq_n_u32(acc_0_2_3_4_6_l, 1);
262 216 acc_0_2_3_4_6_h = vshlq_n_u32(acc_0_2_3_4_6_h, 1);
263
264 432 uint32x4_t acc_0_1_2_3_4_5_6_l =
265 216 vmlaq_u32(acc_0_2_3_4_6_l, acc_1_5_l, const_7_u32_);
266 432 uint32x4_t acc_0_1_2_3_4_5_6_h =
267 216 vmlaq_u32(acc_0_2_3_4_6_h, acc_1_5_h, const_7_u32_);
268
269 216 uint16x4_t acc_0_1_2_3_4_5_6_u16_l = vrshrn_n_u32(acc_0_1_2_3_4_5_6_l, 12);
270 216 uint16x4_t acc_0_1_2_3_4_5_6_u16_h = vrshrn_n_u32(acc_0_1_2_3_4_5_6_h, 12);
271
272 432 uint16x8_t acc_0_1_2_3_4_5_6_u16 =
273 216 vcombine_u16(acc_0_1_2_3_4_5_6_u16_l, acc_0_1_2_3_4_5_6_u16_h);
274 216 uint8x8_t acc_0_1_2_3_4_5_6_u8 = vmovn_u16(acc_0_1_2_3_4_5_6_u16);
275
276 216 vst1(&dst[0], acc_0_1_2_3_4_5_6_u8);
277 216 }
278
279 // Applies horizontal filtering vector using scalar operations.
280 //
281 // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
282 // * [ 2, 7, 14, 18, 14, 7, 2 ]T
283 1672 void horizontal_scalar_path(const BufferType src[7],
284 DestinationType *dst) const {
285 5016 uint32_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 +
286 3344 src[4] * 14 + src[5] * 7 + src[6] * 2;
287 1672 dst[0] = static_cast<DestinationType>(rounding_shift_right(acc, 12));
288 1672 }
289
290 private:
291 uint16x8_t const_7_u16_;
292 uint32x4_t const_7_u32_;
293 uint16x8_t const_9_u16_;
294 }; // end of class GaussianBlur<uint8_t, 7, true>
295
296 template <size_t KernelSize>
297 class GaussianBlur<uint8_t, KernelSize, false> {
298 public:
299 using SourceType = uint8_t;
300 using BufferType = uint8_t;
301 using DestinationType = uint8_t;
302
303 static constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize);
304
305 115 explicit GaussianBlur(const uint8_t *half_kernel)
306 115 : half_kernel_(half_kernel) {}
307
308 3504 void vertical_vector_path(uint8x16_t src[KernelSize], BufferType *dst) const {
309 3504 common_vector_path(src, dst);
310 3504 }
311
312 40312 void vertical_scalar_path(const SourceType src[KernelSize],
313 BufferType *dst) const {
314 40312 uint16_t acc = src[kHalfKernelSize - 1] * half_kernel_[kHalfKernelSize - 1];
315
316 // Optimization to avoid unnecessary branching in vector code.
317 KLEIDICV_FORCE_LOOP_UNROLL
318
10/10
✓ Branch 0 taken 13568 times.
✓ Branch 1 taken 94976 times.
✓ Branch 2 taken 23528 times.
✓ Branch 3 taken 235280 times.
✓ Branch 4 taken 336 times.
✓ Branch 5 taken 336 times.
✓ Branch 6 taken 1096 times.
✓ Branch 7 taken 2192 times.
✓ Branch 8 taken 1784 times.
✓ Branch 9 taken 5352 times.
378448 for (size_t i = 0; i < kHalfKernelSize - 1; i++) {
319 338136 acc += (src[i] + src[KernelSize - i - 1]) * half_kernel_[i];
320 338136 }
321
322 40312 dst[0] = static_cast<DestinationType>(rounding_shift_right(acc, 8));
323 40312 }
324
325 960 void horizontal_vector_path(uint8x16_t src[KernelSize],
326 DestinationType *dst) const {
327 960 common_vector_path(src, dst);
328 960 }
329
330 37268 void horizontal_scalar_path(const BufferType src[KernelSize],
331 DestinationType *dst) const {
332 37268 vertical_scalar_path(src, dst);
333 37268 }
334
335 private:
336 4464 void common_vector_path(uint8x16_t src[KernelSize], BufferType *dst) const {
337 4464 uint8x8_t half_kernel_mid = vdup_n_u8(half_kernel_[kHalfKernelSize - 1]);
338 8928 uint16x8_t acc_l =
339 8928 vmlal_u8(vdupq_n_u16(128), vget_low_u8(src[kHalfKernelSize - 1]),
340 4464 half_kernel_mid);
341 8928 uint16x8_t acc_h =
342 8928 vmlal_u8(vdupq_n_u16(128), vget_high_u8(src[kHalfKernelSize - 1]),
343 4464 half_kernel_mid);
344
345 // Optimization to avoid unnecessary branching in vector code.
346 KLEIDICV_FORCE_LOOP_UNROLL
347
10/10
✓ Branch 0 taken 1344 times.
✓ Branch 1 taken 9408 times.
✓ Branch 2 taken 2736 times.
✓ Branch 3 taken 27360 times.
✓ Branch 4 taken 56 times.
✓ Branch 5 taken 56 times.
✓ Branch 6 taken 112 times.
✓ Branch 7 taken 224 times.
✓ Branch 8 taken 216 times.
✓ Branch 9 taken 648 times.
42160 for (size_t i = 0; i < kHalfKernelSize - 1; i++) {
348 37696 const size_t j = KernelSize - i - 1;
349 37696 uint16x8_t vec_l = vaddl_u8(vget_low_u8(src[i]), vget_low_u8(src[j]));
350 37696 uint16x8_t vec_h = vaddl_high_u8(src[i], src[j]);
351 37696 uint16x8_t coeff = vdupq_n_u16(half_kernel_[i]);
352
353 37696 acc_l = vmlaq_u16(acc_l, vec_l, coeff);
354 37696 acc_h = vmlaq_u16(acc_h, vec_h, coeff);
355 37696 }
356
357 // Keep only the highest 8 bits
358 8928 uint8x16_t result =
359 4464 vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h));
360 4464 neon::VecTraits<uint8_t>::store(result, &dst[0]);
361 4464 }
362
363 const uint8_t *half_kernel_;
364 }; // end of class GaussianBlur<uint8_t, KernelSize, false>
365
366 template <size_t KernelSize, bool IsBinomial, typename ScalarType>
367 336 static kleidicv_error_t gaussian_blur_fixed_kernel_size(
368 const ScalarType *src, size_t src_stride, ScalarType *dst,
369 size_t dst_stride, Rectangle &rect, size_t y_begin, size_t y_end,
370 size_t channels, float sigma, FixedBorderType border_type,
371 SeparableFilterWorkspace *workspace) {
372 using GaussianBlurFilter = GaussianBlur<ScalarType, KernelSize, IsBinomial>;
373
374 336 Rows<const ScalarType> src_rows{src, src_stride, channels};
375 336 Rows<ScalarType> dst_rows{dst, dst_stride, channels};
376
377 if constexpr (IsBinomial) {
378 141 GaussianBlurFilter blur;
379 141 SeparableFilter<GaussianBlurFilter, KernelSize> filter{blur};
380 282 workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels,
381 141 border_type, filter);
382
383 141 return KLEIDICV_OK;
384 141 } else {
385 195 constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize);
386 195 uint8_t half_kernel[128];
387 390 bool success =
388 195 generate_gaussian_half_kernel(half_kernel, kHalfKernelSize, sigma);
389
10/10
✓ Branch 0 taken 32 times.
✓ Branch 1 taken 16 times.
✓ Branch 2 taken 32 times.
✓ Branch 3 taken 16 times.
✓ Branch 4 taken 17 times.
✓ Branch 5 taken 16 times.
✓ Branch 6 taken 17 times.
✓ Branch 7 taken 16 times.
✓ Branch 8 taken 17 times.
✓ Branch 9 taken 16 times.
195 if (success) {
390 115 GaussianBlurFilter blur(half_kernel);
391 115 SeparableFilter<GaussianBlurFilter, KernelSize> filter{blur};
392 230 workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels,
393 115 border_type, filter);
394 115 } else {
395 // Sigma is too small that the middle point would get all the weight
396 // => it's just a copy.
397
10/10
✓ Branch 0 taken 228 times.
✓ Branch 1 taken 16 times.
✓ Branch 2 taken 324 times.
✓ Branch 3 taken 16 times.
✓ Branch 4 taken 36 times.
✓ Branch 5 taken 16 times.
✓ Branch 6 taken 68 times.
✓ Branch 7 taken 16 times.
✓ Branch 8 taken 100 times.
✓ Branch 9 taken 16 times.
836 for (size_t row = y_begin; row < y_end; ++row) {
398 1512 std::memcpy(static_cast<void *>(&dst_rows.at(row)[0]),
399 756 static_cast<const void *>(&src_rows.at(row)[0]),
400 756 rect.width() * sizeof(ScalarType) * dst_rows.channels());
401 756 }
402 }
403 195 return KLEIDICV_OK;
404 195 }
405 336 }
406
407 template <bool IsBinomial, typename ScalarType>
408 336 static kleidicv_error_t gaussian_blur_fixed(
409 size_t kernel_size, const ScalarType *src, size_t src_stride,
410 ScalarType *dst, size_t dst_stride, Rectangle &rect, size_t y_begin,
411 size_t y_end, size_t channels, float sigma, FixedBorderType border_type,
412 SeparableFilterWorkspace *workspace) {
413
10/12
✓ Branch 0 taken 47 times.
✓ Branch 1 taken 61 times.
✓ Branch 2 taken 33 times.
✓ Branch 3 taken 16 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 16 times.
✓ Branch 6 taken 33 times.
✓ Branch 7 taken 33 times.
✓ Branch 8 taken 33 times.
✓ Branch 9 taken 32 times.
✗ Branch 10 not taken.
✓ Branch 11 taken 32 times.
336 switch (kernel_size) {
414 case 3:
415 80 return gaussian_blur_fixed_kernel_size<3, IsBinomial>(
416 80 src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
417 80 sigma, border_type, workspace);
418 case 5:
419 94 return gaussian_blur_fixed_kernel_size<5, IsBinomial>(
420 94 src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
421 94 sigma, border_type, workspace);
422 case 7:
423 66 return gaussian_blur_fixed_kernel_size<7, IsBinomial>(
424 66 src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
425 66 sigma, border_type, workspace);
426 case 15:
427 // 15x15 does not have a binomial variant
428 48 return gaussian_blur_fixed_kernel_size<15, false>(
429 48 src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
430 48 sigma, border_type, workspace);
431 case 21:
432 // 21x21 does not have a binomial variant
433 48 return gaussian_blur_fixed_kernel_size<21, false>(
434 48 src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
435 48 sigma, border_type, workspace);
436 // gaussian_blur_is_implemented checked the kernel size already.
437 // GCOVR_EXCL_START
438 default:
439 assert(!"kernel size not implemented");
440 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
441 // GCOVR_EXCL_STOP
442 }
443 336 }
444
445 KLEIDICV_TARGET_FN_ATTRS
446 347 kleidicv_error_t gaussian_blur_fixed_stripe_u8(
447 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
448 size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels,
449 size_t kernel_width, size_t /*kernel_height*/, float sigma_x,
450 float /*sigma_y*/, FixedBorderType fixed_border_type) {
451
6/6
✓ Branch 0 taken 10 times.
✓ Branch 1 taken 337 times.
✓ Branch 2 taken 10 times.
✓ Branch 3 taken 337 times.
✓ Branch 4 taken 10 times.
✓ Branch 5 taken 337 times.
1041 if (auto result =
452 347 gaussian_blur_checks(src, src_stride, dst, dst_stride, width, height);
453
2/3
✗ Branch 0 not taken.
✓ Branch 1 taken 10 times.
✓ Branch 2 taken 337 times.
357 result != KLEIDICV_OK) {
454 10 return result;
455 }
456
457 337 Rectangle rect{width, height};
458 // As we cannot predict the intermediate size based on the parameters given,
459 // just use the largest possible immediate size out of all available
460 // operations.
461 337 auto workspace =
462 337 SeparableFilterWorkspace::create(rect, channels, sizeof(uint32_t));
463
2/2
✓ Branch 0 taken 336 times.
✓ Branch 1 taken 1 times.
337 if (!workspace) {
464 1 return KLEIDICV_ERROR_ALLOCATION;
465 }
466
467
2/2
✓ Branch 0 taken 173 times.
✓ Branch 1 taken 163 times.
336 if (sigma_x == 0.0) {
468 173 return gaussian_blur_fixed<true>(
469 173 kernel_width, src, src_stride, dst, dst_stride, rect, y_begin, y_end,
470 173 channels, sigma_x, fixed_border_type, workspace.get());
471 }
472
473 163 return gaussian_blur_fixed<false>(
474 163 kernel_width, src, src_stride, dst, dst_stride, rect, y_begin, y_end,
475 163 channels, sigma_x, fixed_border_type, workspace.get());
476 347 }
477
478 } // namespace kleidicv::neon
479