KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/filters/gaussian_blur_fixed_neon.cpp
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 224 224 100.0%
Functions: 55 55 100.0%
Branches: 54 56 96.4%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <cassert>
6 #include <cstddef>
7
8 #include "kleidicv/config.h"
9 #include "kleidicv/ctypes.h"
10 #include "kleidicv/filters/gaussian_blur.h"
11 #include "kleidicv/filters/separable_filter_15x15_neon.h"
12 #include "kleidicv/filters/separable_filter_21x21_neon.h"
13 #include "kleidicv/filters/separable_filter_3x3_neon.h"
14 #include "kleidicv/filters/separable_filter_5x5_neon.h"
15 #include "kleidicv/filters/separable_filter_7x7_neon.h"
16 #include "kleidicv/filters/sigma.h"
17 #include "kleidicv/neon.h"
18 #include "kleidicv/workspace/border_types.h"
19 #include "kleidicv/workspace/separable.h"
20
21 namespace kleidicv::neon {
22
23 // Primary template for Gaussian Blur filters.
24 template <typename ScalarType, size_t KernelSize, bool IsBinomial>
25 class GaussianBlur;
26
27 // Template for 3x3 Gaussian Blur binomial filters.
28 //
29 // [ 1, 2, 1 ] [ 1 ]
30 // F = 1/16 * [ 2, 4, 2 ] = 1/16 * [ 2 ] * [ 1, 2, 1 ]
31 // [ 1, 2, 1 ] [ 1 ]
32 template <>
33 class GaussianBlur<uint8_t, 3, true> {
34 public:
35 using ScalarType = uint8_t;
36 using SourceType = ScalarType;
37 using SourceVectorType = typename VecTraits<SourceType>::VectorType;
38 using BufferType = double_element_width_t<ScalarType>;
39 using BufferVectorType = typename VecTraits<BufferType>::VectorType;
40 using DestinationType = ScalarType;
41
42 // Applies vertical filtering vector using SIMD operations.
43 //
44 // DST = [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
45 114 void vertical_vector_path(SourceVectorType src[3], BufferType *dst) const {
46 // acc_0_2 = src[0] + src[2]
47 114 BufferVectorType acc_0_2_l = vaddl(vget_low(src[0]), vget_low(src[2]));
48 114 BufferVectorType acc_0_2_h = vaddl(vget_high(src[0]), vget_high(src[2]));
49 // acc_1 = src[1] + src[1]
50 114 BufferVectorType acc_1_l = vshll_n<1>(vget_low(src[1]));
51 114 BufferVectorType acc_1_h = vshll_n<1>(vget_high(src[1]));
52 // acc = acc_0_2 + acc_1
53 114 BufferVectorType acc_l = vaddq(acc_0_2_l, acc_1_l);
54 114 BufferVectorType acc_h = vaddq(acc_0_2_h, acc_1_h);
55
56 114 VecTraits<BufferType>::store_consecutive(acc_l, acc_h, &dst[0]);
57 114 }
58
59 // Applies vertical filtering vector using scalar operations.
60 //
61 // DST = [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
62 320 void vertical_scalar_path(const SourceType src[3], BufferType *dst) const {
63 320 dst[0] = src[0] + 2 * src[1] + src[2];
64 320 }
65
66 // Applies horizontal filtering vector using SIMD operations.
67 //
68 // DST = 1/16 * [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
69 176 void horizontal_vector_path(BufferVectorType src[3],
70 DestinationType *dst) const {
71 176 BufferVectorType acc_wide = vaddq(src[0], src[2]);
72 176 acc_wide = vaddq(acc_wide, vshlq_n<1>(src[1]));
73 176 auto acc_narrow = vrshrn_n<4>(acc_wide);
74 176 vst1(&dst[0], acc_narrow);
75 176 }
76
77 // Applies horizontal filtering vector using scalar operations.
78 //
79 // DST = 1/16 * [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
80 452 void horizontal_scalar_path(const BufferType src[3],
81 DestinationType *dst) const {
82 452 auto acc = src[0] + 2 * src[1] + src[2];
83 452 dst[0] = rounding_shift_right(acc, 4);
84 452 }
85 }; // end of class GaussianBlur<uint8_t, 3, true>
86
87 // Template for 5x5 Gaussian Blur binomial filters.
88 //
89 // [ 1, 4, 6, 4, 1 ] [ 1 ]
90 // [ 4, 16, 24, 16, 4 ] [ 4 ]
91 // F = 1/256 * [ 6, 24, 36, 24, 6 ] = 1/256 * [ 6 ] * [ 1, 4, 6, 4, 1 ]
92 // [ 4, 16, 24, 16, 4 ] [ 4 ]
93 // [ 1, 4, 6, 4, 1 ] [ 1 ]
94 template <>
95 class GaussianBlur<uint8_t, 5, true> {
96 public:
97 using SourceType = uint8_t;
98 using BufferType = uint16_t;
99 using DestinationType = uint8_t;
100
101 51 GaussianBlur()
102 51 : const_6_u8_half_{vdup_n_u8(6)},
103 51 const_6_u16_{vdupq_n_u16(6)},
104 51 const_4_u16_{vdupq_n_u16(4)} {}
105
106 // Applies vertical filtering vector using SIMD operations.
107 //
108 // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
109 300 void vertical_vector_path(uint8x16_t src[5], BufferType *dst) const {
110 300 uint16x8_t acc_0_4_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[4]));
111 300 uint16x8_t acc_0_4_h = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[4]));
112 300 uint16x8_t acc_1_3_l = vaddl_u8(vget_low_u8(src[1]), vget_low_u8(src[3]));
113 300 uint16x8_t acc_1_3_h = vaddl_u8(vget_high_u8(src[1]), vget_high_u8(src[3]));
114 600 uint16x8_t acc_l =
115 300 vmlal_u8(acc_0_4_l, vget_low_u8(src[2]), const_6_u8_half_);
116 600 uint16x8_t acc_h =
117 300 vmlal_u8(acc_0_4_h, vget_high_u8(src[2]), const_6_u8_half_);
118 300 acc_l = vmlaq_u16(acc_l, acc_1_3_l, const_4_u16_);
119 300 acc_h = vmlaq_u16(acc_h, acc_1_3_h, const_4_u16_);
120 300 vst1q(&dst[0], acc_l);
121 300 vst1q(&dst[8], acc_h);
122 300 }
123
124 // Applies vertical filtering vector using scalar operations.
125 //
126 // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
127 3820 void vertical_scalar_path(const SourceType src[5], BufferType *dst) const {
128 3820 dst[0] = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2];
129 3820 }
130
131 // Applies horizontal filtering vector using SIMD operations.
132 //
133 // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
134 604 void horizontal_vector_path(uint16x8_t src[5], DestinationType *dst) const {
135 604 uint16x8_t acc_0_4 = vaddq_u16(src[0], src[4]);
136 604 uint16x8_t acc_1_3 = vaddq_u16(src[1], src[3]);
137 604 uint16x8_t acc_u16 = vmlaq_u16(acc_0_4, src[2], const_6_u16_);
138 604 acc_u16 = vmlaq_u16(acc_u16, acc_1_3, const_4_u16_);
139 604 uint8x8_t acc_u8 = vrshrn_n_u16(acc_u16, 8);
140 604 vst1(&dst[0], acc_u8);
141 604 }
142
143 // Applies horizontal filtering vector using scalar operations.
144 //
145 // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
146 3068 void horizontal_scalar_path(const BufferType src[5],
147 DestinationType *dst) const {
148 3068 auto acc = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2];
149 3068 dst[0] = rounding_shift_right(acc, 8);
150 3068 }
151
152 private:
153 uint8x8_t const_6_u8_half_;
154 uint16x8_t const_6_u16_;
155 uint16x8_t const_4_u16_;
156 }; // end of class GaussianBlur<uint8_t, 5, true>
157
158 // Template for 7x7 Gaussian Blur binomial filters.
159 //
160 // [ 4, 14, 28, 36, 28, 14, 4 ]
161 // [ 14, 49, 98, 126, 98, 49, 14 ]
162 // [ 28, 98, 196, 252, 196, 98, 28 ]
163 // F = 1/4096 * [ 36, 126, 252, 324, 252, 126, 36 ] =
164 // [ 28, 98, 196, 252, 196, 98, 28 ]
165 // [ 14, 49, 98, 126, 98, 49, 14 ]
166 // [ 4, 14, 28, 36, 28, 14, 4 ]
167 //
168 // [ 2 ]
169 // [ 7 ]
170 // [ 14 ]
171 // = 1/4096 * [ 18 ] * [ 2, 7, 14, 18, 14, 7, 2 ]
172 // [ 14 ]
173 // [ 7 ]
174 // [ 2 ]
175 template <>
176 class GaussianBlur<uint8_t, 7, true> {
177 public:
178 using SourceType = uint8_t;
179 using BufferType = uint16_t;
180 using DestinationType = uint8_t;
181
182 33 GaussianBlur()
183 33 : const_7_u16_{vdupq_n_u16(7)},
184 33 const_7_u32_{vdupq_n_u32(7)},
185 33 const_9_u16_{vdupq_n_u16(9)} {}
186
187 // Applies vertical filtering vector using SIMD operations.
188 //
189 // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
190 // * [ 2, 7, 14, 18, 14, 7, 2 ]T
191 240 void vertical_vector_path(uint8x16_t src[7], BufferType *dst) const {
192 240 uint16x8_t acc_0_6_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[6]));
193 240 uint16x8_t acc_0_6_h = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[6]));
194
195 240 uint16x8_t acc_1_5_l = vaddl_u8(vget_low_u8(src[1]), vget_low_u8(src[5]));
196 240 uint16x8_t acc_1_5_h = vaddl_u8(vget_high_u8(src[1]), vget_high_u8(src[5]));
197
198 240 uint16x8_t acc_2_4_l = vaddl_u8(vget_low_u8(src[2]), vget_low_u8(src[4]));
199 240 uint16x8_t acc_2_4_h = vaddl_u8(vget_high_u8(src[2]), vget_high_u8(src[4]));
200
201 240 uint16x8_t acc_3_l = vmovl_u8(vget_low_u8(src[3]));
202 240 uint16x8_t acc_3_h = vmovl_u8(vget_high_u8(src[3]));
203
204 240 uint16x8_t acc_0_2_4_6_l = vmlaq_u16(acc_0_6_l, acc_2_4_l, const_7_u16_);
205 240 uint16x8_t acc_0_2_4_6_h = vmlaq_u16(acc_0_6_h, acc_2_4_h, const_7_u16_);
206
207 480 uint16x8_t acc_0_2_3_4_6_l =
208 240 vmlaq_u16(acc_0_2_4_6_l, acc_3_l, const_9_u16_);
209 480 uint16x8_t acc_0_2_3_4_6_h =
210 240 vmlaq_u16(acc_0_2_4_6_h, acc_3_h, const_9_u16_);
211
212 240 acc_0_2_3_4_6_l = vshlq_n_u16(acc_0_2_3_4_6_l, 1);
213 240 acc_0_2_3_4_6_h = vshlq_n_u16(acc_0_2_3_4_6_h, 1);
214
215 480 uint16x8_t acc_0_1_2_3_4_5_6_l =
216 240 vmlaq_u16(acc_0_2_3_4_6_l, acc_1_5_l, const_7_u16_);
217 480 uint16x8_t acc_0_1_2_3_4_5_6_h =
218 240 vmlaq_u16(acc_0_2_3_4_6_h, acc_1_5_h, const_7_u16_);
219
220 240 vst1q(&dst[0], acc_0_1_2_3_4_5_6_l);
221 240 vst1q(&dst[8], acc_0_1_2_3_4_5_6_h);
222 240 }
223
224 // Applies vertical filtering vector using scalar operations.
225 //
226 // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
227 // * [ 2, 7, 14, 18, 14, 7, 2 ]T
228 664 void vertical_scalar_path(const SourceType src[7], BufferType *dst) const {
229 1992 uint16_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 +
230 1328 src[4] * 14 + src[5] * 7 + src[6] * 2;
231 664 dst[0] = acc;
232 664 }
233
234 // Applies horizontal filtering vector using SIMD operations.
235 //
236 // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
237 // * [ 2, 7, 14, 18, 14, 7, 2 ]T
238 216 void horizontal_vector_path(uint16x8_t src[7], DestinationType *dst) const {
239 432 uint32x4_t acc_0_6_l =
240 216 vaddl_u16(vget_low_u16(src[0]), vget_low_u16(src[6]));
241 432 uint32x4_t acc_0_6_h =
242 216 vaddl_u16(vget_high_u16(src[0]), vget_high_u16(src[6]));
243
244 432 uint32x4_t acc_1_5_l =
245 216 vaddl_u16(vget_low_u16(src[1]), vget_low_u16(src[5]));
246 432 uint32x4_t acc_1_5_h =
247 216 vaddl_u16(vget_high_u16(src[1]), vget_high_u16(src[5]));
248
249 216 uint16x8_t acc_2_4 = vaddq_u16(src[2], src[4]);
250
251 432 uint32x4_t acc_0_2_4_6_l =
252 216 vmlal_u16(acc_0_6_l, vget_low_u16(acc_2_4), vget_low_u16(const_7_u16_));
253 432 uint32x4_t acc_0_2_4_6_h = vmlal_u16(acc_0_6_h, vget_high_u16(acc_2_4),
254 216 vget_high_u16(const_7_u16_));
255
256 432 uint32x4_t acc_0_2_3_4_6_l = vmlal_u16(acc_0_2_4_6_l, vget_low_u16(src[3]),
257 216 vget_low_u16(const_9_u16_));
258 432 uint32x4_t acc_0_2_3_4_6_h = vmlal_u16(acc_0_2_4_6_h, vget_high_u16(src[3]),
259 216 vget_high_u16(const_9_u16_));
260
261 216 acc_0_2_3_4_6_l = vshlq_n_u32(acc_0_2_3_4_6_l, 1);
262 216 acc_0_2_3_4_6_h = vshlq_n_u32(acc_0_2_3_4_6_h, 1);
263
264 432 uint32x4_t acc_0_1_2_3_4_5_6_l =
265 216 vmlaq_u32(acc_0_2_3_4_6_l, acc_1_5_l, const_7_u32_);
266 432 uint32x4_t acc_0_1_2_3_4_5_6_h =
267 216 vmlaq_u32(acc_0_2_3_4_6_h, acc_1_5_h, const_7_u32_);
268
269 216 uint16x4_t acc_0_1_2_3_4_5_6_u16_l = vrshrn_n_u32(acc_0_1_2_3_4_5_6_l, 12);
270 216 uint16x4_t acc_0_1_2_3_4_5_6_u16_h = vrshrn_n_u32(acc_0_1_2_3_4_5_6_h, 12);
271
272 432 uint16x8_t acc_0_1_2_3_4_5_6_u16 =
273 216 vcombine_u16(acc_0_1_2_3_4_5_6_u16_l, acc_0_1_2_3_4_5_6_u16_h);
274 216 uint8x8_t acc_0_1_2_3_4_5_6_u8 = vmovn_u16(acc_0_1_2_3_4_5_6_u16);
275
276 216 vst1(&dst[0], acc_0_1_2_3_4_5_6_u8);
277 216 }
278
279 // Applies horizontal filtering vector using scalar operations.
280 //
281 // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
282 // * [ 2, 7, 14, 18, 14, 7, 2 ]T
283 1672 void horizontal_scalar_path(const BufferType src[7],
284 DestinationType *dst) const {
285 5016 uint32_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 +
286 3344 src[4] * 14 + src[5] * 7 + src[6] * 2;
287 1672 dst[0] = static_cast<DestinationType>(rounding_shift_right(acc, 12));
288 1672 }
289
290 private:
291 uint16x8_t const_7_u16_;
292 uint32x4_t const_7_u32_;
293 uint16x8_t const_9_u16_;
294 }; // end of class GaussianBlur<uint8_t, 7, true>
295
296 template <size_t KernelSize>
297 class GaussianBlur<uint8_t, KernelSize, false> {
298 public:
299 using SourceType = uint8_t;
300 using BufferType = uint8_t;
301 using DestinationType = uint8_t;
302
303 static constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize);
304
305 115 explicit GaussianBlur(const uint16_t *half_kernel)
306 115 : half_kernel_(half_kernel) {}
307
308 3504 void vertical_vector_path(uint8x16_t src[KernelSize], BufferType *dst) const {
309 3504 common_vector_path(src, dst);
310 3504 }
311
312 40312 void vertical_scalar_path(const SourceType src[KernelSize],
313 BufferType *dst) const {
314 80624 uint16_t acc = static_cast<uint16_t>(src[kHalfKernelSize - 1]) *
315 40312 half_kernel_[kHalfKernelSize - 1];
316
317 // Optimization to avoid unnecessary branching in vector code.
318 KLEIDICV_FORCE_LOOP_UNROLL
319
10/10
✓ Branch 0 taken 13568 times.
✓ Branch 1 taken 94976 times.
✓ Branch 2 taken 23528 times.
✓ Branch 3 taken 235280 times.
✓ Branch 4 taken 336 times.
✓ Branch 5 taken 336 times.
✓ Branch 6 taken 1096 times.
✓ Branch 7 taken 2192 times.
✓ Branch 8 taken 1784 times.
✓ Branch 9 taken 5352 times.
378448 for (size_t i = 0; i < kHalfKernelSize - 1; i++) {
320 1014408 acc += (static_cast<uint16_t>(src[i]) +
321 676272 static_cast<uint16_t>(src[KernelSize - i - 1])) *
322 338136 half_kernel_[i];
323 338136 }
324
325 40312 dst[0] = static_cast<DestinationType>(rounding_shift_right(acc, 8));
326 40312 }
327
328 960 void horizontal_vector_path(uint8x16_t src[KernelSize],
329 DestinationType *dst) const {
330 960 common_vector_path(src, dst);
331 960 }
332
333 37268 void horizontal_scalar_path(const BufferType src[KernelSize],
334 DestinationType *dst) const {
335 37268 vertical_scalar_path(src, dst);
336 37268 }
337
338 private:
339 4464 void common_vector_path(uint8x16_t src[KernelSize], BufferType *dst) const {
340 4464 uint8x8_t half_kernel_mid = vdup_n_u8(half_kernel_[kHalfKernelSize - 1]);
341 8928 uint16x8_t acc_l =
342 8928 vmlal_u8(vdupq_n_u16(128), vget_low_u8(src[kHalfKernelSize - 1]),
343 4464 half_kernel_mid);
344 8928 uint16x8_t acc_h =
345 8928 vmlal_u8(vdupq_n_u16(128), vget_high_u8(src[kHalfKernelSize - 1]),
346 4464 half_kernel_mid);
347
348 // Optimization to avoid unnecessary branching in vector code.
349 KLEIDICV_FORCE_LOOP_UNROLL
350
10/10
✓ Branch 0 taken 1344 times.
✓ Branch 1 taken 9408 times.
✓ Branch 2 taken 2736 times.
✓ Branch 3 taken 27360 times.
✓ Branch 4 taken 56 times.
✓ Branch 5 taken 56 times.
✓ Branch 6 taken 112 times.
✓ Branch 7 taken 224 times.
✓ Branch 8 taken 216 times.
✓ Branch 9 taken 648 times.
42160 for (size_t i = 0; i < kHalfKernelSize - 1; i++) {
351 37696 const size_t j = KernelSize - i - 1;
352 37696 uint16x8_t vec_l = vaddl_u8(vget_low_u8(src[i]), vget_low_u8(src[j]));
353 37696 uint16x8_t vec_h = vaddl_high_u8(src[i], src[j]);
354 37696 uint16x8_t coeff = vdupq_n_u16(half_kernel_[i]);
355
356 37696 acc_l = vmlaq_u16(acc_l, vec_l, coeff);
357 37696 acc_h = vmlaq_u16(acc_h, vec_h, coeff);
358 37696 }
359
360 // Keep only the highest 8 bits
361 8928 uint8x16_t result =
362 4464 vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h));
363 4464 neon::VecTraits<uint8_t>::store(result, &dst[0]);
364 4464 }
365
366 const uint16_t *half_kernel_;
367 }; // end of class GaussianBlur<uint8_t, KernelSize, false>
368
369 template <size_t KernelSize, bool IsBinomial, typename ScalarType>
370 326 static kleidicv_error_t gaussian_blur_fixed_kernel_size(
371 const ScalarType *src, size_t src_stride, ScalarType *dst,
372 size_t dst_stride, Rectangle &rect, size_t y_begin, size_t y_end,
373 size_t channels, float sigma, FixedBorderType border_type,
374 SeparableFilterWorkspace *workspace) {
375 using GaussianBlurFilter = GaussianBlur<ScalarType, KernelSize, IsBinomial>;
376
377 326 Rows<const ScalarType> src_rows{src, src_stride, channels};
378 326 Rows<ScalarType> dst_rows{dst, dst_stride, channels};
379
380 if constexpr (IsBinomial) {
381 131 GaussianBlurFilter blur;
382 131 SeparableFilter<GaussianBlurFilter, KernelSize> filter{blur};
383 262 workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels,
384 131 border_type, filter);
385
386 131 return KLEIDICV_OK;
387 131 } else {
388 195 constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize);
389 195 uint16_t half_kernel[128];
390 195 generate_gaussian_half_kernel(half_kernel, kHalfKernelSize, sigma);
391 // If sigma is so small that the middle point gets all the weights, it's
392 // just a copy
393
10/10
✓ Branch 0 taken 32 times.
✓ Branch 1 taken 16 times.
✓ Branch 2 taken 32 times.
✓ Branch 3 taken 16 times.
✓ Branch 4 taken 17 times.
✓ Branch 5 taken 16 times.
✓ Branch 6 taken 17 times.
✓ Branch 7 taken 16 times.
✓ Branch 8 taken 17 times.
✓ Branch 9 taken 16 times.
195 if (half_kernel[kHalfKernelSize - 1] < 256) {
394 115 GaussianBlurFilter blur(half_kernel);
395 115 SeparableFilter<GaussianBlurFilter, KernelSize> filter{blur};
396 230 workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels,
397 115 border_type, filter);
398 115 } else {
399
10/10
✓ Branch 0 taken 228 times.
✓ Branch 1 taken 16 times.
✓ Branch 2 taken 324 times.
✓ Branch 3 taken 16 times.
✓ Branch 4 taken 36 times.
✓ Branch 5 taken 16 times.
✓ Branch 6 taken 68 times.
✓ Branch 7 taken 16 times.
✓ Branch 8 taken 100 times.
✓ Branch 9 taken 16 times.
836 for (size_t row = y_begin; row < y_end; ++row) {
400 1512 std::memcpy(static_cast<void *>(&dst_rows.at(row)[0]),
401 756 static_cast<const void *>(&src_rows.at(row)[0]),
402 756 rect.width() * sizeof(ScalarType) * dst_rows.channels());
403 756 }
404 }
405 195 return KLEIDICV_OK;
406 195 }
407 326 }
408
409 template <bool IsBinomial, typename ScalarType>
410 326 static kleidicv_error_t gaussian_blur_fixed(
411 size_t kernel_size, const ScalarType *src, size_t src_stride,
412 ScalarType *dst, size_t dst_stride, Rectangle &rect, size_t y_begin,
413 size_t y_end, size_t channels, float sigma, FixedBorderType border_type,
414 SeparableFilterWorkspace *workspace) {
415
10/12
✓ Branch 0 taken 47 times.
✓ Branch 1 taken 51 times.
✓ Branch 2 taken 33 times.
✓ Branch 3 taken 16 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 16 times.
✓ Branch 6 taken 33 times.
✓ Branch 7 taken 33 times.
✓ Branch 8 taken 33 times.
✓ Branch 9 taken 32 times.
✗ Branch 10 not taken.
✓ Branch 11 taken 32 times.
326 switch (kernel_size) {
416 case 3:
417 80 return gaussian_blur_fixed_kernel_size<3, IsBinomial>(
418 80 src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
419 80 sigma, border_type, workspace);
420 case 5:
421 84 return gaussian_blur_fixed_kernel_size<5, IsBinomial>(
422 84 src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
423 84 sigma, border_type, workspace);
424 case 7:
425 66 return gaussian_blur_fixed_kernel_size<7, IsBinomial>(
426 66 src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
427 66 sigma, border_type, workspace);
428 case 15:
429 // 15x15 does not have a binomial variant
430 48 return gaussian_blur_fixed_kernel_size<15, false>(
431 48 src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
432 48 sigma, border_type, workspace);
433 case 21:
434 // 21x21 does not have a binomial variant
435 48 return gaussian_blur_fixed_kernel_size<21, false>(
436 48 src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
437 48 sigma, border_type, workspace);
438 // gaussian_blur_is_implemented checked the kernel size already.
439 // GCOVR_EXCL_START
440 default:
441 assert(!"kernel size not implemented");
442 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
443 // GCOVR_EXCL_STOP
444 }
445 326 }
446
447 KLEIDICV_TARGET_FN_ATTRS
448 345 kleidicv_error_t gaussian_blur_fixed_stripe_u8(
449 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
450 size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels,
451 size_t kernel_width, size_t /*kernel_height*/, float sigma_x,
452 float /*sigma_y*/, FixedBorderType fixed_border_type,
453 kleidicv_filter_context_t *context) {
454 345 auto *workspace = reinterpret_cast<SeparableFilterWorkspace *>(context);
455 690 kleidicv_error_t checks_result = gaussian_blur_checks(
456 345 src, src_stride, dst, dst_stride, width, height, channels, workspace);
457
458
2/2
✓ Branch 0 taken 19 times.
✓ Branch 1 taken 326 times.
345 if (checks_result != KLEIDICV_OK) {
459 19 return checks_result;
460 }
461
462 326 Rectangle rect{width, height};
463
464
2/2
✓ Branch 0 taken 163 times.
✓ Branch 1 taken 163 times.
326 if (sigma_x == 0.0) {
465 326 return gaussian_blur_fixed<true>(kernel_width, src, src_stride, dst,
466 163 dst_stride, rect, y_begin, y_end, channels,
467 163 sigma_x, fixed_border_type, workspace);
468 }
469
470 326 return gaussian_blur_fixed<false>(kernel_width, src, src_stride, dst,
471 163 dst_stride, rect, y_begin, y_end, channels,
472 163 sigma_x, fixed_border_type, workspace);
473 345 }
474
475 } // namespace kleidicv::neon
476