KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/filters/gaussian_blur_fixed_neon.cpp
Date: 2026-03-05 15:57:40
Exec Total Coverage
Lines: 313 313 100.0%
Functions: 68 68 100.0%
Branches: 110 113 97.3%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2023 - 2026 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <cassert>
6 #include <cstddef>
7
8 #include "kleidicv/config.h"
9 #include "kleidicv/ctypes.h"
10 #include "kleidicv/filters/gaussian_blur.h"
11 #include "kleidicv/filters/separable_filter_15x15_neon.h"
12 #include "kleidicv/filters/separable_filter_21x21_neon.h"
13 #include "kleidicv/filters/separable_filter_3x3_neon.h"
14 #include "kleidicv/filters/separable_filter_5x5_neon.h"
15 #include "kleidicv/filters/separable_filter_7x7_neon.h"
16 #include "kleidicv/filters/separable_filter_9x9_neon.h"
17 #include "kleidicv/filters/sigma.h"
18 #include "kleidicv/neon.h"
19 #include "kleidicv/workspace/border_types.h"
20 #include "kleidicv/workspace/separable.h"
21
22 namespace kleidicv::neon {
23
24 // Primary template for Gaussian Blur filters.
25 template <typename ScalarType, size_t KernelSize, bool IsBinomial>
26 class GaussianBlur;
27
28 // Template for 3x3 Gaussian Blur binomial filters.
29 //
30 // [ 1, 2, 1 ] [ 1 ]
31 // F = 1/16 * [ 2, 4, 2 ] = 1/16 * [ 2 ] * [ 1, 2, 1 ]
32 // [ 1, 2, 1 ] [ 1 ]
33 template <>
34 class GaussianBlur<uint8_t, 3, true> {
35 public:
36 using ScalarType = uint8_t;
37 using SourceType = ScalarType;
38 using SourceVectorType = typename VecTraits<SourceType>::VectorType;
39 using BufferType = double_element_width_t<ScalarType>;
40 using BufferVectorType = typename VecTraits<BufferType>::VectorType;
41 using DestinationType = ScalarType;
42
43 // Applies vertical filtering vector using SIMD operations.
44 //
45 // DST = [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
46 114 void vertical_vector_path(SourceVectorType src[3], BufferType *dst) const {
47 // acc_0_2 = src[0] + src[2]
48 114 BufferVectorType acc_0_2_l = vaddl(vget_low(src[0]), vget_low(src[2]));
49 114 BufferVectorType acc_0_2_h = vaddl(vget_high(src[0]), vget_high(src[2]));
50 // acc_1 = src[1] + src[1]
51 114 BufferVectorType acc_1_l = vshll_n<1>(vget_low(src[1]));
52 114 BufferVectorType acc_1_h = vshll_n<1>(vget_high(src[1]));
53 // acc = acc_0_2 + acc_1
54 114 BufferVectorType acc_l = vaddq(acc_0_2_l, acc_1_l);
55 114 BufferVectorType acc_h = vaddq(acc_0_2_h, acc_1_h);
56
57 114 VecTraits<BufferType>::store_consecutive(acc_l, acc_h, &dst[0]);
58 114 }
59
60 // Applies vertical filtering vector using scalar operations.
61 //
62 // DST = [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
63 320 void vertical_scalar_path(const SourceType src[3], BufferType *dst) const {
64 320 dst[0] = src[0] + 2 * src[1] + src[2];
65 320 }
66
67 // Applies horizontal filtering vector using SIMD operations.
68 //
69 // DST = 1/16 * [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
70 176 void horizontal_vector_path(BufferVectorType src[3],
71 DestinationType *dst) const {
72 176 BufferVectorType acc_wide = vaddq(src[0], src[2]);
73 176 acc_wide = vaddq(acc_wide, vshlq_n<1>(src[1]));
74 176 auto acc_narrow = vrshrn_n<4>(acc_wide);
75 176 vst1(&dst[0], acc_narrow);
76 176 }
77
78 // Applies horizontal filtering vector using scalar operations.
79 //
80 // DST = 1/16 * [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
81 452 void horizontal_scalar_path(const BufferType src[3],
82 DestinationType *dst) const {
83 452 auto acc = src[0] + 2 * src[1] + src[2];
84 452 dst[0] = rounding_shift_right(acc, 4);
85 452 }
86 }; // end of class GaussianBlur<uint8_t, 3, true>
87
88 // Template for 5x5 Gaussian Blur binomial filters.
89 //
90 // [ 1, 4, 6, 4, 1 ] [ 1 ]
91 // [ 4, 16, 24, 16, 4 ] [ 4 ]
92 // F = 1/256 * [ 6, 24, 36, 24, 6 ] = 1/256 * [ 6 ] * [ 1, 4, 6, 4, 1 ]
93 // [ 4, 16, 24, 16, 4 ] [ 4 ]
94 // [ 1, 4, 6, 4, 1 ] [ 1 ]
95 template <>
96 class GaussianBlur<uint8_t, 5, true> {
97 public:
98 using SourceType = uint8_t;
99 using BufferType = uint16_t;
100 using DestinationType = uint8_t;
101
102 61 GaussianBlur()
103 61 : const_6_u8_half_{vdup_n_u8(6)},
104 61 const_6_u16_{vdupq_n_u16(6)},
105 61 const_4_u16_{vdupq_n_u16(4)} {}
106
107 // Applies vertical filtering vector using SIMD operations.
108 //
109 // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
110 300 void vertical_vector_path(uint8x16_t src[5], BufferType *dst) const {
111 300 uint16x8_t acc_0_4_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[4]));
112 300 uint16x8_t acc_0_4_h = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[4]));
113 300 uint16x8_t acc_1_3_l = vaddl_u8(vget_low_u8(src[1]), vget_low_u8(src[3]));
114 300 uint16x8_t acc_1_3_h = vaddl_u8(vget_high_u8(src[1]), vget_high_u8(src[3]));
115 600 uint16x8_t acc_l =
116 300 vmlal_u8(acc_0_4_l, vget_low_u8(src[2]), const_6_u8_half_);
117 600 uint16x8_t acc_h =
118 300 vmlal_u8(acc_0_4_h, vget_high_u8(src[2]), const_6_u8_half_);
119 300 acc_l = vmlaq_u16(acc_l, acc_1_3_l, const_4_u16_);
120 300 acc_h = vmlaq_u16(acc_h, acc_1_3_h, const_4_u16_);
121 300 vst1q(&dst[0], acc_l);
122 300 vst1q(&dst[8], acc_h);
123 300 }
124
125 // Applies vertical filtering vector using scalar operations.
126 //
127 // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
128 3820 void vertical_scalar_path(const SourceType src[5], BufferType *dst) const {
129 3820 dst[0] = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2];
130 3820 }
131
132 // Applies horizontal filtering vector using SIMD operations.
133 //
134 // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
135 604 void horizontal_vector_path(uint16x8_t src[5], DestinationType *dst) const {
136 604 uint16x8_t acc_0_4 = vaddq_u16(src[0], src[4]);
137 604 uint16x8_t acc_1_3 = vaddq_u16(src[1], src[3]);
138 604 uint16x8_t acc_u16 = vmlaq_u16(acc_0_4, src[2], const_6_u16_);
139 604 acc_u16 = vmlaq_u16(acc_u16, acc_1_3, const_4_u16_);
140 604 uint8x8_t acc_u8 = vrshrn_n_u16(acc_u16, 8);
141 604 vst1(&dst[0], acc_u8);
142 604 }
143
144 // Applies horizontal filtering vector using scalar operations.
145 //
146 // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
147 3068 void horizontal_scalar_path(const BufferType src[5],
148 DestinationType *dst) const {
149 3068 auto acc = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2];
150 3068 dst[0] = rounding_shift_right(acc, 8);
151 3068 }
152
153 private:
154 uint8x8_t const_6_u8_half_;
155 uint16x8_t const_6_u16_;
156 uint16x8_t const_4_u16_;
157 }; // end of class GaussianBlur<uint8_t, 5, true>
158
159 // Template for 7x7 Gaussian Blur binomial filters.
160 //
161 // [ 4, 14, 28, 36, 28, 14, 4 ]
162 // [ 14, 49, 98, 126, 98, 49, 14 ]
163 // [ 28, 98, 196, 252, 196, 98, 28 ]
164 // F = 1/4096 * [ 36, 126, 252, 324, 252, 126, 36 ] =
165 // [ 28, 98, 196, 252, 196, 98, 28 ]
166 // [ 14, 49, 98, 126, 98, 49, 14 ]
167 // [ 4, 14, 28, 36, 28, 14, 4 ]
168 //
169 // [ 2 ]
170 // [ 7 ]
171 // [ 14 ]
172 // = 1/4096 * [ 18 ] * [ 2, 7, 14, 18, 14, 7, 2 ]
173 // [ 14 ]
174 // [ 7 ]
175 // [ 2 ]
176 template <>
177 class GaussianBlur<uint8_t, 7, true> {
178 public:
179 using SourceType = uint8_t;
180 using BufferType = uint16_t;
181 using DestinationType = uint8_t;
182
183 33 GaussianBlur()
184 33 : const_7_u16_{vdupq_n_u16(7)},
185 33 const_7_u32_{vdupq_n_u32(7)},
186 33 const_9_u16_{vdupq_n_u16(9)} {}
187
188 // Applies vertical filtering vector using SIMD operations.
189 //
190 // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
191 // * [ 2, 7, 14, 18, 14, 7, 2 ]T
192 240 void vertical_vector_path(uint8x16_t src[7], BufferType *dst) const {
193 240 uint16x8_t acc_0_6_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[6]));
194 240 uint16x8_t acc_0_6_h = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[6]));
195
196 240 uint16x8_t acc_1_5_l = vaddl_u8(vget_low_u8(src[1]), vget_low_u8(src[5]));
197 240 uint16x8_t acc_1_5_h = vaddl_u8(vget_high_u8(src[1]), vget_high_u8(src[5]));
198
199 240 uint16x8_t acc_2_4_l = vaddl_u8(vget_low_u8(src[2]), vget_low_u8(src[4]));
200 240 uint16x8_t acc_2_4_h = vaddl_u8(vget_high_u8(src[2]), vget_high_u8(src[4]));
201
202 240 uint16x8_t acc_3_l = vmovl_u8(vget_low_u8(src[3]));
203 240 uint16x8_t acc_3_h = vmovl_u8(vget_high_u8(src[3]));
204
205 240 uint16x8_t acc_0_2_4_6_l = vmlaq_u16(acc_0_6_l, acc_2_4_l, const_7_u16_);
206 240 uint16x8_t acc_0_2_4_6_h = vmlaq_u16(acc_0_6_h, acc_2_4_h, const_7_u16_);
207
208 480 uint16x8_t acc_0_2_3_4_6_l =
209 240 vmlaq_u16(acc_0_2_4_6_l, acc_3_l, const_9_u16_);
210 480 uint16x8_t acc_0_2_3_4_6_h =
211 240 vmlaq_u16(acc_0_2_4_6_h, acc_3_h, const_9_u16_);
212
213 240 acc_0_2_3_4_6_l = vshlq_n_u16(acc_0_2_3_4_6_l, 1);
214 240 acc_0_2_3_4_6_h = vshlq_n_u16(acc_0_2_3_4_6_h, 1);
215
216 480 uint16x8_t acc_0_1_2_3_4_5_6_l =
217 240 vmlaq_u16(acc_0_2_3_4_6_l, acc_1_5_l, const_7_u16_);
218 480 uint16x8_t acc_0_1_2_3_4_5_6_h =
219 240 vmlaq_u16(acc_0_2_3_4_6_h, acc_1_5_h, const_7_u16_);
220
221 240 vst1q(&dst[0], acc_0_1_2_3_4_5_6_l);
222 240 vst1q(&dst[8], acc_0_1_2_3_4_5_6_h);
223 240 }
224
225 // Applies vertical filtering vector using scalar operations.
226 //
227 // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
228 // * [ 2, 7, 14, 18, 14, 7, 2 ]T
229 664 void vertical_scalar_path(const SourceType src[7], BufferType *dst) const {
230 1992 uint16_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 +
231 1328 src[4] * 14 + src[5] * 7 + src[6] * 2;
232 664 dst[0] = acc;
233 664 }
234
235 // Applies horizontal filtering vector using SIMD operations.
236 //
237 // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
238 // * [ 2, 7, 14, 18, 14, 7, 2 ]T
239 216 void horizontal_vector_path(uint16x8_t src[7], DestinationType *dst) const {
240 432 uint32x4_t acc_0_6_l =
241 216 vaddl_u16(vget_low_u16(src[0]), vget_low_u16(src[6]));
242 432 uint32x4_t acc_0_6_h =
243 216 vaddl_u16(vget_high_u16(src[0]), vget_high_u16(src[6]));
244
245 432 uint32x4_t acc_1_5_l =
246 216 vaddl_u16(vget_low_u16(src[1]), vget_low_u16(src[5]));
247 432 uint32x4_t acc_1_5_h =
248 216 vaddl_u16(vget_high_u16(src[1]), vget_high_u16(src[5]));
249
250 216 uint16x8_t acc_2_4 = vaddq_u16(src[2], src[4]);
251
252 432 uint32x4_t acc_0_2_4_6_l =
253 216 vmlal_u16(acc_0_6_l, vget_low_u16(acc_2_4), vget_low_u16(const_7_u16_));
254 432 uint32x4_t acc_0_2_4_6_h = vmlal_u16(acc_0_6_h, vget_high_u16(acc_2_4),
255 216 vget_high_u16(const_7_u16_));
256
257 432 uint32x4_t acc_0_2_3_4_6_l = vmlal_u16(acc_0_2_4_6_l, vget_low_u16(src[3]),
258 216 vget_low_u16(const_9_u16_));
259 432 uint32x4_t acc_0_2_3_4_6_h = vmlal_u16(acc_0_2_4_6_h, vget_high_u16(src[3]),
260 216 vget_high_u16(const_9_u16_));
261
262 216 acc_0_2_3_4_6_l = vshlq_n_u32(acc_0_2_3_4_6_l, 1);
263 216 acc_0_2_3_4_6_h = vshlq_n_u32(acc_0_2_3_4_6_h, 1);
264
265 432 uint32x4_t acc_0_1_2_3_4_5_6_l =
266 216 vmlaq_u32(acc_0_2_3_4_6_l, acc_1_5_l, const_7_u32_);
267 432 uint32x4_t acc_0_1_2_3_4_5_6_h =
268 216 vmlaq_u32(acc_0_2_3_4_6_h, acc_1_5_h, const_7_u32_);
269
270 216 uint16x4_t acc_0_1_2_3_4_5_6_u16_l = vrshrn_n_u32(acc_0_1_2_3_4_5_6_l, 12);
271 216 uint16x4_t acc_0_1_2_3_4_5_6_u16_h = vrshrn_n_u32(acc_0_1_2_3_4_5_6_h, 12);
272
273 432 uint16x8_t acc_0_1_2_3_4_5_6_u16 =
274 216 vcombine_u16(acc_0_1_2_3_4_5_6_u16_l, acc_0_1_2_3_4_5_6_u16_h);
275 216 uint8x8_t acc_0_1_2_3_4_5_6_u8 = vmovn_u16(acc_0_1_2_3_4_5_6_u16);
276
277 216 vst1(&dst[0], acc_0_1_2_3_4_5_6_u8);
278 216 }
279
280 // Applies horizontal filtering vector using scalar operations.
281 //
282 // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
283 // * [ 2, 7, 14, 18, 14, 7, 2 ]T
284 1672 void horizontal_scalar_path(const BufferType src[7],
285 DestinationType *dst) const {
286 5016 uint32_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 +
287 3344 src[4] * 14 + src[5] * 7 + src[6] * 2;
288 1672 dst[0] = static_cast<DestinationType>(rounding_shift_right(acc, 12));
289 1672 }
290
291 private:
292 uint16x8_t const_7_u16_;
293 uint32x4_t const_7_u32_;
294 uint16x8_t const_9_u16_;
295 }; // end of class GaussianBlur<uint8_t, 7, true>
296
297 // Template for 9x9 Gaussian Blur binomial filters.
298 //
299 // [ 16, 52, 120, 204, 240, 204, 120, 52, 16 ]
300 // [ 52, 169, 390, 663, 780, 663, 390, 169, 52 ]
301 // [ 120, 390, 900, 1530, 1800, 1530, 900, 390, 120 ]
302 // F = 1/65536 * [ 204, 663, 1530, 2601, 3060, 2601, 1530, 663, 204 ] =
303 // [ 240, 780, 1800, 3060, 3600, 3060, 1800, 780, 240 ]
304 // [ 204, 663, 1530, 2601, 3060, 2601, 1530, 663, 204 ]
305 // [ 120, 390, 900, 1530, 1800, 1530, 900, 390, 120 ]
306 // [ 52, 169, 390, 663, 780, 663, 390, 169, 52 ]
307 // [ 16, 52, 120, 204, 240, 204, 120, 52, 16 ]
308 //
309 // [ 4 ]
310 // [ 13 ]
311 // [ 30 ]
312 // = 1/65536 * [ 51 ] * [ 4, 13, 30, 51, 60, 51, 30, 13, 4 ]
313 // [ 60 ]
314 // [ 51 ]
315 // [ 30 ]
316 // [ 13 ]
317 // [ 4 ]
318 template <>
319 class GaussianBlur<uint8_t, 9, true> {
320 public:
321 using SourceType = uint8_t;
322 using BufferType = uint16_t;
323 using DestinationType = uint8_t;
324
325 51 GaussianBlur()
326 51 : const_13_u16_{vdupq_n_u16(13)},
327 51 const_30_u16_{vdupq_n_u16(30)},
328 51 const_51_u16_{vdupq_n_u16(51)},
329 51 const_60_u16_{vdupq_n_u16(60)},
330 51 const_13_u32_{vdupq_n_u32(13)},
331 51 const_30_u32_{vdupq_n_u32(30)},
332 51 const_51_u32_{vdupq_n_u32(51)},
333 51 const_60_u32_{vdupq_n_u32(60)} {}
334
335 // Applies vertical filtering vector using SIMD operations.
336 //
337 // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6, SRC7, SRC8 ] *
338 // * [ 4, 13, 30, 51, 60, 51, 30, 13, 4 ]T
339 556 void vertical_vector_path(uint8x16_t src[9], BufferType *dst) const {
340 556 uint16x8_t acc_0_8_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[8]));
341 556 uint16x8_t acc_0_8_h = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[8]));
342
343 556 uint16x8_t acc_1_7_l = vaddl_u8(vget_low_u8(src[1]), vget_low_u8(src[7]));
344 556 uint16x8_t acc_1_7_h = vaddl_u8(vget_high_u8(src[1]), vget_high_u8(src[7]));
345
346 556 uint16x8_t acc_2_6_l = vaddl_u8(vget_low_u8(src[2]), vget_low_u8(src[6]));
347 556 uint16x8_t acc_2_6_h = vaddl_u8(vget_high_u8(src[2]), vget_high_u8(src[6]));
348
349 556 uint16x8_t acc_3_5_l = vaddl_u8(vget_low_u8(src[3]), vget_low_u8(src[5]));
350 556 uint16x8_t acc_3_5_h = vaddl_u8(vget_high_u8(src[3]), vget_high_u8(src[5]));
351
352 556 uint16x8_t acc_4_l = vmovl_u8(vget_low_u8(src[4]));
353 556 uint16x8_t acc_4_h = vmovl_u8(vget_high_u8(src[4]));
354
355 // Split the work into two independent accumulators.
356 556 uint16x8_t acc_l_even = vshlq_n_u16(acc_0_8_l, 2);
357 556 uint16x8_t acc_h_even = vshlq_n_u16(acc_0_8_h, 2);
358 556 uint16x8_t acc_l_odd = vmulq_u16(acc_1_7_l, const_13_u16_);
359 556 uint16x8_t acc_h_odd = vmulq_u16(acc_1_7_h, const_13_u16_);
360
361 556 acc_l_even = vmlaq_u16(acc_l_even, acc_2_6_l, const_30_u16_);
362 556 acc_h_even = vmlaq_u16(acc_h_even, acc_2_6_h, const_30_u16_);
363 556 acc_l_odd = vmlaq_u16(acc_l_odd, acc_3_5_l, const_51_u16_);
364 556 acc_h_odd = vmlaq_u16(acc_h_odd, acc_3_5_h, const_51_u16_);
365 556 acc_l_even = vmlaq_u16(acc_l_even, acc_4_l, const_60_u16_);
366 556 acc_h_even = vmlaq_u16(acc_h_even, acc_4_h, const_60_u16_);
367
368 556 uint16x8_t acc_l = vaddq_u16(acc_l_even, acc_l_odd);
369 556 uint16x8_t acc_h = vaddq_u16(acc_h_even, acc_h_odd);
370
371 556 vst1q(&dst[0], acc_l);
372 556 vst1q(&dst[8], acc_h);
373 556 }
374
375 // Applies vertical filtering vector using scalar operations.
376 //
377 // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6, SRC7, SRC8 ] *
378 // * [ 4, 13, 30, 51, 60, 51, 30, 13, 4 ]T
379 2276 void vertical_scalar_path(const SourceType src[9], BufferType *dst) const {
380 6828 uint16_t acc = src[0] * 4 + src[1] * 13 + src[2] * 30 + src[3] * 51 +
381 6828 src[4] * 60 + src[5] * 51 + src[6] * 30 + src[7] * 13 +
382 2276 src[8] * 4;
383 2276 dst[0] = acc;
384 2276 }
385
386 // Applies horizontal filtering vector using SIMD operations.
387 //
388 // DST = 1/65536 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6, SRC7, SRC8 ] *
389 // * [ 4, 13, 30, 51, 60, 51, 30, 13, 4 ]T
390 528 void horizontal_vector_path(uint16x8_t src[9], DestinationType *dst) const {
391 1056 uint32x4_t acc_0_8_l =
392 528 vaddl_u16(vget_low_u16(src[0]), vget_low_u16(src[8]));
393 1056 uint32x4_t acc_0_8_h =
394 528 vaddl_u16(vget_high_u16(src[0]), vget_high_u16(src[8]));
395
396 1056 uint32x4_t acc_1_7_l =
397 528 vaddl_u16(vget_low_u16(src[1]), vget_low_u16(src[7]));
398 1056 uint32x4_t acc_1_7_h =
399 528 vaddl_u16(vget_high_u16(src[1]), vget_high_u16(src[7]));
400
401 1056 uint32x4_t acc_2_6_l =
402 528 vaddl_u16(vget_low_u16(src[2]), vget_low_u16(src[6]));
403 1056 uint32x4_t acc_2_6_h =
404 528 vaddl_u16(vget_high_u16(src[2]), vget_high_u16(src[6]));
405
406 1056 uint32x4_t acc_3_5_l =
407 528 vaddl_u16(vget_low_u16(src[3]), vget_low_u16(src[5]));
408 1056 uint32x4_t acc_3_5_h =
409 528 vaddl_u16(vget_high_u16(src[3]), vget_high_u16(src[5]));
410
411 528 uint32x4_t acc_4_l = vmovl_u16(vget_low_u16(src[4]));
412 528 uint32x4_t acc_4_h = vmovl_u16(vget_high_u16(src[4]));
413
414 // Split the work into two independent accumulators.
415 528 uint32x4_t acc_l_even = vshlq_n_u32(acc_0_8_l, 2);
416 528 uint32x4_t acc_h_even = vshlq_n_u32(acc_0_8_h, 2);
417 528 uint32x4_t acc_l_odd = vmulq_u32(acc_1_7_l, const_13_u32_);
418 528 uint32x4_t acc_h_odd = vmulq_u32(acc_1_7_h, const_13_u32_);
419
420 528 acc_l_even = vmlaq_u32(acc_l_even, acc_2_6_l, const_30_u32_);
421 528 acc_h_even = vmlaq_u32(acc_h_even, acc_2_6_h, const_30_u32_);
422 528 acc_l_odd = vmlaq_u32(acc_l_odd, acc_3_5_l, const_51_u32_);
423 528 acc_h_odd = vmlaq_u32(acc_h_odd, acc_3_5_h, const_51_u32_);
424 528 acc_l_even = vmlaq_u32(acc_l_even, acc_4_l, const_60_u32_);
425 528 acc_h_even = vmlaq_u32(acc_h_even, acc_4_h, const_60_u32_);
426
427 528 uint32x4_t acc_l = vaddq_u32(acc_l_even, acc_l_odd);
428 528 uint32x4_t acc_h = vaddq_u32(acc_h_even, acc_h_odd);
429
430 528 uint16x4_t acc_u16_l = vrshrn_n_u32(acc_l, 16);
431 528 uint16x4_t acc_u16_h = vrshrn_n_u32(acc_h, 16);
432 528 uint16x8_t acc_u16 = vcombine_u16(acc_u16_l, acc_u16_h);
433 528 uint8x8_t acc_u8 = vmovn_u16(acc_u16);
434
435 528 vst1(&dst[0], acc_u8);
436 528 }
437
438 // Applies horizontal filtering vector using scalar operations.
439 //
440 // DST = 1/65536 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6, SRC7, SRC8 ] *
441 // * [ 4, 13, 30, 51, 60, 51, 30, 13, 4 ]T
442 5124 void horizontal_scalar_path(const BufferType src[9],
443 DestinationType *dst) const {
444 15372 uint32_t acc = src[0] * 4 + src[1] * 13 + src[2] * 30 + src[3] * 51 +
445 15372 src[4] * 60 + src[5] * 51 + src[6] * 30 + src[7] * 13 +
446 5124 src[8] * 4;
447 5124 dst[0] = static_cast<DestinationType>(rounding_shift_right(acc, 16));
448 5124 }
449
450 private:
451 uint16x8_t const_13_u16_;
452 uint16x8_t const_30_u16_;
453 uint16x8_t const_51_u16_;
454 uint16x8_t const_60_u16_;
455 uint32x4_t const_13_u32_;
456 uint32x4_t const_30_u32_;
457 uint32x4_t const_51_u32_;
458 uint32x4_t const_60_u32_;
459 }; // end of class GaussianBlur<uint8_t, 9, true>
460
461 template <size_t KernelSize>
462 class GaussianBlur<uint8_t, KernelSize, false> {
463 public:
464 using SourceType = uint8_t;
465 using BufferType = uint8_t;
466 using DestinationType = uint8_t;
467
468 static constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize);
469
470 132 explicit GaussianBlur(const uint8_t *half_kernel)
471 132 : half_kernel_(half_kernel) {}
472
473 3760 void vertical_vector_path(uint8x16_t src[KernelSize], BufferType *dst) const {
474 3760 common_vector_path(src, dst);
475 3760 }
476
477 42784 void vertical_scalar_path(const SourceType src[KernelSize],
478 BufferType *dst) const {
479 42784 uint16_t acc = src[kHalfKernelSize - 1] * half_kernel_[kHalfKernelSize - 1];
480
481 // Optimization to avoid unnecessary branching in vector code.
482 KLEIDICV_FORCE_LOOP_UNROLL
483
12/12
✓ Branch 0 taken 13568 times.
✓ Branch 1 taken 94976 times.
✓ Branch 2 taken 23528 times.
✓ Branch 3 taken 235280 times.
✓ Branch 4 taken 336 times.
✓ Branch 5 taken 336 times.
✓ Branch 6 taken 1096 times.
✓ Branch 7 taken 2192 times.
✓ Branch 8 taken 1784 times.
✓ Branch 9 taken 5352 times.
✓ Branch 10 taken 2472 times.
✓ Branch 11 taken 9888 times.
390808 for (size_t i = 0; i < kHalfKernelSize - 1; i++) {
484 348024 acc += (src[i] + src[KernelSize - i - 1]) * half_kernel_[i];
485 348024 }
486
487 42784 dst[0] = static_cast<DestinationType>(rounding_shift_right(acc, 8));
488 42784 }
489
490 1056 void horizontal_vector_path(uint8x16_t src[KernelSize],
491 DestinationType *dst) const {
492 1056 common_vector_path(src, dst);
493 1056 }
494
495 39352 void horizontal_scalar_path(const BufferType src[KernelSize],
496 DestinationType *dst) const {
497 39352 vertical_scalar_path(src, dst);
498 39352 }
499
500 private:
501 4816 void common_vector_path(uint8x16_t src[KernelSize], BufferType *dst) const {
502 4816 uint8x8_t half_kernel_mid = vdup_n_u8(half_kernel_[kHalfKernelSize - 1]);
503 9632 uint16x8_t acc_l =
504 9632 vmlal_u8(vdupq_n_u16(128), vget_low_u8(src[kHalfKernelSize - 1]),
505 4816 half_kernel_mid);
506 9632 uint16x8_t acc_h =
507 9632 vmlal_u8(vdupq_n_u16(128), vget_high_u8(src[kHalfKernelSize - 1]),
508 4816 half_kernel_mid);
509
510 // Optimization to avoid unnecessary branching in vector code.
511 KLEIDICV_FORCE_LOOP_UNROLL
512
12/12
✓ Branch 0 taken 1344 times.
✓ Branch 1 taken 9408 times.
✓ Branch 2 taken 2736 times.
✓ Branch 3 taken 27360 times.
✓ Branch 4 taken 56 times.
✓ Branch 5 taken 56 times.
✓ Branch 6 taken 112 times.
✓ Branch 7 taken 224 times.
✓ Branch 8 taken 216 times.
✓ Branch 9 taken 648 times.
✓ Branch 10 taken 352 times.
✓ Branch 11 taken 1408 times.
43920 for (size_t i = 0; i < kHalfKernelSize - 1; i++) {
513 39104 const size_t j = KernelSize - i - 1;
514 39104 uint16x8_t vec_l = vaddl_u8(vget_low_u8(src[i]), vget_low_u8(src[j]));
515 39104 uint16x8_t vec_h = vaddl_high_u8(src[i], src[j]);
516 39104 uint16x8_t coeff = vdupq_n_u16(half_kernel_[i]);
517
518 39104 acc_l = vmlaq_u16(acc_l, vec_l, coeff);
519 39104 acc_h = vmlaq_u16(acc_h, vec_h, coeff);
520 39104 }
521
522 // Keep only the highest 8 bits
523 9632 uint8x16_t result =
524 4816 vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h));
525 4816 neon::VecTraits<uint8_t>::store(result, &dst[0]);
526 4816 }
527
528 const uint8_t *half_kernel_;
529 }; // end of class GaussianBlur<uint8_t, KernelSize, false>
530
531 template <size_t KernelSize, bool IsBinomial, typename ScalarType>
532 430 static kleidicv_error_t gaussian_blur_fixed_kernel_size(
533 const ScalarType *src, size_t src_stride, ScalarType *dst,
534 size_t dst_stride, Rectangle &rect, size_t y_begin, size_t y_end,
535 size_t channels, float sigma, FixedBorderType border_type) {
536 using GaussianBlurFilter = GaussianBlur<ScalarType, KernelSize, IsBinomial>;
537 430 constexpr size_t intermediate_size{
538 sizeof(typename GaussianBlurFilter::BufferType)};
539
540 430 auto workspace_variant =
541 430 SeparableFilterWorkspace::create(rect, channels, intermediate_size);
542
40/40
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 47 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 47 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 61 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 61 times.
✓ Branch 8 taken 1 times.
✓ Branch 9 taken 33 times.
✓ Branch 10 taken 1 times.
✓ Branch 11 taken 33 times.
✓ Branch 12 taken 1 times.
✓ Branch 13 taken 51 times.
✓ Branch 14 taken 1 times.
✓ Branch 15 taken 51 times.
✓ Branch 16 taken 1 times.
✓ Branch 17 taken 48 times.
✓ Branch 18 taken 1 times.
✓ Branch 19 taken 48 times.
✓ Branch 20 taken 1 times.
✓ Branch 21 taken 48 times.
✓ Branch 22 taken 1 times.
✓ Branch 23 taken 48 times.
✓ Branch 24 taken 1 times.
✓ Branch 25 taken 33 times.
✓ Branch 26 taken 1 times.
✓ Branch 27 taken 33 times.
✓ Branch 28 taken 1 times.
✓ Branch 29 taken 33 times.
✓ Branch 30 taken 1 times.
✓ Branch 31 taken 33 times.
✓ Branch 32 taken 1 times.
✓ Branch 33 taken 33 times.
✓ Branch 34 taken 1 times.
✓ Branch 35 taken 33 times.
✓ Branch 36 taken 1 times.
✓ Branch 37 taken 33 times.
✓ Branch 38 taken 1 times.
✓ Branch 39 taken 33 times.
440 if (auto *err = std::get_if<kleidicv_error_t>(&workspace_variant)) {
543 10 return *err;
544 }
545 420 auto &workspace = *std::get_if<SeparableFilterWorkspace>(&workspace_variant);
546
547 420 Rows<const ScalarType> src_rows{src, src_stride, channels};
548 420 Rows<ScalarType> dst_rows{dst, dst_stride, channels};
549
550 if constexpr (IsBinomial) {
551 192 GaussianBlurFilter blur;
552 192 SeparableFilter<GaussianBlurFilter, KernelSize> filter{blur};
553 192 workspace.process(y_begin, y_end, src_rows, dst_rows, border_type, filter);
554 192 return KLEIDICV_OK;
555 192 } else {
556 228 constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize);
557 228 uint8_t half_kernel[128];
558 456 bool success =
559 228 generate_gaussian_half_kernel(half_kernel, kHalfKernelSize, sigma);
560
12/12
✓ Branch 0 taken 32 times.
✓ Branch 1 taken 16 times.
✓ Branch 2 taken 32 times.
✓ Branch 3 taken 16 times.
✓ Branch 4 taken 17 times.
✓ Branch 5 taken 16 times.
✓ Branch 6 taken 17 times.
✓ Branch 7 taken 16 times.
✓ Branch 8 taken 17 times.
✓ Branch 9 taken 16 times.
✓ Branch 10 taken 17 times.
✓ Branch 11 taken 16 times.
228 if (success) {
561 132 GaussianBlurFilter blur(half_kernel);
562 132 SeparableFilter<GaussianBlurFilter, KernelSize> filter{blur};
563 264 workspace.process(y_begin, y_end, src_rows, dst_rows, border_type,
564 132 filter);
565 132 } else {
566 // Sigma is too small that the middle point would get all the weight
567 // => it's just a copy.
568
12/12
✓ Branch 0 taken 228 times.
✓ Branch 1 taken 16 times.
✓ Branch 2 taken 324 times.
✓ Branch 3 taken 16 times.
✓ Branch 4 taken 36 times.
✓ Branch 5 taken 16 times.
✓ Branch 6 taken 68 times.
✓ Branch 7 taken 16 times.
✓ Branch 8 taken 100 times.
✓ Branch 9 taken 16 times.
✓ Branch 10 taken 132 times.
✓ Branch 11 taken 16 times.
984 for (size_t row = y_begin; row < y_end; ++row) {
569 1776 std::memcpy(static_cast<void *>(&dst_rows.at(row)[0]),
570 888 static_cast<const void *>(&src_rows.at(row)[0]),
571 888 rect.width() * sizeof(ScalarType) * dst_rows.channels());
572 888 }
573 }
574 228 return KLEIDICV_OK;
575 228 }
576 430 }
577
578 template <bool IsBinomial, typename ScalarType>
579 430 static kleidicv_error_t gaussian_blur_fixed(
580 size_t kernel_size, const ScalarType *src, size_t src_stride,
581 ScalarType *dst, size_t dst_stride, Rectangle &rect, size_t y_begin,
582 size_t y_end, size_t channels, float sigma, FixedBorderType border_type) {
583
12/14
✓ Branch 0 taken 48 times.
✓ Branch 1 taken 62 times.
✓ Branch 2 taken 34 times.
✓ Branch 3 taken 52 times.
✓ Branch 4 taken 16 times.
✗ Branch 5 not taken.
✓ Branch 6 taken 16 times.
✓ Branch 7 taken 34 times.
✓ Branch 8 taken 34 times.
✓ Branch 9 taken 34 times.
✓ Branch 10 taken 34 times.
✓ Branch 11 taken 33 times.
✗ Branch 12 not taken.
✓ Branch 13 taken 33 times.
430 switch (kernel_size) {
584 case 3:
585 82 return gaussian_blur_fixed_kernel_size<3, IsBinomial>(
586 82 src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
587 82 sigma, border_type);
588 case 5:
589 96 return gaussian_blur_fixed_kernel_size<5, IsBinomial>(
590 96 src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
591 96 sigma, border_type);
592 case 7:
593 68 return gaussian_blur_fixed_kernel_size<7, IsBinomial>(
594 68 src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
595 68 sigma, border_type);
596 case 9:
597 86 return gaussian_blur_fixed_kernel_size<9, IsBinomial>(
598 86 src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
599 86 sigma, border_type);
600 case 15:
601 // 15x15 does not have a binomial variant
602 49 return gaussian_blur_fixed_kernel_size<15, false>(
603 49 src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
604 49 sigma, border_type);
605 case 21:
606 // 21x21 does not have a binomial variant
607 49 return gaussian_blur_fixed_kernel_size<21, false>(
608 49 src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
609 49 sigma, border_type);
610 // gaussian_blur_is_implemented checked the kernel size already.
611 // GCOVR_EXCL_START
612 default:
613 assert(!"kernel size not implemented");
614 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
615 // GCOVR_EXCL_STOP
616 }
617 430 }
618
619 KLEIDICV_TARGET_FN_ATTRS
620 440 kleidicv_error_t gaussian_blur_fixed_stripe_u8(
621 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
622 size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels,
623 size_t kernel_width, size_t /*kernel_height*/, float sigma_x,
624 float /*sigma_y*/, FixedBorderType fixed_border_type) {
625
6/6
✓ Branch 0 taken 10 times.
✓ Branch 1 taken 430 times.
✓ Branch 2 taken 10 times.
✓ Branch 3 taken 430 times.
✓ Branch 4 taken 10 times.
✓ Branch 5 taken 430 times.
1320 if (auto result =
626 440 gaussian_blur_checks(src, src_stride, dst, dst_stride, width, height);
627
2/3
✗ Branch 0 not taken.
✓ Branch 1 taken 10 times.
✓ Branch 2 taken 430 times.
450 result != KLEIDICV_OK) {
628 10 return result;
629 }
630
631 430 Rectangle rect{width, height};
632
633
2/2
✓ Branch 0 taken 228 times.
✓ Branch 1 taken 202 times.
430 if (sigma_x == 0.0) {
634 456 return gaussian_blur_fixed<true>(kernel_width, src, src_stride, dst,
635 228 dst_stride, rect, y_begin, y_end, channels,
636 228 sigma_x, fixed_border_type);
637 }
638
639 404 return gaussian_blur_fixed<false>(kernel_width, src, src_stride, dst,
640 202 dst_stride, rect, y_begin, y_end, channels,
641 202 sigma_x, fixed_border_type);
642 440 }
643
644 } // namespace kleidicv::neon
645