KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/filters/gaussian_blur_fixed_sc.h
Date: 2026-03-05 15:57:40
Exec Total Coverage
Lines: 257 257 100.0%
Functions: 122 122 100.0%
Branches: 110 113 97.3%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2023 - 2026 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_GAUSSIAN_BLUR_SC_H
6 #define KLEIDICV_GAUSSIAN_BLUR_SC_H
7
8 #include <array>
9 #include <cassert>
10
11 #include "kleidicv/filters/gaussian_blur.h"
12 #include "kleidicv/filters/separable_filter_15x15_sc.h"
13 #include "kleidicv/filters/separable_filter_21x21_sc.h"
14 #include "kleidicv/filters/separable_filter_3x3_sc.h"
15 #include "kleidicv/filters/separable_filter_5x5_sc.h"
16 #include "kleidicv/filters/separable_filter_7x7_sc.h"
17 #include "kleidicv/filters/separable_filter_9x9_sc.h"
18 #include "kleidicv/filters/sigma.h"
19 #include "kleidicv/workspace/separable.h"
20
21 #if KLEIDICV_TARGET_SME || KLEIDICV_TARGET_SME2
22 #include <arm_sme.h>
23 #endif
24
25 namespace KLEIDICV_TARGET_NAMESPACE {
26
27 // Primary template for Gaussian Blur filters.
28 template <typename ScalarType, size_t KernelSize, bool IsBinomial>
29 class GaussianBlur;
30
31 // Template for 3x3 Gaussian Blur binomial filters.
32 //
33 // [ 1, 2, 1 ] [ 1 ]
34 // F = 1/16 * [ 2, 4, 2 ] = 1/16 * [ 2 ] * [ 1, 2, 1 ]
35 // [ 1, 2, 1 ] [ 1 ]
36 template <>
37 class GaussianBlur<uint8_t, 3, true> {
38 public:
39 using SourceType = uint8_t;
40 using BufferType = uint16_t;
41 using DestinationType = uint8_t;
42
43 // Applies vertical filtering vector using SIMD operations.
44 //
45 // DST = [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
46 804 void vertical_vector_path(svbool_t pg,
47 std::reference_wrapper<svuint8_t> src[3],
48 BufferType *dst) const KLEIDICV_STREAMING {
49 804 svuint16_t acc_0_2_b = svaddlb_u16(src[0], src[2]);
50 804 svuint16_t acc_0_2_t = svaddlt_u16(src[0], src[2]);
51
52 804 svuint16_t acc_1_b = svshllb_n_u16(src[1], 1);
53 804 svuint16_t acc_1_t = svshllt_n_u16(src[1], 1);
54
55 804 svuint16_t acc_u16_b = svadd_u16_x(pg, acc_0_2_b, acc_1_b);
56 804 svuint16_t acc_u16_t = svadd_u16_x(pg, acc_0_2_t, acc_1_t);
57
58 804 svuint16x2_t interleaved = svcreate2(acc_u16_b, acc_u16_t);
59 804 svst2(pg, &dst[0], interleaved);
60 804 }
61
62 // Applies horizontal filtering vector using SIMD operations.
63 //
64 // DST = 1/16 * [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
65 604 void horizontal_vector_path(svbool_t pg,
66 std::reference_wrapper<svuint16_t> src[3],
67 DestinationType *dst) const KLEIDICV_STREAMING {
68 604 svuint16_t acc_0_2 = svhadd_u16_x(pg, src[0], src[2]);
69
70 604 svuint16_t acc = svadd_u16_x(pg, acc_0_2, src[1]);
71 604 acc = svrshr_x(pg, acc, 3);
72
73 604 svst1b(pg, &dst[0], acc);
74 604 }
75
76 // Applies horizontal filtering vector using scalar operations.
77 //
78 // DST = 1/16 * [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T
79 1608 void horizontal_scalar_path(const BufferType src[3],
80 DestinationType *dst) const KLEIDICV_STREAMING {
81 1608 auto acc = src[0] + 2 * src[1] + src[2];
82 1608 dst[0] = rounding_shift_right(acc, 4);
83 1608 }
84 }; // end of class GaussianBlur<uint8_t, 3, true>
85
86 // Template for 5x5 Gaussian Blur binomial filters.
87 //
88 // [ 1, 4, 6, 4, 1 ] [ 1 ]
89 // [ 4, 16, 24, 16, 4 ] [ 4 ]
90 // F = 1/256 * [ 6, 24, 36, 24, 6 ] = 1/256 * [ 6 ] * [ 1, 4, 6, 4, 1 ]
91 // [ 4, 16, 24, 16, 4 ] [ 4 ]
92 // [ 1, 4, 6, 4, 1 ] [ 1 ]
93 template <>
94 class GaussianBlur<uint8_t, 5, true> {
95 public:
96 using SourceType = uint8_t;
97 using BufferType = uint16_t;
98 using DestinationType = uint8_t;
99
100 // Applies vertical filtering vector using SIMD operations.
101 //
102 // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
103 2218 void vertical_vector_path(svbool_t pg,
104 std::reference_wrapper<svuint8_t> src[5],
105 BufferType *dst) const KLEIDICV_STREAMING {
106 2218 svuint16_t acc_0_4_b = svaddlb_u16(src[0], src[4]);
107 2218 svuint16_t acc_0_4_t = svaddlt_u16(src[0], src[4]);
108 2218 svuint16_t acc_1_3_b = svaddlb_u16(src[1], src[3]);
109 2218 svuint16_t acc_1_3_t = svaddlt_u16(src[1], src[3]);
110
111 2218 svuint16_t acc_u16_b = svmlalb_n_u16(acc_0_4_b, src[2], 6);
112 2218 svuint16_t acc_u16_t = svmlalt_n_u16(acc_0_4_t, src[2], 6);
113 2218 acc_u16_b = svmla_n_u16_x(pg, acc_u16_b, acc_1_3_b, 4);
114 2218 acc_u16_t = svmla_n_u16_x(pg, acc_u16_t, acc_1_3_t, 4);
115
116 2218 svuint16x2_t interleaved = svcreate2(acc_u16_b, acc_u16_t);
117 2218 svst2(pg, &dst[0], interleaved);
118 2218 }
119
120 // Applies horizontal filtering vector using SIMD operations.
121 //
122 // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
123 1872 void horizontal_vector_path(svbool_t pg,
124 std::reference_wrapper<svuint16_t> src[5],
125 DestinationType *dst) const KLEIDICV_STREAMING {
126 1872 svuint16_t acc_0_4 = svadd_x(pg, src[0], src[4]);
127 1872 svuint16_t acc_1_3 = svadd_x(pg, src[1], src[3]);
128 1872 svuint16_t acc = svmla_n_u16_x(pg, acc_0_4, src[2], 6);
129 1872 acc = svmla_n_u16_x(pg, acc, acc_1_3, 4);
130 1872 acc = svrshr_x(pg, acc, 8);
131 1872 svst1b(pg, &dst[0], acc);
132 1872 }
133
134 // Applies horizontal filtering vector using scalar operations.
135 //
136 // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
137 8328 void horizontal_scalar_path(const BufferType src[5],
138 DestinationType *dst) const KLEIDICV_STREAMING {
139 8328 auto acc = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2];
140 8328 dst[0] = rounding_shift_right(acc, 8);
141 8328 }
142 }; // end of class GaussianBlur<uint8_t, 5, true>
143
144 // Template for 7x7 Gaussian Blur binomial filters.
145 //
146 // [ 4, 14, 28, 36, 28, 14, 4 ]
147 // [ 14, 49, 98, 126, 98, 49, 14 ]
148 // [ 28, 98, 196, 252, 196, 98, 28 ]
149 // F = 1/4096 * [ 36, 126, 252, 324, 252, 126, 36 ] =
150 // [ 28, 98, 196, 252, 196, 98, 28 ]
151 // [ 14, 49, 98, 126, 98, 49, 14 ]
152 // [ 4, 14, 28, 36, 28, 14, 4 ]
153 //
154 // [ 2 ]
155 // [ 7 ]
156 // [ 14 ]
157 // = 1/4096 * [ 18 ] * [ 2, 7, 14, 18, 14, 7, 2 ]
158 // [ 14 ]
159 // [ 7 ]
160 // [ 2 ]
161 template <>
162 class GaussianBlur<uint8_t, 7, true> {
163 public:
164 using SourceType = uint8_t;
165 using BufferType = uint16_t;
166 using DestinationType = uint8_t;
167
168 // Applies vertical filtering vector using SIMD operations.
169 //
170 // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
171 // * [ 2, 7, 14, 18, 14, 7, 2 ]T
172 870 void vertical_vector_path(svbool_t pg,
173 std::reference_wrapper<svuint8_t> src[7],
174 BufferType *dst) const KLEIDICV_STREAMING {
175 870 svuint16_t acc_0_6_b = svaddlb_u16(src[0], src[6]);
176 870 svuint16_t acc_0_6_t = svaddlt_u16(src[0], src[6]);
177
178 870 svuint16_t acc_1_5_b = svaddlb_u16(src[1], src[5]);
179 870 svuint16_t acc_1_5_t = svaddlt_u16(src[1], src[5]);
180
181 870 svuint16_t acc_2_4_b = svaddlb_u16(src[2], src[4]);
182 870 svuint16_t acc_2_4_t = svaddlt_u16(src[2], src[4]);
183
184 870 svuint16_t acc_3_b = svmovlb_u16(src[3]);
185 870 svuint16_t acc_3_t = svmovlt_u16(src[3]);
186
187 870 svuint16_t acc_0_2_4_6_b = svmla_n_u16_x(pg, acc_0_6_b, acc_2_4_b, 7);
188 870 svuint16_t acc_0_2_4_6_t = svmla_n_u16_x(pg, acc_0_6_t, acc_2_4_t, 7);
189
190 870 svuint16_t acc_0_2_3_4_6_b = svmla_n_u16_x(pg, acc_0_2_4_6_b, acc_3_b, 9);
191 870 svuint16_t acc_0_2_3_4_6_t = svmla_n_u16_x(pg, acc_0_2_4_6_t, acc_3_t, 9);
192 870 acc_0_2_3_4_6_b = svlsl_n_u16_x(pg, acc_0_2_3_4_6_b, 1);
193 870 acc_0_2_3_4_6_t = svlsl_n_u16_x(pg, acc_0_2_3_4_6_t, 1);
194
195 1740 svuint16_t acc_0_1_2_3_4_5_6_b =
196 870 svmla_n_u16_x(pg, acc_0_2_3_4_6_b, acc_1_5_b, 7);
197 1740 svuint16_t acc_0_1_2_3_4_5_6_t =
198 870 svmla_n_u16_x(pg, acc_0_2_3_4_6_t, acc_1_5_t, 7);
199
200 1740 svuint16x2_t interleaved =
201 870 svcreate2(acc_0_1_2_3_4_5_6_b, acc_0_1_2_3_4_5_6_t);
202 870 svst2(pg, &dst[0], interleaved);
203 870 }
204
205 // Applies horizontal filtering vector using SIMD operations.
206 //
207 // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
208 // * [ 2, 7, 14, 18, 14, 7, 2 ]T
209 684 void horizontal_vector_path(svbool_t pg,
210 std::reference_wrapper<svuint16_t> src[7],
211 DestinationType *dst) const KLEIDICV_STREAMING {
212 684 svuint32_t acc_0_6_b = svaddlb_u32(src[0], src[6]);
213 684 svuint32_t acc_0_6_t = svaddlt_u32(src[0], src[6]);
214
215 684 svuint32_t acc_1_5_b = svaddlb_u32(src[1], src[5]);
216 684 svuint32_t acc_1_5_t = svaddlt_u32(src[1], src[5]);
217
218 684 svuint16_t acc_2_4 = svadd_u16_x(pg, src[2], src[4]);
219
220 684 svuint32_t acc_0_2_4_6_b = svmlalb_n_u32(acc_0_6_b, acc_2_4, 7);
221 684 svuint32_t acc_0_2_4_6_t = svmlalt_n_u32(acc_0_6_t, acc_2_4, 7);
222
223 684 svuint32_t acc_0_2_3_4_6_b = svmlalb_n_u32(acc_0_2_4_6_b, src[3], 9);
224 684 svuint32_t acc_0_2_3_4_6_t = svmlalt_n_u32(acc_0_2_4_6_t, src[3], 9);
225
226 684 acc_0_2_3_4_6_b = svlsl_n_u32_x(pg, acc_0_2_3_4_6_b, 1);
227 684 acc_0_2_3_4_6_t = svlsl_n_u32_x(pg, acc_0_2_3_4_6_t, 1);
228
229 1368 svuint32_t acc_0_1_2_3_4_5_6_b =
230 684 svmla_n_u32_x(pg, acc_0_2_3_4_6_b, acc_1_5_b, 7);
231 1368 svuint32_t acc_0_1_2_3_4_5_6_t =
232 684 svmla_n_u32_x(pg, acc_0_2_3_4_6_t, acc_1_5_t, 7);
233
234 1368 svuint16_t acc_0_1_2_3_4_5_6_u16_b =
235 684 svrshrnb_n_u32(acc_0_1_2_3_4_5_6_b, 12);
236 1368 svuint16_t acc_0_1_2_3_4_5_6_u16 =
237 684 svrshrnt_n_u32(acc_0_1_2_3_4_5_6_u16_b, acc_0_1_2_3_4_5_6_t, 12);
238
239 684 svst1b(pg, &dst[0], acc_0_1_2_3_4_5_6_u16);
240 684 }
241
242 // Applies horizontal filtering vector using scalar operations.
243 //
244 // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] *
245 // * [ 2, 7, 14, 18, 14, 7, 2 ]T
246 4932 void horizontal_scalar_path(const BufferType src[7],
247 DestinationType *dst) const KLEIDICV_STREAMING {
248 14796 uint32_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 +
249 9864 src[4] * 14 + src[5] * 7 + src[6] * 2;
250 4932 dst[0] = rounding_shift_right(acc, 12);
251 4932 }
252 }; // end of class GaussianBlur<uint8_t, 7, true>
253
254 // Template for 9x9 Gaussian Blur binomial filters.
255 //
256 // [ 16, 52, 120, 204, 240, 204, 120, 52, 16 ]
257 // [ 52, 169, 390, 663, 780, 663, 390, 169, 52 ]
258 // [ 120, 390, 900, 1530, 1800, 1530, 900, 390, 120 ]
259 // F = 1/65536 * [ 204, 663, 1530, 2601, 3060, 2601, 1530, 663, 204 ] =
260 // [ 240, 780, 1800, 3060, 3600, 3060, 1800, 780, 240 ]
261 // [ 204, 663, 1530, 2601, 3060, 2601, 1530, 663, 204 ]
262 // [ 120, 390, 900, 1530, 1800, 1530, 900, 390, 120 ]
263 // [ 52, 169, 390, 663, 780, 663, 390, 169, 52 ]
264 // [ 16, 52, 120, 204, 240, 204, 120, 52, 16 ]
265 //
266 // [ 4 ]
267 // [ 13 ]
268 // [ 30 ]
269 // = 1/65536 * [ 51 ] * [ 4, 13, 30, 51, 60, 51, 30, 13, 4 ]
270 // [ 60 ]
271 // [ 51 ]
272 // [ 30 ]
273 // [ 13 ]
274 // [ 4 ]
275 template <>
276 class GaussianBlur<uint8_t, 9, true> {
277 public:
278 using SourceType = uint8_t;
279 using BufferType = uint16_t;
280 using DestinationType = uint8_t;
281
282 // Applies vertical filtering vector using SIMD operations.
283 //
284 // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6, SRC7, SRC8 ] *
285 // * [ 4, 13, 30, 51, 60, 51, 30, 13, 4 ]T
286 1904 void vertical_vector_path(svbool_t pg,
287 std::reference_wrapper<svuint8_t> src[9],
288 BufferType *dst) const KLEIDICV_STREAMING {
289 // Lane-level split after widening: *_lo/*_hi are lower/upper lanes.
290 1904 svuint16_t acc_0_8_lo = svaddlb_u16(src[0], src[8]);
291 1904 svuint16_t acc_0_8_hi = svaddlt_u16(src[0], src[8]);
292
293 1904 svuint16_t acc_1_7_lo = svaddlb_u16(src[1], src[7]);
294 1904 svuint16_t acc_1_7_hi = svaddlt_u16(src[1], src[7]);
295
296 1904 svuint16_t acc_2_6_lo = svaddlb_u16(src[2], src[6]);
297 1904 svuint16_t acc_2_6_hi = svaddlt_u16(src[2], src[6]);
298
299 1904 svuint16_t acc_3_5_lo = svaddlb_u16(src[3], src[5]);
300 1904 svuint16_t acc_3_5_hi = svaddlt_u16(src[3], src[5]);
301
302 1904 svuint16_t acc_4_lo = svmovlb_u16(src[4]);
303 1904 svuint16_t acc_4_hi = svmovlt_u16(src[4]);
304
305 // Window-level grouping: *_tap_even/*_tap_odd are even/odd taps
306 // (0, 2, 4, 6, 8 vs 1, 3, 5, 7).
307 1904 svuint16_t acc_lo_tap_even = svlsl_n_u16_x(pg, acc_0_8_lo, 2);
308 1904 svuint16_t acc_hi_tap_even = svlsl_n_u16_x(pg, acc_0_8_hi, 2);
309 1904 acc_lo_tap_even = svmla_n_u16_x(pg, acc_lo_tap_even, acc_2_6_lo, 30);
310 1904 acc_hi_tap_even = svmla_n_u16_x(pg, acc_hi_tap_even, acc_2_6_hi, 30);
311 1904 acc_lo_tap_even = svmla_n_u16_x(pg, acc_lo_tap_even, acc_4_lo, 60);
312 1904 acc_hi_tap_even = svmla_n_u16_x(pg, acc_hi_tap_even, acc_4_hi, 60);
313
314 1904 svuint16_t acc_lo_tap_odd = svmul_n_u16_x(pg, acc_1_7_lo, 13);
315 1904 svuint16_t acc_hi_tap_odd = svmul_n_u16_x(pg, acc_1_7_hi, 13);
316 1904 acc_lo_tap_odd = svmla_n_u16_x(pg, acc_lo_tap_odd, acc_3_5_lo, 51);
317 1904 acc_hi_tap_odd = svmla_n_u16_x(pg, acc_hi_tap_odd, acc_3_5_hi, 51);
318
319 1904 svuint16_t acc_lo = svadd_u16_x(pg, acc_lo_tap_even, acc_lo_tap_odd);
320 1904 svuint16_t acc_hi = svadd_u16_x(pg, acc_hi_tap_even, acc_hi_tap_odd);
321
322 1904 svuint16x2_t interleaved = svcreate2(acc_lo, acc_hi);
323 1904 svst2(pg, &dst[0], interleaved);
324 1904 }
325
326 // Applies horizontal filtering vector using SIMD operations.
327 //
328 // DST = 1/65536 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6, SRC7, SRC8 ] *
329 // * [ 4, 13, 30, 51, 60, 51, 30, 13, 4 ]T
330 1628 void horizontal_vector_path(svbool_t pg,
331 std::reference_wrapper<svuint16_t> src[9],
332 DestinationType *dst) const KLEIDICV_STREAMING {
333 // Lane-level split after widening: *_lo/*_hi are lower/upper lanes.
334 1628 svuint32_t acc_0_8_lo = svaddlb_u32(src[0], src[8]);
335 1628 svuint32_t acc_0_8_hi = svaddlt_u32(src[0], src[8]);
336
337 1628 svuint32_t acc_1_7_lo = svaddlb_u32(src[1], src[7]);
338 1628 svuint32_t acc_1_7_hi = svaddlt_u32(src[1], src[7]);
339
340 1628 svuint32_t acc_2_6_lo = svaddlb_u32(src[2], src[6]);
341 1628 svuint32_t acc_2_6_hi = svaddlt_u32(src[2], src[6]);
342
343 1628 svuint32_t acc_3_5_lo = svaddlb_u32(src[3], src[5]);
344 1628 svuint32_t acc_3_5_hi = svaddlt_u32(src[3], src[5]);
345
346 1628 svuint32_t acc_4_lo = svmovlb_u32(src[4]);
347 1628 svuint32_t acc_4_hi = svmovlt_u32(src[4]);
348
349 // Window-level grouping: *_tap_even/*_tap_odd are even/odd taps
350 // (0, 2, 4, 6, 8 vs 1, 3, 5, 7).
351 1628 svuint32_t acc_lo_tap_even = svlsl_n_u32_x(pg, acc_0_8_lo, 2);
352 1628 svuint32_t acc_hi_tap_even = svlsl_n_u32_x(pg, acc_0_8_hi, 2);
353 1628 acc_lo_tap_even = svmla_n_u32_x(pg, acc_lo_tap_even, acc_2_6_lo, 30);
354 1628 acc_hi_tap_even = svmla_n_u32_x(pg, acc_hi_tap_even, acc_2_6_hi, 30);
355 1628 acc_lo_tap_even = svmla_n_u32_x(pg, acc_lo_tap_even, acc_4_lo, 60);
356 1628 acc_hi_tap_even = svmla_n_u32_x(pg, acc_hi_tap_even, acc_4_hi, 60);
357
358 1628 svuint32_t acc_lo_tap_odd = svmul_n_u32_x(pg, acc_1_7_lo, 13);
359 1628 svuint32_t acc_hi_tap_odd = svmul_n_u32_x(pg, acc_1_7_hi, 13);
360 1628 acc_lo_tap_odd = svmla_n_u32_x(pg, acc_lo_tap_odd, acc_3_5_lo, 51);
361 1628 acc_hi_tap_odd = svmla_n_u32_x(pg, acc_hi_tap_odd, acc_3_5_hi, 51);
362
363 1628 svuint32_t acc_lo = svadd_u32_x(pg, acc_lo_tap_even, acc_lo_tap_odd);
364 1628 svuint32_t acc_hi = svadd_u32_x(pg, acc_hi_tap_even, acc_hi_tap_odd);
365
366 1628 svuint16_t acc_u16_lo = svrshrnb_n_u32(acc_lo, 16);
367 1628 svuint16_t acc_u16 = svrshrnt_n_u32(acc_u16_lo, acc_hi, 16);
368
369 1628 svst1b(pg, &dst[0], acc_u16);
370 1628 }
371
372 // Applies horizontal filtering vector using scalar operations.
373 //
374 // DST = 1/65536 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6, SRC7, SRC8 ] *
375 // * [ 4, 13, 30, 51, 60, 51, 30, 13, 4 ]T
376 13632 void horizontal_scalar_path(const BufferType src[9],
377 DestinationType *dst) const KLEIDICV_STREAMING {
378 40896 uint32_t acc = src[0] * 4 + src[1] * 13 + src[2] * 30 + src[3] * 51 +
379 40896 src[4] * 60 + src[5] * 51 + src[6] * 30 + src[7] * 13 +
380 13632 src[8] * 4;
381 13632 dst[0] = rounding_shift_right(acc, 16);
382 13632 }
383 }; // end of class GaussianBlur<uint8_t, 9, true>
384
385 // CustomSigma variant
386 template <size_t KernelSize>
387 class GaussianBlur<uint8_t, KernelSize, false> {
388 public:
389 using SourceType = uint8_t;
390 using BufferType = uint8_t;
391 using DestinationType = uint8_t;
392 using SourceVecTraits =
393 typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits<SourceType>;
394 using SourceVectorType = typename SourceVecTraits::VectorType;
395
396 static constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize);
397
398 396 explicit GaussianBlur(const uint8_t *half_kernel)
399 396 : half_kernel_(half_kernel) {}
400
401 9100 void vertical_vector_path(
402 svbool_t pg, std::reference_wrapper<SourceVectorType> src[KernelSize],
403 BufferType *dst) const KLEIDICV_STREAMING {
404 9100 common_vector_path(pg, src, dst);
405 9100 }
406
407 111624 void vertical_scalar_path(const SourceType src[KernelSize],
408 BufferType *dst) const KLEIDICV_STREAMING {
409 111624 uint32_t acc = src[kHalfKernelSize - 1] * half_kernel_[kHalfKernelSize - 1];
410
411 // Optimization to avoid unnecessary branching in vector code.
412 KLEIDICV_FORCE_LOOP_UNROLL
413
12/12
✓ Branch 0 taken 33264 times.
✓ Branch 1 taken 232848 times.
✓ Branch 2 taken 67680 times.
✓ Branch 3 taken 676800 times.
✓ Branch 4 taken 372 times.
✓ Branch 5 taken 372 times.
✓ Branch 6 taken 1440 times.
✓ Branch 7 taken 2880 times.
✓ Branch 8 taken 3204 times.
✓ Branch 9 taken 9612 times.
✓ Branch 10 taken 5664 times.
✓ Branch 11 taken 22656 times.
1056792 for (size_t i = 0; i < kHalfKernelSize - 1; i++) {
414 945168 acc += (src[i] + src[KernelSize - i - 1]) * half_kernel_[i];
415 945168 }
416
417 111624 dst[0] = static_cast<BufferType>(rounding_shift_right(acc, 8));
418 111624 }
419
420 6432 void horizontal_vector_path(
421 svbool_t pg, std::reference_wrapper<SourceVectorType> src[KernelSize],
422 BufferType *dst) const KLEIDICV_STREAMING {
423 6432 common_vector_path(pg, src, dst);
424 6432 }
425
426 111624 void horizontal_scalar_path(const BufferType src[KernelSize],
427 DestinationType *dst) const KLEIDICV_STREAMING {
428 111624 vertical_scalar_path(src, dst);
429 111624 }
430
431 private:
432 15532 void common_vector_path(
433 svbool_t pg, std::reference_wrapper<SourceVectorType> src[KernelSize],
434 BufferType *dst) const KLEIDICV_STREAMING {
435 15532 svbool_t pg16_all = svptrue_b16();
436 31064 svuint16_t acc_b = svmullb_n_u16(src[kHalfKernelSize - 1],
437 15532 half_kernel_[kHalfKernelSize - 1]);
438 31064 svuint16_t acc_t = svmullt_n_u16(src[kHalfKernelSize - 1],
439 15532 half_kernel_[kHalfKernelSize - 1]);
440
441 // Optimization to avoid unnecessary branching in vector code.
442 KLEIDICV_FORCE_LOOP_UNROLL
443
12/12
✓ Branch 0 taken 4752 times.
✓ Branch 1 taken 33264 times.
✓ Branch 2 taken 7416 times.
✓ Branch 3 taken 74160 times.
✓ Branch 4 taken 342 times.
✓ Branch 5 taken 342 times.
✓ Branch 6 taken 660 times.
✓ Branch 7 taken 1320 times.
✓ Branch 8 taken 1002 times.
✓ Branch 9 taken 3006 times.
✓ Branch 10 taken 1360 times.
✓ Branch 11 taken 5440 times.
133064 for (size_t i = 0; i < kHalfKernelSize - 1; i++) {
444 117532 const size_t j = KernelSize - i - 1;
445 117532 svuint16_t vec_b = svaddlb_u16(src[i], src[j]);
446 117532 svuint16_t vec_t = svaddlt_u16(src[i], src[j]);
447
448 117532 acc_b = svmla_n_u16_x(pg16_all, acc_b, vec_b, half_kernel_[i]);
449 117532 acc_t = svmla_n_u16_x(pg16_all, acc_t, vec_t, half_kernel_[i]);
450 117532 }
451
452 // Rounding before narrowing
453 15532 acc_b = svqadd_n_u16(acc_b, 128);
454 15532 acc_t = svqadd_n_u16(acc_t, 128);
455 // Keep only the highest 8 bits
456 31064 svuint8_t result =
457 15532 svtrn2_u8(svreinterpret_u8_u16(acc_b), svreinterpret_u8_u16(acc_t));
458 15532 svst1(pg, &dst[0], result);
459 15532 }
460
461 const uint8_t *half_kernel_;
462 }; // end of class GaussianBlur<uint8_t, KernelSize, false>
463
464 template <size_t KernelSize, bool IsBinomial, typename ScalarType>
465 1290 static kleidicv_error_t gaussian_blur_fixed_kernel_size(
466 const ScalarType *src, size_t src_stride, ScalarType *dst,
467 size_t dst_stride, Rectangle &rect, size_t y_begin, size_t y_end,
468 size_t channels, float sigma,
469 FixedBorderType border_type) KLEIDICV_STREAMING {
470 using GaussianBlurFilter = GaussianBlur<ScalarType, KernelSize, IsBinomial>;
471 1290 constexpr size_t intermediate_size{
472 sizeof(typename GaussianBlurFilter::BufferType)};
473
474 1290 auto workspace_variant =
475 1290 SeparableFilterWorkspace::create(rect, channels, intermediate_size);
476
40/40
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 141 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 141 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 183 times.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 183 times.
✓ Branch 8 taken 3 times.
✓ Branch 9 taken 99 times.
✓ Branch 10 taken 3 times.
✓ Branch 11 taken 99 times.
✓ Branch 12 taken 3 times.
✓ Branch 13 taken 153 times.
✓ Branch 14 taken 3 times.
✓ Branch 15 taken 153 times.
✓ Branch 16 taken 3 times.
✓ Branch 17 taken 144 times.
✓ Branch 18 taken 3 times.
✓ Branch 19 taken 144 times.
✓ Branch 20 taken 3 times.
✓ Branch 21 taken 144 times.
✓ Branch 22 taken 3 times.
✓ Branch 23 taken 144 times.
✓ Branch 24 taken 3 times.
✓ Branch 25 taken 99 times.
✓ Branch 26 taken 3 times.
✓ Branch 27 taken 99 times.
✓ Branch 28 taken 3 times.
✓ Branch 29 taken 99 times.
✓ Branch 30 taken 3 times.
✓ Branch 31 taken 99 times.
✓ Branch 32 taken 3 times.
✓ Branch 33 taken 99 times.
✓ Branch 34 taken 3 times.
✓ Branch 35 taken 99 times.
✓ Branch 36 taken 3 times.
✓ Branch 37 taken 99 times.
✓ Branch 38 taken 3 times.
✓ Branch 39 taken 99 times.
1320 if (auto *err = std::get_if<kleidicv_error_t>(&workspace_variant)) {
477 30 return *err;
478 }
479 1260 auto &workspace = *std::get_if<SeparableFilterWorkspace>(&workspace_variant);
480
481 1260 Rows<const ScalarType> src_rows{src, src_stride, channels};
482 1260 Rows<ScalarType> dst_rows{dst, dst_stride, channels};
483
484 if constexpr (IsBinomial) {
485 576 GaussianBlurFilter blur;
486 576 SeparableFilter<GaussianBlurFilter, KernelSize> filter{blur};
487 576 workspace.process(y_begin, y_end, src_rows, dst_rows, border_type, filter);
488
489 576 return KLEIDICV_OK;
490 576 } else {
491 684 constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize);
492 684 uint8_t half_kernel[128];
493 1368 bool success =
494 684 generate_gaussian_half_kernel(half_kernel, kHalfKernelSize, sigma);
495
12/12
✓ Branch 0 taken 96 times.
✓ Branch 1 taken 48 times.
✓ Branch 2 taken 96 times.
✓ Branch 3 taken 48 times.
✓ Branch 4 taken 51 times.
✓ Branch 5 taken 48 times.
✓ Branch 6 taken 51 times.
✓ Branch 7 taken 48 times.
✓ Branch 8 taken 51 times.
✓ Branch 9 taken 48 times.
✓ Branch 10 taken 51 times.
✓ Branch 11 taken 48 times.
684 if (success) {
496 396 GaussianBlurFilter blur(half_kernel);
497 396 SeparableFilter<GaussianBlurFilter, KernelSize> filter{blur};
498 792 workspace.process(y_begin, y_end, src_rows, dst_rows, border_type,
499 396 filter);
500 396 } else {
501 // Sigma is too small that the middle point would get all the weight
502 // => it's just a copy.
503
12/12
✓ Branch 0 taken 684 times.
✓ Branch 1 taken 48 times.
✓ Branch 2 taken 972 times.
✓ Branch 3 taken 48 times.
✓ Branch 4 taken 108 times.
✓ Branch 5 taken 48 times.
✓ Branch 6 taken 204 times.
✓ Branch 7 taken 48 times.
✓ Branch 8 taken 300 times.
✓ Branch 9 taken 48 times.
✓ Branch 10 taken 396 times.
✓ Branch 11 taken 48 times.
2952 for (size_t row = y_begin; row < y_end; ++row) {
504 #if KLEIDICV_TARGET_SME && defined(__ANDROID__)
505 __arm_sc_memcpy(
506 static_cast<void *>(&dst_rows.at(row)[0]),
507 static_cast<const void *>(&src_rows.at(row)[0]),
508 rect.width() * sizeof(ScalarType) * dst_rows.channels());
509 #else
510 5328 std::memcpy(static_cast<void *>(&dst_rows.at(row)[0]),
511 2664 static_cast<const void *>(&src_rows.at(row)[0]),
512 2664 rect.width() * sizeof(ScalarType) * dst_rows.channels());
513 #endif
514 2664 }
515 }
516 684 return KLEIDICV_OK;
517 684 }
518 1290 }
519
520 template <bool IsBinomial, typename ScalarType>
521 1290 static kleidicv_error_t gaussian_blur(
522 size_t kernel_size, const ScalarType *src, size_t src_stride,
523 ScalarType *dst, size_t dst_stride, Rectangle &rect, size_t y_begin,
524 size_t y_end, size_t channels, float sigma,
525 FixedBorderType border_type) KLEIDICV_STREAMING {
526
12/14
✓ Branch 0 taken 144 times.
✓ Branch 1 taken 186 times.
✓ Branch 2 taken 102 times.
✓ Branch 3 taken 156 times.
✓ Branch 4 taken 48 times.
✗ Branch 5 not taken.
✓ Branch 6 taken 48 times.
✓ Branch 7 taken 102 times.
✓ Branch 8 taken 102 times.
✓ Branch 9 taken 102 times.
✓ Branch 10 taken 102 times.
✓ Branch 11 taken 99 times.
✗ Branch 12 not taken.
✓ Branch 13 taken 99 times.
1290 switch (kernel_size) {
527 case 3:
528 246 return gaussian_blur_fixed_kernel_size<3, IsBinomial>(
529 246 src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
530 246 sigma, border_type);
531 case 5:
532 288 return gaussian_blur_fixed_kernel_size<5, IsBinomial>(
533 288 src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
534 288 sigma, border_type);
535 case 7:
536 204 return gaussian_blur_fixed_kernel_size<7, IsBinomial>(
537 204 src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
538 204 sigma, border_type);
539 case 9:
540 258 return gaussian_blur_fixed_kernel_size<9, IsBinomial>(
541 258 src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
542 258 sigma, border_type);
543 case 15:
544 // 15x15 does not have a binomial variant
545 147 return gaussian_blur_fixed_kernel_size<15, false>(
546 147 src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
547 147 sigma, border_type);
548 case 21:
549 // 21x21 does not have a binomial variant
550 147 return gaussian_blur_fixed_kernel_size<21, false>(
551 147 src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels,
552 147 sigma, border_type);
553 // gaussian_blur_is_implemented checked the kernel size already.
554 // GCOVR_EXCL_START
555 default:
556 assert(!"kernel size not implemented");
557 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
558 // GCOVR_EXCL_STOP
559 }
560 1290 }
561
562 1320 static kleidicv_error_t gaussian_blur_fixed_stripe_u8_sc(
563 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
564 size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels,
565 size_t kernel_width, size_t /*kernel_height*/, float sigma_x,
566 float /*sigma_y*/, FixedBorderType fixed_border_type) KLEIDICV_STREAMING {
567
6/6
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 1290 times.
✓ Branch 2 taken 30 times.
✓ Branch 3 taken 1290 times.
✓ Branch 4 taken 30 times.
✓ Branch 5 taken 1290 times.
3960 if (auto result =
568 1320 gaussian_blur_checks(src, src_stride, dst, dst_stride, width, height);
569
2/3
✗ Branch 0 not taken.
✓ Branch 1 taken 30 times.
✓ Branch 2 taken 1290 times.
1350 result != KLEIDICV_OK) {
570 30 return result;
571 }
572
573 1290 Rectangle rect{width, height};
574
575
2/2
✓ Branch 0 taken 684 times.
✓ Branch 1 taken 606 times.
1290 if (sigma_x == 0.0) {
576 1368 return gaussian_blur<true>(kernel_width, src, src_stride, dst, dst_stride,
577 684 rect, y_begin, y_end, channels, sigma_x,
578 684 fixed_border_type);
579 }
580
581 1212 return gaussian_blur<false>(kernel_width, src, src_stride, dst, dst_stride,
582 606 rect, y_begin, y_end, channels, sigma_x,
583 606 fixed_border_type);
584 1320 }
585
586 } // namespace KLEIDICV_TARGET_NAMESPACE
587
588 #endif // KLEIDICV_GAUSSIAN_BLUR_SC_H
589