Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #ifndef KLEIDICV_GAUSSIAN_BLUR_SC_H | ||
6 | #define KLEIDICV_GAUSSIAN_BLUR_SC_H | ||
7 | |||
8 | #include <array> | ||
9 | #include <cassert> | ||
10 | |||
11 | #include "kleidicv/filters/gaussian_blur.h" | ||
12 | #include "kleidicv/filters/separable_filter_15x15_sc.h" | ||
13 | #include "kleidicv/filters/separable_filter_21x21_sc.h" | ||
14 | #include "kleidicv/filters/separable_filter_3x3_sc.h" | ||
15 | #include "kleidicv/filters/separable_filter_5x5_sc.h" | ||
16 | #include "kleidicv/filters/separable_filter_7x7_sc.h" | ||
17 | #include "kleidicv/filters/sigma.h" | ||
18 | #include "kleidicv/workspace/separable.h" | ||
19 | |||
20 | #if KLEIDICV_TARGET_SME || KLEIDICV_TARGET_SME2 | ||
21 | #include <arm_sme.h> | ||
22 | #endif | ||
23 | |||
24 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
25 | |||
26 | // Primary template for Gaussian Blur filters. | ||
27 | template <typename ScalarType, size_t KernelSize, bool IsBinomial> | ||
28 | class GaussianBlur; | ||
29 | |||
30 | // Template for 3x3 Gaussian Blur binomial filters. | ||
31 | // | ||
32 | // [ 1, 2, 1 ] [ 1 ] | ||
33 | // F = 1/16 * [ 2, 4, 2 ] = 1/16 * [ 2 ] * [ 1, 2, 1 ] | ||
34 | // [ 1, 2, 1 ] [ 1 ] | ||
35 | template <> | ||
36 | class GaussianBlur<uint8_t, 3, true> { | ||
37 | public: | ||
38 | using SourceType = uint8_t; | ||
39 | using BufferType = uint16_t; | ||
40 | using DestinationType = uint8_t; | ||
41 | |||
42 | // Applies vertical filtering vector using SIMD operations. | ||
43 | // | ||
44 | // DST = [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T | ||
45 | 504 | void vertical_vector_path(svbool_t pg, | |
46 | std::reference_wrapper<svuint8_t> src[3], | ||
47 | BufferType *dst) const KLEIDICV_STREAMING { | ||
48 | 504 | svuint16_t acc_0_2_b = svaddlb_u16(src[0], src[2]); | |
49 | 504 | svuint16_t acc_0_2_t = svaddlt_u16(src[0], src[2]); | |
50 | |||
51 | 504 | svuint16_t acc_1_b = svshllb_n_u16(src[1], 1); | |
52 | 504 | svuint16_t acc_1_t = svshllt_n_u16(src[1], 1); | |
53 | |||
54 | 504 | svuint16_t acc_u16_b = svadd_u16_x(pg, acc_0_2_b, acc_1_b); | |
55 | 504 | svuint16_t acc_u16_t = svadd_u16_x(pg, acc_0_2_t, acc_1_t); | |
56 | |||
57 | 504 | svuint16x2_t interleaved = svcreate2(acc_u16_b, acc_u16_t); | |
58 | 504 | svst2(pg, &dst[0], interleaved); | |
59 | 504 | } | |
60 | |||
61 | // Applies horizontal filtering vector using SIMD operations. | ||
62 | // | ||
63 | // DST = 1/16 * [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T | ||
64 | 400 | void horizontal_vector_path(svbool_t pg, | |
65 | std::reference_wrapper<svuint16_t> src[3], | ||
66 | DestinationType *dst) const KLEIDICV_STREAMING { | ||
67 | 400 | svuint16_t acc_0_2 = svhadd_u16_x(pg, src[0], src[2]); | |
68 | |||
69 | 400 | svuint16_t acc = svadd_u16_x(pg, acc_0_2, src[1]); | |
70 | 400 | acc = svrshr_x(pg, acc, 3); | |
71 | |||
72 | 400 | svst1b(pg, &dst[0], acc); | |
73 | 400 | } | |
74 | |||
75 | // Applies horizontal filtering vector using scalar operations. | ||
76 | // | ||
77 | // DST = 1/16 * [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T | ||
78 | 1008 | void horizontal_scalar_path(const BufferType src[3], | |
79 | DestinationType *dst) const KLEIDICV_STREAMING { | ||
80 | 1008 | auto acc = src[0] + 2 * src[1] + src[2]; | |
81 | 1008 | dst[0] = rounding_shift_right(acc, 4); | |
82 | 1008 | } | |
83 | }; // end of class GaussianBlur<uint8_t, 3, true> | ||
84 | |||
85 | // Template for 5x5 Gaussian Blur binomial filters. | ||
86 | // | ||
87 | // [ 1, 4, 6, 4, 1 ] [ 1 ] | ||
88 | // [ 4, 16, 24, 16, 4 ] [ 4 ] | ||
89 | // F = 1/256 * [ 6, 24, 36, 24, 6 ] = 1/256 * [ 6 ] * [ 1, 4, 6, 4, 1 ] | ||
90 | // [ 4, 16, 24, 16, 4 ] [ 4 ] | ||
91 | // [ 1, 4, 6, 4, 1 ] [ 1 ] | ||
92 | template <> | ||
93 | class GaussianBlur<uint8_t, 5, true> { | ||
94 | public: | ||
95 | using SourceType = uint8_t; | ||
96 | using BufferType = uint16_t; | ||
97 | using DestinationType = uint8_t; | ||
98 | |||
99 | // Applies vertical filtering vector using SIMD operations. | ||
100 | // | ||
101 | // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T | ||
102 | 1524 | void vertical_vector_path(svbool_t pg, | |
103 | std::reference_wrapper<svuint8_t> src[5], | ||
104 | BufferType *dst) const KLEIDICV_STREAMING { | ||
105 | 1524 | svuint16_t acc_0_4_b = svaddlb_u16(src[0], src[4]); | |
106 | 1524 | svuint16_t acc_0_4_t = svaddlt_u16(src[0], src[4]); | |
107 | 1524 | svuint16_t acc_1_3_b = svaddlb_u16(src[1], src[3]); | |
108 | 1524 | svuint16_t acc_1_3_t = svaddlt_u16(src[1], src[3]); | |
109 | |||
110 | 1524 | svuint16_t acc_u16_b = svmlalb_n_u16(acc_0_4_b, src[2], 6); | |
111 | 1524 | svuint16_t acc_u16_t = svmlalt_n_u16(acc_0_4_t, src[2], 6); | |
112 | 1524 | acc_u16_b = svmla_n_u16_x(pg, acc_u16_b, acc_1_3_b, 4); | |
113 | 1524 | acc_u16_t = svmla_n_u16_x(pg, acc_u16_t, acc_1_3_t, 4); | |
114 | |||
115 | 1524 | svuint16x2_t interleaved = svcreate2(acc_u16_b, acc_u16_t); | |
116 | 1524 | svst2(pg, &dst[0], interleaved); | |
117 | 1524 | } | |
118 | |||
119 | // Applies horizontal filtering vector using SIMD operations. | ||
120 | // | ||
121 | // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T | ||
122 | 1316 | void horizontal_vector_path(svbool_t pg, | |
123 | std::reference_wrapper<svuint16_t> src[5], | ||
124 | DestinationType *dst) const KLEIDICV_STREAMING { | ||
125 | 1316 | svuint16_t acc_0_4 = svadd_x(pg, src[0], src[4]); | |
126 | 1316 | svuint16_t acc_1_3 = svadd_x(pg, src[1], src[3]); | |
127 | 1316 | svuint16_t acc = svmla_n_u16_x(pg, acc_0_4, src[2], 6); | |
128 | 1316 | acc = svmla_n_u16_x(pg, acc, acc_1_3, 4); | |
129 | 1316 | acc = svrshr_x(pg, acc, 8); | |
130 | 1316 | svst1b(pg, &dst[0], acc); | |
131 | 1316 | } | |
132 | |||
133 | // Applies horizontal filtering vector using scalar operations. | ||
134 | // | ||
135 | // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T | ||
136 | 5552 | void horizontal_scalar_path(const BufferType src[5], | |
137 | DestinationType *dst) const KLEIDICV_STREAMING { | ||
138 | 5552 | auto acc = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2]; | |
139 | 5552 | dst[0] = rounding_shift_right(acc, 8); | |
140 | 5552 | } | |
141 | }; // end of class GaussianBlur<uint8_t, 5, true> | ||
142 | |||
143 | // Template for 7x7 Gaussian Blur binomial filters. | ||
144 | // | ||
145 | // [ 4, 14, 28, 36, 28, 14, 4 ] | ||
146 | // [ 14, 49, 98, 126, 98, 49, 14 ] | ||
147 | // [ 28, 98, 196, 252, 196, 98, 28 ] | ||
148 | // F = 1/4096 * [ 36, 126, 252, 324, 252, 126, 36 ] = | ||
149 | // [ 28, 98, 196, 252, 196, 98, 28 ] | ||
150 | // [ 14, 49, 98, 126, 98, 49, 14 ] | ||
151 | // [ 4, 14, 28, 36, 28, 14, 4 ] | ||
152 | // | ||
153 | // [ 2 ] | ||
154 | // [ 7 ] | ||
155 | // [ 14 ] | ||
156 | // = 1/4096 * [ 18 ] * [ 2, 7, 14, 18, 14, 7, 2 ] | ||
157 | // [ 14 ] | ||
158 | // [ 7 ] | ||
159 | // [ 2 ] | ||
160 | template <> | ||
161 | class GaussianBlur<uint8_t, 7, true> { | ||
162 | public: | ||
163 | using SourceType = uint8_t; | ||
164 | using BufferType = uint16_t; | ||
165 | using DestinationType = uint8_t; | ||
166 | |||
167 | // Applies vertical filtering vector using SIMD operations. | ||
168 | // | ||
169 | // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] * | ||
170 | // * [ 2, 7, 14, 18, 14, 7, 2 ]T | ||
171 | 596 | void vertical_vector_path(svbool_t pg, | |
172 | std::reference_wrapper<svuint8_t> src[7], | ||
173 | BufferType *dst) const KLEIDICV_STREAMING { | ||
174 | 596 | svuint16_t acc_0_6_b = svaddlb_u16(src[0], src[6]); | |
175 | 596 | svuint16_t acc_0_6_t = svaddlt_u16(src[0], src[6]); | |
176 | |||
177 | 596 | svuint16_t acc_1_5_b = svaddlb_u16(src[1], src[5]); | |
178 | 596 | svuint16_t acc_1_5_t = svaddlt_u16(src[1], src[5]); | |
179 | |||
180 | 596 | svuint16_t acc_2_4_b = svaddlb_u16(src[2], src[4]); | |
181 | 596 | svuint16_t acc_2_4_t = svaddlt_u16(src[2], src[4]); | |
182 | |||
183 | 596 | svuint16_t acc_3_b = svmovlb_u16(src[3]); | |
184 | 596 | svuint16_t acc_3_t = svmovlt_u16(src[3]); | |
185 | |||
186 | 596 | svuint16_t acc_0_2_4_6_b = svmla_n_u16_x(pg, acc_0_6_b, acc_2_4_b, 7); | |
187 | 596 | svuint16_t acc_0_2_4_6_t = svmla_n_u16_x(pg, acc_0_6_t, acc_2_4_t, 7); | |
188 | |||
189 | 596 | svuint16_t acc_0_2_3_4_6_b = svmla_n_u16_x(pg, acc_0_2_4_6_b, acc_3_b, 9); | |
190 | 596 | svuint16_t acc_0_2_3_4_6_t = svmla_n_u16_x(pg, acc_0_2_4_6_t, acc_3_t, 9); | |
191 | 596 | acc_0_2_3_4_6_b = svlsl_n_u16_x(pg, acc_0_2_3_4_6_b, 1); | |
192 | 596 | acc_0_2_3_4_6_t = svlsl_n_u16_x(pg, acc_0_2_3_4_6_t, 1); | |
193 | |||
194 | 1192 | svuint16_t acc_0_1_2_3_4_5_6_b = | |
195 | 596 | svmla_n_u16_x(pg, acc_0_2_3_4_6_b, acc_1_5_b, 7); | |
196 | 1192 | svuint16_t acc_0_1_2_3_4_5_6_t = | |
197 | 596 | svmla_n_u16_x(pg, acc_0_2_3_4_6_t, acc_1_5_t, 7); | |
198 | |||
199 | 1192 | svuint16x2_t interleaved = | |
200 | 596 | svcreate2(acc_0_1_2_3_4_5_6_b, acc_0_1_2_3_4_5_6_t); | |
201 | 596 | svst2(pg, &dst[0], interleaved); | |
202 | 596 | } | |
203 | |||
204 | // Applies horizontal filtering vector using SIMD operations. | ||
205 | // | ||
206 | // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] * | ||
207 | // * [ 2, 7, 14, 18, 14, 7, 2 ]T | ||
208 | 464 | void horizontal_vector_path(svbool_t pg, | |
209 | std::reference_wrapper<svuint16_t> src[7], | ||
210 | DestinationType *dst) const KLEIDICV_STREAMING { | ||
211 | 464 | svuint32_t acc_0_6_b = svaddlb_u32(src[0], src[6]); | |
212 | 464 | svuint32_t acc_0_6_t = svaddlt_u32(src[0], src[6]); | |
213 | |||
214 | 464 | svuint32_t acc_1_5_b = svaddlb_u32(src[1], src[5]); | |
215 | 464 | svuint32_t acc_1_5_t = svaddlt_u32(src[1], src[5]); | |
216 | |||
217 | 464 | svuint16_t acc_2_4 = svadd_u16_x(pg, src[2], src[4]); | |
218 | |||
219 | 464 | svuint32_t acc_0_2_4_6_b = svmlalb_n_u32(acc_0_6_b, acc_2_4, 7); | |
220 | 464 | svuint32_t acc_0_2_4_6_t = svmlalt_n_u32(acc_0_6_t, acc_2_4, 7); | |
221 | |||
222 | 464 | svuint32_t acc_0_2_3_4_6_b = svmlalb_n_u32(acc_0_2_4_6_b, src[3], 9); | |
223 | 464 | svuint32_t acc_0_2_3_4_6_t = svmlalt_n_u32(acc_0_2_4_6_t, src[3], 9); | |
224 | |||
225 | 464 | acc_0_2_3_4_6_b = svlsl_n_u32_x(pg, acc_0_2_3_4_6_b, 1); | |
226 | 464 | acc_0_2_3_4_6_t = svlsl_n_u32_x(pg, acc_0_2_3_4_6_t, 1); | |
227 | |||
228 | 928 | svuint32_t acc_0_1_2_3_4_5_6_b = | |
229 | 464 | svmla_n_u32_x(pg, acc_0_2_3_4_6_b, acc_1_5_b, 7); | |
230 | 928 | svuint32_t acc_0_1_2_3_4_5_6_t = | |
231 | 464 | svmla_n_u32_x(pg, acc_0_2_3_4_6_t, acc_1_5_t, 7); | |
232 | |||
233 | 928 | svuint16_t acc_0_1_2_3_4_5_6_u16_b = | |
234 | 464 | svrshrnb_n_u32(acc_0_1_2_3_4_5_6_b, 12); | |
235 | 928 | svuint16_t acc_0_1_2_3_4_5_6_u16 = | |
236 | 464 | svrshrnt_n_u32(acc_0_1_2_3_4_5_6_u16_b, acc_0_1_2_3_4_5_6_t, 12); | |
237 | |||
238 | 464 | svst1b(pg, &dst[0], acc_0_1_2_3_4_5_6_u16); | |
239 | 464 | } | |
240 | |||
241 | // Applies horizontal filtering vector using scalar operations. | ||
242 | // | ||
243 | // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] * | ||
244 | // * [ 2, 7, 14, 18, 14, 7, 2 ]T | ||
245 | 3288 | void horizontal_scalar_path(const BufferType src[7], | |
246 | DestinationType *dst) const KLEIDICV_STREAMING { | ||
247 | 9864 | uint32_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 + | |
248 | 6576 | src[4] * 14 + src[5] * 7 + src[6] * 2; | |
249 | 3288 | dst[0] = rounding_shift_right(acc, 12); | |
250 | 3288 | } | |
251 | }; // end of class GaussianBlur<uint8_t, 7, true> | ||
252 | |||
253 | // CustomSigma variant | ||
254 | template <size_t KernelSize> | ||
255 | class GaussianBlur<uint8_t, KernelSize, false> { | ||
256 | public: | ||
257 | using SourceType = uint8_t; | ||
258 | using BufferType = uint8_t; | ||
259 | using DestinationType = uint8_t; | ||
260 | using SourceVecTraits = | ||
261 | typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits<SourceType>; | ||
262 | using SourceVectorType = typename SourceVecTraits::VectorType; | ||
263 | |||
264 | static constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize); | ||
265 | |||
266 | 230 | explicit GaussianBlur(const uint16_t *half_kernel) | |
267 | 230 | : half_kernel_(half_kernel) {} | |
268 | |||
269 | 6048 | void vertical_vector_path( | |
270 | svbool_t pg, std::reference_wrapper<SourceVectorType> src[KernelSize], | ||
271 | BufferType *dst) const KLEIDICV_STREAMING { | ||
272 | 6048 | common_vector_path(pg, src, dst); | |
273 | 6048 | } | |
274 | |||
275 | 70640 | void vertical_scalar_path(const SourceType src[KernelSize], | |
276 | BufferType *dst) const KLEIDICV_STREAMING { | ||
277 | 141280 | uint32_t acc = static_cast<uint32_t>(src[kHalfKernelSize - 1]) * | |
278 | 70640 | half_kernel_[kHalfKernelSize - 1]; | |
279 | |||
280 | // Optimization to avoid unnecessary branching in vector code. | ||
281 | KLEIDICV_FORCE_LOOP_UNROLL | ||
282 |
10/10✓ Branch 0 taken 22176 times.
✓ Branch 1 taken 155232 times.
✓ Branch 2 taken 45120 times.
✓ Branch 3 taken 451200 times.
✓ Branch 4 taken 248 times.
✓ Branch 5 taken 248 times.
✓ Branch 6 taken 960 times.
✓ Branch 7 taken 1920 times.
✓ Branch 8 taken 2136 times.
✓ Branch 9 taken 6408 times.
|
685648 | for (size_t i = 0; i < kHalfKernelSize - 1; i++) { |
283 | 1845024 | acc += (static_cast<uint32_t>(src[i]) + | |
284 | 1230016 | static_cast<uint32_t>(src[KernelSize - i - 1])) * | |
285 | 615008 | half_kernel_[i]; | |
286 | 615008 | } | |
287 | |||
288 | 70640 | dst[0] = static_cast<BufferType>(rounding_shift_right(acc, 8)); | |
289 | 70640 | } | |
290 | |||
291 | 3896 | void horizontal_vector_path( | |
292 | svbool_t pg, std::reference_wrapper<SourceVectorType> src[KernelSize], | ||
293 | BufferType *dst) const KLEIDICV_STREAMING { | ||
294 | 3896 | common_vector_path(pg, src, dst); | |
295 | 3896 | } | |
296 | |||
297 | 70640 | void horizontal_scalar_path(const BufferType src[KernelSize], | |
298 | DestinationType *dst) const KLEIDICV_STREAMING { | ||
299 | 70640 | vertical_scalar_path(src, dst); | |
300 | 70640 | } | |
301 | |||
302 | private: | ||
303 | 9944 | void common_vector_path( | |
304 | svbool_t pg, std::reference_wrapper<SourceVectorType> src[KernelSize], | ||
305 | BufferType *dst) const KLEIDICV_STREAMING { | ||
306 | 9944 | svbool_t pg16_all = svptrue_b16(); | |
307 | 19888 | svuint16_t acc_b = svmullb_n_u16(src[kHalfKernelSize - 1], | |
308 | 9944 | half_kernel_[kHalfKernelSize - 1]); | |
309 | 19888 | svuint16_t acc_t = svmullt_n_u16(src[kHalfKernelSize - 1], | |
310 | 9944 | half_kernel_[kHalfKernelSize - 1]); | |
311 | |||
312 | // Optimization to avoid unnecessary branching in vector code. | ||
313 | KLEIDICV_FORCE_LOOP_UNROLL | ||
314 |
10/10✓ Branch 0 taken 3280 times.
✓ Branch 1 taken 22960 times.
✓ Branch 2 taken 5320 times.
✓ Branch 3 taken 53200 times.
✓ Branch 4 taken 228 times.
✓ Branch 5 taken 228 times.
✓ Branch 6 taken 440 times.
✓ Branch 7 taken 880 times.
✓ Branch 8 taken 676 times.
✓ Branch 9 taken 2028 times.
|
89240 | for (size_t i = 0; i < kHalfKernelSize - 1; i++) { |
315 | 79296 | const size_t j = KernelSize - i - 1; | |
316 | 79296 | svuint16_t vec_b = svaddlb_u16(src[i], src[j]); | |
317 | 79296 | svuint16_t vec_t = svaddlt_u16(src[i], src[j]); | |
318 | |||
319 | 79296 | acc_b = svmla_n_u16_x(pg16_all, acc_b, vec_b, half_kernel_[i]); | |
320 | 79296 | acc_t = svmla_n_u16_x(pg16_all, acc_t, vec_t, half_kernel_[i]); | |
321 | 79296 | } | |
322 | |||
323 | // Rounding before narrowing | ||
324 | 9944 | acc_b = svqadd_n_u16(acc_b, 128); | |
325 | 9944 | acc_t = svqadd_n_u16(acc_t, 128); | |
326 | // Keep only the highest 8 bits | ||
327 | 19888 | svuint8_t result = | |
328 | 9944 | svtrn2_u8(svreinterpret_u8_u16(acc_b), svreinterpret_u8_u16(acc_t)); | |
329 | 9944 | svst1(pg, &dst[0], result); | |
330 | 9944 | } | |
331 | |||
332 | const uint16_t *half_kernel_; | ||
333 | }; // end of class GaussianBlur<uint8_t, KernelSize, false> | ||
334 | |||
335 | template <size_t KernelSize, bool IsBinomial, typename ScalarType> | ||
336 | 652 | static kleidicv_error_t gaussian_blur_fixed_kernel_size( | |
337 | const ScalarType *src, size_t src_stride, ScalarType *dst, | ||
338 | size_t dst_stride, Rectangle &rect, size_t y_begin, size_t y_end, | ||
339 | size_t channels, float sigma, FixedBorderType border_type, | ||
340 | SeparableFilterWorkspace *workspace) KLEIDICV_STREAMING { | ||
341 | using GaussianBlurFilter = GaussianBlur<ScalarType, KernelSize, IsBinomial>; | ||
342 | |||
343 | 652 | Rows<const ScalarType> src_rows{src, src_stride, channels}; | |
344 | 652 | Rows<ScalarType> dst_rows{dst, dst_stride, channels}; | |
345 | |||
346 | if constexpr (IsBinomial) { | ||
347 | 262 | GaussianBlurFilter blur; | |
348 | 262 | SeparableFilter<GaussianBlurFilter, KernelSize> filter{blur}; | |
349 | 524 | workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, | |
350 | 262 | border_type, filter); | |
351 | |||
352 | 262 | return KLEIDICV_OK; | |
353 | 262 | } else { | |
354 | 390 | constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize); | |
355 | 390 | uint16_t half_kernel[128]; | |
356 | 390 | generate_gaussian_half_kernel(half_kernel, kHalfKernelSize, sigma); | |
357 | // If sigma is so small that the middle point gets all the weights, it's | ||
358 | // just a copy | ||
359 |
10/10✓ Branch 0 taken 64 times.
✓ Branch 1 taken 32 times.
✓ Branch 2 taken 64 times.
✓ Branch 3 taken 32 times.
✓ Branch 4 taken 34 times.
✓ Branch 5 taken 32 times.
✓ Branch 6 taken 34 times.
✓ Branch 7 taken 32 times.
✓ Branch 8 taken 34 times.
✓ Branch 9 taken 32 times.
|
390 | if (half_kernel[kHalfKernelSize - 1] < 256) { |
360 | 230 | GaussianBlurFilter blur(half_kernel); | |
361 | 230 | SeparableFilter<GaussianBlurFilter, KernelSize> filter{blur}; | |
362 | 460 | workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, | |
363 | 230 | border_type, filter); | |
364 | 230 | } else { | |
365 |
10/10✓ Branch 0 taken 456 times.
✓ Branch 1 taken 32 times.
✓ Branch 2 taken 648 times.
✓ Branch 3 taken 32 times.
✓ Branch 4 taken 72 times.
✓ Branch 5 taken 32 times.
✓ Branch 6 taken 136 times.
✓ Branch 7 taken 32 times.
✓ Branch 8 taken 200 times.
✓ Branch 9 taken 32 times.
|
1672 | for (size_t row = y_begin; row < y_end; ++row) { |
366 | #if KLEIDICV_TARGET_SME && defined(__ANDROID__) | ||
367 | __arm_sc_memcpy( | ||
368 | static_cast<void *>(&dst_rows.at(row)[0]), | ||
369 | static_cast<const void *>(&src_rows.at(row)[0]), | ||
370 | rect.width() * sizeof(ScalarType) * dst_rows.channels()); | ||
371 | #else | ||
372 | 3024 | std::memcpy(static_cast<void *>(&dst_rows.at(row)[0]), | |
373 | 1512 | static_cast<const void *>(&src_rows.at(row)[0]), | |
374 | 1512 | rect.width() * sizeof(ScalarType) * dst_rows.channels()); | |
375 | #endif | ||
376 | 1512 | } | |
377 | } | ||
378 | 390 | return KLEIDICV_OK; | |
379 | 390 | } | |
380 | 652 | } | |
381 | |||
382 | template <bool IsBinomial, typename ScalarType> | ||
383 | 652 | static kleidicv_error_t gaussian_blur( | |
384 | size_t kernel_size, const ScalarType *src, size_t src_stride, | ||
385 | ScalarType *dst, size_t dst_stride, Rectangle &rect, size_t y_begin, | ||
386 | size_t y_end, size_t channels, float sigma, FixedBorderType border_type, | ||
387 | SeparableFilterWorkspace *workspace) KLEIDICV_STREAMING { | ||
388 |
10/12✓ Branch 0 taken 94 times.
✓ Branch 1 taken 102 times.
✓ Branch 2 taken 66 times.
✓ Branch 3 taken 32 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 32 times.
✓ Branch 6 taken 66 times.
✓ Branch 7 taken 66 times.
✓ Branch 8 taken 66 times.
✓ Branch 9 taken 64 times.
✗ Branch 10 not taken.
✓ Branch 11 taken 64 times.
|
652 | switch (kernel_size) { |
389 | case 3: | ||
390 | 160 | return gaussian_blur_fixed_kernel_size<3, IsBinomial>( | |
391 | 160 | src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels, | |
392 | 160 | sigma, border_type, workspace); | |
393 | case 5: | ||
394 | 168 | return gaussian_blur_fixed_kernel_size<5, IsBinomial>( | |
395 | 168 | src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels, | |
396 | 168 | sigma, border_type, workspace); | |
397 | case 7: | ||
398 | 132 | return gaussian_blur_fixed_kernel_size<7, IsBinomial>( | |
399 | 132 | src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels, | |
400 | 132 | sigma, border_type, workspace); | |
401 | case 15: | ||
402 | // 15x15 does not have a binomial variant | ||
403 | 96 | return gaussian_blur_fixed_kernel_size<15, false>( | |
404 | 96 | src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels, | |
405 | 96 | sigma, border_type, workspace); | |
406 | case 21: | ||
407 | // 21x21 does not have a binomial variant | ||
408 | 96 | return gaussian_blur_fixed_kernel_size<21, false>( | |
409 | 96 | src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels, | |
410 | 96 | sigma, border_type, workspace); | |
411 | // gaussian_blur_is_implemented checked the kernel size already. | ||
412 | // GCOVR_EXCL_START | ||
413 | default: | ||
414 | assert(!"kernel size not implemented"); | ||
415 | − | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
416 | // GCOVR_EXCL_STOP | ||
417 | } | ||
418 | 652 | } | |
419 | |||
420 | 690 | static kleidicv_error_t gaussian_blur_fixed_stripe_u8_sc( | |
421 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
422 | size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, | ||
423 | size_t kernel_width, size_t /*kernel_height*/, float sigma_x, | ||
424 | float /*sigma_y*/, FixedBorderType fixed_border_type, | ||
425 | kleidicv_filter_context_t *context) KLEIDICV_STREAMING { | ||
426 | 690 | auto *workspace = reinterpret_cast<SeparableFilterWorkspace *>(context); | |
427 | 1380 | kleidicv_error_t checks_result = gaussian_blur_checks( | |
428 | 690 | src, src_stride, dst, dst_stride, width, height, channels, workspace); | |
429 | |||
430 |
2/2✓ Branch 0 taken 38 times.
✓ Branch 1 taken 652 times.
|
690 | if (checks_result != KLEIDICV_OK) { |
431 | 38 | return checks_result; | |
432 | } | ||
433 | |||
434 | 652 | Rectangle rect{width, height}; | |
435 | |||
436 |
2/2✓ Branch 0 taken 326 times.
✓ Branch 1 taken 326 times.
|
652 | if (sigma_x == 0.0) { |
437 | 652 | return gaussian_blur<true>(kernel_width, src, src_stride, dst, dst_stride, | |
438 | 326 | rect, y_begin, y_end, channels, sigma_x, | |
439 | 326 | fixed_border_type, workspace); | |
440 | } | ||
441 | |||
442 | 652 | return gaussian_blur<false>(kernel_width, src, src_stride, dst, dst_stride, | |
443 | 326 | rect, y_begin, y_end, channels, sigma_x, | |
444 | 326 | fixed_border_type, workspace); | |
445 | 690 | } | |
446 | |||
447 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
448 | |||
449 | #endif // KLEIDICV_GAUSSIAN_BLUR_SC_H | ||
450 |